2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 stitle: Simplified title.
43 ext: Video filename extension.
45 player_url: SWF Player URL (may be None).
47 The following fields are optional. Their primary purpose is to allow
48 youtube-dl to serve as the backend for a video search function, such
49 as the one in youtube2mp3. They are only used when their respective
50 forced printing functions are called:
52 thumbnail: Full URL to a video thumbnail image.
53 description: One-line video description.
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
63 def __init__(self, downloader=None):
64 """Constructor. Receives an optional downloader."""
66 self.set_downloader(downloader)
68 def suitable(self, url):
69 """Receives a URL and returns True if suitable for this IE."""
70 return re.match(self._VALID_URL, url) is not None
73 """Initializes an instance (authentication, etc)."""
75 self._real_initialize()
78 def extract(self, url):
79 """Extracts URL information and returns it in list of dicts."""
81 return self._real_extract(url)
83 def set_downloader(self, downloader):
84 """Sets the downloader for this IE."""
85 self._downloader = downloader
87 def _real_initialize(self):
88 """Real initialization process. Redefine in subclasses."""
91 def _real_extract(self, url):
92 """Real extraction process. Redefine in subclasses."""
96 class YoutubeIE(InfoExtractor):
97 """Information extractor for youtube.com."""
99 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
100 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
101 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
102 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
103 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
104 _NETRC_MACHINE = 'youtube'
105 # Listed in order of quality
106 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
107 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
108 _video_extensions = {
114 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
120 _video_dimensions = {
138 def report_lang(self):
139 """Report attempt to set language."""
140 self._downloader.to_screen(u'[youtube] Setting language')
142 def report_login(self):
143 """Report attempt to log in."""
144 self._downloader.to_screen(u'[youtube] Logging in')
146 def report_age_confirmation(self):
147 """Report attempt to confirm age."""
148 self._downloader.to_screen(u'[youtube] Confirming age')
150 def report_video_webpage_download(self, video_id):
151 """Report attempt to download video webpage."""
152 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
154 def report_video_info_webpage_download(self, video_id):
155 """Report attempt to download video info webpage."""
156 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
158 def report_video_subtitles_download(self, video_id):
159 """Report attempt to download video info webpage."""
160 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
162 def report_information_extraction(self, video_id):
163 """Report attempt to extract video information."""
164 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
166 def report_unavailable_format(self, video_id, format):
167 """Report extracted video URL."""
168 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
170 def report_rtmp_download(self):
171 """Indicate the download will use the RTMP protocol."""
172 self._downloader.to_screen(u'[youtube] RTMP download detected')
174 def _closed_captions_xml_to_srt(self, xml_string):
176 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
177 # TODO parse xml instead of regex
178 for n, (start, dur_tag, dur, caption) in enumerate(texts):
179 if not dur: dur = '4'
181 end = start + float(dur)
182 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
183 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
184 caption = unescapeHTML(caption)
185 caption = unescapeHTML(caption) # double cycle, inentional
187 srt += start + ' --> ' + end + '\n'
188 srt += caption + '\n\n'
191 def _print_formats(self, formats):
192 print 'Available formats:'
194 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
196 def _real_initialize(self):
197 if self._downloader is None:
202 downloader_params = self._downloader.params
204 # Attempt to use provided username and password or .netrc data
205 if downloader_params.get('username', None) is not None:
206 username = downloader_params['username']
207 password = downloader_params['password']
208 elif downloader_params.get('usenetrc', False):
210 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
215 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
216 except (IOError, netrc.NetrcParseError), err:
217 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
221 request = urllib2.Request(self._LANG_URL)
224 urllib2.urlopen(request).read()
225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
226 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
229 # No authentication to be performed
235 'current_form': 'loginForm',
237 'action_login': 'Log In',
238 'username': username,
239 'password': password,
241 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
244 login_results = urllib2.urlopen(request).read()
245 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
246 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
255 'action_confirm': 'Confirm',
257 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
259 self.report_age_confirmation()
260 age_results = urllib2.urlopen(request).read()
261 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
262 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
265 def _real_extract(self, url):
266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
267 mobj = re.search(self._NEXT_URL_RE, url)
269 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
271 # Extract video id from URL
272 mobj = re.match(self._VALID_URL, url)
274 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
276 video_id = mobj.group(2)
279 self.report_video_webpage_download(video_id)
280 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
282 video_webpage = urllib2.urlopen(request).read()
283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
284 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
287 # Attempt to extract SWF player URL
288 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
290 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
295 self.report_video_info_webpage_download(video_id)
296 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
297 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
298 % (video_id, el_type))
299 request = urllib2.Request(video_info_url)
301 video_info_webpage = urllib2.urlopen(request).read()
302 video_info = parse_qs(video_info_webpage)
303 if 'token' in video_info:
305 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
306 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
308 if 'token' not in video_info:
309 if 'reason' in video_info:
310 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
312 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
315 # Start extracting information
316 self.report_information_extraction(video_id)
319 if 'author' not in video_info:
320 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
322 video_uploader = urllib.unquote_plus(video_info['author'][0])
325 if 'title' not in video_info:
326 self._downloader.trouble(u'ERROR: unable to extract video title')
328 video_title = urllib.unquote_plus(video_info['title'][0])
329 video_title = video_title.decode('utf-8')
330 video_title = sanitize_title(video_title)
333 simple_title = simplify_title(video_title)
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
362 self.report_video_subtitles_download(video_id)
363 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
365 srt_list = urllib2.urlopen(request).read()
366 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
367 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
371 if self._downloader.params.get('subtitleslang', False):
372 srt_lang = self._downloader.params.get('subtitleslang')
373 elif 'en' in srt_lang_list:
376 srt_lang = srt_lang_list[0]
377 if not srt_lang in srt_lang_list:
378 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
386 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
388 self._downloader.trouble(u'WARNING: video has no closed captions')
391 video_token = urllib.unquote_plus(video_info['token'][0])
393 # Decide which formats to download
394 req_format = self._downloader.params.get('format', None)
396 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
397 self.report_rtmp_download()
398 video_url_list = [(None, video_info['conn'][0])]
399 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
400 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
401 url_data = [parse_qs(uds) for uds in url_data_strs]
402 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
403 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
405 format_limit = self._downloader.params.get('format_limit', None)
406 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
407 if format_limit is not None and format_limit in available_formats:
408 format_list = available_formats[available_formats.index(format_limit):]
410 format_list = available_formats
411 existing_formats = [x for x in format_list if x in url_map]
412 if len(existing_formats) == 0:
413 self._downloader.trouble(u'ERROR: no known formats available for video')
415 if self._downloader.params.get('listformats', None):
416 self._print_formats(existing_formats)
418 if req_format is None or req_format == 'best':
419 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
420 elif req_format == 'worst':
421 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
422 elif req_format in ('-1', 'all'):
423 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
425 # Specific formats. We pick the first in a slash-delimeted sequence.
426 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
427 req_formats = req_format.split('/')
428 video_url_list = None
429 for rf in req_formats:
431 video_url_list = [(rf, url_map[rf])]
433 if video_url_list is None:
434 self._downloader.trouble(u'ERROR: requested format not available')
437 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
441 for format_param, video_real_url in video_url_list:
443 video_extension = self._video_extensions.get(format_param, 'flv')
446 'id': video_id.decode('utf-8'),
447 'url': video_real_url.decode('utf-8'),
448 'uploader': video_uploader.decode('utf-8'),
449 'upload_date': upload_date,
450 'title': video_title,
451 'stitle': simple_title,
452 'ext': video_extension.decode('utf-8'),
453 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
454 'thumbnail': video_thumbnail.decode('utf-8'),
455 'description': video_description,
456 'player_url': player_url,
457 'subtitles': video_subtitles
462 class MetacafeIE(InfoExtractor):
463 """Information Extractor for metacafe.com."""
465 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
466 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
467 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
468 IE_NAME = u'metacafe'
470 def __init__(self, downloader=None):
471 InfoExtractor.__init__(self, downloader)
473 def report_disclaimer(self):
474 """Report disclaimer retrieval."""
475 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
477 def report_age_confirmation(self):
478 """Report attempt to confirm age."""
479 self._downloader.to_screen(u'[metacafe] Confirming age')
481 def report_download_webpage(self, video_id):
482 """Report webpage download."""
483 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
485 def report_extraction(self, video_id):
486 """Report information extraction."""
487 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
489 def _real_initialize(self):
490 # Retrieve disclaimer
491 request = urllib2.Request(self._DISCLAIMER)
493 self.report_disclaimer()
494 disclaimer = urllib2.urlopen(request).read()
495 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
496 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
502 'submit': "Continue - I'm over 18",
504 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
506 self.report_age_confirmation()
507 disclaimer = urllib2.urlopen(request).read()
508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
509 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
512 def _real_extract(self, url):
513 # Extract id and simplified title from URL
514 mobj = re.match(self._VALID_URL, url)
516 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
519 video_id = mobj.group(1)
521 # Check if video comes from YouTube
522 mobj2 = re.match(r'^yt-(.*)$', video_id)
523 if mobj2 is not None:
524 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
527 simple_title = mobj.group(2).decode('utf-8')
529 # Retrieve video webpage to extract further information
530 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
532 self.report_download_webpage(video_id)
533 webpage = urllib2.urlopen(request).read()
534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
535 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
538 # Extract URL, uploader and title from webpage
539 self.report_extraction(video_id)
540 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
542 mediaURL = urllib.unquote(mobj.group(1))
543 video_extension = mediaURL[-3:]
545 # Extract gdaKey if available
546 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
550 gdaKey = mobj.group(1)
551 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
553 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
557 vardict = parse_qs(mobj.group(1))
558 if 'mediaData' not in vardict:
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
561 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
563 self._downloader.trouble(u'ERROR: unable to extract media URL')
565 mediaURL = mobj.group(1).replace('\\/', '/')
566 video_extension = mediaURL[-3:]
567 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
569 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
571 self._downloader.trouble(u'ERROR: unable to extract title')
573 video_title = mobj.group(1).decode('utf-8')
574 video_title = sanitize_title(video_title)
576 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
580 video_uploader = mobj.group(1)
583 'id': video_id.decode('utf-8'),
584 'url': video_url.decode('utf-8'),
585 'uploader': video_uploader.decode('utf-8'),
586 'upload_date': u'NA',
587 'title': video_title,
588 'stitle': simple_title,
589 'ext': video_extension.decode('utf-8'),
595 class DailymotionIE(InfoExtractor):
596 """Information Extractor for Dailymotion"""
598 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
599 IE_NAME = u'dailymotion'
601 def __init__(self, downloader=None):
602 InfoExtractor.__init__(self, downloader)
604 def report_download_webpage(self, video_id):
605 """Report webpage download."""
606 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
608 def report_extraction(self, video_id):
609 """Report information extraction."""
610 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
612 def _real_extract(self, url):
613 # Extract id and simplified title from URL
614 mobj = re.match(self._VALID_URL, url)
616 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
619 video_id = mobj.group(1)
621 video_extension = 'flv'
623 # Retrieve video webpage to extract further information
624 request = urllib2.Request(url)
625 request.add_header('Cookie', 'family_filter=off')
627 self.report_download_webpage(video_id)
628 webpage = urllib2.urlopen(request).read()
629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
630 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
633 # Extract URL, uploader and title from webpage
634 self.report_extraction(video_id)
635 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 sequence = urllib.unquote(mobj.group(1))
640 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
642 self._downloader.trouble(u'ERROR: unable to extract media URL')
644 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
646 # if needed add http://www.dailymotion.com/ if relative URL
650 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
652 self._downloader.trouble(u'ERROR: unable to extract title')
654 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
655 video_title = sanitize_title(video_title)
656 simple_title = simplify_title(video_title)
658 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
660 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
662 video_uploader = mobj.group(1)
665 'id': video_id.decode('utf-8'),
666 'url': video_url.decode('utf-8'),
667 'uploader': video_uploader.decode('utf-8'),
668 'upload_date': u'NA',
669 'title': video_title,
670 'stitle': simple_title,
671 'ext': video_extension.decode('utf-8'),
677 class GoogleIE(InfoExtractor):
678 """Information extractor for video.google.com."""
680 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
681 IE_NAME = u'video.google'
683 def __init__(self, downloader=None):
684 InfoExtractor.__init__(self, downloader)
686 def report_download_webpage(self, video_id):
687 """Report webpage download."""
688 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
690 def report_extraction(self, video_id):
691 """Report information extraction."""
692 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
694 def _real_extract(self, url):
695 # Extract id from URL
696 mobj = re.match(self._VALID_URL, url)
698 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
701 video_id = mobj.group(1)
703 video_extension = 'mp4'
705 # Retrieve video webpage to extract further information
706 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
708 self.report_download_webpage(video_id)
709 webpage = urllib2.urlopen(request).read()
710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
711 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
714 # Extract URL, uploader, and title from webpage
715 self.report_extraction(video_id)
716 mobj = re.search(r"download_url:'([^']+)'", webpage)
718 video_extension = 'flv'
719 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
721 self._downloader.trouble(u'ERROR: unable to extract media URL')
723 mediaURL = urllib.unquote(mobj.group(1))
724 mediaURL = mediaURL.replace('\\x3d', '\x3d')
725 mediaURL = mediaURL.replace('\\x26', '\x26')
729 mobj = re.search(r'<title>(.*)</title>', webpage)
731 self._downloader.trouble(u'ERROR: unable to extract title')
733 video_title = mobj.group(1).decode('utf-8')
734 video_title = sanitize_title(video_title)
735 simple_title = simplify_title(video_title)
737 # Extract video description
738 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
740 self._downloader.trouble(u'ERROR: unable to extract video description')
742 video_description = mobj.group(1).decode('utf-8')
743 if not video_description:
744 video_description = 'No description available.'
746 # Extract video thumbnail
747 if self._downloader.params.get('forcethumbnail', False):
748 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
750 webpage = urllib2.urlopen(request).read()
751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
754 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
756 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
758 video_thumbnail = mobj.group(1)
759 else: # we need something to pass to process_info
763 'id': video_id.decode('utf-8'),
764 'url': video_url.decode('utf-8'),
766 'upload_date': u'NA',
767 'title': video_title,
768 'stitle': simple_title,
769 'ext': video_extension.decode('utf-8'),
775 class PhotobucketIE(InfoExtractor):
776 """Information extractor for photobucket.com."""
778 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
779 IE_NAME = u'photobucket'
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
784 def report_download_webpage(self, video_id):
785 """Report webpage download."""
786 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
788 def report_extraction(self, video_id):
789 """Report information extraction."""
790 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
792 def _real_extract(self, url):
793 # Extract id from URL
794 mobj = re.match(self._VALID_URL, url)
796 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
799 video_id = mobj.group(1)
801 video_extension = 'flv'
803 # Retrieve video webpage to extract further information
804 request = urllib2.Request(url)
806 self.report_download_webpage(video_id)
807 webpage = urllib2.urlopen(request).read()
808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
812 # Extract URL, uploader, and title from webpage
813 self.report_extraction(video_id)
814 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
816 self._downloader.trouble(u'ERROR: unable to extract media URL')
818 mediaURL = urllib.unquote(mobj.group(1))
822 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
824 self._downloader.trouble(u'ERROR: unable to extract title')
826 video_title = mobj.group(1).decode('utf-8')
827 video_title = sanitize_title(video_title)
828 simple_title = simplify_title(video_title)
830 video_uploader = mobj.group(2).decode('utf-8')
833 'id': video_id.decode('utf-8'),
834 'url': video_url.decode('utf-8'),
835 'uploader': video_uploader,
836 'upload_date': u'NA',
837 'title': video_title,
838 'stitle': simple_title,
839 'ext': video_extension.decode('utf-8'),
845 class YahooIE(InfoExtractor):
846 """Information extractor for video.yahoo.com."""
848 # _VALID_URL matches all Yahoo! Video URLs
849 # _VPAGE_URL matches only the extractable '/watch/' URLs
850 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
851 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
852 IE_NAME = u'video.yahoo'
854 def __init__(self, downloader=None):
855 InfoExtractor.__init__(self, downloader)
857 def report_download_webpage(self, video_id):
858 """Report webpage download."""
859 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
861 def report_extraction(self, video_id):
862 """Report information extraction."""
863 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
865 def _real_extract(self, url, new_video=True):
866 # Extract ID from URL
867 mobj = re.match(self._VALID_URL, url)
869 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
872 video_id = mobj.group(2)
873 video_extension = 'flv'
875 # Rewrite valid but non-extractable URLs as
876 # extractable English language /watch/ URLs
877 if re.match(self._VPAGE_URL, url) is None:
878 request = urllib2.Request(url)
880 webpage = urllib2.urlopen(request).read()
881 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
882 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
885 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
887 self._downloader.trouble(u'ERROR: Unable to extract id field')
889 yahoo_id = mobj.group(1)
891 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
893 self._downloader.trouble(u'ERROR: Unable to extract vid field')
895 yahoo_vid = mobj.group(1)
897 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
898 return self._real_extract(url, new_video=False)
900 # Retrieve video webpage to extract further information
901 request = urllib2.Request(url)
903 self.report_download_webpage(video_id)
904 webpage = urllib2.urlopen(request).read()
905 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
906 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
909 # Extract uploader and title from webpage
910 self.report_extraction(video_id)
911 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract video title')
915 video_title = mobj.group(1).decode('utf-8')
916 simple_title = simplify_title(video_title)
918 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video uploader')
922 video_uploader = mobj.group(1).decode('utf-8')
924 # Extract video thumbnail
925 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
929 video_thumbnail = mobj.group(1).decode('utf-8')
931 # Extract video description
932 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
934 self._downloader.trouble(u'ERROR: unable to extract video description')
936 video_description = mobj.group(1).decode('utf-8')
937 if not video_description:
938 video_description = 'No description available.'
940 # Extract video height and width
941 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
943 self._downloader.trouble(u'ERROR: unable to extract video height')
945 yv_video_height = mobj.group(1)
947 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
949 self._downloader.trouble(u'ERROR: unable to extract video width')
951 yv_video_width = mobj.group(1)
953 # Retrieve video playlist to extract media URL
954 # I'm not completely sure what all these options are, but we
955 # seem to need most of them, otherwise the server sends a 401.
956 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
957 yv_bitrate = '700' # according to Wikipedia this is hard-coded
958 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
959 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
960 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
962 self.report_download_webpage(video_id)
963 webpage = urllib2.urlopen(request).read()
964 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
965 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
968 # Extract media URL from playlist XML
969 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
971 self._downloader.trouble(u'ERROR: Unable to extract media URL')
973 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
974 video_url = unescapeHTML(video_url)
977 'id': video_id.decode('utf-8'),
979 'uploader': video_uploader,
980 'upload_date': u'NA',
981 'title': video_title,
982 'stitle': simple_title,
983 'ext': video_extension.decode('utf-8'),
984 'thumbnail': video_thumbnail.decode('utf-8'),
985 'description': video_description,
986 'thumbnail': video_thumbnail,
991 class VimeoIE(InfoExtractor):
992 """Information extractor for vimeo.com."""
994 # _VALID_URL matches Vimeo URLs
995 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
998 def __init__(self, downloader=None):
999 InfoExtractor.__init__(self, downloader)
1001 def report_download_webpage(self, video_id):
1002 """Report webpage download."""
1003 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1005 def report_extraction(self, video_id):
1006 """Report information extraction."""
1007 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1009 def _real_extract(self, url, new_video=True):
1010 # Extract ID from URL
1011 mobj = re.match(self._VALID_URL, url)
1013 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1016 video_id = mobj.group(1)
1018 # Retrieve video webpage to extract further information
1019 request = urllib2.Request(url, None, std_headers)
1021 self.report_download_webpage(video_id)
1022 webpage = urllib2.urlopen(request).read()
1023 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1024 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1027 # Now we begin extracting as much information as we can from what we
1028 # retrieved. First we extract the information common to all extractors,
1029 # and latter we extract those that are Vimeo specific.
1030 self.report_extraction(video_id)
1032 # Extract the config JSON
1033 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1035 config = json.loads(config)
1037 self._downloader.trouble(u'ERROR: unable to extract info section')
1041 video_title = config["video"]["title"]
1042 simple_title = simplify_title(video_title)
1045 video_uploader = config["video"]["owner"]["name"]
1047 # Extract video thumbnail
1048 video_thumbnail = config["video"]["thumbnail"]
1050 # Extract video description
1051 video_description = get_element_by_id("description", webpage.decode('utf8'))
1052 if video_description: video_description = clean_html(video_description)
1053 else: video_description = ''
1055 # Extract upload date
1056 video_upload_date = u'NA'
1057 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1058 if mobj is not None:
1059 video_upload_date = mobj.group(1)
1061 # Vimeo specific: extract request signature and timestamp
1062 sig = config['request']['signature']
1063 timestamp = config['request']['timestamp']
1065 # Vimeo specific: extract video codec and quality information
1066 # TODO bind to format param
1067 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1068 for codec in codecs:
1069 if codec[0] in config["video"]["files"]:
1070 video_codec = codec[0]
1071 video_extension = codec[1]
1072 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1073 else: quality = 'sd'
1076 self._downloader.trouble(u'ERROR: no known codec found')
1079 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1080 %(video_id, sig, timestamp, quality, video_codec.upper())
1085 'uploader': video_uploader,
1086 'upload_date': video_upload_date,
1087 'title': video_title,
1088 'stitle': simple_title,
1089 'ext': video_extension,
1090 'thumbnail': video_thumbnail,
1091 'description': video_description,
1096 class GenericIE(InfoExtractor):
1097 """Generic last-resort information extractor."""
1100 IE_NAME = u'generic'
1102 def __init__(self, downloader=None):
1103 InfoExtractor.__init__(self, downloader)
1105 def report_download_webpage(self, video_id):
1106 """Report webpage download."""
1107 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1108 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1110 def report_extraction(self, video_id):
1111 """Report information extraction."""
1112 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1114 def report_following_redirect(self, new_url):
1115 """Report information extraction."""
1116 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1118 def _test_redirect(self, url):
1119 """Check if it is a redirect, like url shorteners, in case restart chain."""
1120 class HeadRequest(urllib2.Request):
1121 def get_method(self):
1124 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1126 Subclass the HTTPRedirectHandler to make it use our
1127 HeadRequest also on the redirected URL
1129 def redirect_request(self, req, fp, code, msg, headers, newurl):
1130 if code in (301, 302, 303, 307):
1131 newurl = newurl.replace(' ', '%20')
1132 newheaders = dict((k,v) for k,v in req.headers.items()
1133 if k.lower() not in ("content-length", "content-type"))
1134 return HeadRequest(newurl,
1136 origin_req_host=req.get_origin_req_host(),
1139 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1141 class HTTPMethodFallback(urllib2.BaseHandler):
1143 Fallback to GET if HEAD is not allowed (405 HTTP error)
1145 def http_error_405(self, req, fp, code, msg, headers):
1149 newheaders = dict((k,v) for k,v in req.headers.items()
1150 if k.lower() not in ("content-length", "content-type"))
1151 return self.parent.open(urllib2.Request(req.get_full_url(),
1153 origin_req_host=req.get_origin_req_host(),
1157 opener = urllib2.OpenerDirector()
1158 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1159 HTTPMethodFallback, HEADRedirectHandler,
1160 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1161 opener.add_handler(handler())
1163 response = opener.open(HeadRequest(url))
1164 new_url = response.geturl()
1166 if url == new_url: return False
1168 self.report_following_redirect(new_url)
1169 self._downloader.download([new_url])
1172 def _real_extract(self, url):
1173 if self._test_redirect(url): return
1175 video_id = url.split('/')[-1]
1176 request = urllib2.Request(url)
1178 self.report_download_webpage(video_id)
1179 webpage = urllib2.urlopen(request).read()
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1183 except ValueError, err:
1184 # since this is the last-resort InfoExtractor, if
1185 # this error is thrown, it'll be thrown here
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 self.report_extraction(video_id)
1190 # Start with something easy: JW Player in SWFObject
1191 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1193 # Broaden the search a little bit
1194 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1196 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1199 # It's possible that one of the regexes
1200 # matched, but returned an empty group:
1201 if mobj.group(1) is None:
1202 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1205 video_url = urllib.unquote(mobj.group(1))
1206 video_id = os.path.basename(video_url)
1208 # here's a fun little line of code for you:
1209 video_extension = os.path.splitext(video_id)[1][1:]
1210 video_id = os.path.splitext(video_id)[0]
1212 # it's tempting to parse this further, but you would
1213 # have to take into account all the variations like
1214 # Video Title - Site Name
1215 # Site Name | Video Title
1216 # Video Title - Tagline | Site Name
1217 # and so on and so forth; it's just not practical
1218 mobj = re.search(r'<title>(.*)</title>', webpage)
1220 self._downloader.trouble(u'ERROR: unable to extract title')
1222 video_title = mobj.group(1).decode('utf-8')
1223 video_title = sanitize_title(video_title)
1224 simple_title = simplify_title(video_title)
1226 # video uploader is domain name
1227 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1229 self._downloader.trouble(u'ERROR: unable to extract title')
1231 video_uploader = mobj.group(1).decode('utf-8')
1234 'id': video_id.decode('utf-8'),
1235 'url': video_url.decode('utf-8'),
1236 'uploader': video_uploader,
1237 'upload_date': u'NA',
1238 'title': video_title,
1239 'stitle': simple_title,
1240 'ext': video_extension.decode('utf-8'),
1246 class YoutubeSearchIE(InfoExtractor):
1247 """Information Extractor for YouTube search queries."""
1248 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1249 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1250 _max_youtube_results = 1000
1251 IE_NAME = u'youtube:search'
1253 def __init__(self, downloader=None):
1254 InfoExtractor.__init__(self, downloader)
1256 def report_download_page(self, query, pagenum):
1257 """Report attempt to download playlist page with given number."""
1258 query = query.decode(preferredencoding())
1259 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1261 def _real_extract(self, query):
1262 mobj = re.match(self._VALID_URL, query)
1264 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1267 prefix, query = query.split(':')
1269 query = query.encode('utf-8')
1271 self._download_n_results(query, 1)
1273 elif prefix == 'all':
1274 self._download_n_results(query, self._max_youtube_results)
1280 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1282 elif n > self._max_youtube_results:
1283 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1284 n = self._max_youtube_results
1285 self._download_n_results(query, n)
1287 except ValueError: # parsing prefix as integer fails
1288 self._download_n_results(query, 1)
1291 def _download_n_results(self, query, n):
1292 """Downloads a specified number of results for a query"""
1298 while (50 * pagenum) < limit:
1299 self.report_download_page(query, pagenum+1)
1300 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1301 request = urllib2.Request(result_url)
1303 data = urllib2.urlopen(request).read()
1304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1305 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1307 api_response = json.loads(data)['data']
1309 new_ids = list(video['id'] for video in api_response['items'])
1310 video_ids += new_ids
1312 limit = min(n, api_response['totalItems'])
1315 if len(video_ids) > n:
1316 video_ids = video_ids[:n]
1317 for id in video_ids:
1318 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1322 class GoogleSearchIE(InfoExtractor):
1323 """Information Extractor for Google Video search queries."""
1324 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1325 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1326 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1327 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1328 _max_google_results = 1000
1329 IE_NAME = u'video.google:search'
1331 def __init__(self, downloader=None):
1332 InfoExtractor.__init__(self, downloader)
1334 def report_download_page(self, query, pagenum):
1335 """Report attempt to download playlist page with given number."""
1336 query = query.decode(preferredencoding())
1337 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1339 def _real_extract(self, query):
1340 mobj = re.match(self._VALID_URL, query)
1342 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1345 prefix, query = query.split(':')
1347 query = query.encode('utf-8')
1349 self._download_n_results(query, 1)
1351 elif prefix == 'all':
1352 self._download_n_results(query, self._max_google_results)
1358 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1360 elif n > self._max_google_results:
1361 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1362 n = self._max_google_results
1363 self._download_n_results(query, n)
1365 except ValueError: # parsing prefix as integer fails
1366 self._download_n_results(query, 1)
1369 def _download_n_results(self, query, n):
1370 """Downloads a specified number of results for a query"""
1376 self.report_download_page(query, pagenum)
1377 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1378 request = urllib2.Request(result_url)
1380 page = urllib2.urlopen(request).read()
1381 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1382 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1385 # Extract video identifiers
1386 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1387 video_id = mobj.group(1)
1388 if video_id not in video_ids:
1389 video_ids.append(video_id)
1390 if len(video_ids) == n:
1391 # Specified n videos reached
1392 for id in video_ids:
1393 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1396 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1397 for id in video_ids:
1398 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1401 pagenum = pagenum + 1
1404 class YahooSearchIE(InfoExtractor):
1405 """Information Extractor for Yahoo! Video search queries."""
1406 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1407 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1408 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1409 _MORE_PAGES_INDICATOR = r'\s*Next'
1410 _max_yahoo_results = 1000
1411 IE_NAME = u'video.yahoo:search'
1413 def __init__(self, downloader=None):
1414 InfoExtractor.__init__(self, downloader)
1416 def report_download_page(self, query, pagenum):
1417 """Report attempt to download playlist page with given number."""
1418 query = query.decode(preferredencoding())
1419 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1421 def _real_extract(self, query):
1422 mobj = re.match(self._VALID_URL, query)
1424 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1427 prefix, query = query.split(':')
1429 query = query.encode('utf-8')
1431 self._download_n_results(query, 1)
1433 elif prefix == 'all':
1434 self._download_n_results(query, self._max_yahoo_results)
1440 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1442 elif n > self._max_yahoo_results:
1443 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1444 n = self._max_yahoo_results
1445 self._download_n_results(query, n)
1447 except ValueError: # parsing prefix as integer fails
1448 self._download_n_results(query, 1)
1451 def _download_n_results(self, query, n):
1452 """Downloads a specified number of results for a query"""
1455 already_seen = set()
1459 self.report_download_page(query, pagenum)
1460 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1461 request = urllib2.Request(result_url)
1463 page = urllib2.urlopen(request).read()
1464 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1465 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1468 # Extract video identifiers
1469 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1470 video_id = mobj.group(1)
1471 if video_id not in already_seen:
1472 video_ids.append(video_id)
1473 already_seen.add(video_id)
1474 if len(video_ids) == n:
1475 # Specified n videos reached
1476 for id in video_ids:
1477 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1480 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1481 for id in video_ids:
1482 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1485 pagenum = pagenum + 1
1488 class YoutubePlaylistIE(InfoExtractor):
1489 """Information Extractor for YouTube playlists."""
1491 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1492 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1493 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1494 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1495 IE_NAME = u'youtube:playlist'
1497 def __init__(self, downloader=None):
1498 InfoExtractor.__init__(self, downloader)
1500 def report_download_page(self, playlist_id, pagenum):
1501 """Report attempt to download playlist page with given number."""
1502 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1504 def _real_extract(self, url):
1505 # Extract playlist id
1506 mobj = re.match(self._VALID_URL, url)
1508 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1512 if mobj.group(3) is not None:
1513 self._downloader.download([mobj.group(3)])
1516 # Download playlist pages
1517 # prefix is 'p' as default for playlists but there are other types that need extra care
1518 playlist_prefix = mobj.group(1)
1519 if playlist_prefix == 'a':
1520 playlist_access = 'artist'
1522 playlist_prefix = 'p'
1523 playlist_access = 'view_play_list'
1524 playlist_id = mobj.group(2)
1529 self.report_download_page(playlist_id, pagenum)
1530 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1531 request = urllib2.Request(url)
1533 page = urllib2.urlopen(request).read()
1534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1538 # Extract video identifiers
1540 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1541 if mobj.group(1) not in ids_in_page:
1542 ids_in_page.append(mobj.group(1))
1543 video_ids.extend(ids_in_page)
1545 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1547 pagenum = pagenum + 1
1549 playliststart = self._downloader.params.get('playliststart', 1) - 1
1550 playlistend = self._downloader.params.get('playlistend', -1)
1551 if playlistend == -1:
1552 video_ids = video_ids[playliststart:]
1554 video_ids = video_ids[playliststart:playlistend]
1556 for id in video_ids:
1557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1561 class YoutubeUserIE(InfoExtractor):
1562 """Information Extractor for YouTube users."""
1564 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1565 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1566 _GDATA_PAGE_SIZE = 50
1567 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1568 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1569 IE_NAME = u'youtube:user'
1571 def __init__(self, downloader=None):
1572 InfoExtractor.__init__(self, downloader)
1574 def report_download_page(self, username, start_index):
1575 """Report attempt to download user page."""
1576 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1577 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1579 def _real_extract(self, url):
1581 mobj = re.match(self._VALID_URL, url)
1583 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1586 username = mobj.group(1)
1588 # Download video ids using YouTube Data API. Result size per
1589 # query is limited (currently to 50 videos) so we need to query
1590 # page by page until there are no video ids - it means we got
1597 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1598 self.report_download_page(username, start_index)
1600 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1603 page = urllib2.urlopen(request).read()
1604 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1605 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1608 # Extract video identifiers
1611 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612 if mobj.group(1) not in ids_in_page:
1613 ids_in_page.append(mobj.group(1))
1615 video_ids.extend(ids_in_page)
1617 # A little optimization - if current page is not
1618 # "full", ie. does not contain PAGE_SIZE video ids then
1619 # we can assume that this page is the last one - there
1620 # are no more ids on further pages - no need to query
1623 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1628 all_ids_count = len(video_ids)
1629 playliststart = self._downloader.params.get('playliststart', 1) - 1
1630 playlistend = self._downloader.params.get('playlistend', -1)
1632 if playlistend == -1:
1633 video_ids = video_ids[playliststart:]
1635 video_ids = video_ids[playliststart:playlistend]
1637 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1638 (username, all_ids_count, len(video_ids)))
1640 for video_id in video_ids:
1641 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1644 class DepositFilesIE(InfoExtractor):
1645 """Information extractor for depositfiles.com"""
1647 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1648 IE_NAME = u'DepositFiles'
1650 def __init__(self, downloader=None):
1651 InfoExtractor.__init__(self, downloader)
1653 def report_download_webpage(self, file_id):
1654 """Report webpage download."""
1655 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1657 def report_extraction(self, file_id):
1658 """Report information extraction."""
1659 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1661 def _real_extract(self, url):
1662 file_id = url.split('/')[-1]
1663 # Rebuild url in english locale
1664 url = 'http://depositfiles.com/en/files/' + file_id
1666 # Retrieve file webpage with 'Free download' button pressed
1667 free_download_indication = { 'gateway_result' : '1' }
1668 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1670 self.report_download_webpage(file_id)
1671 webpage = urllib2.urlopen(request).read()
1672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1673 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1676 # Search for the real file URL
1677 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1678 if (mobj is None) or (mobj.group(1) is None):
1679 # Try to figure out reason of the error.
1680 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1681 if (mobj is not None) and (mobj.group(1) is not None):
1682 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1683 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1685 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1688 file_url = mobj.group(1)
1689 file_extension = os.path.splitext(file_url)[1][1:]
1691 # Search for file title
1692 mobj = re.search(r'<b title="(.*?)">', webpage)
1694 self._downloader.trouble(u'ERROR: unable to extract title')
1696 file_title = mobj.group(1).decode('utf-8')
1699 'id': file_id.decode('utf-8'),
1700 'url': file_url.decode('utf-8'),
1702 'upload_date': u'NA',
1703 'title': file_title,
1704 'stitle': file_title,
1705 'ext': file_extension.decode('utf-8'),
1711 class FacebookIE(InfoExtractor):
1712 """Information Extractor for Facebook"""
1714 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1715 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1716 _NETRC_MACHINE = 'facebook'
1717 _available_formats = ['video', 'highqual', 'lowqual']
1718 _video_extensions = {
1723 IE_NAME = u'facebook'
1725 def __init__(self, downloader=None):
1726 InfoExtractor.__init__(self, downloader)
1728 def _reporter(self, message):
1729 """Add header and report message."""
1730 self._downloader.to_screen(u'[facebook] %s' % message)
1732 def report_login(self):
1733 """Report attempt to log in."""
1734 self._reporter(u'Logging in')
1736 def report_video_webpage_download(self, video_id):
1737 """Report attempt to download video webpage."""
1738 self._reporter(u'%s: Downloading video webpage' % video_id)
1740 def report_information_extraction(self, video_id):
1741 """Report attempt to extract video information."""
1742 self._reporter(u'%s: Extracting video information' % video_id)
1744 def _parse_page(self, video_webpage):
1745 """Extract video information from page"""
1747 data = {'title': r'\("video_title", "(.*?)"\)',
1748 'description': r'<div class="datawrap">(.*?)</div>',
1749 'owner': r'\("video_owner_name", "(.*?)"\)',
1750 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1753 for piece in data.keys():
1754 mobj = re.search(data[piece], video_webpage)
1755 if mobj is not None:
1756 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1760 for fmt in self._available_formats:
1761 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1762 if mobj is not None:
1763 # URL is in a Javascript segment inside an escaped Unicode format within
1764 # the generally utf-8 page
1765 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1766 video_info['video_urls'] = video_urls
1770 def _real_initialize(self):
1771 if self._downloader is None:
1776 downloader_params = self._downloader.params
1778 # Attempt to use provided username and password or .netrc data
1779 if downloader_params.get('username', None) is not None:
1780 useremail = downloader_params['username']
1781 password = downloader_params['password']
1782 elif downloader_params.get('usenetrc', False):
1784 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1785 if info is not None:
1789 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1790 except (IOError, netrc.NetrcParseError), err:
1791 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1794 if useremail is None:
1803 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1806 login_results = urllib2.urlopen(request).read()
1807 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1808 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1814 def _real_extract(self, url):
1815 mobj = re.match(self._VALID_URL, url)
1817 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1819 video_id = mobj.group('ID')
1822 self.report_video_webpage_download(video_id)
1823 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1825 page = urllib2.urlopen(request)
1826 video_webpage = page.read()
1827 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1828 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1831 # Start extracting information
1832 self.report_information_extraction(video_id)
1834 # Extract information
1835 video_info = self._parse_page(video_webpage)
1838 if 'owner' not in video_info:
1839 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1841 video_uploader = video_info['owner']
1844 if 'title' not in video_info:
1845 self._downloader.trouble(u'ERROR: unable to extract video title')
1847 video_title = video_info['title']
1848 video_title = video_title.decode('utf-8')
1849 video_title = sanitize_title(video_title)
1851 simple_title = simplify_title(video_title)
1854 if 'thumbnail' not in video_info:
1855 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1856 video_thumbnail = ''
1858 video_thumbnail = video_info['thumbnail']
1862 if 'upload_date' in video_info:
1863 upload_time = video_info['upload_date']
1864 timetuple = email.utils.parsedate_tz(upload_time)
1865 if timetuple is not None:
1867 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1872 video_description = video_info.get('description', 'No description available.')
1874 url_map = video_info['video_urls']
1875 if len(url_map.keys()) > 0:
1876 # Decide which formats to download
1877 req_format = self._downloader.params.get('format', None)
1878 format_limit = self._downloader.params.get('format_limit', None)
1880 if format_limit is not None and format_limit in self._available_formats:
1881 format_list = self._available_formats[self._available_formats.index(format_limit):]
1883 format_list = self._available_formats
1884 existing_formats = [x for x in format_list if x in url_map]
1885 if len(existing_formats) == 0:
1886 self._downloader.trouble(u'ERROR: no known formats available for video')
1888 if req_format is None:
1889 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1890 elif req_format == 'worst':
1891 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1892 elif req_format == '-1':
1893 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1896 if req_format not in url_map:
1897 self._downloader.trouble(u'ERROR: requested format not available')
1899 video_url_list = [(req_format, url_map[req_format])] # Specific format
1902 for format_param, video_real_url in video_url_list:
1904 video_extension = self._video_extensions.get(format_param, 'mp4')
1907 'id': video_id.decode('utf-8'),
1908 'url': video_real_url.decode('utf-8'),
1909 'uploader': video_uploader.decode('utf-8'),
1910 'upload_date': upload_date,
1911 'title': video_title,
1912 'stitle': simple_title,
1913 'ext': video_extension.decode('utf-8'),
1914 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1915 'thumbnail': video_thumbnail.decode('utf-8'),
1916 'description': video_description.decode('utf-8'),
1921 class BlipTVIE(InfoExtractor):
1922 """Information extractor for blip.tv"""
1924 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1925 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1926 IE_NAME = u'blip.tv'
1928 def report_extraction(self, file_id):
1929 """Report information extraction."""
1930 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1932 def report_direct_download(self, title):
1933 """Report information extraction."""
1934 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1936 def _real_extract(self, url):
1937 mobj = re.match(self._VALID_URL, url)
1939 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1946 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1947 request = urllib2.Request(json_url)
1948 self.report_extraction(mobj.group(1))
1951 urlh = urllib2.urlopen(request)
1952 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1953 basename = url.split('/')[-1]
1954 title,ext = os.path.splitext(basename)
1955 title = title.decode('UTF-8')
1956 ext = ext.replace('.', '')
1957 self.report_direct_download(title)
1962 'stitle': simplify_title(title),
1966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1969 if info is None: # Regular URL
1971 json_code = urlh.read()
1972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1977 json_data = json.loads(json_code)
1978 if 'Post' in json_data:
1979 data = json_data['Post']
1983 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1984 video_url = data['media']['url']
1985 umobj = re.match(self._URL_EXT, video_url)
1987 raise ValueError('Can not determine filename extension')
1988 ext = umobj.group(1)
1991 'id': data['item_id'],
1993 'uploader': data['display_name'],
1994 'upload_date': upload_date,
1995 'title': data['title'],
1996 'stitle': simplify_title(data['title']),
1998 'format': data['media']['mimeType'],
1999 'thumbnail': data['thumbnailUrl'],
2000 'description': data['description'],
2001 'player_url': data['embedUrl']
2003 except (ValueError,KeyError), err:
2004 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2010 class MyVideoIE(InfoExtractor):
2011 """Information Extractor for myvideo.de."""
2013 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2014 IE_NAME = u'myvideo'
2016 def __init__(self, downloader=None):
2017 InfoExtractor.__init__(self, downloader)
2019 def report_download_webpage(self, video_id):
2020 """Report webpage download."""
2021 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2023 def report_extraction(self, video_id):
2024 """Report information extraction."""
2025 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2027 def _real_extract(self,url):
2028 mobj = re.match(self._VALID_URL, url)
2030 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2033 video_id = mobj.group(1)
2036 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2038 self.report_download_webpage(video_id)
2039 webpage = urllib2.urlopen(request).read()
2040 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2041 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2044 self.report_extraction(video_id)
2045 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2048 self._downloader.trouble(u'ERROR: unable to extract media URL')
2050 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2052 mobj = re.search('<title>([^<]+)</title>', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract title')
2057 video_title = mobj.group(1)
2058 video_title = sanitize_title(video_title)
2060 simple_title = simplify_title(video_title)
2066 'upload_date': u'NA',
2067 'title': video_title,
2068 'stitle': simple_title,
2074 class ComedyCentralIE(InfoExtractor):
2075 """Information extractor for The Daily Show and Colbert Report """
2077 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2078 IE_NAME = u'comedycentral'
2080 def report_extraction(self, episode_id):
2081 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2083 def report_config_download(self, episode_id):
2084 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2086 def report_index_download(self, episode_id):
2087 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2089 def report_player_url(self, episode_id):
2090 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2092 def _real_extract(self, url):
2093 mobj = re.match(self._VALID_URL, url)
2095 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2098 if mobj.group('shortname'):
2099 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2100 url = u'http://www.thedailyshow.com/full-episodes/'
2102 url = u'http://www.colbertnation.com/full-episodes/'
2103 mobj = re.match(self._VALID_URL, url)
2104 assert mobj is not None
2106 dlNewest = not mobj.group('episode')
2108 epTitle = mobj.group('showname')
2110 epTitle = mobj.group('episode')
2112 req = urllib2.Request(url)
2113 self.report_extraction(epTitle)
2115 htmlHandle = urllib2.urlopen(req)
2116 html = htmlHandle.read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2121 url = htmlHandle.geturl()
2122 mobj = re.match(self._VALID_URL, url)
2124 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2126 if mobj.group('episode') == '':
2127 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2129 epTitle = mobj.group('episode')
2131 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2132 if len(mMovieParams) == 0:
2133 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2136 playerUrl_raw = mMovieParams[0][0]
2137 self.report_player_url(epTitle)
2139 urlHandle = urllib2.urlopen(playerUrl_raw)
2140 playerUrl = urlHandle.geturl()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2145 uri = mMovieParams[0][1]
2146 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2147 self.report_index_download(epTitle)
2149 indexXml = urllib2.urlopen(indexUrl).read()
2150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2151 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2156 idoc = xml.etree.ElementTree.fromstring(indexXml)
2157 itemEls = idoc.findall('.//item')
2158 for itemEl in itemEls:
2159 mediaId = itemEl.findall('./guid')[0].text
2160 shortMediaId = mediaId.split(':')[-1]
2161 showId = mediaId.split(':')[-2].replace('.com', '')
2162 officialTitle = itemEl.findall('./title')[0].text
2163 officialDate = itemEl.findall('./pubDate')[0].text
2165 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2166 urllib.urlencode({'uri': mediaId}))
2167 configReq = urllib2.Request(configUrl)
2168 self.report_config_download(epTitle)
2170 configXml = urllib2.urlopen(configReq).read()
2171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2175 cdoc = xml.etree.ElementTree.fromstring(configXml)
2177 for rendition in cdoc.findall('.//rendition'):
2178 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2182 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2185 # For now, just pick the highest bitrate
2186 format,video_url = turls[-1]
2188 effTitle = showId + u'-' + epTitle
2193 'upload_date': officialDate,
2195 'stitle': simplify_title(effTitle),
2199 'description': officialTitle,
2200 'player_url': playerUrl
2203 results.append(info)
2208 class EscapistIE(InfoExtractor):
2209 """Information extractor for The Escapist """
2211 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2212 IE_NAME = u'escapist'
2214 def report_extraction(self, showName):
2215 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2217 def report_config_download(self, showName):
2218 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2220 def _real_extract(self, url):
2221 mobj = re.match(self._VALID_URL, url)
2223 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2225 showName = mobj.group('showname')
2226 videoId = mobj.group('episode')
2228 self.report_extraction(showName)
2230 webPage = urllib2.urlopen(url).read()
2231 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2232 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2235 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2236 description = unescapeHTML(descMatch.group(1))
2237 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2238 imgUrl = unescapeHTML(imgMatch.group(1))
2239 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2240 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2241 configUrlMatch = re.search('config=(.*)$', playerUrl)
2242 configUrl = urllib2.unquote(configUrlMatch.group(1))
2244 self.report_config_download(showName)
2246 configJSON = urllib2.urlopen(configUrl).read()
2247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2248 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2251 # Technically, it's JavaScript, not JSON
2252 configJSON = configJSON.replace("'", '"')
2255 config = json.loads(configJSON)
2256 except (ValueError,), err:
2257 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2260 playlist = config['playlist']
2261 videoUrl = playlist[1]['url']
2266 'uploader': showName,
2267 'upload_date': None,
2269 'stitle': simplify_title(showName),
2272 'thumbnail': imgUrl,
2273 'description': description,
2274 'player_url': playerUrl,
2280 class CollegeHumorIE(InfoExtractor):
2281 """Information extractor for collegehumor.com"""
2283 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2284 IE_NAME = u'collegehumor'
2286 def report_webpage(self, video_id):
2287 """Report information extraction."""
2288 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2290 def report_extraction(self, video_id):
2291 """Report information extraction."""
2292 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2294 def _real_extract(self, url):
2295 mobj = re.match(self._VALID_URL, url)
2297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2299 video_id = mobj.group('videoid')
2301 self.report_webpage(video_id)
2302 request = urllib2.Request(url)
2304 webpage = urllib2.urlopen(request).read()
2305 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2306 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2309 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2311 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2313 internal_video_id = m.group('internalvideoid')
2317 'internal_id': internal_video_id,
2320 self.report_extraction(video_id)
2321 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2323 metaXml = urllib2.urlopen(xmlUrl).read()
2324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2328 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2330 videoNode = mdoc.findall('./video')[0]
2331 info['description'] = videoNode.findall('./description')[0].text
2332 info['title'] = videoNode.findall('./caption')[0].text
2333 info['stitle'] = simplify_title(info['title'])
2334 info['url'] = videoNode.findall('./file')[0].text
2335 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2336 info['ext'] = info['url'].rpartition('.')[2]
2337 info['format'] = info['ext']
2339 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2345 class XVideosIE(InfoExtractor):
2346 """Information extractor for xvideos.com"""
2348 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2349 IE_NAME = u'xvideos'
2351 def report_webpage(self, video_id):
2352 """Report information extraction."""
2353 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2355 def report_extraction(self, video_id):
2356 """Report information extraction."""
2357 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2359 def _real_extract(self, url):
2360 mobj = re.match(self._VALID_URL, url)
2362 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2364 video_id = mobj.group(1).decode('utf-8')
2366 self.report_webpage(video_id)
2368 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2370 webpage = urllib2.urlopen(request).read()
2371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2375 self.report_extraction(video_id)
2379 mobj = re.search(r'flv_url=(.+?)&', webpage)
2381 self._downloader.trouble(u'ERROR: unable to extract video url')
2383 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2387 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2389 self._downloader.trouble(u'ERROR: unable to extract video title')
2391 video_title = mobj.group(1).decode('utf-8')
2394 # Extract video thumbnail
2395 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2397 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2399 video_thumbnail = mobj.group(1).decode('utf-8')
2405 'upload_date': None,
2406 'title': video_title,
2407 'stitle': simplify_title(video_title),
2410 'thumbnail': video_thumbnail,
2411 'description': None,
2418 class SoundcloudIE(InfoExtractor):
2419 """Information extractor for soundcloud.com
2420 To access the media, the uid of the song and a stream token
2421 must be extracted from the page source and the script must make
2422 a request to media.soundcloud.com/crossdomain.xml. Then
2423 the media can be grabbed by requesting from an url composed
2424 of the stream token and uid
2427 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2428 IE_NAME = u'soundcloud'
2430 def __init__(self, downloader=None):
2431 InfoExtractor.__init__(self, downloader)
2433 def report_webpage(self, video_id):
2434 """Report information extraction."""
2435 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2437 def report_extraction(self, video_id):
2438 """Report information extraction."""
2439 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2441 def _real_extract(self, url):
2442 mobj = re.match(self._VALID_URL, url)
2444 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2447 # extract uploader (which is in the url)
2448 uploader = mobj.group(1).decode('utf-8')
2449 # extract simple title (uploader + slug of song title)
2450 slug_title = mobj.group(2).decode('utf-8')
2451 simple_title = uploader + '-' + slug_title
2453 self.report_webpage('%s/%s' % (uploader, slug_title))
2455 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2457 webpage = urllib2.urlopen(request).read()
2458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2462 self.report_extraction('%s/%s' % (uploader, slug_title))
2464 # extract uid and stream token that soundcloud hands out for access
2465 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2467 video_id = mobj.group(1)
2468 stream_token = mobj.group(2)
2470 # extract unsimplified title
2471 mobj = re.search('"title":"(.*?)",', webpage)
2473 title = mobj.group(1)
2475 # construct media url (with uid/token)
2476 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2477 mediaURL = mediaURL % (video_id, stream_token)
2480 description = u'No description available'
2481 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2483 description = mobj.group(1)
2487 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2490 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2491 except Exception, e:
2494 # for soundcloud, a request to a cross domain is required for cookies
2495 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2498 'id': video_id.decode('utf-8'),
2500 'uploader': uploader.decode('utf-8'),
2501 'upload_date': upload_date,
2502 'title': simple_title.decode('utf-8'),
2503 'stitle': simple_title.decode('utf-8'),
2507 'description': description.decode('utf-8')
2511 class InfoQIE(InfoExtractor):
2512 """Information extractor for infoq.com"""
2514 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2517 def report_webpage(self, video_id):
2518 """Report information extraction."""
2519 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2521 def report_extraction(self, video_id):
2522 """Report information extraction."""
2523 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2525 def _real_extract(self, url):
2526 mobj = re.match(self._VALID_URL, url)
2528 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531 self.report_webpage(url)
2533 request = urllib2.Request(url)
2535 webpage = urllib2.urlopen(request).read()
2536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2537 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2540 self.report_extraction(url)
2544 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2546 self._downloader.trouble(u'ERROR: unable to extract video url')
2548 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2552 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2554 self._downloader.trouble(u'ERROR: unable to extract video title')
2556 video_title = mobj.group(1).decode('utf-8')
2558 # Extract description
2559 video_description = u'No description available.'
2560 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2561 if mobj is not None:
2562 video_description = mobj.group(1).decode('utf-8')
2564 video_filename = video_url.split('/')[-1]
2565 video_id, extension = video_filename.split('.')
2571 'upload_date': None,
2572 'title': video_title,
2573 'stitle': simplify_title(video_title),
2575 'format': extension, # Extension is always(?) mp4, but seems to be flv
2577 'description': video_description,
2583 class MixcloudIE(InfoExtractor):
2584 """Information extractor for www.mixcloud.com"""
2585 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2586 IE_NAME = u'mixcloud'
2588 def __init__(self, downloader=None):
2589 InfoExtractor.__init__(self, downloader)
2591 def report_download_json(self, file_id):
2592 """Report JSON download."""
2593 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2595 def report_extraction(self, file_id):
2596 """Report information extraction."""
2597 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2599 def get_urls(self, jsonData, fmt, bitrate='best'):
2600 """Get urls from 'audio_formats' section in json"""
2603 bitrate_list = jsonData[fmt]
2604 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2605 bitrate = max(bitrate_list) # select highest
2607 url_list = jsonData[fmt][bitrate]
2608 except TypeError: # we have no bitrate info.
2609 url_list = jsonData[fmt]
2612 def check_urls(self, url_list):
2613 """Returns 1st active url from list"""
2614 for url in url_list:
2616 urllib2.urlopen(url)
2618 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 def _print_formats(self, formats):
2624 print 'Available formats:'
2625 for fmt in formats.keys():
2626 for b in formats[fmt]:
2628 ext = formats[fmt][b][0]
2629 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2630 except TypeError: # we have no bitrate info
2631 ext = formats[fmt][0]
2632 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2635 def _real_extract(self, url):
2636 mobj = re.match(self._VALID_URL, url)
2638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640 # extract uploader & filename from url
2641 uploader = mobj.group(1).decode('utf-8')
2642 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2644 # construct API request
2645 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2646 # retrieve .json file with links to files
2647 request = urllib2.Request(file_url)
2649 self.report_download_json(file_url)
2650 jsonData = urllib2.urlopen(request).read()
2651 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2652 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2656 json_data = json.loads(jsonData)
2657 player_url = json_data['player_swf_url']
2658 formats = dict(json_data['audio_formats'])
2660 req_format = self._downloader.params.get('format', None)
2663 if self._downloader.params.get('listformats', None):
2664 self._print_formats(formats)
2667 if req_format is None or req_format == 'best':
2668 for format_param in formats.keys():
2669 url_list = self.get_urls(formats, format_param)
2671 file_url = self.check_urls(url_list)
2672 if file_url is not None:
2675 if req_format not in formats.keys():
2676 self._downloader.trouble(u'ERROR: format is not available')
2679 url_list = self.get_urls(formats, req_format)
2680 file_url = self.check_urls(url_list)
2681 format_param = req_format
2684 'id': file_id.decode('utf-8'),
2685 'url': file_url.decode('utf-8'),
2686 'uploader': uploader.decode('utf-8'),
2687 'upload_date': u'NA',
2688 'title': json_data['name'],
2689 'stitle': simplify_title(json_data['name']),
2690 'ext': file_url.split('.')[-1].decode('utf-8'),
2691 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2692 'thumbnail': json_data['thumbnail_url'],
2693 'description': json_data['description'],
2694 'player_url': player_url.decode('utf-8'),
2697 class StanfordOpenClassroomIE(InfoExtractor):
2698 """Information extractor for Stanford's Open ClassRoom"""
2700 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2701 IE_NAME = u'stanfordoc'
2703 def report_download_webpage(self, objid):
2704 """Report information extraction."""
2705 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2707 def report_extraction(self, video_id):
2708 """Report information extraction."""
2709 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2711 def _real_extract(self, url):
2712 mobj = re.match(self._VALID_URL, url)
2714 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2717 if mobj.group('course') and mobj.group('video'): # A specific video
2718 course = mobj.group('course')
2719 video = mobj.group('video')
2721 'id': simplify_title(course + '_' + video),
2724 self.report_extraction(info['id'])
2725 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2726 xmlUrl = baseUrl + video + '.xml'
2728 metaXml = urllib2.urlopen(xmlUrl).read()
2729 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2730 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2732 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2734 info['title'] = mdoc.findall('./title')[0].text
2735 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2737 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2739 info['stitle'] = simplify_title(info['title'])
2740 info['ext'] = info['url'].rpartition('.')[2]
2741 info['format'] = info['ext']
2743 elif mobj.group('course'): # A course page
2744 course = mobj.group('course')
2746 'id': simplify_title(course),
2750 self.report_download_webpage(info['id'])
2752 coursepage = urllib2.urlopen(url).read()
2753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2757 m = re.search('<h1>([^<]+)</h1>', coursepage)
2759 info['title'] = unescapeHTML(m.group(1))
2761 info['title'] = info['id']
2762 info['stitle'] = simplify_title(info['title'])
2764 m = re.search('<description>([^<]+)</description>', coursepage)
2766 info['description'] = unescapeHTML(m.group(1))
2768 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2771 'type': 'reference',
2772 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2776 for entry in info['list']:
2777 assert entry['type'] == 'reference'
2778 results += self.extract(entry['url'])
2783 'id': 'Stanford OpenClassroom',
2787 self.report_download_webpage(info['id'])
2788 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2790 rootpage = urllib2.urlopen(rootURL).read()
2791 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2795 info['title'] = info['id']
2796 info['stitle'] = simplify_title(info['title'])
2798 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2801 'type': 'reference',
2802 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2807 for entry in info['list']:
2808 assert entry['type'] == 'reference'
2809 results += self.extract(entry['url'])
2812 class MTVIE(InfoExtractor):
2813 """Information extractor for MTV.com"""
2815 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2818 def report_webpage(self, video_id):
2819 """Report information extraction."""
2820 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2822 def report_extraction(self, video_id):
2823 """Report information extraction."""
2824 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2826 def _real_extract(self, url):
2827 mobj = re.match(self._VALID_URL, url)
2829 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2831 if not mobj.group('proto'):
2832 url = 'http://' + url
2833 video_id = mobj.group('videoid')
2834 self.report_webpage(video_id)
2836 request = urllib2.Request(url)
2838 webpage = urllib2.urlopen(request).read()
2839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2840 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2843 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2845 self._downloader.trouble(u'ERROR: unable to extract song name')
2847 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2848 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2850 self._downloader.trouble(u'ERROR: unable to extract performer')
2852 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2853 video_title = performer + ' - ' + song_name
2855 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2857 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2859 mtvn_uri = mobj.group(1)
2861 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2863 self._downloader.trouble(u'ERROR: unable to extract content id')
2865 content_id = mobj.group(1)
2867 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2868 self.report_extraction(video_id)
2869 request = urllib2.Request(videogen_url)
2871 metadataXml = urllib2.urlopen(request).read()
2872 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2873 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2876 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2877 renditions = mdoc.findall('.//rendition')
2879 # For now, always pick the highest quality.
2880 rendition = renditions[-1]
2883 _,_,ext = rendition.attrib['type'].partition('/')
2884 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2885 video_url = rendition.find('./src').text
2887 self._downloader.trouble('Invalid rendition field.')
2893 'uploader': performer,
2894 'title': video_title,
2895 'stitle': simplify_title(video_title),