2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
185 srt += str(n+1) + '\n'
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self.report_information_extraction(video_id)
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
370 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
371 if not srt_lang_list:
372 raise Trouble(u'WARNING: video has no closed captions')
373 if self._downloader.params.get('subtitleslang', False):
374 srt_lang = self._downloader.params.get('subtitleslang')
375 elif 'en' in srt_lang_list:
378 srt_lang = srt_lang_list.keys()[0]
379 if not srt_lang in srt_lang_list:
380 raise Trouble(u'WARNING: no closed captions found in the specified language')
381 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
383 srt_xml = urllib2.urlopen(request).read()
384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
387 raise Trouble(u'WARNING: unable to download video subtitles')
388 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
389 except Trouble as trouble:
390 self._downloader.trouble(trouble[0])
393 video_token = urllib.unquote_plus(video_info['token'][0])
395 # Decide which formats to download
396 req_format = self._downloader.params.get('format', None)
398 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
399 self.report_rtmp_download()
400 video_url_list = [(None, video_info['conn'][0])]
401 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
402 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
403 url_data = [parse_qs(uds) for uds in url_data_strs]
404 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
405 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
407 format_limit = self._downloader.params.get('format_limit', None)
408 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
409 if format_limit is not None and format_limit in available_formats:
410 format_list = available_formats[available_formats.index(format_limit):]
412 format_list = available_formats
413 existing_formats = [x for x in format_list if x in url_map]
414 if len(existing_formats) == 0:
415 self._downloader.trouble(u'ERROR: no known formats available for video')
417 if self._downloader.params.get('listformats', None):
418 self._print_formats(existing_formats)
420 if req_format is None or req_format == 'best':
421 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
422 elif req_format == 'worst':
423 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
424 elif req_format in ('-1', 'all'):
425 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
427 # Specific formats. We pick the first in a slash-delimeted sequence.
428 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
429 req_formats = req_format.split('/')
430 video_url_list = None
431 for rf in req_formats:
433 video_url_list = [(rf, url_map[rf])]
435 if video_url_list is None:
436 self._downloader.trouble(u'ERROR: requested format not available')
439 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
443 for format_param, video_real_url in video_url_list:
445 video_extension = self._video_extensions.get(format_param, 'flv')
448 'id': video_id.decode('utf-8'),
449 'url': video_real_url.decode('utf-8'),
450 'uploader': video_uploader.decode('utf-8'),
451 'upload_date': upload_date,
452 'title': video_title,
453 'ext': video_extension.decode('utf-8'),
454 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
455 'thumbnail': video_thumbnail.decode('utf-8'),
456 'description': video_description,
457 'player_url': player_url,
458 'subtitles': video_subtitles
463 class MetacafeIE(InfoExtractor):
464 """Information Extractor for metacafe.com."""
466 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
467 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
468 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
469 IE_NAME = u'metacafe'
471 def __init__(self, downloader=None):
472 InfoExtractor.__init__(self, downloader)
474 def report_disclaimer(self):
475 """Report disclaimer retrieval."""
476 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
478 def report_age_confirmation(self):
479 """Report attempt to confirm age."""
480 self._downloader.to_screen(u'[metacafe] Confirming age')
482 def report_download_webpage(self, video_id):
483 """Report webpage download."""
484 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
486 def report_extraction(self, video_id):
487 """Report information extraction."""
488 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
490 def _real_initialize(self):
491 # Retrieve disclaimer
492 request = urllib2.Request(self._DISCLAIMER)
494 self.report_disclaimer()
495 disclaimer = urllib2.urlopen(request).read()
496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
503 'submit': "Continue - I'm over 18",
505 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
507 self.report_age_confirmation()
508 disclaimer = urllib2.urlopen(request).read()
509 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
510 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
513 def _real_extract(self, url):
514 # Extract id and simplified title from URL
515 mobj = re.match(self._VALID_URL, url)
517 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
520 video_id = mobj.group(1)
522 # Check if video comes from YouTube
523 mobj2 = re.match(r'^yt-(.*)$', video_id)
524 if mobj2 is not None:
525 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
528 # Retrieve video webpage to extract further information
529 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
531 self.report_download_webpage(video_id)
532 webpage = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
537 # Extract URL, uploader and title from webpage
538 self.report_extraction(video_id)
539 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
541 mediaURL = urllib.unquote(mobj.group(1))
542 video_extension = mediaURL[-3:]
544 # Extract gdaKey if available
545 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
549 gdaKey = mobj.group(1)
550 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
552 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 vardict = parse_qs(mobj.group(1))
557 if 'mediaData' not in vardict:
558 self._downloader.trouble(u'ERROR: unable to extract media URL')
560 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
562 self._downloader.trouble(u'ERROR: unable to extract media URL')
564 mediaURL = mobj.group(1).replace('\\/', '/')
565 video_extension = mediaURL[-3:]
566 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
568 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
570 self._downloader.trouble(u'ERROR: unable to extract title')
572 video_title = mobj.group(1).decode('utf-8')
574 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
576 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
578 video_uploader = mobj.group(1)
581 'id': video_id.decode('utf-8'),
582 'url': video_url.decode('utf-8'),
583 'uploader': video_uploader.decode('utf-8'),
584 'upload_date': u'NA',
585 'title': video_title,
586 'ext': video_extension.decode('utf-8'),
592 class DailymotionIE(InfoExtractor):
593 """Information Extractor for Dailymotion"""
595 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
596 IE_NAME = u'dailymotion'
598 def __init__(self, downloader=None):
599 InfoExtractor.__init__(self, downloader)
601 def report_download_webpage(self, video_id):
602 """Report webpage download."""
603 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
605 def report_extraction(self, video_id):
606 """Report information extraction."""
607 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
609 def _real_extract(self, url):
610 # Extract id and simplified title from URL
611 mobj = re.match(self._VALID_URL, url)
613 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
616 video_id = mobj.group(1)
618 video_extension = 'flv'
620 # Retrieve video webpage to extract further information
621 request = urllib2.Request(url)
622 request.add_header('Cookie', 'family_filter=off')
624 self.report_download_webpage(video_id)
625 webpage = urllib2.urlopen(request).read()
626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
627 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
630 # Extract URL, uploader and title from webpage
631 self.report_extraction(video_id)
632 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract media URL')
636 sequence = urllib.unquote(mobj.group(1))
637 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
641 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
643 # if needed add http://www.dailymotion.com/ if relative URL
647 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
653 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': u'NA',
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
671 class GoogleIE(InfoExtractor):
672 """Information extractor for video.google.com."""
674 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
675 IE_NAME = u'video.google'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_download_webpage(self, video_id):
681 """Report webpage download."""
682 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
688 def _real_extract(self, url):
689 # Extract id from URL
690 mobj = re.match(self._VALID_URL, url)
692 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
695 video_id = mobj.group(1)
697 video_extension = 'mp4'
699 # Retrieve video webpage to extract further information
700 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
702 self.report_download_webpage(video_id)
703 webpage = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
708 # Extract URL, uploader, and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r"download_url:'([^']+)'", webpage)
712 video_extension = 'flv'
713 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
715 self._downloader.trouble(u'ERROR: unable to extract media URL')
717 mediaURL = urllib.unquote(mobj.group(1))
718 mediaURL = mediaURL.replace('\\x3d', '\x3d')
719 mediaURL = mediaURL.replace('\\x26', '\x26')
723 mobj = re.search(r'<title>(.*)</title>', webpage)
725 self._downloader.trouble(u'ERROR: unable to extract title')
727 video_title = mobj.group(1).decode('utf-8')
729 # Extract video description
730 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
732 self._downloader.trouble(u'ERROR: unable to extract video description')
734 video_description = mobj.group(1).decode('utf-8')
735 if not video_description:
736 video_description = 'No description available.'
738 # Extract video thumbnail
739 if self._downloader.params.get('forcethumbnail', False):
740 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
746 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
748 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
750 video_thumbnail = mobj.group(1)
751 else: # we need something to pass to process_info
755 'id': video_id.decode('utf-8'),
756 'url': video_url.decode('utf-8'),
758 'upload_date': u'NA',
759 'title': video_title,
760 'ext': video_extension.decode('utf-8'),
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
790 video_id = mobj.group(1)
792 video_extension = 'flv'
794 # Retrieve video webpage to extract further information
795 request = urllib2.Request(url)
797 self.report_download_webpage(video_id)
798 webpage = urllib2.urlopen(request).read()
799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 mediaURL = urllib.unquote(mobj.group(1))
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract title')
817 video_title = mobj.group(1).decode('utf-8')
819 video_uploader = mobj.group(2).decode('utf-8')
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
825 'upload_date': u'NA',
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
833 class YahooIE(InfoExtractor):
834 """Information extractor for video.yahoo.com."""
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
860 video_id = mobj.group(2)
861 video_extension = 'flv'
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = urllib2.Request(url)
868 webpage = urllib2.urlopen(request).read()
869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
877 yahoo_id = mobj.group(1)
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
883 yahoo_vid = mobj.group(1)
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
888 # Retrieve video webpage to extract further information
889 request = urllib2.Request(url)
891 self.report_download_webpage(video_id)
892 webpage = urllib2.urlopen(request).read()
893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
901 self._downloader.trouble(u'ERROR: unable to extract video title')
903 video_title = mobj.group(1).decode('utf-8')
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
909 video_uploader = mobj.group(1).decode('utf-8')
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
916 video_thumbnail = mobj.group(1).decode('utf-8')
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video description')
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
930 self._downloader.trouble(u'ERROR: unable to extract video height')
932 yv_video_height = mobj.group(1)
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
936 self._downloader.trouble(u'ERROR: unable to extract video width')
938 yv_video_width = mobj.group(1)
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
949 self.report_download_webpage(video_id)
950 webpage = urllib2.urlopen(request).read()
951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
960 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
964 'id': video_id.decode('utf-8'),
966 'uploader': video_uploader,
967 'upload_date': u'NA',
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
972 'thumbnail': video_thumbnail,
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002 video_id = mobj.group(1)
1004 # Retrieve video webpage to extract further information
1005 request = urllib2.Request(url, None, std_headers)
1007 self.report_download_webpage(video_id)
1008 webpage = urllib2.urlopen(request).read()
1009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1013 # Now we begin extracting as much information as we can from what we
1014 # retrieved. First we extract the information common to all extractors,
1015 # and latter we extract those that are Vimeo specific.
1016 self.report_extraction(video_id)
1018 # Extract the config JSON
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1021 config = json.loads(config)
1023 self._downloader.trouble(u'ERROR: unable to extract info section')
1027 video_title = config["video"]["title"]
1030 video_uploader = config["video"]["owner"]["name"]
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1035 # Extract video description
1036 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1040 # Extract upload date
1041 video_upload_date = u'NA'
1042 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1)
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1050 # Vimeo specific: extract video codec and quality information
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 for codec in codecs:
1054 if codec[0] in config["video"]["files"]:
1055 video_codec = codec[0]
1056 video_extension = codec[1]
1057 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058 else: quality = 'sd'
1061 self._downloader.trouble(u'ERROR: no known codec found')
1064 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065 %(video_id, sig, timestamp, quality, video_codec.upper())
1070 'uploader': video_uploader,
1071 'upload_date': video_upload_date,
1072 'title': video_title,
1073 'ext': video_extension,
1074 'thumbnail': video_thumbnail,
1075 'description': video_description,
1080 class GenericIE(InfoExtractor):
1081 """Generic last-resort information extractor."""
1084 IE_NAME = u'generic'
1086 def __init__(self, downloader=None):
1087 InfoExtractor.__init__(self, downloader)
1089 def report_download_webpage(self, video_id):
1090 """Report webpage download."""
1091 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1092 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1094 def report_extraction(self, video_id):
1095 """Report information extraction."""
1096 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1098 def report_following_redirect(self, new_url):
1099 """Report information extraction."""
1100 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1102 def _test_redirect(self, url):
1103 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104 class HeadRequest(urllib2.Request):
1105 def get_method(self):
1108 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1110 Subclass the HTTPRedirectHandler to make it use our
1111 HeadRequest also on the redirected URL
1113 def redirect_request(self, req, fp, code, msg, headers, newurl):
1114 if code in (301, 302, 303, 307):
1115 newurl = newurl.replace(' ', '%20')
1116 newheaders = dict((k,v) for k,v in req.headers.items()
1117 if k.lower() not in ("content-length", "content-type"))
1118 return HeadRequest(newurl,
1120 origin_req_host=req.get_origin_req_host(),
1123 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1125 class HTTPMethodFallback(urllib2.BaseHandler):
1127 Fallback to GET if HEAD is not allowed (405 HTTP error)
1129 def http_error_405(self, req, fp, code, msg, headers):
1133 newheaders = dict((k,v) for k,v in req.headers.items()
1134 if k.lower() not in ("content-length", "content-type"))
1135 return self.parent.open(urllib2.Request(req.get_full_url(),
1137 origin_req_host=req.get_origin_req_host(),
1141 opener = urllib2.OpenerDirector()
1142 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1143 HTTPMethodFallback, HEADRedirectHandler,
1144 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1145 opener.add_handler(handler())
1147 response = opener.open(HeadRequest(url))
1148 new_url = response.geturl()
1150 if url == new_url: return False
1152 self.report_following_redirect(new_url)
1153 self._downloader.download([new_url])
1156 def _real_extract(self, url):
1157 if self._test_redirect(url): return
1159 video_id = url.split('/')[-1]
1160 request = urllib2.Request(url)
1162 self.report_download_webpage(video_id)
1163 webpage = urllib2.urlopen(request).read()
1164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1167 except ValueError, err:
1168 # since this is the last-resort InfoExtractor, if
1169 # this error is thrown, it'll be thrown here
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173 self.report_extraction(video_id)
1174 # Start with something easy: JW Player in SWFObject
1175 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1177 # Broaden the search a little bit
1178 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 # It's possible that one of the regexes
1184 # matched, but returned an empty group:
1185 if mobj.group(1) is None:
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 video_url = urllib.unquote(mobj.group(1))
1190 video_id = os.path.basename(video_url)
1192 # here's a fun little line of code for you:
1193 video_extension = os.path.splitext(video_id)[1][1:]
1194 video_id = os.path.splitext(video_id)[0]
1196 # it's tempting to parse this further, but you would
1197 # have to take into account all the variations like
1198 # Video Title - Site Name
1199 # Site Name | Video Title
1200 # Video Title - Tagline | Site Name
1201 # and so on and so forth; it's just not practical
1202 mobj = re.search(r'<title>(.*)</title>', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1206 video_title = mobj.group(1).decode('utf-8')
1208 # video uploader is domain name
1209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1213 video_uploader = mobj.group(1).decode('utf-8')
1216 'id': video_id.decode('utf-8'),
1217 'url': video_url.decode('utf-8'),
1218 'uploader': video_uploader,
1219 'upload_date': u'NA',
1220 'title': video_title,
1221 'ext': video_extension.decode('utf-8'),
1227 class YoutubeSearchIE(InfoExtractor):
1228 """Information Extractor for YouTube search queries."""
1229 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231 _max_youtube_results = 1000
1232 IE_NAME = u'youtube:search'
1234 def __init__(self, downloader=None):
1235 InfoExtractor.__init__(self, downloader)
1237 def report_download_page(self, query, pagenum):
1238 """Report attempt to download search page with given number."""
1239 query = query.decode(preferredencoding())
1240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1242 def _real_extract(self, query):
1243 mobj = re.match(self._VALID_URL, query)
1245 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1248 prefix, query = query.split(':')
1250 query = query.encode('utf-8')
1252 self._download_n_results(query, 1)
1254 elif prefix == 'all':
1255 self._download_n_results(query, self._max_youtube_results)
1261 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1263 elif n > self._max_youtube_results:
1264 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265 n = self._max_youtube_results
1266 self._download_n_results(query, n)
1268 except ValueError: # parsing prefix as integer fails
1269 self._download_n_results(query, 1)
1272 def _download_n_results(self, query, n):
1273 """Downloads a specified number of results for a query"""
1279 while (50 * pagenum) < limit:
1280 self.report_download_page(query, pagenum+1)
1281 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282 request = urllib2.Request(result_url)
1284 data = urllib2.urlopen(request).read()
1285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1288 api_response = json.loads(data)['data']
1290 new_ids = list(video['id'] for video in api_response['items'])
1291 video_ids += new_ids
1293 limit = min(n, api_response['totalItems'])
1296 if len(video_ids) > n:
1297 video_ids = video_ids[:n]
1298 for id in video_ids:
1299 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1303 class GoogleSearchIE(InfoExtractor):
1304 """Information Extractor for Google Video search queries."""
1305 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1308 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1309 _max_google_results = 1000
1310 IE_NAME = u'video.google:search'
1312 def __init__(self, downloader=None):
1313 InfoExtractor.__init__(self, downloader)
1315 def report_download_page(self, query, pagenum):
1316 """Report attempt to download playlist page with given number."""
1317 query = query.decode(preferredencoding())
1318 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1320 def _real_extract(self, query):
1321 mobj = re.match(self._VALID_URL, query)
1323 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1326 prefix, query = query.split(':')
1328 query = query.encode('utf-8')
1330 self._download_n_results(query, 1)
1332 elif prefix == 'all':
1333 self._download_n_results(query, self._max_google_results)
1339 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1341 elif n > self._max_google_results:
1342 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343 n = self._max_google_results
1344 self._download_n_results(query, n)
1346 except ValueError: # parsing prefix as integer fails
1347 self._download_n_results(query, 1)
1350 def _download_n_results(self, query, n):
1351 """Downloads a specified number of results for a query"""
1357 self.report_download_page(query, pagenum)
1358 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359 request = urllib2.Request(result_url)
1361 page = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1366 # Extract video identifiers
1367 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368 video_id = mobj.group(1)
1369 if video_id not in video_ids:
1370 video_ids.append(video_id)
1371 if len(video_ids) == n:
1372 # Specified n videos reached
1373 for id in video_ids:
1374 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1377 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378 for id in video_ids:
1379 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1382 pagenum = pagenum + 1
1385 class YahooSearchIE(InfoExtractor):
1386 """Information Extractor for Yahoo! Video search queries."""
1387 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1390 _MORE_PAGES_INDICATOR = r'\s*Next'
1391 _max_yahoo_results = 1000
1392 IE_NAME = u'video.yahoo:search'
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1397 def report_download_page(self, query, pagenum):
1398 """Report attempt to download playlist page with given number."""
1399 query = query.decode(preferredencoding())
1400 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1402 def _real_extract(self, query):
1403 mobj = re.match(self._VALID_URL, query)
1405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1408 prefix, query = query.split(':')
1410 query = query.encode('utf-8')
1412 self._download_n_results(query, 1)
1414 elif prefix == 'all':
1415 self._download_n_results(query, self._max_yahoo_results)
1421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1423 elif n > self._max_yahoo_results:
1424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425 n = self._max_yahoo_results
1426 self._download_n_results(query, n)
1428 except ValueError: # parsing prefix as integer fails
1429 self._download_n_results(query, 1)
1432 def _download_n_results(self, query, n):
1433 """Downloads a specified number of results for a query"""
1436 already_seen = set()
1440 self.report_download_page(query, pagenum)
1441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442 request = urllib2.Request(result_url)
1444 page = urllib2.urlopen(request).read()
1445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1449 # Extract video identifiers
1450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451 video_id = mobj.group(1)
1452 if video_id not in already_seen:
1453 video_ids.append(video_id)
1454 already_seen.add(video_id)
1455 if len(video_ids) == n:
1456 # Specified n videos reached
1457 for id in video_ids:
1458 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462 for id in video_ids:
1463 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1466 pagenum = pagenum + 1
1469 class YoutubePlaylistIE(InfoExtractor):
1470 """Information Extractor for YouTube playlists."""
1472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1475 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476 IE_NAME = u'youtube:playlist'
1478 def __init__(self, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1481 def report_download_page(self, playlist_id, pagenum):
1482 """Report attempt to download playlist page with given number."""
1483 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1485 def _real_extract(self, url):
1486 # Extract playlist id
1487 mobj = re.match(self._VALID_URL, url)
1489 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1493 if mobj.group(3) is not None:
1494 self._downloader.download([mobj.group(3)])
1497 # Download playlist pages
1498 # prefix is 'p' as default for playlists but there are other types that need extra care
1499 playlist_prefix = mobj.group(1)
1500 if playlist_prefix == 'a':
1501 playlist_access = 'artist'
1503 playlist_prefix = 'p'
1504 playlist_access = 'view_play_list'
1505 playlist_id = mobj.group(2)
1510 self.report_download_page(playlist_id, pagenum)
1511 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512 request = urllib2.Request(url)
1514 page = urllib2.urlopen(request).read()
1515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1519 # Extract video identifiers
1521 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522 if mobj.group(1) not in ids_in_page:
1523 ids_in_page.append(mobj.group(1))
1524 video_ids.extend(ids_in_page)
1526 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1528 pagenum = pagenum + 1
1530 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531 playlistend = self._downloader.params.get('playlistend', -1)
1532 if playlistend == -1:
1533 video_ids = video_ids[playliststart:]
1535 video_ids = video_ids[playliststart:playlistend]
1537 for id in video_ids:
1538 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1542 class YoutubeUserIE(InfoExtractor):
1543 """Information Extractor for YouTube users."""
1545 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547 _GDATA_PAGE_SIZE = 50
1548 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550 IE_NAME = u'youtube:user'
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1555 def report_download_page(self, username, start_index):
1556 """Report attempt to download user page."""
1557 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1560 def _real_extract(self, url):
1562 mobj = re.match(self._VALID_URL, url)
1564 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1567 username = mobj.group(1)
1569 # Download video ids using YouTube Data API. Result size per
1570 # query is limited (currently to 50 videos) so we need to query
1571 # page by page until there are no video ids - it means we got
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579 self.report_download_page(username, start_index)
1581 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1584 page = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1589 # Extract video identifiers
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 if mobj.group(1) not in ids_in_page:
1594 ids_in_page.append(mobj.group(1))
1596 video_ids.extend(ids_in_page)
1598 # A little optimization - if current page is not
1599 # "full", ie. does not contain PAGE_SIZE video ids then
1600 # we can assume that this page is the last one - there
1601 # are no more ids on further pages - no need to query
1604 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1609 all_ids_count = len(video_ids)
1610 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611 playlistend = self._downloader.params.get('playlistend', -1)
1613 if playlistend == -1:
1614 video_ids = video_ids[playliststart:]
1616 video_ids = video_ids[playliststart:playlistend]
1618 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1619 (username, all_ids_count, len(video_ids)))
1621 for video_id in video_ids:
1622 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1625 class DepositFilesIE(InfoExtractor):
1626 """Information extractor for depositfiles.com"""
1628 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1629 IE_NAME = u'DepositFiles'
1631 def __init__(self, downloader=None):
1632 InfoExtractor.__init__(self, downloader)
1634 def report_download_webpage(self, file_id):
1635 """Report webpage download."""
1636 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1638 def report_extraction(self, file_id):
1639 """Report information extraction."""
1640 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1642 def _real_extract(self, url):
1643 file_id = url.split('/')[-1]
1644 # Rebuild url in english locale
1645 url = 'http://depositfiles.com/en/files/' + file_id
1647 # Retrieve file webpage with 'Free download' button pressed
1648 free_download_indication = { 'gateway_result' : '1' }
1649 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1651 self.report_download_webpage(file_id)
1652 webpage = urllib2.urlopen(request).read()
1653 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1654 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1657 # Search for the real file URL
1658 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1659 if (mobj is None) or (mobj.group(1) is None):
1660 # Try to figure out reason of the error.
1661 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1662 if (mobj is not None) and (mobj.group(1) is not None):
1663 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1664 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1666 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1669 file_url = mobj.group(1)
1670 file_extension = os.path.splitext(file_url)[1][1:]
1672 # Search for file title
1673 mobj = re.search(r'<b title="(.*?)">', webpage)
1675 self._downloader.trouble(u'ERROR: unable to extract title')
1677 file_title = mobj.group(1).decode('utf-8')
1680 'id': file_id.decode('utf-8'),
1681 'url': file_url.decode('utf-8'),
1683 'upload_date': u'NA',
1684 'title': file_title,
1685 'ext': file_extension.decode('utf-8'),
1691 class FacebookIE(InfoExtractor):
1692 """Information Extractor for Facebook"""
1694 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1695 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1696 _NETRC_MACHINE = 'facebook'
1697 _available_formats = ['video', 'highqual', 'lowqual']
1698 _video_extensions = {
1703 IE_NAME = u'facebook'
1705 def __init__(self, downloader=None):
1706 InfoExtractor.__init__(self, downloader)
1708 def _reporter(self, message):
1709 """Add header and report message."""
1710 self._downloader.to_screen(u'[facebook] %s' % message)
1712 def report_login(self):
1713 """Report attempt to log in."""
1714 self._reporter(u'Logging in')
1716 def report_video_webpage_download(self, video_id):
1717 """Report attempt to download video webpage."""
1718 self._reporter(u'%s: Downloading video webpage' % video_id)
1720 def report_information_extraction(self, video_id):
1721 """Report attempt to extract video information."""
1722 self._reporter(u'%s: Extracting video information' % video_id)
1724 def _parse_page(self, video_webpage):
1725 """Extract video information from page"""
1727 data = {'title': r'\("video_title", "(.*?)"\)',
1728 'description': r'<div class="datawrap">(.*?)</div>',
1729 'owner': r'\("video_owner_name", "(.*?)"\)',
1730 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1733 for piece in data.keys():
1734 mobj = re.search(data[piece], video_webpage)
1735 if mobj is not None:
1736 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1740 for fmt in self._available_formats:
1741 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1742 if mobj is not None:
1743 # URL is in a Javascript segment inside an escaped Unicode format within
1744 # the generally utf-8 page
1745 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1746 video_info['video_urls'] = video_urls
1750 def _real_initialize(self):
1751 if self._downloader is None:
1756 downloader_params = self._downloader.params
1758 # Attempt to use provided username and password or .netrc data
1759 if downloader_params.get('username', None) is not None:
1760 useremail = downloader_params['username']
1761 password = downloader_params['password']
1762 elif downloader_params.get('usenetrc', False):
1764 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1765 if info is not None:
1769 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1770 except (IOError, netrc.NetrcParseError), err:
1771 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1774 if useremail is None:
1783 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1786 login_results = urllib2.urlopen(request).read()
1787 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1788 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1794 def _real_extract(self, url):
1795 mobj = re.match(self._VALID_URL, url)
1797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1799 video_id = mobj.group('ID')
1802 self.report_video_webpage_download(video_id)
1803 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1805 page = urllib2.urlopen(request)
1806 video_webpage = page.read()
1807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1808 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1811 # Start extracting information
1812 self.report_information_extraction(video_id)
1814 # Extract information
1815 video_info = self._parse_page(video_webpage)
1818 if 'owner' not in video_info:
1819 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1821 video_uploader = video_info['owner']
1824 if 'title' not in video_info:
1825 self._downloader.trouble(u'ERROR: unable to extract video title')
1827 video_title = video_info['title']
1828 video_title = video_title.decode('utf-8')
1831 if 'thumbnail' not in video_info:
1832 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1833 video_thumbnail = ''
1835 video_thumbnail = video_info['thumbnail']
1839 if 'upload_date' in video_info:
1840 upload_time = video_info['upload_date']
1841 timetuple = email.utils.parsedate_tz(upload_time)
1842 if timetuple is not None:
1844 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1849 video_description = video_info.get('description', 'No description available.')
1851 url_map = video_info['video_urls']
1852 if len(url_map.keys()) > 0:
1853 # Decide which formats to download
1854 req_format = self._downloader.params.get('format', None)
1855 format_limit = self._downloader.params.get('format_limit', None)
1857 if format_limit is not None and format_limit in self._available_formats:
1858 format_list = self._available_formats[self._available_formats.index(format_limit):]
1860 format_list = self._available_formats
1861 existing_formats = [x for x in format_list if x in url_map]
1862 if len(existing_formats) == 0:
1863 self._downloader.trouble(u'ERROR: no known formats available for video')
1865 if req_format is None:
1866 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1867 elif req_format == 'worst':
1868 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1869 elif req_format == '-1':
1870 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1873 if req_format not in url_map:
1874 self._downloader.trouble(u'ERROR: requested format not available')
1876 video_url_list = [(req_format, url_map[req_format])] # Specific format
1879 for format_param, video_real_url in video_url_list:
1881 video_extension = self._video_extensions.get(format_param, 'mp4')
1884 'id': video_id.decode('utf-8'),
1885 'url': video_real_url.decode('utf-8'),
1886 'uploader': video_uploader.decode('utf-8'),
1887 'upload_date': upload_date,
1888 'title': video_title,
1889 'ext': video_extension.decode('utf-8'),
1890 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1891 'thumbnail': video_thumbnail.decode('utf-8'),
1892 'description': video_description.decode('utf-8'),
1897 class BlipTVIE(InfoExtractor):
1898 """Information extractor for blip.tv"""
1900 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1901 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1902 IE_NAME = u'blip.tv'
1904 def report_extraction(self, file_id):
1905 """Report information extraction."""
1906 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1908 def report_direct_download(self, title):
1909 """Report information extraction."""
1910 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1912 def _real_extract(self, url):
1913 mobj = re.match(self._VALID_URL, url)
1915 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1922 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1923 request = urllib2.Request(json_url)
1924 self.report_extraction(mobj.group(1))
1927 urlh = urllib2.urlopen(request)
1928 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1929 basename = url.split('/')[-1]
1930 title,ext = os.path.splitext(basename)
1931 title = title.decode('UTF-8')
1932 ext = ext.replace('.', '')
1933 self.report_direct_download(title)
1941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1944 if info is None: # Regular URL
1946 json_code = urlh.read()
1947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1952 json_data = json.loads(json_code)
1953 if 'Post' in json_data:
1954 data = json_data['Post']
1958 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1959 video_url = data['media']['url']
1960 umobj = re.match(self._URL_EXT, video_url)
1962 raise ValueError('Can not determine filename extension')
1963 ext = umobj.group(1)
1966 'id': data['item_id'],
1968 'uploader': data['display_name'],
1969 'upload_date': upload_date,
1970 'title': data['title'],
1972 'format': data['media']['mimeType'],
1973 'thumbnail': data['thumbnailUrl'],
1974 'description': data['description'],
1975 'player_url': data['embedUrl']
1977 except (ValueError,KeyError), err:
1978 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1984 class MyVideoIE(InfoExtractor):
1985 """Information Extractor for myvideo.de."""
1987 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1988 IE_NAME = u'myvideo'
1990 def __init__(self, downloader=None):
1991 InfoExtractor.__init__(self, downloader)
1993 def report_download_webpage(self, video_id):
1994 """Report webpage download."""
1995 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1997 def report_extraction(self, video_id):
1998 """Report information extraction."""
1999 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2001 def _real_extract(self,url):
2002 mobj = re.match(self._VALID_URL, url)
2004 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2007 video_id = mobj.group(1)
2010 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2012 self.report_download_webpage(video_id)
2013 webpage = urllib2.urlopen(request).read()
2014 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2015 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2018 self.report_extraction(video_id)
2019 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2022 self._downloader.trouble(u'ERROR: unable to extract media URL')
2024 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2026 mobj = re.search('<title>([^<]+)</title>', webpage)
2028 self._downloader.trouble(u'ERROR: unable to extract title')
2031 video_title = mobj.group(1)
2037 'upload_date': u'NA',
2038 'title': video_title,
2044 class ComedyCentralIE(InfoExtractor):
2045 """Information extractor for The Daily Show and Colbert Report """
2047 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2048 IE_NAME = u'comedycentral'
2050 def report_extraction(self, episode_id):
2051 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2053 def report_config_download(self, episode_id):
2054 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2056 def report_index_download(self, episode_id):
2057 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2059 def report_player_url(self, episode_id):
2060 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2062 def _real_extract(self, url):
2063 mobj = re.match(self._VALID_URL, url)
2065 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2068 if mobj.group('shortname'):
2069 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2070 url = u'http://www.thedailyshow.com/full-episodes/'
2072 url = u'http://www.colbertnation.com/full-episodes/'
2073 mobj = re.match(self._VALID_URL, url)
2074 assert mobj is not None
2076 dlNewest = not mobj.group('episode')
2078 epTitle = mobj.group('showname')
2080 epTitle = mobj.group('episode')
2082 req = urllib2.Request(url)
2083 self.report_extraction(epTitle)
2085 htmlHandle = urllib2.urlopen(req)
2086 html = htmlHandle.read()
2087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2088 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2091 url = htmlHandle.geturl()
2092 mobj = re.match(self._VALID_URL, url)
2094 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2096 if mobj.group('episode') == '':
2097 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2099 epTitle = mobj.group('episode')
2101 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2102 if len(mMovieParams) == 0:
2103 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2106 playerUrl_raw = mMovieParams[0][0]
2107 self.report_player_url(epTitle)
2109 urlHandle = urllib2.urlopen(playerUrl_raw)
2110 playerUrl = urlHandle.geturl()
2111 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2112 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2115 uri = mMovieParams[0][1]
2116 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2117 self.report_index_download(epTitle)
2119 indexXml = urllib2.urlopen(indexUrl).read()
2120 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2121 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2126 idoc = xml.etree.ElementTree.fromstring(indexXml)
2127 itemEls = idoc.findall('.//item')
2128 for itemEl in itemEls:
2129 mediaId = itemEl.findall('./guid')[0].text
2130 shortMediaId = mediaId.split(':')[-1]
2131 showId = mediaId.split(':')[-2].replace('.com', '')
2132 officialTitle = itemEl.findall('./title')[0].text
2133 officialDate = itemEl.findall('./pubDate')[0].text
2135 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2136 urllib.urlencode({'uri': mediaId}))
2137 configReq = urllib2.Request(configUrl)
2138 self.report_config_download(epTitle)
2140 configXml = urllib2.urlopen(configReq).read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2145 cdoc = xml.etree.ElementTree.fromstring(configXml)
2147 for rendition in cdoc.findall('.//rendition'):
2148 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2152 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2155 # For now, just pick the highest bitrate
2156 format,video_url = turls[-1]
2158 effTitle = showId + u'-' + epTitle
2163 'upload_date': officialDate,
2168 'description': officialTitle,
2169 'player_url': playerUrl
2172 results.append(info)
2177 class EscapistIE(InfoExtractor):
2178 """Information extractor for The Escapist """
2180 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2181 IE_NAME = u'escapist'
2183 def report_extraction(self, showName):
2184 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2186 def report_config_download(self, showName):
2187 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2189 def _real_extract(self, url):
2190 mobj = re.match(self._VALID_URL, url)
2192 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2194 showName = mobj.group('showname')
2195 videoId = mobj.group('episode')
2197 self.report_extraction(showName)
2199 webPage = urllib2.urlopen(url)
2200 webPageBytes = webPage.read()
2201 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2202 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2204 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2207 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2208 description = unescapeHTML(descMatch.group(1))
2209 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2210 imgUrl = unescapeHTML(imgMatch.group(1))
2211 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2212 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2213 configUrlMatch = re.search('config=(.*)$', playerUrl)
2214 configUrl = urllib2.unquote(configUrlMatch.group(1))
2216 self.report_config_download(showName)
2218 configJSON = urllib2.urlopen(configUrl).read()
2219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2220 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2223 # Technically, it's JavaScript, not JSON
2224 configJSON = configJSON.replace("'", '"')
2227 config = json.loads(configJSON)
2228 except (ValueError,), err:
2229 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2232 playlist = config['playlist']
2233 videoUrl = playlist[1]['url']
2238 'uploader': showName,
2239 'upload_date': None,
2243 'thumbnail': imgUrl,
2244 'description': description,
2245 'player_url': playerUrl,
2251 class CollegeHumorIE(InfoExtractor):
2252 """Information extractor for collegehumor.com"""
2254 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2255 IE_NAME = u'collegehumor'
2257 def report_webpage(self, video_id):
2258 """Report information extraction."""
2259 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2261 def report_extraction(self, video_id):
2262 """Report information extraction."""
2263 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2265 def _real_extract(self, url):
2266 mobj = re.match(self._VALID_URL, url)
2268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2270 video_id = mobj.group('videoid')
2272 self.report_webpage(video_id)
2273 request = urllib2.Request(url)
2275 webpage = urllib2.urlopen(request).read()
2276 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2277 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2280 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2282 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2284 internal_video_id = m.group('internalvideoid')
2288 'internal_id': internal_video_id,
2291 self.report_extraction(video_id)
2292 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2294 metaXml = urllib2.urlopen(xmlUrl).read()
2295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2296 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2299 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2301 videoNode = mdoc.findall('./video')[0]
2302 info['description'] = videoNode.findall('./description')[0].text
2303 info['title'] = videoNode.findall('./caption')[0].text
2304 info['url'] = videoNode.findall('./file')[0].text
2305 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2306 info['ext'] = info['url'].rpartition('.')[2]
2307 info['format'] = info['ext']
2309 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2315 class XVideosIE(InfoExtractor):
2316 """Information extractor for xvideos.com"""
2318 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2319 IE_NAME = u'xvideos'
2321 def report_webpage(self, video_id):
2322 """Report information extraction."""
2323 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2325 def report_extraction(self, video_id):
2326 """Report information extraction."""
2327 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2329 def _real_extract(self, url):
2330 mobj = re.match(self._VALID_URL, url)
2332 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2334 video_id = mobj.group(1).decode('utf-8')
2336 self.report_webpage(video_id)
2338 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2340 webpage = urllib2.urlopen(request).read()
2341 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2342 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2345 self.report_extraction(video_id)
2349 mobj = re.search(r'flv_url=(.+?)&', webpage)
2351 self._downloader.trouble(u'ERROR: unable to extract video url')
2353 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2357 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2359 self._downloader.trouble(u'ERROR: unable to extract video title')
2361 video_title = mobj.group(1).decode('utf-8')
2364 # Extract video thumbnail
2365 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2367 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2369 video_thumbnail = mobj.group(0).decode('utf-8')
2375 'upload_date': None,
2376 'title': video_title,
2379 'thumbnail': video_thumbnail,
2380 'description': None,
2387 class SoundcloudIE(InfoExtractor):
2388 """Information extractor for soundcloud.com
2389 To access the media, the uid of the song and a stream token
2390 must be extracted from the page source and the script must make
2391 a request to media.soundcloud.com/crossdomain.xml. Then
2392 the media can be grabbed by requesting from an url composed
2393 of the stream token and uid
2396 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2397 IE_NAME = u'soundcloud'
2399 def __init__(self, downloader=None):
2400 InfoExtractor.__init__(self, downloader)
2402 def report_webpage(self, video_id):
2403 """Report information extraction."""
2404 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2406 def report_extraction(self, video_id):
2407 """Report information extraction."""
2408 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2410 def _real_extract(self, url):
2411 mobj = re.match(self._VALID_URL, url)
2413 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2416 # extract uploader (which is in the url)
2417 uploader = mobj.group(1).decode('utf-8')
2418 # extract simple title (uploader + slug of song title)
2419 slug_title = mobj.group(2).decode('utf-8')
2420 simple_title = uploader + u'-' + slug_title
2422 self.report_webpage('%s/%s' % (uploader, slug_title))
2424 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2426 webpage = urllib2.urlopen(request).read()
2427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2428 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2431 self.report_extraction('%s/%s' % (uploader, slug_title))
2433 # extract uid and stream token that soundcloud hands out for access
2434 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2436 video_id = mobj.group(1)
2437 stream_token = mobj.group(2)
2439 # extract unsimplified title
2440 mobj = re.search('"title":"(.*?)",', webpage)
2442 title = mobj.group(1).decode('utf-8')
2444 title = simple_title
2446 # construct media url (with uid/token)
2447 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2448 mediaURL = mediaURL % (video_id, stream_token)
2451 description = u'No description available'
2452 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2454 description = mobj.group(1)
2458 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2461 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2462 except Exception, e:
2463 self._downloader.to_stderr(str(e))
2465 # for soundcloud, a request to a cross domain is required for cookies
2466 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2469 'id': video_id.decode('utf-8'),
2471 'uploader': uploader.decode('utf-8'),
2472 'upload_date': upload_date,
2477 'description': description.decode('utf-8')
2481 class InfoQIE(InfoExtractor):
2482 """Information extractor for infoq.com"""
2484 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2487 def report_webpage(self, video_id):
2488 """Report information extraction."""
2489 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2491 def report_extraction(self, video_id):
2492 """Report information extraction."""
2493 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2495 def _real_extract(self, url):
2496 mobj = re.match(self._VALID_URL, url)
2498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2501 self.report_webpage(url)
2503 request = urllib2.Request(url)
2505 webpage = urllib2.urlopen(request).read()
2506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2507 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2510 self.report_extraction(url)
2514 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2516 self._downloader.trouble(u'ERROR: unable to extract video url')
2518 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2522 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2524 self._downloader.trouble(u'ERROR: unable to extract video title')
2526 video_title = mobj.group(1).decode('utf-8')
2528 # Extract description
2529 video_description = u'No description available.'
2530 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2531 if mobj is not None:
2532 video_description = mobj.group(1).decode('utf-8')
2534 video_filename = video_url.split('/')[-1]
2535 video_id, extension = video_filename.split('.')
2541 'upload_date': None,
2542 'title': video_title,
2544 'format': extension, # Extension is always(?) mp4, but seems to be flv
2546 'description': video_description,
2552 class MixcloudIE(InfoExtractor):
2553 """Information extractor for www.mixcloud.com"""
2554 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2555 IE_NAME = u'mixcloud'
2557 def __init__(self, downloader=None):
2558 InfoExtractor.__init__(self, downloader)
2560 def report_download_json(self, file_id):
2561 """Report JSON download."""
2562 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2564 def report_extraction(self, file_id):
2565 """Report information extraction."""
2566 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2568 def get_urls(self, jsonData, fmt, bitrate='best'):
2569 """Get urls from 'audio_formats' section in json"""
2572 bitrate_list = jsonData[fmt]
2573 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2574 bitrate = max(bitrate_list) # select highest
2576 url_list = jsonData[fmt][bitrate]
2577 except TypeError: # we have no bitrate info.
2578 url_list = jsonData[fmt]
2581 def check_urls(self, url_list):
2582 """Returns 1st active url from list"""
2583 for url in url_list:
2585 urllib2.urlopen(url)
2587 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2592 def _print_formats(self, formats):
2593 print 'Available formats:'
2594 for fmt in formats.keys():
2595 for b in formats[fmt]:
2597 ext = formats[fmt][b][0]
2598 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2599 except TypeError: # we have no bitrate info
2600 ext = formats[fmt][0]
2601 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2604 def _real_extract(self, url):
2605 mobj = re.match(self._VALID_URL, url)
2607 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2609 # extract uploader & filename from url
2610 uploader = mobj.group(1).decode('utf-8')
2611 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2613 # construct API request
2614 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2615 # retrieve .json file with links to files
2616 request = urllib2.Request(file_url)
2618 self.report_download_json(file_url)
2619 jsonData = urllib2.urlopen(request).read()
2620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2625 json_data = json.loads(jsonData)
2626 player_url = json_data['player_swf_url']
2627 formats = dict(json_data['audio_formats'])
2629 req_format = self._downloader.params.get('format', None)
2632 if self._downloader.params.get('listformats', None):
2633 self._print_formats(formats)
2636 if req_format is None or req_format == 'best':
2637 for format_param in formats.keys():
2638 url_list = self.get_urls(formats, format_param)
2640 file_url = self.check_urls(url_list)
2641 if file_url is not None:
2644 if req_format not in formats.keys():
2645 self._downloader.trouble(u'ERROR: format is not available')
2648 url_list = self.get_urls(formats, req_format)
2649 file_url = self.check_urls(url_list)
2650 format_param = req_format
2653 'id': file_id.decode('utf-8'),
2654 'url': file_url.decode('utf-8'),
2655 'uploader': uploader.decode('utf-8'),
2656 'upload_date': u'NA',
2657 'title': json_data['name'],
2658 'ext': file_url.split('.')[-1].decode('utf-8'),
2659 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2660 'thumbnail': json_data['thumbnail_url'],
2661 'description': json_data['description'],
2662 'player_url': player_url.decode('utf-8'),
2665 class StanfordOpenClassroomIE(InfoExtractor):
2666 """Information extractor for Stanford's Open ClassRoom"""
2668 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2669 IE_NAME = u'stanfordoc'
2671 def report_download_webpage(self, objid):
2672 """Report information extraction."""
2673 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2675 def report_extraction(self, video_id):
2676 """Report information extraction."""
2677 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2679 def _real_extract(self, url):
2680 mobj = re.match(self._VALID_URL, url)
2682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2685 if mobj.group('course') and mobj.group('video'): # A specific video
2686 course = mobj.group('course')
2687 video = mobj.group('video')
2689 'id': course + '_' + video,
2692 self.report_extraction(info['id'])
2693 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2694 xmlUrl = baseUrl + video + '.xml'
2696 metaXml = urllib2.urlopen(xmlUrl).read()
2697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2700 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2702 info['title'] = mdoc.findall('./title')[0].text
2703 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2705 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2707 info['ext'] = info['url'].rpartition('.')[2]
2708 info['format'] = info['ext']
2710 elif mobj.group('course'): # A course page
2711 course = mobj.group('course')
2717 self.report_download_webpage(info['id'])
2719 coursepage = urllib2.urlopen(url).read()
2720 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2721 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2724 m = re.search('<h1>([^<]+)</h1>', coursepage)
2726 info['title'] = unescapeHTML(m.group(1))
2728 info['title'] = info['id']
2730 m = re.search('<description>([^<]+)</description>', coursepage)
2732 info['description'] = unescapeHTML(m.group(1))
2734 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2737 'type': 'reference',
2738 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2742 for entry in info['list']:
2743 assert entry['type'] == 'reference'
2744 results += self.extract(entry['url'])
2749 'id': 'Stanford OpenClassroom',
2753 self.report_download_webpage(info['id'])
2754 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2756 rootpage = urllib2.urlopen(rootURL).read()
2757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2758 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2761 info['title'] = info['id']
2763 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2766 'type': 'reference',
2767 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2772 for entry in info['list']:
2773 assert entry['type'] == 'reference'
2774 results += self.extract(entry['url'])
2777 class MTVIE(InfoExtractor):
2778 """Information extractor for MTV.com"""
2780 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2783 def report_webpage(self, video_id):
2784 """Report information extraction."""
2785 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2787 def report_extraction(self, video_id):
2788 """Report information extraction."""
2789 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2791 def _real_extract(self, url):
2792 mobj = re.match(self._VALID_URL, url)
2794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2796 if not mobj.group('proto'):
2797 url = 'http://' + url
2798 video_id = mobj.group('videoid')
2799 self.report_webpage(video_id)
2801 request = urllib2.Request(url)
2803 webpage = urllib2.urlopen(request).read()
2804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2808 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2810 self._downloader.trouble(u'ERROR: unable to extract song name')
2812 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2813 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2815 self._downloader.trouble(u'ERROR: unable to extract performer')
2817 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2818 video_title = performer + ' - ' + song_name
2820 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2822 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2824 mtvn_uri = mobj.group(1)
2826 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2828 self._downloader.trouble(u'ERROR: unable to extract content id')
2830 content_id = mobj.group(1)
2832 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2833 self.report_extraction(video_id)
2834 request = urllib2.Request(videogen_url)
2836 metadataXml = urllib2.urlopen(request).read()
2837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2841 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2842 renditions = mdoc.findall('.//rendition')
2844 # For now, always pick the highest quality.
2845 rendition = renditions[-1]
2848 _,_,ext = rendition.attrib['type'].partition('/')
2849 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2850 video_url = rendition.find('./src').text
2852 self._downloader.trouble('Invalid rendition field.')
2858 'uploader': performer,
2859 'title': video_title,