2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
21 # parse_qs was moved from the cgi module to the urlparse module recently.
23 from urlparse import parse_qs
25 from cgi import parse_qs
28 import xml.etree.ElementTree
29 except ImportError: # Python<2.5: Not officially supported, but let it slip
30 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
35 class InfoExtractor(object):
36 """Information Extractor class.
38 Information extractors are the classes that, given a URL, extract
39 information from the video (or videos) the URL refers to. This
40 information includes the real video URL, the video title and simplified
41 title, author and others. The information is stored in a dictionary
42 which is then passed to the FileDownloader. The FileDownloader
43 processes this information possibly downloading the video to the file
44 system, among other possible outcomes. The dictionaries must include
49 uploader: Nickname of the video uploader.
51 stitle: Simplified title.
52 ext: Video filename extension.
54 player_url: SWF Player URL (may be None).
56 The following fields are optional. Their primary purpose is to allow
57 youtube-dl to serve as the backend for a video search function, such
58 as the one in youtube2mp3. They are only used when their respective
59 forced printing functions are called:
61 thumbnail: Full URL to a video thumbnail image.
62 description: One-line video description.
64 Subclasses of this one should re-define the _real_initialize() and
65 _real_extract() methods and define a _VALID_URL regexp.
66 Probably, they should also be added to the list of extractors.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
77 def suitable(self, url):
78 """Receives a URL and returns True if suitable for this IE."""
79 return re.match(self._VALID_URL, url) is not None
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
105 class YoutubeIE(InfoExtractor):
106 """Information extractor for youtube.com."""
108 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
109 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
110 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
111 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
112 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
113 _NETRC_MACHINE = 'youtube'
114 # Listed in order of quality
115 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
116 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
117 _video_extensions = {
123 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
128 _video_dimensions = {
145 def report_lang(self):
146 """Report attempt to set language."""
147 self._downloader.to_screen(u'[youtube] Setting language')
149 def report_login(self):
150 """Report attempt to log in."""
151 self._downloader.to_screen(u'[youtube] Logging in')
153 def report_age_confirmation(self):
154 """Report attempt to confirm age."""
155 self._downloader.to_screen(u'[youtube] Confirming age')
157 def report_video_webpage_download(self, video_id):
158 """Report attempt to download video webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
161 def report_video_info_webpage_download(self, video_id):
162 """Report attempt to download video info webpage."""
163 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
165 def report_video_subtitles_download(self, video_id):
166 """Report attempt to download video info webpage."""
167 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
169 def report_information_extraction(self, video_id):
170 """Report attempt to extract video information."""
171 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
173 def report_unavailable_format(self, video_id, format):
174 """Report extracted video URL."""
175 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
177 def report_rtmp_download(self):
178 """Indicate the download will use the RTMP protocol."""
179 self._downloader.to_screen(u'[youtube] RTMP download detected')
181 def _closed_captions_xml_to_srt(self, xml_string):
183 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
184 # TODO parse xml instead of regex
185 for n, (start, dur_tag, dur, caption) in enumerate(texts):
186 if not dur: dur = '4'
188 end = start + float(dur)
189 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
190 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
191 caption = unescapeHTML(caption)
192 caption = unescapeHTML(caption) # double cycle, inentional
194 srt += start + ' --> ' + end + '\n'
195 srt += caption + '\n\n'
198 def _print_formats(self, formats):
199 print 'Available formats:'
201 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
203 def _real_initialize(self):
204 if self._downloader is None:
209 downloader_params = self._downloader.params
211 # Attempt to use provided username and password or .netrc data
212 if downloader_params.get('username', None) is not None:
213 username = downloader_params['username']
214 password = downloader_params['password']
215 elif downloader_params.get('usenetrc', False):
217 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
222 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
223 except (IOError, netrc.NetrcParseError), err:
224 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
228 request = urllib2.Request(self._LANG_URL)
231 urllib2.urlopen(request).read()
232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
233 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
236 # No authentication to be performed
242 'current_form': 'loginForm',
244 'action_login': 'Log In',
245 'username': username,
246 'password': password,
248 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
251 login_results = urllib2.urlopen(request).read()
252 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
253 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
256 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
262 'action_confirm': 'Confirm',
264 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
266 self.report_age_confirmation()
267 age_results = urllib2.urlopen(request).read()
268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
269 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
272 def _real_extract(self, url):
273 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
274 mobj = re.search(self._NEXT_URL_RE, url)
276 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
278 # Extract video id from URL
279 mobj = re.match(self._VALID_URL, url)
281 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
283 video_id = mobj.group(2)
286 self.report_video_webpage_download(video_id)
287 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
289 video_webpage = urllib2.urlopen(request).read()
290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
291 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
294 # Attempt to extract SWF player URL
295 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
297 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
302 self.report_video_info_webpage_download(video_id)
303 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
304 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
305 % (video_id, el_type))
306 request = urllib2.Request(video_info_url)
308 video_info_webpage = urllib2.urlopen(request).read()
309 video_info = parse_qs(video_info_webpage)
310 if 'token' in video_info:
312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
313 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
315 if 'token' not in video_info:
316 if 'reason' in video_info:
317 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
319 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
322 # Start extracting information
323 self.report_information_extraction(video_id)
326 if 'author' not in video_info:
327 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
329 video_uploader = urllib.unquote_plus(video_info['author'][0])
332 if 'title' not in video_info:
333 self._downloader.trouble(u'ERROR: unable to extract video title')
335 video_title = urllib.unquote_plus(video_info['title'][0])
336 video_title = video_title.decode('utf-8')
337 video_title = sanitize_title(video_title)
340 simple_title = simplify_title(video_title)
343 if 'thumbnail_url' not in video_info:
344 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
346 else: # don't panic if we can't find it
347 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
351 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
353 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
354 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
355 for expression in format_expressions:
357 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
362 video_description = get_element_by_id("eow-description", video_webpage)
363 if video_description: video_description = clean_html(video_description.decode('utf8'))
364 else: video_description = ''
367 video_subtitles = None
368 if self._downloader.params.get('writesubtitles', False):
369 self.report_video_subtitles_download(video_id)
370 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
372 srt_list = urllib2.urlopen(request).read()
373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
374 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
376 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
378 if self._downloader.params.get('subtitleslang', False):
379 srt_lang = self._downloader.params.get('subtitleslang')
380 elif 'en' in srt_lang_list:
383 srt_lang = srt_lang_list[0]
384 if not srt_lang in srt_lang_list:
385 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
387 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
389 srt_xml = urllib2.urlopen(request).read()
390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
391 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
395 self._downloader.trouble(u'WARNING: video has no closed captions')
398 video_token = urllib.unquote_plus(video_info['token'][0])
400 # Decide which formats to download
401 req_format = self._downloader.params.get('format', None)
403 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
404 self.report_rtmp_download()
405 video_url_list = [(None, video_info['conn'][0])]
406 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
407 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
408 url_data = [parse_qs(uds) for uds in url_data_strs]
409 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
410 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
412 format_limit = self._downloader.params.get('format_limit', None)
413 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
414 if format_limit is not None and format_limit in available_formats:
415 format_list = available_formats[available_formats.index(format_limit):]
417 format_list = available_formats
418 existing_formats = [x for x in format_list if x in url_map]
419 if len(existing_formats) == 0:
420 self._downloader.trouble(u'ERROR: no known formats available for video')
422 if self._downloader.params.get('listformats', None):
423 self._print_formats(existing_formats)
425 if req_format is None or req_format == 'best':
426 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
427 elif req_format == 'worst':
428 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
429 elif req_format in ('-1', 'all'):
430 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
432 # Specific formats. We pick the first in a slash-delimeted sequence.
433 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
434 req_formats = req_format.split('/')
435 video_url_list = None
436 for rf in req_formats:
438 video_url_list = [(rf, url_map[rf])]
440 if video_url_list is None:
441 self._downloader.trouble(u'ERROR: requested format not available')
444 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
448 for format_param, video_real_url in video_url_list:
450 video_extension = self._video_extensions.get(format_param, 'flv')
453 'id': video_id.decode('utf-8'),
454 'url': video_real_url.decode('utf-8'),
455 'uploader': video_uploader.decode('utf-8'),
456 'upload_date': upload_date,
457 'title': video_title,
458 'stitle': simple_title,
459 'ext': video_extension.decode('utf-8'),
460 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
461 'thumbnail': video_thumbnail.decode('utf-8'),
462 'description': video_description,
463 'player_url': player_url,
464 'subtitles': video_subtitles
469 class MetacafeIE(InfoExtractor):
470 """Information Extractor for metacafe.com."""
472 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
473 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
474 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
475 IE_NAME = u'metacafe'
477 def __init__(self, downloader=None):
478 InfoExtractor.__init__(self, downloader)
480 def report_disclaimer(self):
481 """Report disclaimer retrieval."""
482 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
484 def report_age_confirmation(self):
485 """Report attempt to confirm age."""
486 self._downloader.to_screen(u'[metacafe] Confirming age')
488 def report_download_webpage(self, video_id):
489 """Report webpage download."""
490 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
492 def report_extraction(self, video_id):
493 """Report information extraction."""
494 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
496 def _real_initialize(self):
497 # Retrieve disclaimer
498 request = urllib2.Request(self._DISCLAIMER)
500 self.report_disclaimer()
501 disclaimer = urllib2.urlopen(request).read()
502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
503 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
509 'submit': "Continue - I'm over 18",
511 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
513 self.report_age_confirmation()
514 disclaimer = urllib2.urlopen(request).read()
515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
516 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
519 def _real_extract(self, url):
520 # Extract id and simplified title from URL
521 mobj = re.match(self._VALID_URL, url)
523 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
526 video_id = mobj.group(1)
528 # Check if video comes from YouTube
529 mobj2 = re.match(r'^yt-(.*)$', video_id)
530 if mobj2 is not None:
531 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
534 simple_title = mobj.group(2).decode('utf-8')
536 # Retrieve video webpage to extract further information
537 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
539 self.report_download_webpage(video_id)
540 webpage = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
545 # Extract URL, uploader and title from webpage
546 self.report_extraction(video_id)
547 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
549 mediaURL = urllib.unquote(mobj.group(1))
550 video_extension = mediaURL[-3:]
552 # Extract gdaKey if available
553 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
557 gdaKey = mobj.group(1)
558 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
560 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
562 self._downloader.trouble(u'ERROR: unable to extract media URL')
564 vardict = parse_qs(mobj.group(1))
565 if 'mediaData' not in vardict:
566 self._downloader.trouble(u'ERROR: unable to extract media URL')
568 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
570 self._downloader.trouble(u'ERROR: unable to extract media URL')
572 mediaURL = mobj.group(1).replace('\\/', '/')
573 video_extension = mediaURL[-3:]
574 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
576 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract title')
580 video_title = mobj.group(1).decode('utf-8')
581 video_title = sanitize_title(video_title)
583 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
585 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
587 video_uploader = mobj.group(1)
590 'id': video_id.decode('utf-8'),
591 'url': video_url.decode('utf-8'),
592 'uploader': video_uploader.decode('utf-8'),
593 'upload_date': u'NA',
594 'title': video_title,
595 'stitle': simple_title,
596 'ext': video_extension.decode('utf-8'),
602 class DailymotionIE(InfoExtractor):
603 """Information Extractor for Dailymotion"""
605 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
606 IE_NAME = u'dailymotion'
608 def __init__(self, downloader=None):
609 InfoExtractor.__init__(self, downloader)
611 def report_download_webpage(self, video_id):
612 """Report webpage download."""
613 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
615 def report_extraction(self, video_id):
616 """Report information extraction."""
617 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
619 def _real_extract(self, url):
620 # Extract id and simplified title from URL
621 mobj = re.match(self._VALID_URL, url)
623 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
626 video_id = mobj.group(1)
628 video_extension = 'flv'
630 # Retrieve video webpage to extract further information
631 request = urllib2.Request(url)
632 request.add_header('Cookie', 'family_filter=off')
634 self.report_download_webpage(video_id)
635 webpage = urllib2.urlopen(request).read()
636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
637 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
640 # Extract URL, uploader and title from webpage
641 self.report_extraction(video_id)
642 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
644 self._downloader.trouble(u'ERROR: unable to extract media URL')
646 sequence = urllib.unquote(mobj.group(1))
647 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
649 self._downloader.trouble(u'ERROR: unable to extract media URL')
651 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
653 # if needed add http://www.dailymotion.com/ if relative URL
657 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
659 self._downloader.trouble(u'ERROR: unable to extract title')
661 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
662 video_title = sanitize_title(video_title)
663 simple_title = simplify_title(video_title)
665 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
667 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
669 video_uploader = mobj.group(1)
672 'id': video_id.decode('utf-8'),
673 'url': video_url.decode('utf-8'),
674 'uploader': video_uploader.decode('utf-8'),
675 'upload_date': u'NA',
676 'title': video_title,
677 'stitle': simple_title,
678 'ext': video_extension.decode('utf-8'),
684 class GoogleIE(InfoExtractor):
685 """Information extractor for video.google.com."""
687 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
688 IE_NAME = u'video.google'
690 def __init__(self, downloader=None):
691 InfoExtractor.__init__(self, downloader)
693 def report_download_webpage(self, video_id):
694 """Report webpage download."""
695 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
697 def report_extraction(self, video_id):
698 """Report information extraction."""
699 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
701 def _real_extract(self, url):
702 # Extract id from URL
703 mobj = re.match(self._VALID_URL, url)
705 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
708 video_id = mobj.group(1)
710 video_extension = 'mp4'
712 # Retrieve video webpage to extract further information
713 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
715 self.report_download_webpage(video_id)
716 webpage = urllib2.urlopen(request).read()
717 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
718 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
721 # Extract URL, uploader, and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r"download_url:'([^']+)'", webpage)
725 video_extension = 'flv'
726 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
728 self._downloader.trouble(u'ERROR: unable to extract media URL')
730 mediaURL = urllib.unquote(mobj.group(1))
731 mediaURL = mediaURL.replace('\\x3d', '\x3d')
732 mediaURL = mediaURL.replace('\\x26', '\x26')
736 mobj = re.search(r'<title>(.*)</title>', webpage)
738 self._downloader.trouble(u'ERROR: unable to extract title')
740 video_title = mobj.group(1).decode('utf-8')
741 video_title = sanitize_title(video_title)
742 simple_title = simplify_title(video_title)
744 # Extract video description
745 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
747 self._downloader.trouble(u'ERROR: unable to extract video description')
749 video_description = mobj.group(1).decode('utf-8')
750 if not video_description:
751 video_description = 'No description available.'
753 # Extract video thumbnail
754 if self._downloader.params.get('forcethumbnail', False):
755 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
757 webpage = urllib2.urlopen(request).read()
758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
761 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
763 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
765 video_thumbnail = mobj.group(1)
766 else: # we need something to pass to process_info
770 'id': video_id.decode('utf-8'),
771 'url': video_url.decode('utf-8'),
773 'upload_date': u'NA',
774 'title': video_title,
775 'stitle': simple_title,
776 'ext': video_extension.decode('utf-8'),
782 class PhotobucketIE(InfoExtractor):
783 """Information extractor for photobucket.com."""
785 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
786 IE_NAME = u'photobucket'
788 def __init__(self, downloader=None):
789 InfoExtractor.__init__(self, downloader)
791 def report_download_webpage(self, video_id):
792 """Report webpage download."""
793 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
795 def report_extraction(self, video_id):
796 """Report information extraction."""
797 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
799 def _real_extract(self, url):
800 # Extract id from URL
801 mobj = re.match(self._VALID_URL, url)
803 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
806 video_id = mobj.group(1)
808 video_extension = 'flv'
810 # Retrieve video webpage to extract further information
811 request = urllib2.Request(url)
813 self.report_download_webpage(video_id)
814 webpage = urllib2.urlopen(request).read()
815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
819 # Extract URL, uploader, and title from webpage
820 self.report_extraction(video_id)
821 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
823 self._downloader.trouble(u'ERROR: unable to extract media URL')
825 mediaURL = urllib.unquote(mobj.group(1))
829 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
831 self._downloader.trouble(u'ERROR: unable to extract title')
833 video_title = mobj.group(1).decode('utf-8')
834 video_title = sanitize_title(video_title)
835 simple_title = simplify_title(video_title)
837 video_uploader = mobj.group(2).decode('utf-8')
840 'id': video_id.decode('utf-8'),
841 'url': video_url.decode('utf-8'),
842 'uploader': video_uploader,
843 'upload_date': u'NA',
844 'title': video_title,
845 'stitle': simple_title,
846 'ext': video_extension.decode('utf-8'),
852 class YahooIE(InfoExtractor):
853 """Information extractor for video.yahoo.com."""
855 # _VALID_URL matches all Yahoo! Video URLs
856 # _VPAGE_URL matches only the extractable '/watch/' URLs
857 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
858 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
859 IE_NAME = u'video.yahoo'
861 def __init__(self, downloader=None):
862 InfoExtractor.__init__(self, downloader)
864 def report_download_webpage(self, video_id):
865 """Report webpage download."""
866 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
868 def report_extraction(self, video_id):
869 """Report information extraction."""
870 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
872 def _real_extract(self, url, new_video=True):
873 # Extract ID from URL
874 mobj = re.match(self._VALID_URL, url)
876 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
879 video_id = mobj.group(2)
880 video_extension = 'flv'
882 # Rewrite valid but non-extractable URLs as
883 # extractable English language /watch/ URLs
884 if re.match(self._VPAGE_URL, url) is None:
885 request = urllib2.Request(url)
887 webpage = urllib2.urlopen(request).read()
888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
892 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
894 self._downloader.trouble(u'ERROR: Unable to extract id field')
896 yahoo_id = mobj.group(1)
898 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
900 self._downloader.trouble(u'ERROR: Unable to extract vid field')
902 yahoo_vid = mobj.group(1)
904 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
905 return self._real_extract(url, new_video=False)
907 # Retrieve video webpage to extract further information
908 request = urllib2.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = urllib2.urlopen(request).read()
912 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
913 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
916 # Extract uploader and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video title')
922 video_title = mobj.group(1).decode('utf-8')
923 simple_title = simplify_title(video_title)
925 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video uploader')
929 video_uploader = mobj.group(1).decode('utf-8')
931 # Extract video thumbnail
932 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
934 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
936 video_thumbnail = mobj.group(1).decode('utf-8')
938 # Extract video description
939 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video description')
943 video_description = mobj.group(1).decode('utf-8')
944 if not video_description:
945 video_description = 'No description available.'
947 # Extract video height and width
948 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
950 self._downloader.trouble(u'ERROR: unable to extract video height')
952 yv_video_height = mobj.group(1)
954 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
956 self._downloader.trouble(u'ERROR: unable to extract video width')
958 yv_video_width = mobj.group(1)
960 # Retrieve video playlist to extract media URL
961 # I'm not completely sure what all these options are, but we
962 # seem to need most of them, otherwise the server sends a 401.
963 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
964 yv_bitrate = '700' # according to Wikipedia this is hard-coded
965 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
966 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
967 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
969 self.report_download_webpage(video_id)
970 webpage = urllib2.urlopen(request).read()
971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
972 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
975 # Extract media URL from playlist XML
976 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
978 self._downloader.trouble(u'ERROR: Unable to extract media URL')
980 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
981 video_url = unescapeHTML(video_url)
984 'id': video_id.decode('utf-8'),
986 'uploader': video_uploader,
987 'upload_date': u'NA',
988 'title': video_title,
989 'stitle': simple_title,
990 'ext': video_extension.decode('utf-8'),
991 'thumbnail': video_thumbnail.decode('utf-8'),
992 'description': video_description,
993 'thumbnail': video_thumbnail,
998 class VimeoIE(InfoExtractor):
999 """Information extractor for vimeo.com."""
1001 # _VALID_URL matches Vimeo URLs
1002 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1005 def __init__(self, downloader=None):
1006 InfoExtractor.__init__(self, downloader)
1008 def report_download_webpage(self, video_id):
1009 """Report webpage download."""
1010 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1012 def report_extraction(self, video_id):
1013 """Report information extraction."""
1014 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1016 def _real_extract(self, url, new_video=True):
1017 # Extract ID from URL
1018 mobj = re.match(self._VALID_URL, url)
1020 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1023 video_id = mobj.group(1)
1025 # Retrieve video webpage to extract further information
1026 request = urllib2.Request(url, None, std_headers)
1028 self.report_download_webpage(video_id)
1029 webpage = urllib2.urlopen(request).read()
1030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1034 # Now we begin extracting as much information as we can from what we
1035 # retrieved. First we extract the information common to all extractors,
1036 # and latter we extract those that are Vimeo specific.
1037 self.report_extraction(video_id)
1039 # Extract the config JSON
1040 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1042 config = json.loads(config)
1044 self._downloader.trouble(u'ERROR: unable to extract info section')
1048 video_title = config["video"]["title"]
1049 simple_title = simplify_title(video_title)
1052 video_uploader = config["video"]["owner"]["name"]
1054 # Extract video thumbnail
1055 video_thumbnail = config["video"]["thumbnail"]
1057 # Extract video description
1058 video_description = get_element_by_id("description", webpage)
1059 if video_description: video_description = clean_html(video_description.decode('utf8'))
1060 else: video_description = ''
1062 # Extract upload date
1063 video_upload_date = u'NA'
1064 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1065 if mobj is not None:
1066 video_upload_date = mobj.group(1)
1068 # Vimeo specific: extract request signature and timestamp
1069 sig = config['request']['signature']
1070 timestamp = config['request']['timestamp']
1072 # Vimeo specific: extract video codec and quality information
1073 # TODO bind to format param
1074 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1075 for codec in codecs:
1076 if codec[0] in config["video"]["files"]:
1077 video_codec = codec[0]
1078 video_extension = codec[1]
1079 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1080 else: quality = 'sd'
1083 self._downloader.trouble(u'ERROR: no known codec found')
1086 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1087 %(video_id, sig, timestamp, quality, video_codec.upper())
1092 'uploader': video_uploader,
1093 'upload_date': video_upload_date,
1094 'title': video_title,
1095 'stitle': simple_title,
1096 'ext': video_extension,
1097 'thumbnail': video_thumbnail,
1098 'description': video_description,
1103 class GenericIE(InfoExtractor):
1104 """Generic last-resort information extractor."""
1107 IE_NAME = u'generic'
1109 def __init__(self, downloader=None):
1110 InfoExtractor.__init__(self, downloader)
1112 def report_download_webpage(self, video_id):
1113 """Report webpage download."""
1114 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1115 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1117 def report_extraction(self, video_id):
1118 """Report information extraction."""
1119 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1121 def report_following_redirect(self, new_url):
1122 """Report information extraction."""
1123 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1125 def _test_redirect(self, url):
1126 """Check if it is a redirect, like url shorteners, in case restart chain."""
1127 class HeadRequest(urllib2.Request):
1128 def get_method(self):
1131 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1133 Subclass the HTTPRedirectHandler to make it use our
1134 HeadRequest also on the redirected URL
1136 def redirect_request(self, req, fp, code, msg, headers, newurl):
1137 if code in (301, 302, 303, 307):
1138 newurl = newurl.replace(' ', '%20')
1139 newheaders = dict((k,v) for k,v in req.headers.items()
1140 if k.lower() not in ("content-length", "content-type"))
1141 return HeadRequest(newurl,
1143 origin_req_host=req.get_origin_req_host(),
1146 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1148 class HTTPMethodFallback(urllib2.BaseHandler):
1150 Fallback to GET if HEAD is not allowed (405 HTTP error)
1152 def http_error_405(self, req, fp, code, msg, headers):
1156 newheaders = dict((k,v) for k,v in req.headers.items()
1157 if k.lower() not in ("content-length", "content-type"))
1158 return self.parent.open(urllib2.Request(req.get_full_url(),
1160 origin_req_host=req.get_origin_req_host(),
1164 opener = urllib2.OpenerDirector()
1165 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1166 HTTPMethodFallback, HEADRedirectHandler,
1167 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1168 opener.add_handler(handler())
1170 response = opener.open(HeadRequest(url))
1171 new_url = response.geturl()
1173 if url == new_url: return False
1175 self.report_following_redirect(new_url)
1176 self._downloader.download([new_url])
1179 def _real_extract(self, url):
1180 if self._test_redirect(url): return
1182 video_id = url.split('/')[-1]
1183 request = urllib2.Request(url)
1185 self.report_download_webpage(video_id)
1186 webpage = urllib2.urlopen(request).read()
1187 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1188 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1190 except ValueError, err:
1191 # since this is the last-resort InfoExtractor, if
1192 # this error is thrown, it'll be thrown here
1193 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 self.report_extraction(video_id)
1197 # Start with something easy: JW Player in SWFObject
1198 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1200 # Broaden the search a little bit
1201 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1203 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1206 # It's possible that one of the regexes
1207 # matched, but returned an empty group:
1208 if mobj.group(1) is None:
1209 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1212 video_url = urllib.unquote(mobj.group(1))
1213 video_id = os.path.basename(video_url)
1215 # here's a fun little line of code for you:
1216 video_extension = os.path.splitext(video_id)[1][1:]
1217 video_id = os.path.splitext(video_id)[0]
1219 # it's tempting to parse this further, but you would
1220 # have to take into account all the variations like
1221 # Video Title - Site Name
1222 # Site Name | Video Title
1223 # Video Title - Tagline | Site Name
1224 # and so on and so forth; it's just not practical
1225 mobj = re.search(r'<title>(.*)</title>', webpage)
1227 self._downloader.trouble(u'ERROR: unable to extract title')
1229 video_title = mobj.group(1).decode('utf-8')
1230 video_title = sanitize_title(video_title)
1231 simple_title = simplify_title(video_title)
1233 # video uploader is domain name
1234 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1236 self._downloader.trouble(u'ERROR: unable to extract title')
1238 video_uploader = mobj.group(1).decode('utf-8')
1241 'id': video_id.decode('utf-8'),
1242 'url': video_url.decode('utf-8'),
1243 'uploader': video_uploader,
1244 'upload_date': u'NA',
1245 'title': video_title,
1246 'stitle': simple_title,
1247 'ext': video_extension.decode('utf-8'),
1253 class YoutubeSearchIE(InfoExtractor):
1254 """Information Extractor for YouTube search queries."""
1255 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1256 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1257 _max_youtube_results = 1000
1258 IE_NAME = u'youtube:search'
1260 def __init__(self, downloader=None):
1261 InfoExtractor.__init__(self, downloader)
1263 def report_download_page(self, query, pagenum):
1264 """Report attempt to download playlist page with given number."""
1265 query = query.decode(preferredencoding())
1266 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1268 def _real_extract(self, query):
1269 mobj = re.match(self._VALID_URL, query)
1271 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1274 prefix, query = query.split(':')
1276 query = query.encode('utf-8')
1278 self._download_n_results(query, 1)
1280 elif prefix == 'all':
1281 self._download_n_results(query, self._max_youtube_results)
1287 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1289 elif n > self._max_youtube_results:
1290 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1291 n = self._max_youtube_results
1292 self._download_n_results(query, n)
1294 except ValueError: # parsing prefix as integer fails
1295 self._download_n_results(query, 1)
1298 def _download_n_results(self, query, n):
1299 """Downloads a specified number of results for a query"""
1305 while (50 * pagenum) < limit:
1306 self.report_download_page(query, pagenum+1)
1307 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1308 request = urllib2.Request(result_url)
1310 data = urllib2.urlopen(request).read()
1311 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1314 api_response = json.loads(data)['data']
1316 new_ids = list(video['id'] for video in api_response['items'])
1317 video_ids += new_ids
1319 limit = min(n, api_response['totalItems'])
1322 if len(video_ids) > n:
1323 video_ids = video_ids[:n]
1324 for id in video_ids:
1325 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1329 class GoogleSearchIE(InfoExtractor):
1330 """Information Extractor for Google Video search queries."""
1331 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1332 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1333 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1334 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1335 _max_google_results = 1000
1336 IE_NAME = u'video.google:search'
1338 def __init__(self, downloader=None):
1339 InfoExtractor.__init__(self, downloader)
1341 def report_download_page(self, query, pagenum):
1342 """Report attempt to download playlist page with given number."""
1343 query = query.decode(preferredencoding())
1344 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1346 def _real_extract(self, query):
1347 mobj = re.match(self._VALID_URL, query)
1349 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1352 prefix, query = query.split(':')
1354 query = query.encode('utf-8')
1356 self._download_n_results(query, 1)
1358 elif prefix == 'all':
1359 self._download_n_results(query, self._max_google_results)
1365 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1367 elif n > self._max_google_results:
1368 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1369 n = self._max_google_results
1370 self._download_n_results(query, n)
1372 except ValueError: # parsing prefix as integer fails
1373 self._download_n_results(query, 1)
1376 def _download_n_results(self, query, n):
1377 """Downloads a specified number of results for a query"""
1383 self.report_download_page(query, pagenum)
1384 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1385 request = urllib2.Request(result_url)
1387 page = urllib2.urlopen(request).read()
1388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1392 # Extract video identifiers
1393 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1394 video_id = mobj.group(1)
1395 if video_id not in video_ids:
1396 video_ids.append(video_id)
1397 if len(video_ids) == n:
1398 # Specified n videos reached
1399 for id in video_ids:
1400 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1403 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1404 for id in video_ids:
1405 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1408 pagenum = pagenum + 1
1411 class YahooSearchIE(InfoExtractor):
1412 """Information Extractor for Yahoo! Video search queries."""
1413 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1414 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1415 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1416 _MORE_PAGES_INDICATOR = r'\s*Next'
1417 _max_yahoo_results = 1000
1418 IE_NAME = u'video.yahoo:search'
1420 def __init__(self, downloader=None):
1421 InfoExtractor.__init__(self, downloader)
1423 def report_download_page(self, query, pagenum):
1424 """Report attempt to download playlist page with given number."""
1425 query = query.decode(preferredencoding())
1426 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1428 def _real_extract(self, query):
1429 mobj = re.match(self._VALID_URL, query)
1431 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434 prefix, query = query.split(':')
1436 query = query.encode('utf-8')
1438 self._download_n_results(query, 1)
1440 elif prefix == 'all':
1441 self._download_n_results(query, self._max_yahoo_results)
1447 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1449 elif n > self._max_yahoo_results:
1450 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1451 n = self._max_yahoo_results
1452 self._download_n_results(query, n)
1454 except ValueError: # parsing prefix as integer fails
1455 self._download_n_results(query, 1)
1458 def _download_n_results(self, query, n):
1459 """Downloads a specified number of results for a query"""
1462 already_seen = set()
1466 self.report_download_page(query, pagenum)
1467 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1468 request = urllib2.Request(result_url)
1470 page = urllib2.urlopen(request).read()
1471 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1475 # Extract video identifiers
1476 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1477 video_id = mobj.group(1)
1478 if video_id not in already_seen:
1479 video_ids.append(video_id)
1480 already_seen.add(video_id)
1481 if len(video_ids) == n:
1482 # Specified n videos reached
1483 for id in video_ids:
1484 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1487 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488 for id in video_ids:
1489 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1492 pagenum = pagenum + 1
1495 class YoutubePlaylistIE(InfoExtractor):
1496 """Information Extractor for YouTube playlists."""
1498 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1499 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1500 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1501 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1502 IE_NAME = u'youtube:playlist'
1504 def __init__(self, downloader=None):
1505 InfoExtractor.__init__(self, downloader)
1507 def report_download_page(self, playlist_id, pagenum):
1508 """Report attempt to download playlist page with given number."""
1509 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1511 def _real_extract(self, url):
1512 # Extract playlist id
1513 mobj = re.match(self._VALID_URL, url)
1515 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1519 if mobj.group(3) is not None:
1520 self._downloader.download([mobj.group(3)])
1523 # Download playlist pages
1524 # prefix is 'p' as default for playlists but there are other types that need extra care
1525 playlist_prefix = mobj.group(1)
1526 if playlist_prefix == 'a':
1527 playlist_access = 'artist'
1529 playlist_prefix = 'p'
1530 playlist_access = 'view_play_list'
1531 playlist_id = mobj.group(2)
1536 self.report_download_page(playlist_id, pagenum)
1537 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1538 request = urllib2.Request(url)
1540 page = urllib2.urlopen(request).read()
1541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1542 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1545 # Extract video identifiers
1547 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1548 if mobj.group(1) not in ids_in_page:
1549 ids_in_page.append(mobj.group(1))
1550 video_ids.extend(ids_in_page)
1552 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1554 pagenum = pagenum + 1
1556 playliststart = self._downloader.params.get('playliststart', 1) - 1
1557 playlistend = self._downloader.params.get('playlistend', -1)
1558 if playlistend == -1:
1559 video_ids = video_ids[playliststart:]
1561 video_ids = video_ids[playliststart:playlistend]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class YoutubeUserIE(InfoExtractor):
1569 """Information Extractor for YouTube users."""
1571 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1572 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1573 _GDATA_PAGE_SIZE = 50
1574 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1575 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1576 IE_NAME = u'youtube:user'
1578 def __init__(self, downloader=None):
1579 InfoExtractor.__init__(self, downloader)
1581 def report_download_page(self, username, start_index):
1582 """Report attempt to download user page."""
1583 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1584 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1586 def _real_extract(self, url):
1588 mobj = re.match(self._VALID_URL, url)
1590 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1593 username = mobj.group(1)
1595 # Download video ids using YouTube Data API. Result size per
1596 # query is limited (currently to 50 videos) so we need to query
1597 # page by page until there are no video ids - it means we got
1604 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1605 self.report_download_page(username, start_index)
1607 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1610 page = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1615 # Extract video identifiers
1618 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1619 if mobj.group(1) not in ids_in_page:
1620 ids_in_page.append(mobj.group(1))
1622 video_ids.extend(ids_in_page)
1624 # A little optimization - if current page is not
1625 # "full", ie. does not contain PAGE_SIZE video ids then
1626 # we can assume that this page is the last one - there
1627 # are no more ids on further pages - no need to query
1630 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1635 all_ids_count = len(video_ids)
1636 playliststart = self._downloader.params.get('playliststart', 1) - 1
1637 playlistend = self._downloader.params.get('playlistend', -1)
1639 if playlistend == -1:
1640 video_ids = video_ids[playliststart:]
1642 video_ids = video_ids[playliststart:playlistend]
1644 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1645 (username, all_ids_count, len(video_ids)))
1647 for video_id in video_ids:
1648 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1651 class DepositFilesIE(InfoExtractor):
1652 """Information extractor for depositfiles.com"""
1654 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1655 IE_NAME = u'DepositFiles'
1657 def __init__(self, downloader=None):
1658 InfoExtractor.__init__(self, downloader)
1660 def report_download_webpage(self, file_id):
1661 """Report webpage download."""
1662 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1664 def report_extraction(self, file_id):
1665 """Report information extraction."""
1666 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1668 def _real_extract(self, url):
1669 file_id = url.split('/')[-1]
1670 # Rebuild url in english locale
1671 url = 'http://depositfiles.com/en/files/' + file_id
1673 # Retrieve file webpage with 'Free download' button pressed
1674 free_download_indication = { 'gateway_result' : '1' }
1675 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1677 self.report_download_webpage(file_id)
1678 webpage = urllib2.urlopen(request).read()
1679 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1683 # Search for the real file URL
1684 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1685 if (mobj is None) or (mobj.group(1) is None):
1686 # Try to figure out reason of the error.
1687 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1688 if (mobj is not None) and (mobj.group(1) is not None):
1689 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1690 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1692 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1695 file_url = mobj.group(1)
1696 file_extension = os.path.splitext(file_url)[1][1:]
1698 # Search for file title
1699 mobj = re.search(r'<b title="(.*?)">', webpage)
1701 self._downloader.trouble(u'ERROR: unable to extract title')
1703 file_title = mobj.group(1).decode('utf-8')
1706 'id': file_id.decode('utf-8'),
1707 'url': file_url.decode('utf-8'),
1709 'upload_date': u'NA',
1710 'title': file_title,
1711 'stitle': file_title,
1712 'ext': file_extension.decode('utf-8'),
1718 class FacebookIE(InfoExtractor):
1719 """Information Extractor for Facebook"""
1721 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1722 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1723 _NETRC_MACHINE = 'facebook'
1724 _available_formats = ['video', 'highqual', 'lowqual']
1725 _video_extensions = {
1730 IE_NAME = u'facebook'
1732 def __init__(self, downloader=None):
1733 InfoExtractor.__init__(self, downloader)
1735 def _reporter(self, message):
1736 """Add header and report message."""
1737 self._downloader.to_screen(u'[facebook] %s' % message)
1739 def report_login(self):
1740 """Report attempt to log in."""
1741 self._reporter(u'Logging in')
1743 def report_video_webpage_download(self, video_id):
1744 """Report attempt to download video webpage."""
1745 self._reporter(u'%s: Downloading video webpage' % video_id)
1747 def report_information_extraction(self, video_id):
1748 """Report attempt to extract video information."""
1749 self._reporter(u'%s: Extracting video information' % video_id)
1751 def _parse_page(self, video_webpage):
1752 """Extract video information from page"""
1754 data = {'title': r'\("video_title", "(.*?)"\)',
1755 'description': r'<div class="datawrap">(.*?)</div>',
1756 'owner': r'\("video_owner_name", "(.*?)"\)',
1757 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1760 for piece in data.keys():
1761 mobj = re.search(data[piece], video_webpage)
1762 if mobj is not None:
1763 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1767 for fmt in self._available_formats:
1768 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1769 if mobj is not None:
1770 # URL is in a Javascript segment inside an escaped Unicode format within
1771 # the generally utf-8 page
1772 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1773 video_info['video_urls'] = video_urls
1777 def _real_initialize(self):
1778 if self._downloader is None:
1783 downloader_params = self._downloader.params
1785 # Attempt to use provided username and password or .netrc data
1786 if downloader_params.get('username', None) is not None:
1787 useremail = downloader_params['username']
1788 password = downloader_params['password']
1789 elif downloader_params.get('usenetrc', False):
1791 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1792 if info is not None:
1796 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1797 except (IOError, netrc.NetrcParseError), err:
1798 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1801 if useremail is None:
1810 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1813 login_results = urllib2.urlopen(request).read()
1814 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1815 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1817 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1821 def _real_extract(self, url):
1822 mobj = re.match(self._VALID_URL, url)
1824 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1826 video_id = mobj.group('ID')
1829 self.report_video_webpage_download(video_id)
1830 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1832 page = urllib2.urlopen(request)
1833 video_webpage = page.read()
1834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1838 # Start extracting information
1839 self.report_information_extraction(video_id)
1841 # Extract information
1842 video_info = self._parse_page(video_webpage)
1845 if 'owner' not in video_info:
1846 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1848 video_uploader = video_info['owner']
1851 if 'title' not in video_info:
1852 self._downloader.trouble(u'ERROR: unable to extract video title')
1854 video_title = video_info['title']
1855 video_title = video_title.decode('utf-8')
1856 video_title = sanitize_title(video_title)
1858 simple_title = simplify_title(video_title)
1861 if 'thumbnail' not in video_info:
1862 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1863 video_thumbnail = ''
1865 video_thumbnail = video_info['thumbnail']
1869 if 'upload_date' in video_info:
1870 upload_time = video_info['upload_date']
1871 timetuple = email.utils.parsedate_tz(upload_time)
1872 if timetuple is not None:
1874 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1879 video_description = video_info.get('description', 'No description available.')
1881 url_map = video_info['video_urls']
1882 if len(url_map.keys()) > 0:
1883 # Decide which formats to download
1884 req_format = self._downloader.params.get('format', None)
1885 format_limit = self._downloader.params.get('format_limit', None)
1887 if format_limit is not None and format_limit in self._available_formats:
1888 format_list = self._available_formats[self._available_formats.index(format_limit):]
1890 format_list = self._available_formats
1891 existing_formats = [x for x in format_list if x in url_map]
1892 if len(existing_formats) == 0:
1893 self._downloader.trouble(u'ERROR: no known formats available for video')
1895 if req_format is None:
1896 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1897 elif req_format == 'worst':
1898 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1899 elif req_format == '-1':
1900 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1903 if req_format not in url_map:
1904 self._downloader.trouble(u'ERROR: requested format not available')
1906 video_url_list = [(req_format, url_map[req_format])] # Specific format
1909 for format_param, video_real_url in video_url_list:
1911 video_extension = self._video_extensions.get(format_param, 'mp4')
1914 'id': video_id.decode('utf-8'),
1915 'url': video_real_url.decode('utf-8'),
1916 'uploader': video_uploader.decode('utf-8'),
1917 'upload_date': upload_date,
1918 'title': video_title,
1919 'stitle': simple_title,
1920 'ext': video_extension.decode('utf-8'),
1921 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1922 'thumbnail': video_thumbnail.decode('utf-8'),
1923 'description': video_description.decode('utf-8'),
1928 class BlipTVIE(InfoExtractor):
1929 """Information extractor for blip.tv"""
1931 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1932 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1933 IE_NAME = u'blip.tv'
1935 def report_extraction(self, file_id):
1936 """Report information extraction."""
1937 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1939 def report_direct_download(self, title):
1940 """Report information extraction."""
1941 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1943 def _real_extract(self, url):
1944 mobj = re.match(self._VALID_URL, url)
1946 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1953 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1954 request = urllib2.Request(json_url)
1955 self.report_extraction(mobj.group(1))
1958 urlh = urllib2.urlopen(request)
1959 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1960 basename = url.split('/')[-1]
1961 title,ext = os.path.splitext(basename)
1962 title = title.decode('UTF-8')
1963 ext = ext.replace('.', '')
1964 self.report_direct_download(title)
1969 'stitle': simplify_title(title),
1973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1976 if info is None: # Regular URL
1978 json_code = urlh.read()
1979 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1984 json_data = json.loads(json_code)
1985 if 'Post' in json_data:
1986 data = json_data['Post']
1990 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1991 video_url = data['media']['url']
1992 umobj = re.match(self._URL_EXT, video_url)
1994 raise ValueError('Can not determine filename extension')
1995 ext = umobj.group(1)
1998 'id': data['item_id'],
2000 'uploader': data['display_name'],
2001 'upload_date': upload_date,
2002 'title': data['title'],
2003 'stitle': simplify_title(data['title']),
2005 'format': data['media']['mimeType'],
2006 'thumbnail': data['thumbnailUrl'],
2007 'description': data['description'],
2008 'player_url': data['embedUrl']
2010 except (ValueError,KeyError), err:
2011 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2017 class MyVideoIE(InfoExtractor):
2018 """Information Extractor for myvideo.de."""
2020 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2021 IE_NAME = u'myvideo'
2023 def __init__(self, downloader=None):
2024 InfoExtractor.__init__(self, downloader)
2026 def report_download_webpage(self, video_id):
2027 """Report webpage download."""
2028 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2030 def report_extraction(self, video_id):
2031 """Report information extraction."""
2032 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2034 def _real_extract(self,url):
2035 mobj = re.match(self._VALID_URL, url)
2037 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2040 video_id = mobj.group(1)
2043 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2045 self.report_download_webpage(video_id)
2046 webpage = urllib2.urlopen(request).read()
2047 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2051 self.report_extraction(video_id)
2052 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2055 self._downloader.trouble(u'ERROR: unable to extract media URL')
2057 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2059 mobj = re.search('<title>([^<]+)</title>', webpage)
2061 self._downloader.trouble(u'ERROR: unable to extract title')
2064 video_title = mobj.group(1)
2065 video_title = sanitize_title(video_title)
2067 simple_title = simplify_title(video_title)
2073 'upload_date': u'NA',
2074 'title': video_title,
2075 'stitle': simple_title,
2081 class ComedyCentralIE(InfoExtractor):
2082 """Information extractor for The Daily Show and Colbert Report """
2084 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2085 IE_NAME = u'comedycentral'
2087 def report_extraction(self, episode_id):
2088 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2090 def report_config_download(self, episode_id):
2091 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2093 def report_index_download(self, episode_id):
2094 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2096 def report_player_url(self, episode_id):
2097 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2099 def _real_extract(self, url):
2100 mobj = re.match(self._VALID_URL, url)
2102 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2105 if mobj.group('shortname'):
2106 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2107 url = u'http://www.thedailyshow.com/full-episodes/'
2109 url = u'http://www.colbertnation.com/full-episodes/'
2110 mobj = re.match(self._VALID_URL, url)
2111 assert mobj is not None
2113 dlNewest = not mobj.group('episode')
2115 epTitle = mobj.group('showname')
2117 epTitle = mobj.group('episode')
2119 req = urllib2.Request(url)
2120 self.report_extraction(epTitle)
2122 htmlHandle = urllib2.urlopen(req)
2123 html = htmlHandle.read()
2124 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2125 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2128 url = htmlHandle.geturl()
2129 mobj = re.match(self._VALID_URL, url)
2131 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2133 if mobj.group('episode') == '':
2134 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2136 epTitle = mobj.group('episode')
2138 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2139 if len(mMovieParams) == 0:
2140 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2143 playerUrl_raw = mMovieParams[0][0]
2144 self.report_player_url(epTitle)
2146 urlHandle = urllib2.urlopen(playerUrl_raw)
2147 playerUrl = urlHandle.geturl()
2148 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2152 uri = mMovieParams[0][1]
2153 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2154 self.report_index_download(epTitle)
2156 indexXml = urllib2.urlopen(indexUrl).read()
2157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2163 idoc = xml.etree.ElementTree.fromstring(indexXml)
2164 itemEls = idoc.findall('.//item')
2165 for itemEl in itemEls:
2166 mediaId = itemEl.findall('./guid')[0].text
2167 shortMediaId = mediaId.split(':')[-1]
2168 showId = mediaId.split(':')[-2].replace('.com', '')
2169 officialTitle = itemEl.findall('./title')[0].text
2170 officialDate = itemEl.findall('./pubDate')[0].text
2172 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2173 urllib.urlencode({'uri': mediaId}))
2174 configReq = urllib2.Request(configUrl)
2175 self.report_config_download(epTitle)
2177 configXml = urllib2.urlopen(configReq).read()
2178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2179 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2182 cdoc = xml.etree.ElementTree.fromstring(configXml)
2184 for rendition in cdoc.findall('.//rendition'):
2185 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2189 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2192 # For now, just pick the highest bitrate
2193 format,video_url = turls[-1]
2195 effTitle = showId + u'-' + epTitle
2200 'upload_date': officialDate,
2202 'stitle': simplify_title(effTitle),
2206 'description': officialTitle,
2207 'player_url': playerUrl
2210 results.append(info)
2215 class EscapistIE(InfoExtractor):
2216 """Information extractor for The Escapist """
2218 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2219 IE_NAME = u'escapist'
2221 def report_extraction(self, showName):
2222 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2224 def report_config_download(self, showName):
2225 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2227 def _real_extract(self, url):
2228 mobj = re.match(self._VALID_URL, url)
2230 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2232 showName = mobj.group('showname')
2233 videoId = mobj.group('episode')
2235 self.report_extraction(showName)
2237 webPage = urllib2.urlopen(url).read()
2238 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2239 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2242 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2243 description = unescapeHTML(descMatch.group(1))
2244 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2245 imgUrl = unescapeHTML(imgMatch.group(1))
2246 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2247 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2248 configUrlMatch = re.search('config=(.*)$', playerUrl)
2249 configUrl = urllib2.unquote(configUrlMatch.group(1))
2251 self.report_config_download(showName)
2253 configJSON = urllib2.urlopen(configUrl).read()
2254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2258 # Technically, it's JavaScript, not JSON
2259 configJSON = configJSON.replace("'", '"')
2262 config = json.loads(configJSON)
2263 except (ValueError,), err:
2264 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2267 playlist = config['playlist']
2268 videoUrl = playlist[1]['url']
2273 'uploader': showName,
2274 'upload_date': None,
2276 'stitle': simplify_title(showName),
2279 'thumbnail': imgUrl,
2280 'description': description,
2281 'player_url': playerUrl,
2287 class CollegeHumorIE(InfoExtractor):
2288 """Information extractor for collegehumor.com"""
2290 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2291 IE_NAME = u'collegehumor'
2293 def report_webpage(self, video_id):
2294 """Report information extraction."""
2295 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2297 def report_extraction(self, video_id):
2298 """Report information extraction."""
2299 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2301 def _real_extract(self, url):
2302 mobj = re.match(self._VALID_URL, url)
2304 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2306 video_id = mobj.group('videoid')
2308 self.report_webpage(video_id)
2309 request = urllib2.Request(url)
2311 webpage = urllib2.urlopen(request).read()
2312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2316 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2318 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2320 internal_video_id = m.group('internalvideoid')
2324 'internal_id': internal_video_id,
2327 self.report_extraction(video_id)
2328 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2330 metaXml = urllib2.urlopen(xmlUrl).read()
2331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2335 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2337 videoNode = mdoc.findall('./video')[0]
2338 info['description'] = videoNode.findall('./description')[0].text
2339 info['title'] = videoNode.findall('./caption')[0].text
2340 info['stitle'] = simplify_title(info['title'])
2341 info['url'] = videoNode.findall('./file')[0].text
2342 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2343 info['ext'] = info['url'].rpartition('.')[2]
2344 info['format'] = info['ext']
2346 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2352 class XVideosIE(InfoExtractor):
2353 """Information extractor for xvideos.com"""
2355 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2356 IE_NAME = u'xvideos'
2358 def report_webpage(self, video_id):
2359 """Report information extraction."""
2360 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2362 def report_extraction(self, video_id):
2363 """Report information extraction."""
2364 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2366 def _real_extract(self, url):
2367 mobj = re.match(self._VALID_URL, url)
2369 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2371 video_id = mobj.group(1).decode('utf-8')
2373 self.report_webpage(video_id)
2375 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2377 webpage = urllib2.urlopen(request).read()
2378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2382 self.report_extraction(video_id)
2386 mobj = re.search(r'flv_url=(.+?)&', webpage)
2388 self._downloader.trouble(u'ERROR: unable to extract video url')
2390 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2394 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2396 self._downloader.trouble(u'ERROR: unable to extract video title')
2398 video_title = mobj.group(1).decode('utf-8')
2401 # Extract video thumbnail
2402 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2404 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2406 video_thumbnail = mobj.group(1).decode('utf-8')
2412 'upload_date': None,
2413 'title': video_title,
2414 'stitle': simplify_title(video_title),
2417 'thumbnail': video_thumbnail,
2418 'description': None,
2425 class SoundcloudIE(InfoExtractor):
2426 """Information extractor for soundcloud.com
2427 To access the media, the uid of the song and a stream token
2428 must be extracted from the page source and the script must make
2429 a request to media.soundcloud.com/crossdomain.xml. Then
2430 the media can be grabbed by requesting from an url composed
2431 of the stream token and uid
2434 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2435 IE_NAME = u'soundcloud'
2437 def __init__(self, downloader=None):
2438 InfoExtractor.__init__(self, downloader)
2440 def report_webpage(self, video_id):
2441 """Report information extraction."""
2442 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2444 def report_extraction(self, video_id):
2445 """Report information extraction."""
2446 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2448 def _real_extract(self, url):
2449 mobj = re.match(self._VALID_URL, url)
2451 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2454 # extract uploader (which is in the url)
2455 uploader = mobj.group(1).decode('utf-8')
2456 # extract simple title (uploader + slug of song title)
2457 slug_title = mobj.group(2).decode('utf-8')
2458 simple_title = uploader + '-' + slug_title
2460 self.report_webpage('%s/%s' % (uploader, slug_title))
2462 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2464 webpage = urllib2.urlopen(request).read()
2465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2469 self.report_extraction('%s/%s' % (uploader, slug_title))
2471 # extract uid and stream token that soundcloud hands out for access
2472 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2474 video_id = mobj.group(1)
2475 stream_token = mobj.group(2)
2477 # extract unsimplified title
2478 mobj = re.search('"title":"(.*?)",', webpage)
2480 title = mobj.group(1)
2482 # construct media url (with uid/token)
2483 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2484 mediaURL = mediaURL % (video_id, stream_token)
2487 description = u'No description available'
2488 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2490 description = mobj.group(1)
2494 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2497 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2498 except Exception, e:
2501 # for soundcloud, a request to a cross domain is required for cookies
2502 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2505 'id': video_id.decode('utf-8'),
2507 'uploader': uploader.decode('utf-8'),
2508 'upload_date': upload_date,
2509 'title': simple_title.decode('utf-8'),
2510 'stitle': simple_title.decode('utf-8'),
2514 'description': description.decode('utf-8')
2518 class InfoQIE(InfoExtractor):
2519 """Information extractor for infoq.com"""
2521 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2524 def report_webpage(self, video_id):
2525 """Report information extraction."""
2526 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2528 def report_extraction(self, video_id):
2529 """Report information extraction."""
2530 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2535 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2538 self.report_webpage(url)
2540 request = urllib2.Request(url)
2542 webpage = urllib2.urlopen(request).read()
2543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2547 self.report_extraction(url)
2551 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2553 self._downloader.trouble(u'ERROR: unable to extract video url')
2555 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2559 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2561 self._downloader.trouble(u'ERROR: unable to extract video title')
2563 video_title = mobj.group(1).decode('utf-8')
2565 # Extract description
2566 video_description = u'No description available.'
2567 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2568 if mobj is not None:
2569 video_description = mobj.group(1).decode('utf-8')
2571 video_filename = video_url.split('/')[-1]
2572 video_id, extension = video_filename.split('.')
2578 'upload_date': None,
2579 'title': video_title,
2580 'stitle': simplify_title(video_title),
2582 'format': extension, # Extension is always(?) mp4, but seems to be flv
2584 'description': video_description,
2590 class MixcloudIE(InfoExtractor):
2591 """Information extractor for www.mixcloud.com"""
2592 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2593 IE_NAME = u'mixcloud'
2595 def __init__(self, downloader=None):
2596 InfoExtractor.__init__(self, downloader)
2598 def report_download_json(self, file_id):
2599 """Report JSON download."""
2600 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2602 def report_extraction(self, file_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2606 def get_urls(self, jsonData, fmt, bitrate='best'):
2607 """Get urls from 'audio_formats' section in json"""
2610 bitrate_list = jsonData[fmt]
2611 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2612 bitrate = max(bitrate_list) # select highest
2614 url_list = jsonData[fmt][bitrate]
2615 except TypeError: # we have no bitrate info.
2616 url_list = jsonData[fmt]
2620 def check_urls(self, url_list):
2621 """Returns 1st active url from list"""
2622 for url in url_list:
2624 urllib2.urlopen(url)
2626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631 def _print_formats(self, formats):
2632 print 'Available formats:'
2633 for fmt in formats.keys():
2634 for b in formats[fmt]:
2636 ext = formats[fmt][b][0]
2637 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2638 except TypeError: # we have no bitrate info
2639 ext = formats[fmt][0]
2640 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2643 def _real_extract(self, url):
2644 mobj = re.match(self._VALID_URL, url)
2646 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2648 # extract uploader & filename from url
2649 uploader = mobj.group(1).decode('utf-8')
2650 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2652 # construct API request
2653 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2654 # retrieve .json file with links to files
2655 request = urllib2.Request(file_url)
2657 self.report_download_json(file_url)
2658 jsonData = urllib2.urlopen(request).read()
2659 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2660 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2664 json_data = json.loads(jsonData)
2665 player_url = json_data['player_swf_url']
2666 formats = dict(json_data['audio_formats'])
2668 req_format = self._downloader.params.get('format', None)
2671 if self._downloader.params.get('listformats', None):
2672 self._print_formats(formats)
2675 if req_format is None or req_format == 'best':
2676 for format_param in formats.keys():
2677 url_list = self.get_urls(formats, format_param)
2679 file_url = self.check_urls(url_list)
2680 if file_url is not None:
2683 if req_format not in formats.keys():
2684 self._downloader.trouble(u'ERROR: format is not available')
2687 url_list = self.get_urls(formats, req_format)
2688 file_url = self.check_urls(url_list)
2689 format_param = req_format
2692 'id': file_id.decode('utf-8'),
2693 'url': file_url.decode('utf-8'),
2694 'uploader': uploader.decode('utf-8'),
2695 'upload_date': u'NA',
2696 'title': json_data['name'],
2697 'stitle': simplify_title(json_data['name']),
2698 'ext': file_url.split('.')[-1].decode('utf-8'),
2699 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2700 'thumbnail': json_data['thumbnail_url'],
2701 'description': json_data['description'],
2702 'player_url': player_url.decode('utf-8'),
2705 class StanfordOpenClassroomIE(InfoExtractor):
2706 """Information extractor for Stanford's Open ClassRoom"""
2708 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2709 IE_NAME = u'stanfordoc'
2711 def report_download_webpage(self, objid):
2712 """Report information extraction."""
2713 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2715 def report_extraction(self, video_id):
2716 """Report information extraction."""
2717 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2719 def _real_extract(self, url):
2720 mobj = re.match(self._VALID_URL, url)
2722 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2725 if mobj.group('course') and mobj.group('video'): # A specific video
2726 course = mobj.group('course')
2727 video = mobj.group('video')
2729 'id': simplify_title(course + '_' + video),
2732 self.report_extraction(info['id'])
2733 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2734 xmlUrl = baseUrl + video + '.xml'
2736 metaXml = urllib2.urlopen(xmlUrl).read()
2737 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2738 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2740 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2742 info['title'] = mdoc.findall('./title')[0].text
2743 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2745 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2747 info['stitle'] = simplify_title(info['title'])
2748 info['ext'] = info['url'].rpartition('.')[2]
2749 info['format'] = info['ext']
2751 elif mobj.group('course'): # A course page
2752 course = mobj.group('course')
2754 'id': simplify_title(course),
2758 self.report_download_webpage(info['id'])
2760 coursepage = urllib2.urlopen(url).read()
2761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2765 m = re.search('<h1>([^<]+)</h1>', coursepage)
2767 info['title'] = unescapeHTML(m.group(1))
2769 info['title'] = info['id']
2770 info['stitle'] = simplify_title(info['title'])
2772 m = re.search('<description>([^<]+)</description>', coursepage)
2774 info['description'] = unescapeHTML(m.group(1))
2776 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2779 'type': 'reference',
2780 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2784 for entry in info['list']:
2785 assert entry['type'] == 'reference'
2786 results += self.extract(entry['url'])
2791 'id': 'Stanford OpenClassroom',
2795 self.report_download_webpage(info['id'])
2796 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2798 rootpage = urllib2.urlopen(rootURL).read()
2799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2803 info['title'] = info['id']
2804 info['stitle'] = simplify_title(info['title'])
2806 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2809 'type': 'reference',
2810 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2815 for entry in info['list']:
2816 assert entry['type'] == 'reference'
2817 results += self.extract(entry['url'])
2820 class MTVIE(InfoExtractor):
2821 """Information extractor for MTV.com"""
2823 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2826 def report_webpage(self, video_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2830 def report_extraction(self, video_id):
2831 """Report information extraction."""
2832 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2834 def _real_extract(self, url):
2835 mobj = re.match(self._VALID_URL, url)
2837 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2839 if not mobj.group('proto'):
2840 url = 'http://' + url
2841 video_id = mobj.group('videoid')
2842 self.report_webpage(video_id)
2844 request = urllib2.Request(url)
2846 webpage = urllib2.urlopen(request).read()
2847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2848 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2851 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2853 self._downloader.trouble(u'ERROR: unable to extract song name')
2855 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2856 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2858 self._downloader.trouble(u'ERROR: unable to extract performer')
2860 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2861 video_title = performer + ' - ' + song_name
2863 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2865 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2867 mtvn_uri = mobj.group(1)
2869 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2871 self._downloader.trouble(u'ERROR: unable to extract content id')
2873 content_id = mobj.group(1)
2875 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2876 self.report_extraction(video_id)
2877 request = urllib2.Request(videogen_url)
2879 metadataXml = urllib2.urlopen(request).read()
2880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2884 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2885 renditions = mdoc.findall('.//rendition')
2887 # For now, always pick the highest quality.
2888 rendition = renditions[-1]
2891 _,_,ext = rendition.attrib['type'].partition('/')
2892 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2893 video_url = rendition.find('./src').text
2895 self._downloader.trouble('Invalid rendition field.')
2901 'uploader': performer,
2902 'title': video_title,
2903 'stitle': simplify_title(video_title),