2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube.majestyc.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
185 srt += str(n+1) + '\n'
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self.report_information_extraction(video_id)
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
370 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
371 if not srt_lang_list:
372 raise Trouble(u'WARNING: video has no closed captions')
373 if self._downloader.params.get('subtitleslang', False):
374 srt_lang = self._downloader.params.get('subtitleslang')
375 elif 'en' in srt_lang_list:
378 srt_lang = srt_lang_list.keys()[0]
379 if not srt_lang in srt_lang_list:
380 raise Trouble(u'WARNING: no closed captions found in the specified language')
381 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
383 srt_xml = urllib2.urlopen(request).read()
384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
387 raise Trouble(u'WARNING: unable to download video subtitles')
388 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
389 except Trouble as trouble:
390 self._downloader.trouble(trouble[0])
393 video_token = urllib.unquote_plus(video_info['token'][0])
395 # Decide which formats to download
396 req_format = self._downloader.params.get('format', None)
398 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
399 self.report_rtmp_download()
400 video_url_list = [(None, video_info['conn'][0])]
401 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
402 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
403 url_data = [parse_qs(uds) for uds in url_data_strs]
404 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
405 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
407 format_limit = self._downloader.params.get('format_limit', None)
408 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
409 if format_limit is not None and format_limit in available_formats:
410 format_list = available_formats[available_formats.index(format_limit):]
412 format_list = available_formats
413 existing_formats = [x for x in format_list if x in url_map]
414 if len(existing_formats) == 0:
415 self._downloader.trouble(u'ERROR: no known formats available for video')
417 if self._downloader.params.get('listformats', None):
418 self._print_formats(existing_formats)
420 if req_format is None or req_format == 'best':
421 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
422 elif req_format == 'worst':
423 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
424 elif req_format in ('-1', 'all'):
425 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
427 # Specific formats. We pick the first in a slash-delimeted sequence.
428 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
429 req_formats = req_format.split('/')
430 video_url_list = None
431 for rf in req_formats:
433 video_url_list = [(rf, url_map[rf])]
435 if video_url_list is None:
436 self._downloader.trouble(u'ERROR: requested format not available')
439 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
443 for format_param, video_real_url in video_url_list:
445 video_extension = self._video_extensions.get(format_param, 'flv')
449 'id': video_id.decode('utf-8'),
450 'url': video_real_url.decode('utf-8'),
451 'uploader': video_uploader.decode('utf-8'),
452 'upload_date': upload_date,
453 'title': video_title,
454 'ext': video_extension.decode('utf-8'),
455 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
456 'thumbnail': video_thumbnail.decode('utf-8'),
457 'description': video_description,
458 'player_url': player_url,
459 'subtitles': video_subtitles
464 class MetacafeIE(InfoExtractor):
465 """Information Extractor for metacafe.com."""
467 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
468 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
469 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
470 IE_NAME = u'metacafe'
472 def __init__(self, downloader=None):
473 InfoExtractor.__init__(self, downloader)
475 def report_disclaimer(self):
476 """Report disclaimer retrieval."""
477 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
479 def report_age_confirmation(self):
480 """Report attempt to confirm age."""
481 self._downloader.to_screen(u'[metacafe] Confirming age')
483 def report_download_webpage(self, video_id):
484 """Report webpage download."""
485 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
487 def report_extraction(self, video_id):
488 """Report information extraction."""
489 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
491 def _real_initialize(self):
492 # Retrieve disclaimer
493 request = urllib2.Request(self._DISCLAIMER)
495 self.report_disclaimer()
496 disclaimer = urllib2.urlopen(request).read()
497 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
498 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
504 'submit': "Continue - I'm over 18",
506 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
508 self.report_age_confirmation()
509 disclaimer = urllib2.urlopen(request).read()
510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
511 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
514 def _real_extract(self, url):
515 # Extract id and simplified title from URL
516 mobj = re.match(self._VALID_URL, url)
518 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
521 video_id = mobj.group(1)
523 # Check if video comes from YouTube
524 mobj2 = re.match(r'^yt-(.*)$', video_id)
525 if mobj2 is not None:
526 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
529 # Retrieve video webpage to extract further information
530 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
532 self.report_download_webpage(video_id)
533 webpage = urllib2.urlopen(request).read()
534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
535 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
538 # Extract URL, uploader and title from webpage
539 self.report_extraction(video_id)
540 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
542 mediaURL = urllib.unquote(mobj.group(1))
543 video_extension = mediaURL[-3:]
545 # Extract gdaKey if available
546 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
550 gdaKey = mobj.group(1)
551 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
553 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
557 vardict = parse_qs(mobj.group(1))
558 if 'mediaData' not in vardict:
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
561 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
563 self._downloader.trouble(u'ERROR: unable to extract media URL')
565 mediaURL = mobj.group(1).replace('\\/', '/')
566 video_extension = mediaURL[-3:]
567 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
569 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
571 self._downloader.trouble(u'ERROR: unable to extract title')
573 video_title = mobj.group(1).decode('utf-8')
575 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
577 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
579 video_uploader = mobj.group(1)
583 'id': video_id.decode('utf-8'),
584 'url': video_url.decode('utf-8'),
585 'uploader': video_uploader.decode('utf-8'),
586 'upload_date': u'NA',
587 'title': video_title,
588 'ext': video_extension.decode('utf-8'),
594 class DailymotionIE(InfoExtractor):
595 """Information Extractor for Dailymotion"""
597 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
598 IE_NAME = u'dailymotion'
600 def __init__(self, downloader=None):
601 InfoExtractor.__init__(self, downloader)
603 def report_download_webpage(self, video_id):
604 """Report webpage download."""
605 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
607 def report_extraction(self, video_id):
608 """Report information extraction."""
609 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
611 def _real_extract(self, url):
612 # Extract id and simplified title from URL
613 mobj = re.match(self._VALID_URL, url)
615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 video_extension = 'flv'
622 # Retrieve video webpage to extract further information
623 request = urllib2.Request(url)
624 request.add_header('Cookie', 'family_filter=off')
626 self.report_download_webpage(video_id)
627 webpage = urllib2.urlopen(request).read()
628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
632 # Extract URL, uploader and title from webpage
633 self.report_extraction(video_id)
634 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 sequence = urllib.unquote(mobj.group(1))
639 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
645 # if needed add http://www.dailymotion.com/ if relative URL
649 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
651 self._downloader.trouble(u'ERROR: unable to extract title')
653 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
655 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
657 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
659 video_uploader = mobj.group(1)
663 'id': video_id.decode('utf-8'),
664 'url': video_url.decode('utf-8'),
665 'uploader': video_uploader.decode('utf-8'),
666 'upload_date': u'NA',
667 'title': video_title,
668 'ext': video_extension.decode('utf-8'),
674 class GoogleIE(InfoExtractor):
675 """Information extractor for video.google.com."""
677 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
678 IE_NAME = u'video.google'
680 def __init__(self, downloader=None):
681 InfoExtractor.__init__(self, downloader)
683 def report_download_webpage(self, video_id):
684 """Report webpage download."""
685 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
687 def report_extraction(self, video_id):
688 """Report information extraction."""
689 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
691 def _real_extract(self, url):
692 # Extract id from URL
693 mobj = re.match(self._VALID_URL, url)
695 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
698 video_id = mobj.group(1)
700 video_extension = 'mp4'
702 # Retrieve video webpage to extract further information
703 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
705 self.report_download_webpage(video_id)
706 webpage = urllib2.urlopen(request).read()
707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
708 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
711 # Extract URL, uploader, and title from webpage
712 self.report_extraction(video_id)
713 mobj = re.search(r"download_url:'([^']+)'", webpage)
715 video_extension = 'flv'
716 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
718 self._downloader.trouble(u'ERROR: unable to extract media URL')
720 mediaURL = urllib.unquote(mobj.group(1))
721 mediaURL = mediaURL.replace('\\x3d', '\x3d')
722 mediaURL = mediaURL.replace('\\x26', '\x26')
726 mobj = re.search(r'<title>(.*)</title>', webpage)
728 self._downloader.trouble(u'ERROR: unable to extract title')
730 video_title = mobj.group(1).decode('utf-8')
732 # Extract video description
733 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
735 self._downloader.trouble(u'ERROR: unable to extract video description')
737 video_description = mobj.group(1).decode('utf-8')
738 if not video_description:
739 video_description = 'No description available.'
741 # Extract video thumbnail
742 if self._downloader.params.get('forcethumbnail', False):
743 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
745 webpage = urllib2.urlopen(request).read()
746 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
747 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
749 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
751 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
753 video_thumbnail = mobj.group(1)
754 else: # we need something to pass to process_info
759 'id': video_id.decode('utf-8'),
760 'url': video_url.decode('utf-8'),
762 'upload_date': u'NA',
763 'title': video_title,
764 'ext': video_extension.decode('utf-8'),
770 class PhotobucketIE(InfoExtractor):
771 """Information extractor for photobucket.com."""
773 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
774 IE_NAME = u'photobucket'
776 def __init__(self, downloader=None):
777 InfoExtractor.__init__(self, downloader)
779 def report_download_webpage(self, video_id):
780 """Report webpage download."""
781 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
783 def report_extraction(self, video_id):
784 """Report information extraction."""
785 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
787 def _real_extract(self, url):
788 # Extract id from URL
789 mobj = re.match(self._VALID_URL, url)
791 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
794 video_id = mobj.group(1)
796 video_extension = 'flv'
798 # Retrieve video webpage to extract further information
799 request = urllib2.Request(url)
801 self.report_download_webpage(video_id)
802 webpage = urllib2.urlopen(request).read()
803 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
804 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
807 # Extract URL, uploader, and title from webpage
808 self.report_extraction(video_id)
809 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
811 self._downloader.trouble(u'ERROR: unable to extract media URL')
813 mediaURL = urllib.unquote(mobj.group(1))
817 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
819 self._downloader.trouble(u'ERROR: unable to extract title')
821 video_title = mobj.group(1).decode('utf-8')
823 video_uploader = mobj.group(2).decode('utf-8')
827 'id': video_id.decode('utf-8'),
828 'url': video_url.decode('utf-8'),
829 'uploader': video_uploader,
830 'upload_date': u'NA',
831 'title': video_title,
832 'ext': video_extension.decode('utf-8'),
838 class YahooIE(InfoExtractor):
839 """Information extractor for video.yahoo.com."""
841 # _VALID_URL matches all Yahoo! Video URLs
842 # _VPAGE_URL matches only the extractable '/watch/' URLs
843 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
844 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
845 IE_NAME = u'video.yahoo'
847 def __init__(self, downloader=None):
848 InfoExtractor.__init__(self, downloader)
850 def report_download_webpage(self, video_id):
851 """Report webpage download."""
852 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
854 def report_extraction(self, video_id):
855 """Report information extraction."""
856 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
858 def _real_extract(self, url, new_video=True):
859 # Extract ID from URL
860 mobj = re.match(self._VALID_URL, url)
862 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
865 video_id = mobj.group(2)
866 video_extension = 'flv'
868 # Rewrite valid but non-extractable URLs as
869 # extractable English language /watch/ URLs
870 if re.match(self._VPAGE_URL, url) is None:
871 request = urllib2.Request(url)
873 webpage = urllib2.urlopen(request).read()
874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
875 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
878 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
880 self._downloader.trouble(u'ERROR: Unable to extract id field')
882 yahoo_id = mobj.group(1)
884 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
886 self._downloader.trouble(u'ERROR: Unable to extract vid field')
888 yahoo_vid = mobj.group(1)
890 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
891 return self._real_extract(url, new_video=False)
893 # Retrieve video webpage to extract further information
894 request = urllib2.Request(url)
896 self.report_download_webpage(video_id)
897 webpage = urllib2.urlopen(request).read()
898 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
899 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
902 # Extract uploader and title from webpage
903 self.report_extraction(video_id)
904 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video title')
908 video_title = mobj.group(1).decode('utf-8')
910 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
912 self._downloader.trouble(u'ERROR: unable to extract video uploader')
914 video_uploader = mobj.group(1).decode('utf-8')
916 # Extract video thumbnail
917 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
919 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
921 video_thumbnail = mobj.group(1).decode('utf-8')
923 # Extract video description
924 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video description')
928 video_description = mobj.group(1).decode('utf-8')
929 if not video_description:
930 video_description = 'No description available.'
932 # Extract video height and width
933 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
935 self._downloader.trouble(u'ERROR: unable to extract video height')
937 yv_video_height = mobj.group(1)
939 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video width')
943 yv_video_width = mobj.group(1)
945 # Retrieve video playlist to extract media URL
946 # I'm not completely sure what all these options are, but we
947 # seem to need most of them, otherwise the server sends a 401.
948 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
949 yv_bitrate = '700' # according to Wikipedia this is hard-coded
950 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
951 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
952 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
954 self.report_download_webpage(video_id)
955 webpage = urllib2.urlopen(request).read()
956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
957 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
960 # Extract media URL from playlist XML
961 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
963 self._downloader.trouble(u'ERROR: Unable to extract media URL')
965 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
966 video_url = unescapeHTML(video_url)
970 'id': video_id.decode('utf-8'),
972 'uploader': video_uploader,
973 'upload_date': u'NA',
974 'title': video_title,
975 'ext': video_extension.decode('utf-8'),
976 'thumbnail': video_thumbnail.decode('utf-8'),
977 'description': video_description,
978 'thumbnail': video_thumbnail,
983 class VimeoIE(InfoExtractor):
984 """Information extractor for vimeo.com."""
986 # _VALID_URL matches Vimeo URLs
987 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
990 def __init__(self, downloader=None):
991 InfoExtractor.__init__(self, downloader)
993 def report_download_webpage(self, video_id):
994 """Report webpage download."""
995 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
997 def report_extraction(self, video_id):
998 """Report information extraction."""
999 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1001 def _real_extract(self, url, new_video=True):
1002 # Extract ID from URL
1003 mobj = re.match(self._VALID_URL, url)
1005 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1008 video_id = mobj.group(1)
1010 # Retrieve video webpage to extract further information
1011 request = urllib2.Request(url, None, std_headers)
1013 self.report_download_webpage(video_id)
1014 webpage = urllib2.urlopen(request).read()
1015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1019 # Now we begin extracting as much information as we can from what we
1020 # retrieved. First we extract the information common to all extractors,
1021 # and latter we extract those that are Vimeo specific.
1022 self.report_extraction(video_id)
1024 # Extract the config JSON
1025 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1027 config = json.loads(config)
1029 self._downloader.trouble(u'ERROR: unable to extract info section')
1033 video_title = config["video"]["title"]
1036 video_uploader = config["video"]["owner"]["name"]
1038 # Extract video thumbnail
1039 video_thumbnail = config["video"]["thumbnail"]
1041 # Extract video description
1042 video_description = get_element_by_id("description", webpage.decode('utf8'))
1043 if video_description: video_description = clean_html(video_description)
1044 else: video_description = ''
1046 # Extract upload date
1047 video_upload_date = u'NA'
1048 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1049 if mobj is not None:
1050 video_upload_date = mobj.group(1)
1052 # Vimeo specific: extract request signature and timestamp
1053 sig = config['request']['signature']
1054 timestamp = config['request']['timestamp']
1056 # Vimeo specific: extract video codec and quality information
1057 # TODO bind to format param
1058 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1059 for codec in codecs:
1060 if codec[0] in config["video"]["files"]:
1061 video_codec = codec[0]
1062 video_extension = codec[1]
1063 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1064 else: quality = 'sd'
1067 self._downloader.trouble(u'ERROR: no known codec found')
1070 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1071 %(video_id, sig, timestamp, quality, video_codec.upper())
1074 'provider': IE_NAME,
1077 'uploader': video_uploader,
1078 'upload_date': video_upload_date,
1079 'title': video_title,
1080 'ext': video_extension,
1081 'thumbnail': video_thumbnail,
1082 'description': video_description,
1087 class GenericIE(InfoExtractor):
1088 """Generic last-resort information extractor."""
1091 IE_NAME = u'generic'
1093 def __init__(self, downloader=None):
1094 InfoExtractor.__init__(self, downloader)
1096 def report_download_webpage(self, video_id):
1097 """Report webpage download."""
1098 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1099 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1105 def report_following_redirect(self, new_url):
1106 """Report information extraction."""
1107 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1109 def _test_redirect(self, url):
1110 """Check if it is a redirect, like url shorteners, in case restart chain."""
1111 class HeadRequest(urllib2.Request):
1112 def get_method(self):
1115 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1117 Subclass the HTTPRedirectHandler to make it use our
1118 HeadRequest also on the redirected URL
1120 def redirect_request(self, req, fp, code, msg, headers, newurl):
1121 if code in (301, 302, 303, 307):
1122 newurl = newurl.replace(' ', '%20')
1123 newheaders = dict((k,v) for k,v in req.headers.items()
1124 if k.lower() not in ("content-length", "content-type"))
1125 return HeadRequest(newurl,
1127 origin_req_host=req.get_origin_req_host(),
1130 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1132 class HTTPMethodFallback(urllib2.BaseHandler):
1134 Fallback to GET if HEAD is not allowed (405 HTTP error)
1136 def http_error_405(self, req, fp, code, msg, headers):
1140 newheaders = dict((k,v) for k,v in req.headers.items()
1141 if k.lower() not in ("content-length", "content-type"))
1142 return self.parent.open(urllib2.Request(req.get_full_url(),
1144 origin_req_host=req.get_origin_req_host(),
1148 opener = urllib2.OpenerDirector()
1149 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1150 HTTPMethodFallback, HEADRedirectHandler,
1151 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1152 opener.add_handler(handler())
1154 response = opener.open(HeadRequest(url))
1155 new_url = response.geturl()
1157 if url == new_url: return False
1159 self.report_following_redirect(new_url)
1160 self._downloader.download([new_url])
1163 def _real_extract(self, url):
1164 if self._test_redirect(url): return
1166 video_id = url.split('/')[-1]
1167 request = urllib2.Request(url)
1169 self.report_download_webpage(video_id)
1170 webpage = urllib2.urlopen(request).read()
1171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1172 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1174 except ValueError, err:
1175 # since this is the last-resort InfoExtractor, if
1176 # this error is thrown, it'll be thrown here
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 self.report_extraction(video_id)
1181 # Start with something easy: JW Player in SWFObject
1182 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1184 # Broaden the search a little bit
1185 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 # It's possible that one of the regexes
1191 # matched, but returned an empty group:
1192 if mobj.group(1) is None:
1193 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 video_url = urllib.unquote(mobj.group(1))
1197 video_id = os.path.basename(video_url)
1199 # here's a fun little line of code for you:
1200 video_extension = os.path.splitext(video_id)[1][1:]
1201 video_id = os.path.splitext(video_id)[0]
1203 # it's tempting to parse this further, but you would
1204 # have to take into account all the variations like
1205 # Video Title - Site Name
1206 # Site Name | Video Title
1207 # Video Title - Tagline | Site Name
1208 # and so on and so forth; it's just not practical
1209 mobj = re.search(r'<title>(.*)</title>', webpage)
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1213 video_title = mobj.group(1).decode('utf-8')
1215 # video uploader is domain name
1216 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1218 self._downloader.trouble(u'ERROR: unable to extract title')
1220 video_uploader = mobj.group(1).decode('utf-8')
1223 'provider': IE_NAME,
1224 'id': video_id.decode('utf-8'),
1225 'url': video_url.decode('utf-8'),
1226 'uploader': video_uploader,
1227 'upload_date': u'NA',
1228 'title': video_title,
1229 'ext': video_extension.decode('utf-8'),
1235 class YoutubeSearchIE(InfoExtractor):
1236 """Information Extractor for YouTube search queries."""
1237 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1238 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1239 _max_youtube_results = 1000
1240 IE_NAME = u'youtube:search'
1242 def __init__(self, downloader=None):
1243 InfoExtractor.__init__(self, downloader)
1245 def report_download_page(self, query, pagenum):
1246 """Report attempt to download search page with given number."""
1247 query = query.decode(preferredencoding())
1248 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1250 def _real_extract(self, query):
1251 mobj = re.match(self._VALID_URL, query)
1253 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1256 prefix, query = query.split(':')
1258 query = query.encode('utf-8')
1260 self._download_n_results(query, 1)
1262 elif prefix == 'all':
1263 self._download_n_results(query, self._max_youtube_results)
1269 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1271 elif n > self._max_youtube_results:
1272 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1273 n = self._max_youtube_results
1274 self._download_n_results(query, n)
1276 except ValueError: # parsing prefix as integer fails
1277 self._download_n_results(query, 1)
1280 def _download_n_results(self, query, n):
1281 """Downloads a specified number of results for a query"""
1287 while (50 * pagenum) < limit:
1288 self.report_download_page(query, pagenum+1)
1289 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1290 request = urllib2.Request(result_url)
1292 data = urllib2.urlopen(request).read()
1293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1296 api_response = json.loads(data)['data']
1298 new_ids = list(video['id'] for video in api_response['items'])
1299 video_ids += new_ids
1301 limit = min(n, api_response['totalItems'])
1304 if len(video_ids) > n:
1305 video_ids = video_ids[:n]
1306 for id in video_ids:
1307 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1311 class GoogleSearchIE(InfoExtractor):
1312 """Information Extractor for Google Video search queries."""
1313 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1314 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1315 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1316 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1317 _max_google_results = 1000
1318 IE_NAME = u'video.google:search'
1320 def __init__(self, downloader=None):
1321 InfoExtractor.__init__(self, downloader)
1323 def report_download_page(self, query, pagenum):
1324 """Report attempt to download playlist page with given number."""
1325 query = query.decode(preferredencoding())
1326 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1328 def _real_extract(self, query):
1329 mobj = re.match(self._VALID_URL, query)
1331 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1334 prefix, query = query.split(':')
1336 query = query.encode('utf-8')
1338 self._download_n_results(query, 1)
1340 elif prefix == 'all':
1341 self._download_n_results(query, self._max_google_results)
1347 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1349 elif n > self._max_google_results:
1350 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1351 n = self._max_google_results
1352 self._download_n_results(query, n)
1354 except ValueError: # parsing prefix as integer fails
1355 self._download_n_results(query, 1)
1358 def _download_n_results(self, query, n):
1359 """Downloads a specified number of results for a query"""
1365 self.report_download_page(query, pagenum)
1366 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1367 request = urllib2.Request(result_url)
1369 page = urllib2.urlopen(request).read()
1370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1371 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1374 # Extract video identifiers
1375 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1376 video_id = mobj.group(1)
1377 if video_id not in video_ids:
1378 video_ids.append(video_id)
1379 if len(video_ids) == n:
1380 # Specified n videos reached
1381 for id in video_ids:
1382 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1385 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1386 for id in video_ids:
1387 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1390 pagenum = pagenum + 1
1393 class YahooSearchIE(InfoExtractor):
1394 """Information Extractor for Yahoo! Video search queries."""
1395 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1396 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1397 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1398 _MORE_PAGES_INDICATOR = r'\s*Next'
1399 _max_yahoo_results = 1000
1400 IE_NAME = u'video.yahoo:search'
1402 def __init__(self, downloader=None):
1403 InfoExtractor.__init__(self, downloader)
1405 def report_download_page(self, query, pagenum):
1406 """Report attempt to download playlist page with given number."""
1407 query = query.decode(preferredencoding())
1408 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1410 def _real_extract(self, query):
1411 mobj = re.match(self._VALID_URL, query)
1413 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1416 prefix, query = query.split(':')
1418 query = query.encode('utf-8')
1420 self._download_n_results(query, 1)
1422 elif prefix == 'all':
1423 self._download_n_results(query, self._max_yahoo_results)
1429 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1431 elif n > self._max_yahoo_results:
1432 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1433 n = self._max_yahoo_results
1434 self._download_n_results(query, n)
1436 except ValueError: # parsing prefix as integer fails
1437 self._download_n_results(query, 1)
1440 def _download_n_results(self, query, n):
1441 """Downloads a specified number of results for a query"""
1444 already_seen = set()
1448 self.report_download_page(query, pagenum)
1449 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1450 request = urllib2.Request(result_url)
1452 page = urllib2.urlopen(request).read()
1453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1457 # Extract video identifiers
1458 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1459 video_id = mobj.group(1)
1460 if video_id not in already_seen:
1461 video_ids.append(video_id)
1462 already_seen.add(video_id)
1463 if len(video_ids) == n:
1464 # Specified n videos reached
1465 for id in video_ids:
1466 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1469 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1470 for id in video_ids:
1471 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1474 pagenum = pagenum + 1
1477 class YoutubePlaylistIE(InfoExtractor):
1478 """Information Extractor for YouTube playlists."""
1480 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1481 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1482 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1483 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1484 IE_NAME = u'youtube:playlist'
1486 def __init__(self, downloader=None):
1487 InfoExtractor.__init__(self, downloader)
1489 def report_download_page(self, playlist_id, pagenum):
1490 """Report attempt to download playlist page with given number."""
1491 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1493 def _real_extract(self, url):
1494 # Extract playlist id
1495 mobj = re.match(self._VALID_URL, url)
1497 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1501 if mobj.group(3) is not None:
1502 self._downloader.download([mobj.group(3)])
1505 # Download playlist pages
1506 # prefix is 'p' as default for playlists but there are other types that need extra care
1507 playlist_prefix = mobj.group(1)
1508 if playlist_prefix == 'a':
1509 playlist_access = 'artist'
1511 playlist_prefix = 'p'
1512 playlist_access = 'view_play_list'
1513 playlist_id = mobj.group(2)
1518 self.report_download_page(playlist_id, pagenum)
1519 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1520 request = urllib2.Request(url)
1522 page = urllib2.urlopen(request).read()
1523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1524 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1527 # Extract video identifiers
1529 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1530 if mobj.group(1) not in ids_in_page:
1531 ids_in_page.append(mobj.group(1))
1532 video_ids.extend(ids_in_page)
1534 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1536 pagenum = pagenum + 1
1538 playliststart = self._downloader.params.get('playliststart', 1) - 1
1539 playlistend = self._downloader.params.get('playlistend', -1)
1540 if playlistend == -1:
1541 video_ids = video_ids[playliststart:]
1543 video_ids = video_ids[playliststart:playlistend]
1545 for id in video_ids:
1546 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1550 class YoutubeUserIE(InfoExtractor):
1551 """Information Extractor for YouTube users."""
1553 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1554 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1555 _GDATA_PAGE_SIZE = 50
1556 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1557 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1558 IE_NAME = u'youtube:user'
1560 def __init__(self, downloader=None):
1561 InfoExtractor.__init__(self, downloader)
1563 def report_download_page(self, username, start_index):
1564 """Report attempt to download user page."""
1565 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1566 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1568 def _real_extract(self, url):
1570 mobj = re.match(self._VALID_URL, url)
1572 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1575 username = mobj.group(1)
1577 # Download video ids using YouTube Data API. Result size per
1578 # query is limited (currently to 50 videos) so we need to query
1579 # page by page until there are no video ids - it means we got
1586 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1587 self.report_download_page(username, start_index)
1589 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1592 page = urllib2.urlopen(request).read()
1593 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1597 # Extract video identifiers
1600 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601 if mobj.group(1) not in ids_in_page:
1602 ids_in_page.append(mobj.group(1))
1604 video_ids.extend(ids_in_page)
1606 # A little optimization - if current page is not
1607 # "full", ie. does not contain PAGE_SIZE video ids then
1608 # we can assume that this page is the last one - there
1609 # are no more ids on further pages - no need to query
1612 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1617 all_ids_count = len(video_ids)
1618 playliststart = self._downloader.params.get('playliststart', 1) - 1
1619 playlistend = self._downloader.params.get('playlistend', -1)
1621 if playlistend == -1:
1622 video_ids = video_ids[playliststart:]
1624 video_ids = video_ids[playliststart:playlistend]
1626 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1627 (username, all_ids_count, len(video_ids)))
1629 for video_id in video_ids:
1630 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1633 class BlipTVUserIE(InfoExtractor):
1634 """Information Extractor for blip.tv users."""
1636 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1638 IE_NAME = u'blip.tv:user'
1640 def __init__(self, downloader=None):
1641 InfoExtractor.__init__(self, downloader)
1643 def report_download_page(self, username, pagenum):
1644 """Report attempt to download user page."""
1645 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1646 (self.IE_NAME, username, pagenum))
1648 def _real_extract(self, url):
1650 mobj = re.match(self._VALID_URL, url)
1652 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1655 username = mobj.group(1)
1657 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1659 request = urllib2.Request(url)
1662 page = urllib2.urlopen(request).read().decode('utf-8')
1663 mobj = re.search(r'data-users-id="([^"]+)"', page)
1664 page_base = page_base % mobj.group(1)
1665 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1666 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1670 # Download video ids using BlipTV Ajax calls. Result size per
1671 # query is limited (currently to 12 videos) so we need to query
1672 # page by page until there are no video ids - it means we got
1679 self.report_download_page(username, pagenum)
1681 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1684 page = urllib2.urlopen(request).read().decode('utf-8')
1685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1689 # Extract video identifiers
1692 for mobj in re.finditer(r'href="/([^"]+)"', page):
1693 if mobj.group(1) not in ids_in_page:
1694 ids_in_page.append(unescapeHTML(mobj.group(1)))
1696 video_ids.extend(ids_in_page)
1698 # A little optimization - if current page is not
1699 # "full", ie. does not contain PAGE_SIZE video ids then
1700 # we can assume that this page is the last one - there
1701 # are no more ids on further pages - no need to query
1704 if len(ids_in_page) < self._PAGE_SIZE:
1709 all_ids_count = len(video_ids)
1710 playliststart = self._downloader.params.get('playliststart', 1) - 1
1711 playlistend = self._downloader.params.get('playlistend', -1)
1713 if playlistend == -1:
1714 video_ids = video_ids[playliststart:]
1716 video_ids = video_ids[playliststart:playlistend]
1718 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1719 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1721 for video_id in video_ids:
1722 self._downloader.download([u'http://blip.tv/'+video_id])
1725 class DepositFilesIE(InfoExtractor):
1726 """Information extractor for depositfiles.com"""
1728 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1729 IE_NAME = u'DepositFiles'
1731 def __init__(self, downloader=None):
1732 InfoExtractor.__init__(self, downloader)
1734 def report_download_webpage(self, file_id):
1735 """Report webpage download."""
1736 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1738 def report_extraction(self, file_id):
1739 """Report information extraction."""
1740 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1742 def _real_extract(self, url):
1743 file_id = url.split('/')[-1]
1744 # Rebuild url in english locale
1745 url = 'http://depositfiles.com/en/files/' + file_id
1747 # Retrieve file webpage with 'Free download' button pressed
1748 free_download_indication = { 'gateway_result' : '1' }
1749 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1751 self.report_download_webpage(file_id)
1752 webpage = urllib2.urlopen(request).read()
1753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1754 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1757 # Search for the real file URL
1758 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1759 if (mobj is None) or (mobj.group(1) is None):
1760 # Try to figure out reason of the error.
1761 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1762 if (mobj is not None) and (mobj.group(1) is not None):
1763 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1764 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1766 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1769 file_url = mobj.group(1)
1770 file_extension = os.path.splitext(file_url)[1][1:]
1772 # Search for file title
1773 mobj = re.search(r'<b title="(.*?)">', webpage)
1775 self._downloader.trouble(u'ERROR: unable to extract title')
1777 file_title = mobj.group(1).decode('utf-8')
1780 'provider': IE_NAME,
1781 'id': file_id.decode('utf-8'),
1782 'url': file_url.decode('utf-8'),
1784 'upload_date': u'NA',
1785 'title': file_title,
1786 'ext': file_extension.decode('utf-8'),
1792 class FacebookIE(InfoExtractor):
1793 """Information Extractor for Facebook"""
1795 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1796 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1797 _NETRC_MACHINE = 'facebook'
1798 _available_formats = ['video', 'highqual', 'lowqual']
1799 _video_extensions = {
1804 IE_NAME = u'facebook'
1806 def __init__(self, downloader=None):
1807 InfoExtractor.__init__(self, downloader)
1809 def _reporter(self, message):
1810 """Add header and report message."""
1811 self._downloader.to_screen(u'[facebook] %s' % message)
1813 def report_login(self):
1814 """Report attempt to log in."""
1815 self._reporter(u'Logging in')
1817 def report_video_webpage_download(self, video_id):
1818 """Report attempt to download video webpage."""
1819 self._reporter(u'%s: Downloading video webpage' % video_id)
1821 def report_information_extraction(self, video_id):
1822 """Report attempt to extract video information."""
1823 self._reporter(u'%s: Extracting video information' % video_id)
1825 def _parse_page(self, video_webpage):
1826 """Extract video information from page"""
1828 data = {'title': r'\("video_title", "(.*?)"\)',
1829 'description': r'<div class="datawrap">(.*?)</div>',
1830 'owner': r'\("video_owner_name", "(.*?)"\)',
1831 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1834 for piece in data.keys():
1835 mobj = re.search(data[piece], video_webpage)
1836 if mobj is not None:
1837 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1841 for fmt in self._available_formats:
1842 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1843 if mobj is not None:
1844 # URL is in a Javascript segment inside an escaped Unicode format within
1845 # the generally utf-8 page
1846 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1847 video_info['video_urls'] = video_urls
1851 def _real_initialize(self):
1852 if self._downloader is None:
1857 downloader_params = self._downloader.params
1859 # Attempt to use provided username and password or .netrc data
1860 if downloader_params.get('username', None) is not None:
1861 useremail = downloader_params['username']
1862 password = downloader_params['password']
1863 elif downloader_params.get('usenetrc', False):
1865 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1866 if info is not None:
1870 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1871 except (IOError, netrc.NetrcParseError), err:
1872 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1875 if useremail is None:
1884 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1887 login_results = urllib2.urlopen(request).read()
1888 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1889 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1895 def _real_extract(self, url):
1896 mobj = re.match(self._VALID_URL, url)
1898 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1900 video_id = mobj.group('ID')
1903 self.report_video_webpage_download(video_id)
1904 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1906 page = urllib2.urlopen(request)
1907 video_webpage = page.read()
1908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1912 # Start extracting information
1913 self.report_information_extraction(video_id)
1915 # Extract information
1916 video_info = self._parse_page(video_webpage)
1919 if 'owner' not in video_info:
1920 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1922 video_uploader = video_info['owner']
1925 if 'title' not in video_info:
1926 self._downloader.trouble(u'ERROR: unable to extract video title')
1928 video_title = video_info['title']
1929 video_title = video_title.decode('utf-8')
1932 if 'thumbnail' not in video_info:
1933 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1934 video_thumbnail = ''
1936 video_thumbnail = video_info['thumbnail']
1940 if 'upload_date' in video_info:
1941 upload_time = video_info['upload_date']
1942 timetuple = email.utils.parsedate_tz(upload_time)
1943 if timetuple is not None:
1945 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1950 video_description = video_info.get('description', 'No description available.')
1952 url_map = video_info['video_urls']
1953 if len(url_map.keys()) > 0:
1954 # Decide which formats to download
1955 req_format = self._downloader.params.get('format', None)
1956 format_limit = self._downloader.params.get('format_limit', None)
1958 if format_limit is not None and format_limit in self._available_formats:
1959 format_list = self._available_formats[self._available_formats.index(format_limit):]
1961 format_list = self._available_formats
1962 existing_formats = [x for x in format_list if x in url_map]
1963 if len(existing_formats) == 0:
1964 self._downloader.trouble(u'ERROR: no known formats available for video')
1966 if req_format is None:
1967 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1968 elif req_format == 'worst':
1969 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1970 elif req_format == '-1':
1971 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1974 if req_format not in url_map:
1975 self._downloader.trouble(u'ERROR: requested format not available')
1977 video_url_list = [(req_format, url_map[req_format])] # Specific format
1980 for format_param, video_real_url in video_url_list:
1982 video_extension = self._video_extensions.get(format_param, 'mp4')
1985 'provider': IE_NAME,
1986 'id': video_id.decode('utf-8'),
1987 'url': video_real_url.decode('utf-8'),
1988 'uploader': video_uploader.decode('utf-8'),
1989 'upload_date': upload_date,
1990 'title': video_title,
1991 'ext': video_extension.decode('utf-8'),
1992 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1993 'thumbnail': video_thumbnail.decode('utf-8'),
1994 'description': video_description.decode('utf-8'),
1999 class BlipTVIE(InfoExtractor):
2000 """Information extractor for blip.tv"""
2002 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2003 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2004 IE_NAME = u'blip.tv'
2006 def report_extraction(self, file_id):
2007 """Report information extraction."""
2008 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2010 def report_direct_download(self, title):
2011 """Report information extraction."""
2012 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2014 def _real_extract(self, url):
2015 mobj = re.match(self._VALID_URL, url)
2017 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2024 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2025 request = urllib2.Request(json_url.encode('utf-8'))
2026 self.report_extraction(mobj.group(1))
2029 urlh = urllib2.urlopen(request)
2030 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2031 basename = url.split('/')[-1]
2032 title,ext = os.path.splitext(basename)
2033 title = title.decode('UTF-8')
2034 ext = ext.replace('.', '')
2035 self.report_direct_download(title)
2037 'provider': IE_NAME,
2044 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2045 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2047 if info is None: # Regular URL
2049 json_code = urlh.read()
2050 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2051 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2055 json_data = json.loads(json_code)
2056 if 'Post' in json_data:
2057 data = json_data['Post']
2061 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2062 video_url = data['media']['url']
2063 umobj = re.match(self._URL_EXT, video_url)
2065 raise ValueError('Can not determine filename extension')
2066 ext = umobj.group(1)
2069 'provider': IE_NAME,
2070 'id': data['item_id'],
2072 'uploader': data['display_name'],
2073 'upload_date': upload_date,
2074 'title': data['title'],
2076 'format': data['media']['mimeType'],
2077 'thumbnail': data['thumbnailUrl'],
2078 'description': data['description'],
2079 'player_url': data['embedUrl']
2081 except (ValueError,KeyError), err:
2082 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2085 std_headers['User-Agent'] = 'iTunes/10.6.1'
2089 class MyVideoIE(InfoExtractor):
2090 """Information Extractor for myvideo.de."""
2092 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2093 IE_NAME = u'myvideo'
2095 def __init__(self, downloader=None):
2096 InfoExtractor.__init__(self, downloader)
2098 def report_download_webpage(self, video_id):
2099 """Report webpage download."""
2100 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2102 def report_extraction(self, video_id):
2103 """Report information extraction."""
2104 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2106 def _real_extract(self,url):
2107 mobj = re.match(self._VALID_URL, url)
2109 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2112 video_id = mobj.group(1)
2115 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2117 self.report_download_webpage(video_id)
2118 webpage = urllib2.urlopen(request).read()
2119 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2120 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2123 self.report_extraction(video_id)
2124 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2127 self._downloader.trouble(u'ERROR: unable to extract media URL')
2129 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2131 mobj = re.search('<title>([^<]+)</title>', webpage)
2133 self._downloader.trouble(u'ERROR: unable to extract title')
2136 video_title = mobj.group(1)
2139 'provider': IE_NAME,
2143 'upload_date': u'NA',
2144 'title': video_title,
2150 class ComedyCentralIE(InfoExtractor):
2151 """Information extractor for The Daily Show and Colbert Report """
2153 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2154 IE_NAME = u'comedycentral'
2156 def report_extraction(self, episode_id):
2157 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2159 def report_config_download(self, episode_id):
2160 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2162 def report_index_download(self, episode_id):
2163 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2165 def report_player_url(self, episode_id):
2166 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2168 def _real_extract(self, url):
2169 mobj = re.match(self._VALID_URL, url)
2171 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2174 if mobj.group('shortname'):
2175 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2176 url = u'http://www.thedailyshow.com/full-episodes/'
2178 url = u'http://www.colbertnation.com/full-episodes/'
2179 mobj = re.match(self._VALID_URL, url)
2180 assert mobj is not None
2182 dlNewest = not mobj.group('episode')
2184 epTitle = mobj.group('showname')
2186 epTitle = mobj.group('episode')
2188 req = urllib2.Request(url)
2189 self.report_extraction(epTitle)
2191 htmlHandle = urllib2.urlopen(req)
2192 html = htmlHandle.read()
2193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2194 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2197 url = htmlHandle.geturl()
2198 mobj = re.match(self._VALID_URL, url)
2200 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2202 if mobj.group('episode') == '':
2203 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2205 epTitle = mobj.group('episode')
2207 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2208 if len(mMovieParams) == 0:
2209 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2212 playerUrl_raw = mMovieParams[0][0]
2213 self.report_player_url(epTitle)
2215 urlHandle = urllib2.urlopen(playerUrl_raw)
2216 playerUrl = urlHandle.geturl()
2217 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2218 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2221 uri = mMovieParams[0][1]
2222 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2223 self.report_index_download(epTitle)
2225 indexXml = urllib2.urlopen(indexUrl).read()
2226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2232 idoc = xml.etree.ElementTree.fromstring(indexXml)
2233 itemEls = idoc.findall('.//item')
2234 for itemEl in itemEls:
2235 mediaId = itemEl.findall('./guid')[0].text
2236 shortMediaId = mediaId.split(':')[-1]
2237 showId = mediaId.split(':')[-2].replace('.com', '')
2238 officialTitle = itemEl.findall('./title')[0].text
2239 officialDate = itemEl.findall('./pubDate')[0].text
2241 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2242 urllib.urlencode({'uri': mediaId}))
2243 configReq = urllib2.Request(configUrl)
2244 self.report_config_download(epTitle)
2246 configXml = urllib2.urlopen(configReq).read()
2247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2248 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2251 cdoc = xml.etree.ElementTree.fromstring(configXml)
2253 for rendition in cdoc.findall('.//rendition'):
2254 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2258 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2261 # For now, just pick the highest bitrate
2262 format,video_url = turls[-1]
2264 effTitle = showId + u'-' + epTitle
2266 'provider': IE_NAME,
2270 'upload_date': officialDate,
2275 'description': officialTitle,
2276 'player_url': playerUrl
2279 results.append(info)
2284 class EscapistIE(InfoExtractor):
2285 """Information extractor for The Escapist """
2287 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2288 IE_NAME = u'escapist'
2290 def report_extraction(self, showName):
2291 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2293 def report_config_download(self, showName):
2294 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2296 def _real_extract(self, url):
2297 mobj = re.match(self._VALID_URL, url)
2299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2301 showName = mobj.group('showname')
2302 videoId = mobj.group('episode')
2304 self.report_extraction(showName)
2306 webPage = urllib2.urlopen(url)
2307 webPageBytes = webPage.read()
2308 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2309 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2314 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2315 description = unescapeHTML(descMatch.group(1))
2316 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2317 imgUrl = unescapeHTML(imgMatch.group(1))
2318 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2319 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2320 configUrlMatch = re.search('config=(.*)$', playerUrl)
2321 configUrl = urllib2.unquote(configUrlMatch.group(1))
2323 self.report_config_download(showName)
2325 configJSON = urllib2.urlopen(configUrl).read()
2326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2327 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2330 # Technically, it's JavaScript, not JSON
2331 configJSON = configJSON.replace("'", '"')
2334 config = json.loads(configJSON)
2335 except (ValueError,), err:
2336 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2339 playlist = config['playlist']
2340 videoUrl = playlist[1]['url']
2343 'provider': IE_NAME,
2346 'uploader': showName,
2347 'upload_date': None,
2351 'thumbnail': imgUrl,
2352 'description': description,
2353 'player_url': playerUrl,
2359 class CollegeHumorIE(InfoExtractor):
2360 """Information extractor for collegehumor.com"""
2362 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2363 IE_NAME = u'collegehumor'
2365 def report_webpage(self, video_id):
2366 """Report information extraction."""
2367 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2369 def report_extraction(self, video_id):
2370 """Report information extraction."""
2371 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2373 def _real_extract(self, url):
2374 mobj = re.match(self._VALID_URL, url)
2376 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2378 video_id = mobj.group('videoid')
2380 self.report_webpage(video_id)
2381 request = urllib2.Request(url)
2383 webpage = urllib2.urlopen(request).read()
2384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2385 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2388 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2390 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2392 internal_video_id = m.group('internalvideoid')
2395 'provider': IE_NAME,
2397 'internal_id': internal_video_id,
2400 self.report_extraction(video_id)
2401 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2403 metaXml = urllib2.urlopen(xmlUrl).read()
2404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2408 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2410 videoNode = mdoc.findall('./video')[0]
2411 info['description'] = videoNode.findall('./description')[0].text
2412 info['title'] = videoNode.findall('./caption')[0].text
2413 info['url'] = videoNode.findall('./file')[0].text
2414 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2415 info['ext'] = info['url'].rpartition('.')[2]
2416 info['format'] = info['ext']
2418 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2424 class XVideosIE(InfoExtractor):
2425 """Information extractor for xvideos.com"""
2427 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2428 IE_NAME = u'xvideos'
2430 def report_webpage(self, video_id):
2431 """Report information extraction."""
2432 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2434 def report_extraction(self, video_id):
2435 """Report information extraction."""
2436 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2438 def _real_extract(self, url):
2439 mobj = re.match(self._VALID_URL, url)
2441 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2443 video_id = mobj.group(1).decode('utf-8')
2445 self.report_webpage(video_id)
2447 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2449 webpage = urllib2.urlopen(request).read()
2450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2451 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2454 self.report_extraction(video_id)
2458 mobj = re.search(r'flv_url=(.+?)&', webpage)
2460 self._downloader.trouble(u'ERROR: unable to extract video url')
2462 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2466 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2468 self._downloader.trouble(u'ERROR: unable to extract video title')
2470 video_title = mobj.group(1).decode('utf-8')
2473 # Extract video thumbnail
2474 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2476 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2478 video_thumbnail = mobj.group(0).decode('utf-8')
2481 'provider': IE_NAME,
2485 'upload_date': None,
2486 'title': video_title,
2489 'thumbnail': video_thumbnail,
2490 'description': None,
2497 class SoundcloudIE(InfoExtractor):
2498 """Information extractor for soundcloud.com
2499 To access the media, the uid of the song and a stream token
2500 must be extracted from the page source and the script must make
2501 a request to media.soundcloud.com/crossdomain.xml. Then
2502 the media can be grabbed by requesting from an url composed
2503 of the stream token and uid
2506 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2507 IE_NAME = u'soundcloud'
2509 def __init__(self, downloader=None):
2510 InfoExtractor.__init__(self, downloader)
2512 def report_webpage(self, video_id):
2513 """Report information extraction."""
2514 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2516 def report_extraction(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2520 def _real_extract(self, url):
2521 mobj = re.match(self._VALID_URL, url)
2523 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2526 # extract uploader (which is in the url)
2527 uploader = mobj.group(1).decode('utf-8')
2528 # extract simple title (uploader + slug of song title)
2529 slug_title = mobj.group(2).decode('utf-8')
2530 simple_title = uploader + u'-' + slug_title
2532 self.report_webpage('%s/%s' % (uploader, slug_title))
2534 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2536 webpage = urllib2.urlopen(request).read()
2537 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2538 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2541 self.report_extraction('%s/%s' % (uploader, slug_title))
2543 # extract uid and stream token that soundcloud hands out for access
2544 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2546 video_id = mobj.group(1)
2547 stream_token = mobj.group(2)
2549 # extract unsimplified title
2550 mobj = re.search('"title":"(.*?)",', webpage)
2552 title = mobj.group(1).decode('utf-8')
2554 title = simple_title
2556 # construct media url (with uid/token)
2557 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2558 mediaURL = mediaURL % (video_id, stream_token)
2561 description = u'No description available'
2562 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2564 description = mobj.group(1)
2568 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2571 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2572 except Exception, e:
2573 self._downloader.to_stderr(str(e))
2575 # for soundcloud, a request to a cross domain is required for cookies
2576 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2579 'provider': IE_NAME,
2580 'id': video_id.decode('utf-8'),
2582 'uploader': uploader.decode('utf-8'),
2583 'upload_date': upload_date,
2588 'description': description.decode('utf-8')
2592 class InfoQIE(InfoExtractor):
2593 """Information extractor for infoq.com"""
2595 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2598 def report_webpage(self, video_id):
2599 """Report information extraction."""
2600 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2602 def report_extraction(self, video_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2606 def _real_extract(self, url):
2607 mobj = re.match(self._VALID_URL, url)
2609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2612 self.report_webpage(url)
2614 request = urllib2.Request(url)
2616 webpage = urllib2.urlopen(request).read()
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2621 self.report_extraction(url)
2625 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2627 self._downloader.trouble(u'ERROR: unable to extract video url')
2629 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2633 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2635 self._downloader.trouble(u'ERROR: unable to extract video title')
2637 video_title = mobj.group(1).decode('utf-8')
2639 # Extract description
2640 video_description = u'No description available.'
2641 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2642 if mobj is not None:
2643 video_description = mobj.group(1).decode('utf-8')
2645 video_filename = video_url.split('/')[-1]
2646 video_id, extension = video_filename.split('.')
2649 'provider': IE_NAME,
2653 'upload_date': None,
2654 'title': video_title,
2656 'format': extension, # Extension is always(?) mp4, but seems to be flv
2658 'description': video_description,
2664 class MixcloudIE(InfoExtractor):
2665 """Information extractor for www.mixcloud.com"""
2666 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2667 IE_NAME = u'mixcloud'
2669 def __init__(self, downloader=None):
2670 InfoExtractor.__init__(self, downloader)
2672 def report_download_json(self, file_id):
2673 """Report JSON download."""
2674 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2676 def report_extraction(self, file_id):
2677 """Report information extraction."""
2678 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2680 def get_urls(self, jsonData, fmt, bitrate='best'):
2681 """Get urls from 'audio_formats' section in json"""
2684 bitrate_list = jsonData[fmt]
2685 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2686 bitrate = max(bitrate_list) # select highest
2688 url_list = jsonData[fmt][bitrate]
2689 except TypeError: # we have no bitrate info.
2690 url_list = jsonData[fmt]
2693 def check_urls(self, url_list):
2694 """Returns 1st active url from list"""
2695 for url in url_list:
2697 urllib2.urlopen(url)
2699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2704 def _print_formats(self, formats):
2705 print 'Available formats:'
2706 for fmt in formats.keys():
2707 for b in formats[fmt]:
2709 ext = formats[fmt][b][0]
2710 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2711 except TypeError: # we have no bitrate info
2712 ext = formats[fmt][0]
2713 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2716 def _real_extract(self, url):
2717 mobj = re.match(self._VALID_URL, url)
2719 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2721 # extract uploader & filename from url
2722 uploader = mobj.group(1).decode('utf-8')
2723 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2725 # construct API request
2726 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2727 # retrieve .json file with links to files
2728 request = urllib2.Request(file_url)
2730 self.report_download_json(file_url)
2731 jsonData = urllib2.urlopen(request).read()
2732 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2733 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2737 json_data = json.loads(jsonData)
2738 player_url = json_data['player_swf_url']
2739 formats = dict(json_data['audio_formats'])
2741 req_format = self._downloader.params.get('format', None)
2744 if self._downloader.params.get('listformats', None):
2745 self._print_formats(formats)
2748 if req_format is None or req_format == 'best':
2749 for format_param in formats.keys():
2750 url_list = self.get_urls(formats, format_param)
2752 file_url = self.check_urls(url_list)
2753 if file_url is not None:
2756 if req_format not in formats.keys():
2757 self._downloader.trouble(u'ERROR: format is not available')
2760 url_list = self.get_urls(formats, req_format)
2761 file_url = self.check_urls(url_list)
2762 format_param = req_format
2765 'provider': IE_NAME,
2766 'id': file_id.decode('utf-8'),
2767 'url': file_url.decode('utf-8'),
2768 'uploader': uploader.decode('utf-8'),
2769 'upload_date': u'NA',
2770 'title': json_data['name'],
2771 'ext': file_url.split('.')[-1].decode('utf-8'),
2772 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2773 'thumbnail': json_data['thumbnail_url'],
2774 'description': json_data['description'],
2775 'player_url': player_url.decode('utf-8'),
2778 class StanfordOpenClassroomIE(InfoExtractor):
2779 """Information extractor for Stanford's Open ClassRoom"""
2781 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2782 IE_NAME = u'stanfordoc'
2784 def report_download_webpage(self, objid):
2785 """Report information extraction."""
2786 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2788 def report_extraction(self, video_id):
2789 """Report information extraction."""
2790 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2792 def _real_extract(self, url):
2793 mobj = re.match(self._VALID_URL, url)
2795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798 if mobj.group('course') and mobj.group('video'): # A specific video
2799 course = mobj.group('course')
2800 video = mobj.group('video')
2802 'provider': IE_NAME,
2803 'id': course + '_' + video,
2806 self.report_extraction(info['id'])
2807 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808 xmlUrl = baseUrl + video + '.xml'
2810 metaXml = urllib2.urlopen(xmlUrl).read()
2811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2814 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2816 info['title'] = mdoc.findall('./title')[0].text
2817 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2819 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2821 info['ext'] = info['url'].rpartition('.')[2]
2822 info['format'] = info['ext']
2824 elif mobj.group('course'): # A course page
2825 course = mobj.group('course')
2827 'provider': IE_NAME,
2832 self.report_download_webpage(info['id'])
2834 coursepage = urllib2.urlopen(url).read()
2835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2839 m = re.search('<h1>([^<]+)</h1>', coursepage)
2841 info['title'] = unescapeHTML(m.group(1))
2843 info['title'] = info['id']
2845 m = re.search('<description>([^<]+)</description>', coursepage)
2847 info['description'] = unescapeHTML(m.group(1))
2849 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2852 'type': 'reference',
2853 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2857 for entry in info['list']:
2858 assert entry['type'] == 'reference'
2859 results += self.extract(entry['url'])
2864 'provider': IE_NAME,
2865 'id': 'Stanford OpenClassroom',
2869 self.report_download_webpage(info['id'])
2870 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2872 rootpage = urllib2.urlopen(rootURL).read()
2873 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2874 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2877 info['title'] = info['id']
2879 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2882 'type': 'reference',
2883 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2888 for entry in info['list']:
2889 assert entry['type'] == 'reference'
2890 results += self.extract(entry['url'])
2893 class MTVIE(InfoExtractor):
2894 """Information extractor for MTV.com"""
2896 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2899 def report_webpage(self, video_id):
2900 """Report information extraction."""
2901 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2903 def report_extraction(self, video_id):
2904 """Report information extraction."""
2905 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2907 def _real_extract(self, url):
2908 mobj = re.match(self._VALID_URL, url)
2910 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2912 if not mobj.group('proto'):
2913 url = 'http://' + url
2914 video_id = mobj.group('videoid')
2915 self.report_webpage(video_id)
2917 request = urllib2.Request(url)
2919 webpage = urllib2.urlopen(request).read()
2920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2924 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2926 self._downloader.trouble(u'ERROR: unable to extract song name')
2928 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2929 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2931 self._downloader.trouble(u'ERROR: unable to extract performer')
2933 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2934 video_title = performer + ' - ' + song_name
2936 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2938 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2940 mtvn_uri = mobj.group(1)
2942 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2944 self._downloader.trouble(u'ERROR: unable to extract content id')
2946 content_id = mobj.group(1)
2948 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2949 self.report_extraction(video_id)
2950 request = urllib2.Request(videogen_url)
2952 metadataXml = urllib2.urlopen(request).read()
2953 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2954 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2957 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2958 renditions = mdoc.findall('.//rendition')
2960 # For now, always pick the highest quality.
2961 rendition = renditions[-1]
2964 _,_,ext = rendition.attrib['type'].partition('/')
2965 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2966 video_url = rendition.find('./src').text
2968 self._downloader.trouble('Invalid rendition field.')
2972 'provider': IE_NAME,
2975 'uploader': performer,
2976 'title': video_title,