2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
100 (?:https?://)? # http(s):// (optional)
101 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/) # the various hostnames, with wildcard subdomains
102 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
103 (?: # the various things that can precede the ID:
104 (?:(?:v|embed|e)/) # v/ or embed/ or e/
105 |(?: # or the v= param in all its forms
106 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
107 (?:\?|\#!?) # the params delimiter ? or # or #!
108 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
111 )? # optional -> youtube.com/xxxx is OK
112 )? # all until now is optional -> you can pass the naked ID
113 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
114 (?(1).+)? # if we found the ID, everything can follow
116 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
117 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
118 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
119 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
120 _NETRC_MACHINE = 'youtube'
121 # Listed in order of quality
122 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
123 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
124 _video_extensions = {
130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
136 _video_dimensions = {
154 def suitable(self, url):
155 """Receives a URL and returns True if suitable for this IE."""
156 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
158 def report_lang(self):
159 """Report attempt to set language."""
160 self._downloader.to_screen(u'[youtube] Setting language')
162 def report_login(self):
163 """Report attempt to log in."""
164 self._downloader.to_screen(u'[youtube] Logging in')
166 def report_age_confirmation(self):
167 """Report attempt to confirm age."""
168 self._downloader.to_screen(u'[youtube] Confirming age')
170 def report_video_webpage_download(self, video_id):
171 """Report attempt to download video webpage."""
172 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
174 def report_video_info_webpage_download(self, video_id):
175 """Report attempt to download video info webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
178 def report_video_subtitles_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
182 def report_information_extraction(self, video_id):
183 """Report attempt to extract video information."""
184 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
186 def report_unavailable_format(self, video_id, format):
187 """Report extracted video URL."""
188 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
190 def report_rtmp_download(self):
191 """Indicate the download will use the RTMP protocol."""
192 self._downloader.to_screen(u'[youtube] RTMP download detected')
194 def _closed_captions_xml_to_srt(self, xml_string):
196 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
197 # TODO parse xml instead of regex
198 for n, (start, dur_tag, dur, caption) in enumerate(texts):
199 if not dur: dur = '4'
201 end = start + float(dur)
202 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
203 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
204 caption = unescapeHTML(caption)
205 caption = unescapeHTML(caption) # double cycle, intentional
206 srt += str(n+1) + '\n'
207 srt += start + ' --> ' + end + '\n'
208 srt += caption + '\n\n'
211 def _print_formats(self, formats):
212 print 'Available formats:'
214 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
216 def _real_initialize(self):
217 if self._downloader is None:
222 downloader_params = self._downloader.params
224 # Attempt to use provided username and password or .netrc data
225 if downloader_params.get('username', None) is not None:
226 username = downloader_params['username']
227 password = downloader_params['password']
228 elif downloader_params.get('usenetrc', False):
230 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
236 except (IOError, netrc.NetrcParseError), err:
237 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
241 request = urllib2.Request(self._LANG_URL)
244 urllib2.urlopen(request).read()
245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
246 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
249 # No authentication to be performed
255 'current_form': 'loginForm',
257 'action_login': 'Log In',
258 'username': username,
259 'password': password,
261 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
264 login_results = urllib2.urlopen(request).read()
265 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
266 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
269 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
275 'action_confirm': 'Confirm',
277 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
279 self.report_age_confirmation()
280 age_results = urllib2.urlopen(request).read()
281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
282 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
285 def _real_extract(self, url):
286 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
287 mobj = re.search(self._NEXT_URL_RE, url)
289 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
291 # Extract video id from URL
292 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
294 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
296 video_id = mobj.group(2)
299 self.report_video_webpage_download(video_id)
300 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
302 video_webpage = urllib2.urlopen(request).read()
303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
304 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
307 # Attempt to extract SWF player URL
308 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
310 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 self.report_video_info_webpage_download(video_id)
316 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
317 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
318 % (video_id, el_type))
319 request = urllib2.Request(video_info_url)
321 video_info_webpage = urllib2.urlopen(request).read()
322 video_info = parse_qs(video_info_webpage)
323 if 'token' in video_info:
325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
326 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
328 if 'token' not in video_info:
329 if 'reason' in video_info:
330 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
332 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
335 # Check for "rental" videos
336 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
337 self._downloader.trouble(u'ERROR: "rental" videos not supported')
340 # Start extracting information
341 self.report_information_extraction(video_id)
344 if 'author' not in video_info:
345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
347 video_uploader = urllib.unquote_plus(video_info['author'][0])
350 if 'title' not in video_info:
351 self._downloader.trouble(u'ERROR: unable to extract video title')
353 video_title = urllib.unquote_plus(video_info['title'][0])
354 video_title = video_title.decode('utf-8')
357 if 'thumbnail_url' not in video_info:
358 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
360 else: # don't panic if we can't find it
361 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
365 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
367 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
368 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
369 for expression in format_expressions:
371 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
377 if video_description: video_description = clean_html(video_description)
378 else: video_description = ''
381 video_subtitles = None
382 if self._downloader.params.get('writesubtitles', False):
384 self.report_video_subtitles_download(video_id)
385 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
387 srt_list = urllib2.urlopen(request).read()
388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
389 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
390 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
391 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
392 if not srt_lang_list:
393 raise Trouble(u'WARNING: video has no closed captions')
394 if self._downloader.params.get('subtitleslang', False):
395 srt_lang = self._downloader.params.get('subtitleslang')
396 elif 'en' in srt_lang_list:
399 srt_lang = srt_lang_list.keys()[0]
400 if not srt_lang in srt_lang_list:
401 raise Trouble(u'WARNING: no closed captions found in the specified language')
402 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
404 srt_xml = urllib2.urlopen(request).read()
405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
406 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
408 raise Trouble(u'WARNING: unable to download video subtitles')
409 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
410 except Trouble as trouble:
411 self._downloader.trouble(trouble[0])
414 video_token = urllib.unquote_plus(video_info['token'][0])
416 # Decide which formats to download
417 req_format = self._downloader.params.get('format', None)
419 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
420 self.report_rtmp_download()
421 video_url_list = [(None, video_info['conn'][0])]
422 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
423 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
424 url_data = [parse_qs(uds) for uds in url_data_strs]
425 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
426 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
428 format_limit = self._downloader.params.get('format_limit', None)
429 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
430 if format_limit is not None and format_limit in available_formats:
431 format_list = available_formats[available_formats.index(format_limit):]
433 format_list = available_formats
434 existing_formats = [x for x in format_list if x in url_map]
435 if len(existing_formats) == 0:
436 self._downloader.trouble(u'ERROR: no known formats available for video')
438 if self._downloader.params.get('listformats', None):
439 self._print_formats(existing_formats)
441 if req_format is None or req_format == 'best':
442 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
443 elif req_format == 'worst':
444 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
445 elif req_format in ('-1', 'all'):
446 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
448 # Specific formats. We pick the first in a slash-delimeted sequence.
449 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
450 req_formats = req_format.split('/')
451 video_url_list = None
452 for rf in req_formats:
454 video_url_list = [(rf, url_map[rf])]
456 if video_url_list is None:
457 self._downloader.trouble(u'ERROR: requested format not available')
460 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
464 for format_param, video_real_url in video_url_list:
466 video_extension = self._video_extensions.get(format_param, 'flv')
469 'id': video_id.decode('utf-8'),
470 'url': video_real_url.decode('utf-8'),
471 'uploader': video_uploader.decode('utf-8'),
472 'upload_date': upload_date,
473 'title': video_title,
474 'ext': video_extension.decode('utf-8'),
475 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
476 'thumbnail': video_thumbnail.decode('utf-8'),
477 'description': video_description,
478 'player_url': player_url,
479 'subtitles': video_subtitles
484 class MetacafeIE(InfoExtractor):
485 """Information Extractor for metacafe.com."""
487 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
488 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
489 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
490 IE_NAME = u'metacafe'
492 def __init__(self, downloader=None):
493 InfoExtractor.__init__(self, downloader)
495 def report_disclaimer(self):
496 """Report disclaimer retrieval."""
497 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
499 def report_age_confirmation(self):
500 """Report attempt to confirm age."""
501 self._downloader.to_screen(u'[metacafe] Confirming age')
503 def report_download_webpage(self, video_id):
504 """Report webpage download."""
505 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
507 def report_extraction(self, video_id):
508 """Report information extraction."""
509 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
511 def _real_initialize(self):
512 # Retrieve disclaimer
513 request = urllib2.Request(self._DISCLAIMER)
515 self.report_disclaimer()
516 disclaimer = urllib2.urlopen(request).read()
517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
518 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
524 'submit': "Continue - I'm over 18",
526 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
528 self.report_age_confirmation()
529 disclaimer = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
534 def _real_extract(self, url):
535 # Extract id and simplified title from URL
536 mobj = re.match(self._VALID_URL, url)
538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
541 video_id = mobj.group(1)
543 # Check if video comes from YouTube
544 mobj2 = re.match(r'^yt-(.*)$', video_id)
545 if mobj2 is not None:
546 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
549 # Retrieve video webpage to extract further information
550 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
552 self.report_download_webpage(video_id)
553 webpage = urllib2.urlopen(request).read()
554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
555 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
558 # Extract URL, uploader and title from webpage
559 self.report_extraction(video_id)
560 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
562 mediaURL = urllib.unquote(mobj.group(1))
563 video_extension = mediaURL[-3:]
565 # Extract gdaKey if available
566 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
570 gdaKey = mobj.group(1)
571 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
573 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
575 self._downloader.trouble(u'ERROR: unable to extract media URL')
577 vardict = parse_qs(mobj.group(1))
578 if 'mediaData' not in vardict:
579 self._downloader.trouble(u'ERROR: unable to extract media URL')
581 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
583 self._downloader.trouble(u'ERROR: unable to extract media URL')
585 mediaURL = mobj.group(1).replace('\\/', '/')
586 video_extension = mediaURL[-3:]
587 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
589 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
591 self._downloader.trouble(u'ERROR: unable to extract title')
593 video_title = mobj.group(1).decode('utf-8')
595 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
597 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
599 video_uploader = mobj.group(1)
602 'id': video_id.decode('utf-8'),
603 'url': video_url.decode('utf-8'),
604 'uploader': video_uploader.decode('utf-8'),
605 'upload_date': u'NA',
606 'title': video_title,
607 'ext': video_extension.decode('utf-8'),
613 class DailymotionIE(InfoExtractor):
614 """Information Extractor for Dailymotion"""
616 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
617 IE_NAME = u'dailymotion'
619 def __init__(self, downloader=None):
620 InfoExtractor.__init__(self, downloader)
622 def report_download_webpage(self, video_id):
623 """Report webpage download."""
624 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
626 def report_extraction(self, video_id):
627 """Report information extraction."""
628 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
630 def _real_extract(self, url):
631 # Extract id and simplified title from URL
632 mobj = re.match(self._VALID_URL, url)
634 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
637 video_id = mobj.group(1)
639 video_extension = 'flv'
641 # Retrieve video webpage to extract further information
642 request = urllib2.Request(url)
643 request.add_header('Cookie', 'family_filter=off')
645 self.report_download_webpage(video_id)
646 webpage = urllib2.urlopen(request).read()
647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
648 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
651 # Extract URL, uploader and title from webpage
652 self.report_extraction(video_id)
653 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract media URL')
657 sequence = urllib.unquote(mobj.group(1))
658 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
660 self._downloader.trouble(u'ERROR: unable to extract media URL')
662 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
664 # if needed add http://www.dailymotion.com/ if relative URL
668 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
670 self._downloader.trouble(u'ERROR: unable to extract title')
672 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
674 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
676 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
678 video_uploader = mobj.group(1)
681 'id': video_id.decode('utf-8'),
682 'url': video_url.decode('utf-8'),
683 'uploader': video_uploader.decode('utf-8'),
684 'upload_date': u'NA',
685 'title': video_title,
686 'ext': video_extension.decode('utf-8'),
692 class GoogleIE(InfoExtractor):
693 """Information extractor for video.google.com."""
695 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
696 IE_NAME = u'video.google'
698 def __init__(self, downloader=None):
699 InfoExtractor.__init__(self, downloader)
701 def report_download_webpage(self, video_id):
702 """Report webpage download."""
703 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
705 def report_extraction(self, video_id):
706 """Report information extraction."""
707 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
709 def _real_extract(self, url):
710 # Extract id from URL
711 mobj = re.match(self._VALID_URL, url)
713 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
716 video_id = mobj.group(1)
718 video_extension = 'mp4'
720 # Retrieve video webpage to extract further information
721 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
723 self.report_download_webpage(video_id)
724 webpage = urllib2.urlopen(request).read()
725 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
726 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
729 # Extract URL, uploader, and title from webpage
730 self.report_extraction(video_id)
731 mobj = re.search(r"download_url:'([^']+)'", webpage)
733 video_extension = 'flv'
734 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
736 self._downloader.trouble(u'ERROR: unable to extract media URL')
738 mediaURL = urllib.unquote(mobj.group(1))
739 mediaURL = mediaURL.replace('\\x3d', '\x3d')
740 mediaURL = mediaURL.replace('\\x26', '\x26')
744 mobj = re.search(r'<title>(.*)</title>', webpage)
746 self._downloader.trouble(u'ERROR: unable to extract title')
748 video_title = mobj.group(1).decode('utf-8')
750 # Extract video description
751 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
753 self._downloader.trouble(u'ERROR: unable to extract video description')
755 video_description = mobj.group(1).decode('utf-8')
756 if not video_description:
757 video_description = 'No description available.'
759 # Extract video thumbnail
760 if self._downloader.params.get('forcethumbnail', False):
761 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
763 webpage = urllib2.urlopen(request).read()
764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
765 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
767 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
769 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
771 video_thumbnail = mobj.group(1)
772 else: # we need something to pass to process_info
776 'id': video_id.decode('utf-8'),
777 'url': video_url.decode('utf-8'),
779 'upload_date': u'NA',
780 'title': video_title,
781 'ext': video_extension.decode('utf-8'),
787 class PhotobucketIE(InfoExtractor):
788 """Information extractor for photobucket.com."""
790 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
791 IE_NAME = u'photobucket'
793 def __init__(self, downloader=None):
794 InfoExtractor.__init__(self, downloader)
796 def report_download_webpage(self, video_id):
797 """Report webpage download."""
798 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
811 video_id = mobj.group(1)
813 video_extension = 'flv'
815 # Retrieve video webpage to extract further information
816 request = urllib2.Request(url)
818 self.report_download_webpage(video_id)
819 webpage = urllib2.urlopen(request).read()
820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
821 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
824 # Extract URL, uploader, and title from webpage
825 self.report_extraction(video_id)
826 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
828 self._downloader.trouble(u'ERROR: unable to extract media URL')
830 mediaURL = urllib.unquote(mobj.group(1))
834 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
836 self._downloader.trouble(u'ERROR: unable to extract title')
838 video_title = mobj.group(1).decode('utf-8')
840 video_uploader = mobj.group(2).decode('utf-8')
843 'id': video_id.decode('utf-8'),
844 'url': video_url.decode('utf-8'),
845 'uploader': video_uploader,
846 'upload_date': u'NA',
847 'title': video_title,
848 'ext': video_extension.decode('utf-8'),
854 class YahooIE(InfoExtractor):
855 """Information extractor for video.yahoo.com."""
857 # _VALID_URL matches all Yahoo! Video URLs
858 # _VPAGE_URL matches only the extractable '/watch/' URLs
859 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
860 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
861 IE_NAME = u'video.yahoo'
863 def __init__(self, downloader=None):
864 InfoExtractor.__init__(self, downloader)
866 def report_download_webpage(self, video_id):
867 """Report webpage download."""
868 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
870 def report_extraction(self, video_id):
871 """Report information extraction."""
872 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
874 def _real_extract(self, url, new_video=True):
875 # Extract ID from URL
876 mobj = re.match(self._VALID_URL, url)
878 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
881 video_id = mobj.group(2)
882 video_extension = 'flv'
884 # Rewrite valid but non-extractable URLs as
885 # extractable English language /watch/ URLs
886 if re.match(self._VPAGE_URL, url) is None:
887 request = urllib2.Request(url)
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
894 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
896 self._downloader.trouble(u'ERROR: Unable to extract id field')
898 yahoo_id = mobj.group(1)
900 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
902 self._downloader.trouble(u'ERROR: Unable to extract vid field')
904 yahoo_vid = mobj.group(1)
906 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
907 return self._real_extract(url, new_video=False)
909 # Retrieve video webpage to extract further information
910 request = urllib2.Request(url)
912 self.report_download_webpage(video_id)
913 webpage = urllib2.urlopen(request).read()
914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
918 # Extract uploader and title from webpage
919 self.report_extraction(video_id)
920 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
922 self._downloader.trouble(u'ERROR: unable to extract video title')
924 video_title = mobj.group(1).decode('utf-8')
926 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
928 self._downloader.trouble(u'ERROR: unable to extract video uploader')
930 video_uploader = mobj.group(1).decode('utf-8')
932 # Extract video thumbnail
933 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
935 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
937 video_thumbnail = mobj.group(1).decode('utf-8')
939 # Extract video description
940 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
942 self._downloader.trouble(u'ERROR: unable to extract video description')
944 video_description = mobj.group(1).decode('utf-8')
945 if not video_description:
946 video_description = 'No description available.'
948 # Extract video height and width
949 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
951 self._downloader.trouble(u'ERROR: unable to extract video height')
953 yv_video_height = mobj.group(1)
955 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
957 self._downloader.trouble(u'ERROR: unable to extract video width')
959 yv_video_width = mobj.group(1)
961 # Retrieve video playlist to extract media URL
962 # I'm not completely sure what all these options are, but we
963 # seem to need most of them, otherwise the server sends a 401.
964 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
965 yv_bitrate = '700' # according to Wikipedia this is hard-coded
966 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
967 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
968 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
970 self.report_download_webpage(video_id)
971 webpage = urllib2.urlopen(request).read()
972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
973 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
976 # Extract media URL from playlist XML
977 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
979 self._downloader.trouble(u'ERROR: Unable to extract media URL')
981 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
982 video_url = unescapeHTML(video_url)
985 'id': video_id.decode('utf-8'),
987 'uploader': video_uploader,
988 'upload_date': u'NA',
989 'title': video_title,
990 'ext': video_extension.decode('utf-8'),
991 'thumbnail': video_thumbnail.decode('utf-8'),
992 'description': video_description,
993 'thumbnail': video_thumbnail,
998 class VimeoIE(InfoExtractor):
999 """Information extractor for vimeo.com."""
1001 # _VALID_URL matches Vimeo URLs
1002 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1005 def __init__(self, downloader=None):
1006 InfoExtractor.__init__(self, downloader)
1008 def report_download_webpage(self, video_id):
1009 """Report webpage download."""
1010 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1012 def report_extraction(self, video_id):
1013 """Report information extraction."""
1014 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1016 def _real_extract(self, url, new_video=True):
1017 # Extract ID from URL
1018 mobj = re.match(self._VALID_URL, url)
1020 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1023 video_id = mobj.group(1)
1025 # Retrieve video webpage to extract further information
1026 request = urllib2.Request(url, None, std_headers)
1028 self.report_download_webpage(video_id)
1029 webpage = urllib2.urlopen(request).read()
1030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1034 # Now we begin extracting as much information as we can from what we
1035 # retrieved. First we extract the information common to all extractors,
1036 # and latter we extract those that are Vimeo specific.
1037 self.report_extraction(video_id)
1039 # Extract the config JSON
1040 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1042 config = json.loads(config)
1044 self._downloader.trouble(u'ERROR: unable to extract info section')
1048 video_title = config["video"]["title"]
1051 video_uploader = config["video"]["owner"]["name"]
1053 # Extract video thumbnail
1054 video_thumbnail = config["video"]["thumbnail"]
1056 # Extract video description
1057 video_description = get_element_by_id("description", webpage.decode('utf8'))
1058 if video_description: video_description = clean_html(video_description)
1059 else: video_description = ''
1061 # Extract upload date
1062 video_upload_date = u'NA'
1063 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1064 if mobj is not None:
1065 video_upload_date = mobj.group(1)
1067 # Vimeo specific: extract request signature and timestamp
1068 sig = config['request']['signature']
1069 timestamp = config['request']['timestamp']
1071 # Vimeo specific: extract video codec and quality information
1072 # TODO bind to format param
1073 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1074 for codec in codecs:
1075 if codec[0] in config["video"]["files"]:
1076 video_codec = codec[0]
1077 video_extension = codec[1]
1078 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1079 else: quality = 'sd'
1082 self._downloader.trouble(u'ERROR: no known codec found')
1085 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1086 %(video_id, sig, timestamp, quality, video_codec.upper())
1091 'uploader': video_uploader,
1092 'upload_date': video_upload_date,
1093 'title': video_title,
1094 'ext': video_extension,
1095 'thumbnail': video_thumbnail,
1096 'description': video_description,
1101 class GenericIE(InfoExtractor):
1102 """Generic last-resort information extractor."""
1105 IE_NAME = u'generic'
1107 def __init__(self, downloader=None):
1108 InfoExtractor.__init__(self, downloader)
1110 def report_download_webpage(self, video_id):
1111 """Report webpage download."""
1112 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1113 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1115 def report_extraction(self, video_id):
1116 """Report information extraction."""
1117 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1119 def report_following_redirect(self, new_url):
1120 """Report information extraction."""
1121 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1123 def _test_redirect(self, url):
1124 """Check if it is a redirect, like url shorteners, in case restart chain."""
1125 class HeadRequest(urllib2.Request):
1126 def get_method(self):
1129 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1131 Subclass the HTTPRedirectHandler to make it use our
1132 HeadRequest also on the redirected URL
1134 def redirect_request(self, req, fp, code, msg, headers, newurl):
1135 if code in (301, 302, 303, 307):
1136 newurl = newurl.replace(' ', '%20')
1137 newheaders = dict((k,v) for k,v in req.headers.items()
1138 if k.lower() not in ("content-length", "content-type"))
1139 return HeadRequest(newurl,
1141 origin_req_host=req.get_origin_req_host(),
1144 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1146 class HTTPMethodFallback(urllib2.BaseHandler):
1148 Fallback to GET if HEAD is not allowed (405 HTTP error)
1150 def http_error_405(self, req, fp, code, msg, headers):
1154 newheaders = dict((k,v) for k,v in req.headers.items()
1155 if k.lower() not in ("content-length", "content-type"))
1156 return self.parent.open(urllib2.Request(req.get_full_url(),
1158 origin_req_host=req.get_origin_req_host(),
1162 opener = urllib2.OpenerDirector()
1163 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1164 HTTPMethodFallback, HEADRedirectHandler,
1165 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1166 opener.add_handler(handler())
1168 response = opener.open(HeadRequest(url))
1169 new_url = response.geturl()
1171 if url == new_url: return False
1173 self.report_following_redirect(new_url)
1174 self._downloader.download([new_url])
1177 def _real_extract(self, url):
1178 if self._test_redirect(url): return
1180 video_id = url.split('/')[-1]
1181 request = urllib2.Request(url)
1183 self.report_download_webpage(video_id)
1184 webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1188 except ValueError, err:
1189 # since this is the last-resort InfoExtractor, if
1190 # this error is thrown, it'll be thrown here
1191 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1194 self.report_extraction(video_id)
1195 # Start with something easy: JW Player in SWFObject
1196 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1198 # Broaden the search a little bit
1199 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1201 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1204 # It's possible that one of the regexes
1205 # matched, but returned an empty group:
1206 if mobj.group(1) is None:
1207 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210 video_url = urllib.unquote(mobj.group(1))
1211 video_id = os.path.basename(video_url)
1213 # here's a fun little line of code for you:
1214 video_extension = os.path.splitext(video_id)[1][1:]
1215 video_id = os.path.splitext(video_id)[0]
1217 # it's tempting to parse this further, but you would
1218 # have to take into account all the variations like
1219 # Video Title - Site Name
1220 # Site Name | Video Title
1221 # Video Title - Tagline | Site Name
1222 # and so on and so forth; it's just not practical
1223 mobj = re.search(r'<title>(.*)</title>', webpage)
1225 self._downloader.trouble(u'ERROR: unable to extract title')
1227 video_title = mobj.group(1).decode('utf-8')
1229 # video uploader is domain name
1230 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1232 self._downloader.trouble(u'ERROR: unable to extract title')
1234 video_uploader = mobj.group(1).decode('utf-8')
1237 'id': video_id.decode('utf-8'),
1238 'url': video_url.decode('utf-8'),
1239 'uploader': video_uploader,
1240 'upload_date': u'NA',
1241 'title': video_title,
1242 'ext': video_extension.decode('utf-8'),
1248 class YoutubeSearchIE(InfoExtractor):
1249 """Information Extractor for YouTube search queries."""
1250 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1251 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1252 _max_youtube_results = 1000
1253 IE_NAME = u'youtube:search'
1255 def __init__(self, downloader=None):
1256 InfoExtractor.__init__(self, downloader)
1258 def report_download_page(self, query, pagenum):
1259 """Report attempt to download search page with given number."""
1260 query = query.decode(preferredencoding())
1261 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1263 def _real_extract(self, query):
1264 mobj = re.match(self._VALID_URL, query)
1266 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1269 prefix, query = query.split(':')
1271 query = query.encode('utf-8')
1273 self._download_n_results(query, 1)
1275 elif prefix == 'all':
1276 self._download_n_results(query, self._max_youtube_results)
1282 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1284 elif n > self._max_youtube_results:
1285 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1286 n = self._max_youtube_results
1287 self._download_n_results(query, n)
1289 except ValueError: # parsing prefix as integer fails
1290 self._download_n_results(query, 1)
1293 def _download_n_results(self, query, n):
1294 """Downloads a specified number of results for a query"""
1300 while (50 * pagenum) < limit:
1301 self.report_download_page(query, pagenum+1)
1302 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1303 request = urllib2.Request(result_url)
1305 data = urllib2.urlopen(request).read()
1306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1307 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1309 api_response = json.loads(data)['data']
1311 new_ids = list(video['id'] for video in api_response['items'])
1312 video_ids += new_ids
1314 limit = min(n, api_response['totalItems'])
1317 if len(video_ids) > n:
1318 video_ids = video_ids[:n]
1319 for id in video_ids:
1320 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1324 class GoogleSearchIE(InfoExtractor):
1325 """Information Extractor for Google Video search queries."""
1326 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1327 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1328 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1329 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1330 _max_google_results = 1000
1331 IE_NAME = u'video.google:search'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_page(self, query, pagenum):
1337 """Report attempt to download playlist page with given number."""
1338 query = query.decode(preferredencoding())
1339 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1341 def _real_extract(self, query):
1342 mobj = re.match(self._VALID_URL, query)
1344 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1347 prefix, query = query.split(':')
1349 query = query.encode('utf-8')
1351 self._download_n_results(query, 1)
1353 elif prefix == 'all':
1354 self._download_n_results(query, self._max_google_results)
1360 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1362 elif n > self._max_google_results:
1363 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1364 n = self._max_google_results
1365 self._download_n_results(query, n)
1367 except ValueError: # parsing prefix as integer fails
1368 self._download_n_results(query, 1)
1371 def _download_n_results(self, query, n):
1372 """Downloads a specified number of results for a query"""
1378 self.report_download_page(query, pagenum)
1379 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1380 request = urllib2.Request(result_url)
1382 page = urllib2.urlopen(request).read()
1383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1387 # Extract video identifiers
1388 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1389 video_id = mobj.group(1)
1390 if video_id not in video_ids:
1391 video_ids.append(video_id)
1392 if len(video_ids) == n:
1393 # Specified n videos reached
1394 for id in video_ids:
1395 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1398 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1399 for id in video_ids:
1400 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1403 pagenum = pagenum + 1
1406 class YahooSearchIE(InfoExtractor):
1407 """Information Extractor for Yahoo! Video search queries."""
1408 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1409 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1410 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1411 _MORE_PAGES_INDICATOR = r'\s*Next'
1412 _max_yahoo_results = 1000
1413 IE_NAME = u'video.yahoo:search'
1415 def __init__(self, downloader=None):
1416 InfoExtractor.__init__(self, downloader)
1418 def report_download_page(self, query, pagenum):
1419 """Report attempt to download playlist page with given number."""
1420 query = query.decode(preferredencoding())
1421 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1423 def _real_extract(self, query):
1424 mobj = re.match(self._VALID_URL, query)
1426 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1429 prefix, query = query.split(':')
1431 query = query.encode('utf-8')
1433 self._download_n_results(query, 1)
1435 elif prefix == 'all':
1436 self._download_n_results(query, self._max_yahoo_results)
1442 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1444 elif n > self._max_yahoo_results:
1445 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1446 n = self._max_yahoo_results
1447 self._download_n_results(query, n)
1449 except ValueError: # parsing prefix as integer fails
1450 self._download_n_results(query, 1)
1453 def _download_n_results(self, query, n):
1454 """Downloads a specified number of results for a query"""
1457 already_seen = set()
1461 self.report_download_page(query, pagenum)
1462 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1463 request = urllib2.Request(result_url)
1465 page = urllib2.urlopen(request).read()
1466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1470 # Extract video identifiers
1471 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1472 video_id = mobj.group(1)
1473 if video_id not in already_seen:
1474 video_ids.append(video_id)
1475 already_seen.add(video_id)
1476 if len(video_ids) == n:
1477 # Specified n videos reached
1478 for id in video_ids:
1479 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1483 for id in video_ids:
1484 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1487 pagenum = pagenum + 1
1490 class YoutubePlaylistIE(InfoExtractor):
1491 """Information Extractor for YouTube playlists."""
1493 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1494 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1495 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1496 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1497 IE_NAME = u'youtube:playlist'
1499 def __init__(self, downloader=None):
1500 InfoExtractor.__init__(self, downloader)
1502 def report_download_page(self, playlist_id, pagenum):
1503 """Report attempt to download playlist page with given number."""
1504 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1506 def _real_extract(self, url):
1507 # Extract playlist id
1508 mobj = re.match(self._VALID_URL, url)
1510 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1514 if mobj.group(3) is not None:
1515 self._downloader.download([mobj.group(3)])
1518 # Download playlist pages
1519 # prefix is 'p' as default for playlists but there are other types that need extra care
1520 playlist_prefix = mobj.group(1)
1521 if playlist_prefix == 'a':
1522 playlist_access = 'artist'
1524 playlist_prefix = 'p'
1525 playlist_access = 'view_play_list'
1526 playlist_id = mobj.group(2)
1531 self.report_download_page(playlist_id, pagenum)
1532 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1533 request = urllib2.Request(url)
1535 page = urllib2.urlopen(request).read()
1536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1537 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1540 # Extract video identifiers
1542 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1543 if mobj.group(1) not in ids_in_page:
1544 ids_in_page.append(mobj.group(1))
1545 video_ids.extend(ids_in_page)
1547 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1549 pagenum = pagenum + 1
1551 playliststart = self._downloader.params.get('playliststart', 1) - 1
1552 playlistend = self._downloader.params.get('playlistend', -1)
1553 if playlistend == -1:
1554 video_ids = video_ids[playliststart:]
1556 video_ids = video_ids[playliststart:playlistend]
1558 for id in video_ids:
1559 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1563 class YoutubeUserIE(InfoExtractor):
1564 """Information Extractor for YouTube users."""
1566 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1567 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1568 _GDATA_PAGE_SIZE = 50
1569 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1570 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1571 IE_NAME = u'youtube:user'
1573 def __init__(self, downloader=None):
1574 InfoExtractor.__init__(self, downloader)
1576 def report_download_page(self, username, start_index):
1577 """Report attempt to download user page."""
1578 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1579 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1581 def _real_extract(self, url):
1583 mobj = re.match(self._VALID_URL, url)
1585 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1588 username = mobj.group(1)
1590 # Download video ids using YouTube Data API. Result size per
1591 # query is limited (currently to 50 videos) so we need to query
1592 # page by page until there are no video ids - it means we got
1599 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1600 self.report_download_page(username, start_index)
1602 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1605 page = urllib2.urlopen(request).read()
1606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1607 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1610 # Extract video identifiers
1613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1614 if mobj.group(1) not in ids_in_page:
1615 ids_in_page.append(mobj.group(1))
1617 video_ids.extend(ids_in_page)
1619 # A little optimization - if current page is not
1620 # "full", ie. does not contain PAGE_SIZE video ids then
1621 # we can assume that this page is the last one - there
1622 # are no more ids on further pages - no need to query
1625 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1630 all_ids_count = len(video_ids)
1631 playliststart = self._downloader.params.get('playliststart', 1) - 1
1632 playlistend = self._downloader.params.get('playlistend', -1)
1634 if playlistend == -1:
1635 video_ids = video_ids[playliststart:]
1637 video_ids = video_ids[playliststart:playlistend]
1639 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1640 (username, all_ids_count, len(video_ids)))
1642 for video_id in video_ids:
1643 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1646 class BlipTVUserIE(InfoExtractor):
1647 """Information Extractor for blip.tv users."""
1649 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1651 IE_NAME = u'blip.tv:user'
1653 def __init__(self, downloader=None):
1654 InfoExtractor.__init__(self, downloader)
1656 def report_download_page(self, username, pagenum):
1657 """Report attempt to download user page."""
1658 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1659 (self.IE_NAME, username, pagenum))
1661 def _real_extract(self, url):
1663 mobj = re.match(self._VALID_URL, url)
1665 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1668 username = mobj.group(1)
1670 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1672 request = urllib2.Request(url)
1675 page = urllib2.urlopen(request).read().decode('utf-8')
1676 mobj = re.search(r'data-users-id="([^"]+)"', page)
1677 page_base = page_base % mobj.group(1)
1678 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1679 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1683 # Download video ids using BlipTV Ajax calls. Result size per
1684 # query is limited (currently to 12 videos) so we need to query
1685 # page by page until there are no video ids - it means we got
1692 self.report_download_page(username, pagenum)
1694 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1697 page = urllib2.urlopen(request).read().decode('utf-8')
1698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1702 # Extract video identifiers
1705 for mobj in re.finditer(r'href="/([^"]+)"', page):
1706 if mobj.group(1) not in ids_in_page:
1707 ids_in_page.append(unescapeHTML(mobj.group(1)))
1709 video_ids.extend(ids_in_page)
1711 # A little optimization - if current page is not
1712 # "full", ie. does not contain PAGE_SIZE video ids then
1713 # we can assume that this page is the last one - there
1714 # are no more ids on further pages - no need to query
1717 if len(ids_in_page) < self._PAGE_SIZE:
1722 all_ids_count = len(video_ids)
1723 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724 playlistend = self._downloader.params.get('playlistend', -1)
1726 if playlistend == -1:
1727 video_ids = video_ids[playliststart:]
1729 video_ids = video_ids[playliststart:playlistend]
1731 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1732 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1734 for video_id in video_ids:
1735 self._downloader.download([u'http://blip.tv/'+video_id])
1738 class DepositFilesIE(InfoExtractor):
1739 """Information extractor for depositfiles.com"""
1741 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1742 IE_NAME = u'DepositFiles'
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1747 def report_download_webpage(self, file_id):
1748 """Report webpage download."""
1749 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1751 def report_extraction(self, file_id):
1752 """Report information extraction."""
1753 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1755 def _real_extract(self, url):
1756 file_id = url.split('/')[-1]
1757 # Rebuild url in english locale
1758 url = 'http://depositfiles.com/en/files/' + file_id
1760 # Retrieve file webpage with 'Free download' button pressed
1761 free_download_indication = { 'gateway_result' : '1' }
1762 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1764 self.report_download_webpage(file_id)
1765 webpage = urllib2.urlopen(request).read()
1766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1767 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1770 # Search for the real file URL
1771 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1772 if (mobj is None) or (mobj.group(1) is None):
1773 # Try to figure out reason of the error.
1774 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1775 if (mobj is not None) and (mobj.group(1) is not None):
1776 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1777 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1779 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1782 file_url = mobj.group(1)
1783 file_extension = os.path.splitext(file_url)[1][1:]
1785 # Search for file title
1786 mobj = re.search(r'<b title="(.*?)">', webpage)
1788 self._downloader.trouble(u'ERROR: unable to extract title')
1790 file_title = mobj.group(1).decode('utf-8')
1793 'id': file_id.decode('utf-8'),
1794 'url': file_url.decode('utf-8'),
1796 'upload_date': u'NA',
1797 'title': file_title,
1798 'ext': file_extension.decode('utf-8'),
1804 class FacebookIE(InfoExtractor):
1805 """Information Extractor for Facebook"""
1807 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1808 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1809 _NETRC_MACHINE = 'facebook'
1810 _available_formats = ['video', 'highqual', 'lowqual']
1811 _video_extensions = {
1816 IE_NAME = u'facebook'
1818 def __init__(self, downloader=None):
1819 InfoExtractor.__init__(self, downloader)
1821 def _reporter(self, message):
1822 """Add header and report message."""
1823 self._downloader.to_screen(u'[facebook] %s' % message)
1825 def report_login(self):
1826 """Report attempt to log in."""
1827 self._reporter(u'Logging in')
1829 def report_video_webpage_download(self, video_id):
1830 """Report attempt to download video webpage."""
1831 self._reporter(u'%s: Downloading video webpage' % video_id)
1833 def report_information_extraction(self, video_id):
1834 """Report attempt to extract video information."""
1835 self._reporter(u'%s: Extracting video information' % video_id)
1837 def _parse_page(self, video_webpage):
1838 """Extract video information from page"""
1840 data = {'title': r'\("video_title", "(.*?)"\)',
1841 'description': r'<div class="datawrap">(.*?)</div>',
1842 'owner': r'\("video_owner_name", "(.*?)"\)',
1843 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1846 for piece in data.keys():
1847 mobj = re.search(data[piece], video_webpage)
1848 if mobj is not None:
1849 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1853 for fmt in self._available_formats:
1854 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1855 if mobj is not None:
1856 # URL is in a Javascript segment inside an escaped Unicode format within
1857 # the generally utf-8 page
1858 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1859 video_info['video_urls'] = video_urls
1863 def _real_initialize(self):
1864 if self._downloader is None:
1869 downloader_params = self._downloader.params
1871 # Attempt to use provided username and password or .netrc data
1872 if downloader_params.get('username', None) is not None:
1873 useremail = downloader_params['username']
1874 password = downloader_params['password']
1875 elif downloader_params.get('usenetrc', False):
1877 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1878 if info is not None:
1882 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1883 except (IOError, netrc.NetrcParseError), err:
1884 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1887 if useremail is None:
1896 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1899 login_results = urllib2.urlopen(request).read()
1900 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1901 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1904 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1907 def _real_extract(self, url):
1908 mobj = re.match(self._VALID_URL, url)
1910 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1912 video_id = mobj.group('ID')
1915 self.report_video_webpage_download(video_id)
1916 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1918 page = urllib2.urlopen(request)
1919 video_webpage = page.read()
1920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1924 # Start extracting information
1925 self.report_information_extraction(video_id)
1927 # Extract information
1928 video_info = self._parse_page(video_webpage)
1931 if 'owner' not in video_info:
1932 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1934 video_uploader = video_info['owner']
1937 if 'title' not in video_info:
1938 self._downloader.trouble(u'ERROR: unable to extract video title')
1940 video_title = video_info['title']
1941 video_title = video_title.decode('utf-8')
1944 if 'thumbnail' not in video_info:
1945 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1946 video_thumbnail = ''
1948 video_thumbnail = video_info['thumbnail']
1952 if 'upload_date' in video_info:
1953 upload_time = video_info['upload_date']
1954 timetuple = email.utils.parsedate_tz(upload_time)
1955 if timetuple is not None:
1957 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1962 video_description = video_info.get('description', 'No description available.')
1964 url_map = video_info['video_urls']
1965 if len(url_map.keys()) > 0:
1966 # Decide which formats to download
1967 req_format = self._downloader.params.get('format', None)
1968 format_limit = self._downloader.params.get('format_limit', None)
1970 if format_limit is not None and format_limit in self._available_formats:
1971 format_list = self._available_formats[self._available_formats.index(format_limit):]
1973 format_list = self._available_formats
1974 existing_formats = [x for x in format_list if x in url_map]
1975 if len(existing_formats) == 0:
1976 self._downloader.trouble(u'ERROR: no known formats available for video')
1978 if req_format is None:
1979 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1980 elif req_format == 'worst':
1981 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1982 elif req_format == '-1':
1983 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1986 if req_format not in url_map:
1987 self._downloader.trouble(u'ERROR: requested format not available')
1989 video_url_list = [(req_format, url_map[req_format])] # Specific format
1992 for format_param, video_real_url in video_url_list:
1994 video_extension = self._video_extensions.get(format_param, 'mp4')
1997 'id': video_id.decode('utf-8'),
1998 'url': video_real_url.decode('utf-8'),
1999 'uploader': video_uploader.decode('utf-8'),
2000 'upload_date': upload_date,
2001 'title': video_title,
2002 'ext': video_extension.decode('utf-8'),
2003 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2004 'thumbnail': video_thumbnail.decode('utf-8'),
2005 'description': video_description.decode('utf-8'),
2010 class BlipTVIE(InfoExtractor):
2011 """Information extractor for blip.tv"""
2013 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2014 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2015 IE_NAME = u'blip.tv'
2017 def report_extraction(self, file_id):
2018 """Report information extraction."""
2019 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2021 def report_direct_download(self, title):
2022 """Report information extraction."""
2023 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2025 def _real_extract(self, url):
2026 mobj = re.match(self._VALID_URL, url)
2028 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2035 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2036 request = urllib2.Request(json_url.encode('utf-8'))
2037 self.report_extraction(mobj.group(1))
2040 urlh = urllib2.urlopen(request)
2041 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2042 basename = url.split('/')[-1]
2043 title,ext = os.path.splitext(basename)
2044 title = title.decode('UTF-8')
2045 ext = ext.replace('.', '')
2046 self.report_direct_download(title)
2054 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2057 if info is None: # Regular URL
2059 json_code = urlh.read()
2060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2065 json_data = json.loads(json_code)
2066 if 'Post' in json_data:
2067 data = json_data['Post']
2071 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2072 video_url = data['media']['url']
2073 umobj = re.match(self._URL_EXT, video_url)
2075 raise ValueError('Can not determine filename extension')
2076 ext = umobj.group(1)
2079 'id': data['item_id'],
2081 'uploader': data['display_name'],
2082 'upload_date': upload_date,
2083 'title': data['title'],
2085 'format': data['media']['mimeType'],
2086 'thumbnail': data['thumbnailUrl'],
2087 'description': data['description'],
2088 'player_url': data['embedUrl']
2090 except (ValueError,KeyError), err:
2091 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2094 std_headers['User-Agent'] = 'iTunes/10.6.1'
2098 class MyVideoIE(InfoExtractor):
2099 """Information Extractor for myvideo.de."""
2101 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2102 IE_NAME = u'myvideo'
2104 def __init__(self, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2107 def report_download_webpage(self, video_id):
2108 """Report webpage download."""
2109 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2111 def report_extraction(self, video_id):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2115 def _real_extract(self,url):
2116 mobj = re.match(self._VALID_URL, url)
2118 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2121 video_id = mobj.group(1)
2124 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2126 self.report_download_webpage(video_id)
2127 webpage = urllib2.urlopen(request).read()
2128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2129 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2132 self.report_extraction(video_id)
2133 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2136 self._downloader.trouble(u'ERROR: unable to extract media URL')
2138 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2140 mobj = re.search('<title>([^<]+)</title>', webpage)
2142 self._downloader.trouble(u'ERROR: unable to extract title')
2145 video_title = mobj.group(1)
2151 'upload_date': u'NA',
2152 'title': video_title,
2158 class ComedyCentralIE(InfoExtractor):
2159 """Information extractor for The Daily Show and Colbert Report """
2161 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2162 IE_NAME = u'comedycentral'
2164 def report_extraction(self, episode_id):
2165 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2167 def report_config_download(self, episode_id):
2168 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2170 def report_index_download(self, episode_id):
2171 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2173 def report_player_url(self, episode_id):
2174 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2176 def _real_extract(self, url):
2177 mobj = re.match(self._VALID_URL, url)
2179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2182 if mobj.group('shortname'):
2183 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2184 url = u'http://www.thedailyshow.com/full-episodes/'
2186 url = u'http://www.colbertnation.com/full-episodes/'
2187 mobj = re.match(self._VALID_URL, url)
2188 assert mobj is not None
2190 dlNewest = not mobj.group('episode')
2192 epTitle = mobj.group('showname')
2194 epTitle = mobj.group('episode')
2196 req = urllib2.Request(url)
2197 self.report_extraction(epTitle)
2199 htmlHandle = urllib2.urlopen(req)
2200 html = htmlHandle.read()
2201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2205 url = htmlHandle.geturl()
2206 mobj = re.match(self._VALID_URL, url)
2208 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2210 if mobj.group('episode') == '':
2211 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2213 epTitle = mobj.group('episode')
2215 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2216 if len(mMovieParams) == 0:
2217 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2220 playerUrl_raw = mMovieParams[0][0]
2221 self.report_player_url(epTitle)
2223 urlHandle = urllib2.urlopen(playerUrl_raw)
2224 playerUrl = urlHandle.geturl()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2229 uri = mMovieParams[0][1]
2230 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2231 self.report_index_download(epTitle)
2233 indexXml = urllib2.urlopen(indexUrl).read()
2234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2240 idoc = xml.etree.ElementTree.fromstring(indexXml)
2241 itemEls = idoc.findall('.//item')
2242 for itemEl in itemEls:
2243 mediaId = itemEl.findall('./guid')[0].text
2244 shortMediaId = mediaId.split(':')[-1]
2245 showId = mediaId.split(':')[-2].replace('.com', '')
2246 officialTitle = itemEl.findall('./title')[0].text
2247 officialDate = itemEl.findall('./pubDate')[0].text
2249 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2250 urllib.urlencode({'uri': mediaId}))
2251 configReq = urllib2.Request(configUrl)
2252 self.report_config_download(epTitle)
2254 configXml = urllib2.urlopen(configReq).read()
2255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2259 cdoc = xml.etree.ElementTree.fromstring(configXml)
2261 for rendition in cdoc.findall('.//rendition'):
2262 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2266 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2269 # For now, just pick the highest bitrate
2270 format,video_url = turls[-1]
2272 effTitle = showId + u'-' + epTitle
2277 'upload_date': officialDate,
2282 'description': officialTitle,
2283 'player_url': playerUrl
2286 results.append(info)
2291 class EscapistIE(InfoExtractor):
2292 """Information extractor for The Escapist """
2294 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2295 IE_NAME = u'escapist'
2297 def report_extraction(self, showName):
2298 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2300 def report_config_download(self, showName):
2301 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2303 def _real_extract(self, url):
2304 mobj = re.match(self._VALID_URL, url)
2306 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2308 showName = mobj.group('showname')
2309 videoId = mobj.group('episode')
2311 self.report_extraction(showName)
2313 webPage = urllib2.urlopen(url)
2314 webPageBytes = webPage.read()
2315 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2316 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2321 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2322 description = unescapeHTML(descMatch.group(1))
2323 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2324 imgUrl = unescapeHTML(imgMatch.group(1))
2325 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2326 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2327 configUrlMatch = re.search('config=(.*)$', playerUrl)
2328 configUrl = urllib2.unquote(configUrlMatch.group(1))
2330 self.report_config_download(showName)
2332 configJSON = urllib2.urlopen(configUrl).read()
2333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2334 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2337 # Technically, it's JavaScript, not JSON
2338 configJSON = configJSON.replace("'", '"')
2341 config = json.loads(configJSON)
2342 except (ValueError,), err:
2343 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2346 playlist = config['playlist']
2347 videoUrl = playlist[1]['url']
2352 'uploader': showName,
2353 'upload_date': None,
2357 'thumbnail': imgUrl,
2358 'description': description,
2359 'player_url': playerUrl,
2365 class CollegeHumorIE(InfoExtractor):
2366 """Information extractor for collegehumor.com"""
2368 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2369 IE_NAME = u'collegehumor'
2371 def report_webpage(self, video_id):
2372 """Report information extraction."""
2373 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2375 def report_extraction(self, video_id):
2376 """Report information extraction."""
2377 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2379 def _real_extract(self, url):
2380 mobj = re.match(self._VALID_URL, url)
2382 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2384 video_id = mobj.group('videoid')
2386 self.report_webpage(video_id)
2387 request = urllib2.Request(url)
2389 webpage = urllib2.urlopen(request).read()
2390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2394 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2396 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2398 internal_video_id = m.group('internalvideoid')
2402 'internal_id': internal_video_id,
2405 self.report_extraction(video_id)
2406 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2408 metaXml = urllib2.urlopen(xmlUrl).read()
2409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2413 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2415 videoNode = mdoc.findall('./video')[0]
2416 info['description'] = videoNode.findall('./description')[0].text
2417 info['title'] = videoNode.findall('./caption')[0].text
2418 info['url'] = videoNode.findall('./file')[0].text
2419 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2420 info['ext'] = info['url'].rpartition('.')[2]
2421 info['format'] = info['ext']
2423 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2429 class XVideosIE(InfoExtractor):
2430 """Information extractor for xvideos.com"""
2432 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2433 IE_NAME = u'xvideos'
2435 def report_webpage(self, video_id):
2436 """Report information extraction."""
2437 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2439 def report_extraction(self, video_id):
2440 """Report information extraction."""
2441 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2443 def _real_extract(self, url):
2444 mobj = re.match(self._VALID_URL, url)
2446 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2448 video_id = mobj.group(1).decode('utf-8')
2450 self.report_webpage(video_id)
2452 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2454 webpage = urllib2.urlopen(request).read()
2455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2456 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2459 self.report_extraction(video_id)
2463 mobj = re.search(r'flv_url=(.+?)&', webpage)
2465 self._downloader.trouble(u'ERROR: unable to extract video url')
2467 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2471 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2473 self._downloader.trouble(u'ERROR: unable to extract video title')
2475 video_title = mobj.group(1).decode('utf-8')
2478 # Extract video thumbnail
2479 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2481 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2483 video_thumbnail = mobj.group(0).decode('utf-8')
2489 'upload_date': None,
2490 'title': video_title,
2493 'thumbnail': video_thumbnail,
2494 'description': None,
2501 class SoundcloudIE(InfoExtractor):
2502 """Information extractor for soundcloud.com
2503 To access the media, the uid of the song and a stream token
2504 must be extracted from the page source and the script must make
2505 a request to media.soundcloud.com/crossdomain.xml. Then
2506 the media can be grabbed by requesting from an url composed
2507 of the stream token and uid
2510 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2511 IE_NAME = u'soundcloud'
2513 def __init__(self, downloader=None):
2514 InfoExtractor.__init__(self, downloader)
2516 def report_webpage(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2520 def report_extraction(self, video_id):
2521 """Report information extraction."""
2522 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2524 def _real_extract(self, url):
2525 mobj = re.match(self._VALID_URL, url)
2527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530 # extract uploader (which is in the url)
2531 uploader = mobj.group(1).decode('utf-8')
2532 # extract simple title (uploader + slug of song title)
2533 slug_title = mobj.group(2).decode('utf-8')
2534 simple_title = uploader + u'-' + slug_title
2536 self.report_webpage('%s/%s' % (uploader, slug_title))
2538 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2540 webpage = urllib2.urlopen(request).read()
2541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2545 self.report_extraction('%s/%s' % (uploader, slug_title))
2547 # extract uid and stream token that soundcloud hands out for access
2548 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2550 video_id = mobj.group(1)
2551 stream_token = mobj.group(2)
2553 # extract unsimplified title
2554 mobj = re.search('"title":"(.*?)",', webpage)
2556 title = mobj.group(1).decode('utf-8')
2558 title = simple_title
2560 # construct media url (with uid/token)
2561 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2562 mediaURL = mediaURL % (video_id, stream_token)
2565 description = u'No description available'
2566 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2568 description = mobj.group(1)
2572 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2575 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2576 except Exception, e:
2577 self._downloader.to_stderr(str(e))
2579 # for soundcloud, a request to a cross domain is required for cookies
2580 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2583 'id': video_id.decode('utf-8'),
2585 'uploader': uploader.decode('utf-8'),
2586 'upload_date': upload_date,
2591 'description': description.decode('utf-8')
2595 class InfoQIE(InfoExtractor):
2596 """Information extractor for infoq.com"""
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2601 def report_webpage(self, video_id):
2602 """Report information extraction."""
2603 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2615 self.report_webpage(url)
2617 request = urllib2.Request(url)
2619 webpage = urllib2.urlopen(request).read()
2620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2624 self.report_extraction(url)
2628 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2630 self._downloader.trouble(u'ERROR: unable to extract video url')
2632 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2636 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2638 self._downloader.trouble(u'ERROR: unable to extract video title')
2640 video_title = mobj.group(1).decode('utf-8')
2642 # Extract description
2643 video_description = u'No description available.'
2644 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2645 if mobj is not None:
2646 video_description = mobj.group(1).decode('utf-8')
2648 video_filename = video_url.split('/')[-1]
2649 video_id, extension = video_filename.split('.')
2655 'upload_date': None,
2656 'title': video_title,
2658 'format': extension, # Extension is always(?) mp4, but seems to be flv
2660 'description': video_description,
2666 class MixcloudIE(InfoExtractor):
2667 """Information extractor for www.mixcloud.com"""
2668 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2669 IE_NAME = u'mixcloud'
2671 def __init__(self, downloader=None):
2672 InfoExtractor.__init__(self, downloader)
2674 def report_download_json(self, file_id):
2675 """Report JSON download."""
2676 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2678 def report_extraction(self, file_id):
2679 """Report information extraction."""
2680 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2682 def get_urls(self, jsonData, fmt, bitrate='best'):
2683 """Get urls from 'audio_formats' section in json"""
2686 bitrate_list = jsonData[fmt]
2687 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2688 bitrate = max(bitrate_list) # select highest
2690 url_list = jsonData[fmt][bitrate]
2691 except TypeError: # we have no bitrate info.
2692 url_list = jsonData[fmt]
2695 def check_urls(self, url_list):
2696 """Returns 1st active url from list"""
2697 for url in url_list:
2699 urllib2.urlopen(url)
2701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2706 def _print_formats(self, formats):
2707 print 'Available formats:'
2708 for fmt in formats.keys():
2709 for b in formats[fmt]:
2711 ext = formats[fmt][b][0]
2712 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2713 except TypeError: # we have no bitrate info
2714 ext = formats[fmt][0]
2715 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2718 def _real_extract(self, url):
2719 mobj = re.match(self._VALID_URL, url)
2721 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2723 # extract uploader & filename from url
2724 uploader = mobj.group(1).decode('utf-8')
2725 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2727 # construct API request
2728 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2729 # retrieve .json file with links to files
2730 request = urllib2.Request(file_url)
2732 self.report_download_json(file_url)
2733 jsonData = urllib2.urlopen(request).read()
2734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2735 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2739 json_data = json.loads(jsonData)
2740 player_url = json_data['player_swf_url']
2741 formats = dict(json_data['audio_formats'])
2743 req_format = self._downloader.params.get('format', None)
2746 if self._downloader.params.get('listformats', None):
2747 self._print_formats(formats)
2750 if req_format is None or req_format == 'best':
2751 for format_param in formats.keys():
2752 url_list = self.get_urls(formats, format_param)
2754 file_url = self.check_urls(url_list)
2755 if file_url is not None:
2758 if req_format not in formats.keys():
2759 self._downloader.trouble(u'ERROR: format is not available')
2762 url_list = self.get_urls(formats, req_format)
2763 file_url = self.check_urls(url_list)
2764 format_param = req_format
2767 'id': file_id.decode('utf-8'),
2768 'url': file_url.decode('utf-8'),
2769 'uploader': uploader.decode('utf-8'),
2770 'upload_date': u'NA',
2771 'title': json_data['name'],
2772 'ext': file_url.split('.')[-1].decode('utf-8'),
2773 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2774 'thumbnail': json_data['thumbnail_url'],
2775 'description': json_data['description'],
2776 'player_url': player_url.decode('utf-8'),
2779 class StanfordOpenClassroomIE(InfoExtractor):
2780 """Information extractor for Stanford's Open ClassRoom"""
2782 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2783 IE_NAME = u'stanfordoc'
2785 def report_download_webpage(self, objid):
2786 """Report information extraction."""
2787 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2789 def report_extraction(self, video_id):
2790 """Report information extraction."""
2791 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2799 if mobj.group('course') and mobj.group('video'): # A specific video
2800 course = mobj.group('course')
2801 video = mobj.group('video')
2803 'id': course + '_' + video,
2806 self.report_extraction(info['id'])
2807 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808 xmlUrl = baseUrl + video + '.xml'
2810 metaXml = urllib2.urlopen(xmlUrl).read()
2811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2814 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2816 info['title'] = mdoc.findall('./title')[0].text
2817 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2819 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2821 info['ext'] = info['url'].rpartition('.')[2]
2822 info['format'] = info['ext']
2824 elif mobj.group('course'): # A course page
2825 course = mobj.group('course')
2831 self.report_download_webpage(info['id'])
2833 coursepage = urllib2.urlopen(url).read()
2834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2838 m = re.search('<h1>([^<]+)</h1>', coursepage)
2840 info['title'] = unescapeHTML(m.group(1))
2842 info['title'] = info['id']
2844 m = re.search('<description>([^<]+)</description>', coursepage)
2846 info['description'] = unescapeHTML(m.group(1))
2848 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2851 'type': 'reference',
2852 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2856 for entry in info['list']:
2857 assert entry['type'] == 'reference'
2858 results += self.extract(entry['url'])
2863 'id': 'Stanford OpenClassroom',
2867 self.report_download_webpage(info['id'])
2868 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870 rootpage = urllib2.urlopen(rootURL).read()
2871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2875 info['title'] = info['id']
2877 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2880 'type': 'reference',
2881 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2886 for entry in info['list']:
2887 assert entry['type'] == 'reference'
2888 results += self.extract(entry['url'])
2891 class MTVIE(InfoExtractor):
2892 """Information extractor for MTV.com"""
2894 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2897 def report_webpage(self, video_id):
2898 """Report information extraction."""
2899 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2901 def report_extraction(self, video_id):
2902 """Report information extraction."""
2903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2905 def _real_extract(self, url):
2906 mobj = re.match(self._VALID_URL, url)
2908 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2910 if not mobj.group('proto'):
2911 url = 'http://' + url
2912 video_id = mobj.group('videoid')
2913 self.report_webpage(video_id)
2915 request = urllib2.Request(url)
2917 webpage = urllib2.urlopen(request).read()
2918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2922 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2924 self._downloader.trouble(u'ERROR: unable to extract song name')
2926 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2927 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2929 self._downloader.trouble(u'ERROR: unable to extract performer')
2931 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2932 video_title = performer + ' - ' + song_name
2934 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2936 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2938 mtvn_uri = mobj.group(1)
2940 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2942 self._downloader.trouble(u'ERROR: unable to extract content id')
2944 content_id = mobj.group(1)
2946 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2947 self.report_extraction(video_id)
2948 request = urllib2.Request(videogen_url)
2950 metadataXml = urllib2.urlopen(request).read()
2951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2955 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2956 renditions = mdoc.findall('.//rendition')
2958 # For now, always pick the highest quality.
2959 rendition = renditions[-1]
2962 _,_,ext = rendition.attrib['type'].partition('/')
2963 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2964 video_url = rendition.find('./src').text
2966 self._downloader.trouble('Invalid rendition field.')
2972 'uploader': performer,
2973 'title': video_title,