2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
106 (?: # the various things that can precede the ID:
107 (?:(?:v|embed|e)/) # v/ or embed/ or e/
108 |(?: # or the v= param in all its forms
109 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110 (?:\?|\#!?) # the params delimiter ? or # or #!
111 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 )? # optional -> youtube.com/xxxx is OK
115 )? # all until now is optional -> you can pass the naked ID
116 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
117 (?(1).+)? # if we found the ID, everything can follow
119 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123 _NETRC_MACHINE = 'youtube'
124 # Listed in order of quality
125 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127 _video_extensions = {
133 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139 _video_dimensions = {
157 def suitable(self, url):
158 """Receives a URL and returns True if suitable for this IE."""
159 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161 def report_lang(self):
162 """Report attempt to set language."""
163 self._downloader.to_screen(u'[youtube] Setting language')
165 def report_login(self):
166 """Report attempt to log in."""
167 self._downloader.to_screen(u'[youtube] Logging in')
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self._downloader.to_screen(u'[youtube] Confirming age')
173 def report_video_webpage_download(self, video_id):
174 """Report attempt to download video webpage."""
175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177 def report_video_info_webpage_download(self, video_id):
178 """Report attempt to download video info webpage."""
179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181 def report_video_subtitles_download(self, video_id):
182 """Report attempt to download video info webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185 def report_information_extraction(self, video_id):
186 """Report attempt to extract video information."""
187 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189 def report_unavailable_format(self, video_id, format):
190 """Report extracted video URL."""
191 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193 def report_rtmp_download(self):
194 """Indicate the download will use the RTMP protocol."""
195 self._downloader.to_screen(u'[youtube] RTMP download detected')
197 def _closed_captions_xml_to_srt(self, xml_string):
199 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200 # TODO parse xml instead of regex
201 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202 if not dur: dur = '4'
204 end = start + float(dur)
205 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207 caption = unescapeHTML(caption)
208 caption = unescapeHTML(caption) # double cycle, intentional
209 srt += str(n+1) + '\n'
210 srt += start + ' --> ' + end + '\n'
211 srt += caption + '\n\n'
214 def _print_formats(self, formats):
215 print 'Available formats:'
217 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219 def _real_initialize(self):
220 if self._downloader is None:
225 downloader_params = self._downloader.params
227 # Attempt to use provided username and password or .netrc data
228 if downloader_params.get('username', None) is not None:
229 username = downloader_params['username']
230 password = downloader_params['password']
231 elif downloader_params.get('usenetrc', False):
233 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
238 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239 except (IOError, netrc.NetrcParseError), err:
240 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 request = urllib2.Request(self._LANG_URL)
247 urllib2.urlopen(request).read()
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
252 # No authentication to be performed
258 'current_form': 'loginForm',
260 'action_login': 'Log In',
261 'username': username,
262 'password': password,
264 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 login_results = urllib2.urlopen(request).read()
268 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
278 'action_confirm': 'Confirm',
280 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 self.report_age_confirmation()
283 age_results = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
288 def _real_extract(self, url):
289 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290 mobj = re.search(self._NEXT_URL_RE, url)
292 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294 # Extract video id from URL
295 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 video_id = mobj.group(2)
302 self.report_video_webpage_download(video_id)
303 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 video_webpage = urllib2.urlopen(request).read()
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
310 # Attempt to extract SWF player URL
311 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
318 self.report_video_info_webpage_download(video_id)
319 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321 % (video_id, el_type))
322 request = urllib2.Request(video_info_url)
324 video_info_webpage = urllib2.urlopen(request).read()
325 video_info = parse_qs(video_info_webpage)
326 if 'token' in video_info:
328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 if 'token' not in video_info:
332 if 'reason' in video_info:
333 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 # Check for "rental" videos
339 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 # Start extracting information
344 self.report_information_extraction(video_id)
347 if 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 video_uploader = urllib.unquote_plus(video_info['author'][0])
353 if 'title' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract video title')
356 video_title = urllib.unquote_plus(video_info['title'][0])
357 video_title = video_title.decode('utf-8')
360 if 'thumbnail_url' not in video_info:
361 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 else: # don't panic if we can't find it
364 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372 for expression in format_expressions:
374 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
379 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380 if video_description: video_description = clean_html(video_description)
381 else: video_description = ''
384 video_subtitles = None
385 if self._downloader.params.get('writesubtitles', False):
387 self.report_video_subtitles_download(video_id)
388 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 srt_list = urllib2.urlopen(request).read()
391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395 if not srt_lang_list:
396 raise Trouble(u'WARNING: video has no closed captions')
397 if self._downloader.params.get('subtitleslang', False):
398 srt_lang = self._downloader.params.get('subtitleslang')
399 elif 'en' in srt_lang_list:
402 srt_lang = srt_lang_list.keys()[0]
403 if not srt_lang in srt_lang_list:
404 raise Trouble(u'WARNING: no closed captions found in the specified language')
405 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 srt_xml = urllib2.urlopen(request).read()
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 raise Trouble(u'WARNING: unable to download video subtitles')
412 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413 except Trouble as trouble:
414 self._downloader.trouble(trouble[0])
417 video_token = urllib.unquote_plus(video_info['token'][0])
419 # Decide which formats to download
420 req_format = self._downloader.params.get('format', None)
422 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423 self.report_rtmp_download()
424 video_url_list = [(None, video_info['conn'][0])]
425 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427 url_data = [parse_qs(uds) for uds in url_data_strs]
428 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
431 format_limit = self._downloader.params.get('format_limit', None)
432 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433 if format_limit is not None and format_limit in available_formats:
434 format_list = available_formats[available_formats.index(format_limit):]
436 format_list = available_formats
437 existing_formats = [x for x in format_list if x in url_map]
438 if len(existing_formats) == 0:
439 self._downloader.trouble(u'ERROR: no known formats available for video')
441 if self._downloader.params.get('listformats', None):
442 self._print_formats(existing_formats)
444 if req_format is None or req_format == 'best':
445 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446 elif req_format == 'worst':
447 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448 elif req_format in ('-1', 'all'):
449 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 # Specific formats. We pick the first in a slash-delimeted sequence.
452 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453 req_formats = req_format.split('/')
454 video_url_list = None
455 for rf in req_formats:
457 video_url_list = [(rf, url_map[rf])]
459 if video_url_list is None:
460 self._downloader.trouble(u'ERROR: requested format not available')
463 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
467 for format_param, video_real_url in video_url_list:
469 video_extension = self._video_extensions.get(format_param, 'flv')
472 'id': video_id.decode('utf-8'),
473 'url': video_real_url.decode('utf-8'),
474 'uploader': video_uploader.decode('utf-8'),
475 'upload_date': upload_date,
476 'title': video_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
487 class MetacafeIE(InfoExtractor):
488 """Information Extractor for metacafe.com."""
490 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493 IE_NAME = u'metacafe'
495 def __init__(self, downloader=None):
496 InfoExtractor.__init__(self, downloader)
498 def report_disclaimer(self):
499 """Report disclaimer retrieval."""
500 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502 def report_age_confirmation(self):
503 """Report attempt to confirm age."""
504 self._downloader.to_screen(u'[metacafe] Confirming age')
506 def report_download_webpage(self, video_id):
507 """Report webpage download."""
508 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510 def report_extraction(self, video_id):
511 """Report information extraction."""
512 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514 def _real_initialize(self):
515 # Retrieve disclaimer
516 request = urllib2.Request(self._DISCLAIMER)
518 self.report_disclaimer()
519 disclaimer = urllib2.urlopen(request).read()
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
527 'submit': "Continue - I'm over 18",
529 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 self.report_age_confirmation()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract id and simplified title from URL
539 mobj = re.match(self._VALID_URL, url)
541 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
544 video_id = mobj.group(1)
546 # Check if video comes from YouTube
547 mobj2 = re.match(r'^yt-(.*)$', video_id)
548 if mobj2 is not None:
549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
552 # Retrieve video webpage to extract further information
553 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 self.report_download_webpage(video_id)
556 webpage = urllib2.urlopen(request).read()
557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
561 # Extract URL, uploader and title from webpage
562 self.report_extraction(video_id)
563 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 mediaURL = urllib.unquote(mobj.group(1))
566 video_extension = mediaURL[-3:]
568 # Extract gdaKey if available
569 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
573 gdaKey = mobj.group(1)
574 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 vardict = parse_qs(mobj.group(1))
581 if 'mediaData' not in vardict:
582 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 mediaURL = mobj.group(1).replace('\\/', '/')
589 video_extension = mediaURL[-3:]
590 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract title')
596 video_title = mobj.group(1).decode('utf-8')
598 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 video_uploader = mobj.group(1)
605 'id': video_id.decode('utf-8'),
606 'url': video_url.decode('utf-8'),
607 'uploader': video_uploader.decode('utf-8'),
608 'upload_date': u'NA',
609 'title': video_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1)
642 video_extension = 'mp4'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 flashvars = urllib.unquote(mobj.group(1))
661 if 'hqURL' in flashvars: max_quality = 'hqURL'
662 elif 'sdURL' in flashvars: max_quality = 'sdURL'
663 else: max_quality = 'ldURL'
664 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
668 video_url = mobj.group(1).replace('\\/', '/')
670 # TODO: support choosing qualities
672 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
674 self._downloader.trouble(u'ERROR: unable to extract title')
676 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
678 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
682 video_uploader = mobj.group(1)
685 'id': video_id.decode('utf-8'),
686 'url': video_url.decode('utf-8'),
687 'uploader': video_uploader.decode('utf-8'),
688 'upload_date': u'NA',
689 'title': video_title,
690 'ext': video_extension.decode('utf-8'),
696 class GoogleIE(InfoExtractor):
697 """Information extractor for video.google.com."""
699 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
700 IE_NAME = u'video.google'
702 def __init__(self, downloader=None):
703 InfoExtractor.__init__(self, downloader)
705 def report_download_webpage(self, video_id):
706 """Report webpage download."""
707 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
709 def report_extraction(self, video_id):
710 """Report information extraction."""
711 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
713 def _real_extract(self, url):
714 # Extract id from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 video_extension = 'mp4'
724 # Retrieve video webpage to extract further information
725 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
727 self.report_download_webpage(video_id)
728 webpage = urllib2.urlopen(request).read()
729 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
730 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
733 # Extract URL, uploader, and title from webpage
734 self.report_extraction(video_id)
735 mobj = re.search(r"download_url:'([^']+)'", webpage)
737 video_extension = 'flv'
738 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
740 self._downloader.trouble(u'ERROR: unable to extract media URL')
742 mediaURL = urllib.unquote(mobj.group(1))
743 mediaURL = mediaURL.replace('\\x3d', '\x3d')
744 mediaURL = mediaURL.replace('\\x26', '\x26')
748 mobj = re.search(r'<title>(.*)</title>', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract title')
752 video_title = mobj.group(1).decode('utf-8')
754 # Extract video description
755 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
757 self._downloader.trouble(u'ERROR: unable to extract video description')
759 video_description = mobj.group(1).decode('utf-8')
760 if not video_description:
761 video_description = 'No description available.'
763 # Extract video thumbnail
764 if self._downloader.params.get('forcethumbnail', False):
765 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
767 webpage = urllib2.urlopen(request).read()
768 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
769 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
771 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
773 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
775 video_thumbnail = mobj.group(1)
776 else: # we need something to pass to process_info
780 'id': video_id.decode('utf-8'),
781 'url': video_url.decode('utf-8'),
783 'upload_date': u'NA',
784 'title': video_title,
785 'ext': video_extension.decode('utf-8'),
791 class PhotobucketIE(InfoExtractor):
792 """Information extractor for photobucket.com."""
794 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
795 IE_NAME = u'photobucket'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_download_webpage(self, video_id):
801 """Report webpage download."""
802 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
804 def report_extraction(self, video_id):
805 """Report information extraction."""
806 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
808 def _real_extract(self, url):
809 # Extract id from URL
810 mobj = re.match(self._VALID_URL, url)
812 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
815 video_id = mobj.group(1)
817 video_extension = 'flv'
819 # Retrieve video webpage to extract further information
820 request = urllib2.Request(url)
822 self.report_download_webpage(video_id)
823 webpage = urllib2.urlopen(request).read()
824 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
825 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
828 # Extract URL, uploader, and title from webpage
829 self.report_extraction(video_id)
830 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
832 self._downloader.trouble(u'ERROR: unable to extract media URL')
834 mediaURL = urllib.unquote(mobj.group(1))
838 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
840 self._downloader.trouble(u'ERROR: unable to extract title')
842 video_title = mobj.group(1).decode('utf-8')
844 video_uploader = mobj.group(2).decode('utf-8')
847 'id': video_id.decode('utf-8'),
848 'url': video_url.decode('utf-8'),
849 'uploader': video_uploader,
850 'upload_date': u'NA',
851 'title': video_title,
852 'ext': video_extension.decode('utf-8'),
858 class YahooIE(InfoExtractor):
859 """Information extractor for video.yahoo.com."""
861 # _VALID_URL matches all Yahoo! Video URLs
862 # _VPAGE_URL matches only the extractable '/watch/' URLs
863 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
864 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
865 IE_NAME = u'video.yahoo'
867 def __init__(self, downloader=None):
868 InfoExtractor.__init__(self, downloader)
870 def report_download_webpage(self, video_id):
871 """Report webpage download."""
872 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
874 def report_extraction(self, video_id):
875 """Report information extraction."""
876 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
878 def _real_extract(self, url, new_video=True):
879 # Extract ID from URL
880 mobj = re.match(self._VALID_URL, url)
882 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
885 video_id = mobj.group(2)
886 video_extension = 'flv'
888 # Rewrite valid but non-extractable URLs as
889 # extractable English language /watch/ URLs
890 if re.match(self._VPAGE_URL, url) is None:
891 request = urllib2.Request(url)
893 webpage = urllib2.urlopen(request).read()
894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
895 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
898 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
900 self._downloader.trouble(u'ERROR: Unable to extract id field')
902 yahoo_id = mobj.group(1)
904 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
906 self._downloader.trouble(u'ERROR: Unable to extract vid field')
908 yahoo_vid = mobj.group(1)
910 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
911 return self._real_extract(url, new_video=False)
913 # Retrieve video webpage to extract further information
914 request = urllib2.Request(url)
916 self.report_download_webpage(video_id)
917 webpage = urllib2.urlopen(request).read()
918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
922 # Extract uploader and title from webpage
923 self.report_extraction(video_id)
924 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video title')
928 video_title = mobj.group(1).decode('utf-8')
930 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video uploader')
934 video_uploader = mobj.group(1).decode('utf-8')
936 # Extract video thumbnail
937 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
941 video_thumbnail = mobj.group(1).decode('utf-8')
943 # Extract video description
944 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
946 self._downloader.trouble(u'ERROR: unable to extract video description')
948 video_description = mobj.group(1).decode('utf-8')
949 if not video_description:
950 video_description = 'No description available.'
952 # Extract video height and width
953 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video height')
957 yv_video_height = mobj.group(1)
959 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video width')
963 yv_video_width = mobj.group(1)
965 # Retrieve video playlist to extract media URL
966 # I'm not completely sure what all these options are, but we
967 # seem to need most of them, otherwise the server sends a 401.
968 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
969 yv_bitrate = '700' # according to Wikipedia this is hard-coded
970 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
971 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
972 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
974 self.report_download_webpage(video_id)
975 webpage = urllib2.urlopen(request).read()
976 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
977 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
980 # Extract media URL from playlist XML
981 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
983 self._downloader.trouble(u'ERROR: Unable to extract media URL')
985 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
986 video_url = unescapeHTML(video_url)
989 'id': video_id.decode('utf-8'),
991 'uploader': video_uploader,
992 'upload_date': u'NA',
993 'title': video_title,
994 'ext': video_extension.decode('utf-8'),
995 'thumbnail': video_thumbnail.decode('utf-8'),
996 'description': video_description,
997 'thumbnail': video_thumbnail,
1002 class VimeoIE(InfoExtractor):
1003 """Information extractor for vimeo.com."""
1005 # _VALID_URL matches Vimeo URLs
1006 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1009 def __init__(self, downloader=None):
1010 InfoExtractor.__init__(self, downloader)
1012 def report_download_webpage(self, video_id):
1013 """Report webpage download."""
1014 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1016 def report_extraction(self, video_id):
1017 """Report information extraction."""
1018 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1020 def _real_extract(self, url, new_video=True):
1021 # Extract ID from URL
1022 mobj = re.match(self._VALID_URL, url)
1024 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1027 video_id = mobj.group(1)
1029 # Retrieve video webpage to extract further information
1030 request = urllib2.Request(url, None, std_headers)
1032 self.report_download_webpage(video_id)
1033 webpage = urllib2.urlopen(request).read()
1034 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1038 # Now we begin extracting as much information as we can from what we
1039 # retrieved. First we extract the information common to all extractors,
1040 # and latter we extract those that are Vimeo specific.
1041 self.report_extraction(video_id)
1043 # Extract the config JSON
1044 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1046 config = json.loads(config)
1048 self._downloader.trouble(u'ERROR: unable to extract info section')
1052 video_title = config["video"]["title"]
1055 video_uploader = config["video"]["owner"]["name"]
1057 # Extract video thumbnail
1058 video_thumbnail = config["video"]["thumbnail"]
1060 # Extract video description
1061 video_description = get_element_by_id("description", webpage.decode('utf8'))
1062 if video_description: video_description = clean_html(video_description)
1063 else: video_description = ''
1065 # Extract upload date
1066 video_upload_date = u'NA'
1067 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1068 if mobj is not None:
1069 video_upload_date = mobj.group(1)
1071 # Vimeo specific: extract request signature and timestamp
1072 sig = config['request']['signature']
1073 timestamp = config['request']['timestamp']
1075 # Vimeo specific: extract video codec and quality information
1076 # TODO bind to format param
1077 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1078 for codec in codecs:
1079 if codec[0] in config["video"]["files"]:
1080 video_codec = codec[0]
1081 video_extension = codec[1]
1082 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1083 else: quality = 'sd'
1086 self._downloader.trouble(u'ERROR: no known codec found')
1089 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1090 %(video_id, sig, timestamp, quality, video_codec.upper())
1095 'uploader': video_uploader,
1096 'upload_date': video_upload_date,
1097 'title': video_title,
1098 'ext': video_extension,
1099 'thumbnail': video_thumbnail,
1100 'description': video_description,
1105 class GenericIE(InfoExtractor):
1106 """Generic last-resort information extractor."""
1109 IE_NAME = u'generic'
1111 def __init__(self, downloader=None):
1112 InfoExtractor.__init__(self, downloader)
1114 def report_download_webpage(self, video_id):
1115 """Report webpage download."""
1116 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1117 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1119 def report_extraction(self, video_id):
1120 """Report information extraction."""
1121 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1123 def report_following_redirect(self, new_url):
1124 """Report information extraction."""
1125 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1127 def _test_redirect(self, url):
1128 """Check if it is a redirect, like url shorteners, in case restart chain."""
1129 class HeadRequest(urllib2.Request):
1130 def get_method(self):
1133 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1135 Subclass the HTTPRedirectHandler to make it use our
1136 HeadRequest also on the redirected URL
1138 def redirect_request(self, req, fp, code, msg, headers, newurl):
1139 if code in (301, 302, 303, 307):
1140 newurl = newurl.replace(' ', '%20')
1141 newheaders = dict((k,v) for k,v in req.headers.items()
1142 if k.lower() not in ("content-length", "content-type"))
1143 return HeadRequest(newurl,
1145 origin_req_host=req.get_origin_req_host(),
1148 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1150 class HTTPMethodFallback(urllib2.BaseHandler):
1152 Fallback to GET if HEAD is not allowed (405 HTTP error)
1154 def http_error_405(self, req, fp, code, msg, headers):
1158 newheaders = dict((k,v) for k,v in req.headers.items()
1159 if k.lower() not in ("content-length", "content-type"))
1160 return self.parent.open(urllib2.Request(req.get_full_url(),
1162 origin_req_host=req.get_origin_req_host(),
1166 opener = urllib2.OpenerDirector()
1167 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1168 HTTPMethodFallback, HEADRedirectHandler,
1169 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1170 opener.add_handler(handler())
1172 response = opener.open(HeadRequest(url))
1173 new_url = response.geturl()
1175 if url == new_url: return False
1177 self.report_following_redirect(new_url)
1178 self._downloader.download([new_url])
1181 def _real_extract(self, url):
1182 if self._test_redirect(url): return
1184 video_id = url.split('/')[-1]
1185 request = urllib2.Request(url)
1187 self.report_download_webpage(video_id)
1188 webpage = urllib2.urlopen(request).read()
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1192 except ValueError, err:
1193 # since this is the last-resort InfoExtractor, if
1194 # this error is thrown, it'll be thrown here
1195 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1198 self.report_extraction(video_id)
1199 # Start with something easy: JW Player in SWFObject
1200 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1202 # Broaden the search a little bit
1203 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1205 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1208 # It's possible that one of the regexes
1209 # matched, but returned an empty group:
1210 if mobj.group(1) is None:
1211 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1214 video_url = urllib.unquote(mobj.group(1))
1215 video_id = os.path.basename(video_url)
1217 # here's a fun little line of code for you:
1218 video_extension = os.path.splitext(video_id)[1][1:]
1219 video_id = os.path.splitext(video_id)[0]
1221 # it's tempting to parse this further, but you would
1222 # have to take into account all the variations like
1223 # Video Title - Site Name
1224 # Site Name | Video Title
1225 # Video Title - Tagline | Site Name
1226 # and so on and so forth; it's just not practical
1227 mobj = re.search(r'<title>(.*)</title>', webpage)
1229 self._downloader.trouble(u'ERROR: unable to extract title')
1231 video_title = mobj.group(1).decode('utf-8')
1233 # video uploader is domain name
1234 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1236 self._downloader.trouble(u'ERROR: unable to extract title')
1238 video_uploader = mobj.group(1).decode('utf-8')
1241 'id': video_id.decode('utf-8'),
1242 'url': video_url.decode('utf-8'),
1243 'uploader': video_uploader,
1244 'upload_date': u'NA',
1245 'title': video_title,
1246 'ext': video_extension.decode('utf-8'),
1252 class YoutubeSearchIE(InfoExtractor):
1253 """Information Extractor for YouTube search queries."""
1254 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1255 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1256 _max_youtube_results = 1000
1257 IE_NAME = u'youtube:search'
1259 def __init__(self, downloader=None):
1260 InfoExtractor.__init__(self, downloader)
1262 def report_download_page(self, query, pagenum):
1263 """Report attempt to download search page with given number."""
1264 query = query.decode(preferredencoding())
1265 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1267 def _real_extract(self, query):
1268 mobj = re.match(self._VALID_URL, query)
1270 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1273 prefix, query = query.split(':')
1275 query = query.encode('utf-8')
1277 self._download_n_results(query, 1)
1279 elif prefix == 'all':
1280 self._download_n_results(query, self._max_youtube_results)
1286 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1288 elif n > self._max_youtube_results:
1289 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1290 n = self._max_youtube_results
1291 self._download_n_results(query, n)
1293 except ValueError: # parsing prefix as integer fails
1294 self._download_n_results(query, 1)
1297 def _download_n_results(self, query, n):
1298 """Downloads a specified number of results for a query"""
1304 while (50 * pagenum) < limit:
1305 self.report_download_page(query, pagenum+1)
1306 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1307 request = urllib2.Request(result_url)
1309 data = urllib2.urlopen(request).read()
1310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1313 api_response = json.loads(data)['data']
1315 new_ids = list(video['id'] for video in api_response['items'])
1316 video_ids += new_ids
1318 limit = min(n, api_response['totalItems'])
1321 if len(video_ids) > n:
1322 video_ids = video_ids[:n]
1323 for id in video_ids:
1324 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1328 class GoogleSearchIE(InfoExtractor):
1329 """Information Extractor for Google Video search queries."""
1330 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1331 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1332 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1333 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1334 _max_google_results = 1000
1335 IE_NAME = u'video.google:search'
1337 def __init__(self, downloader=None):
1338 InfoExtractor.__init__(self, downloader)
1340 def report_download_page(self, query, pagenum):
1341 """Report attempt to download playlist page with given number."""
1342 query = query.decode(preferredencoding())
1343 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1345 def _real_extract(self, query):
1346 mobj = re.match(self._VALID_URL, query)
1348 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1351 prefix, query = query.split(':')
1353 query = query.encode('utf-8')
1355 self._download_n_results(query, 1)
1357 elif prefix == 'all':
1358 self._download_n_results(query, self._max_google_results)
1364 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1366 elif n > self._max_google_results:
1367 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1368 n = self._max_google_results
1369 self._download_n_results(query, n)
1371 except ValueError: # parsing prefix as integer fails
1372 self._download_n_results(query, 1)
1375 def _download_n_results(self, query, n):
1376 """Downloads a specified number of results for a query"""
1382 self.report_download_page(query, pagenum)
1383 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1384 request = urllib2.Request(result_url)
1386 page = urllib2.urlopen(request).read()
1387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1388 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1391 # Extract video identifiers
1392 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1393 video_id = mobj.group(1)
1394 if video_id not in video_ids:
1395 video_ids.append(video_id)
1396 if len(video_ids) == n:
1397 # Specified n videos reached
1398 for id in video_ids:
1399 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1402 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1403 for id in video_ids:
1404 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1407 pagenum = pagenum + 1
1410 class YahooSearchIE(InfoExtractor):
1411 """Information Extractor for Yahoo! Video search queries."""
1412 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1413 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1414 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1415 _MORE_PAGES_INDICATOR = r'\s*Next'
1416 _max_yahoo_results = 1000
1417 IE_NAME = u'video.yahoo:search'
1419 def __init__(self, downloader=None):
1420 InfoExtractor.__init__(self, downloader)
1422 def report_download_page(self, query, pagenum):
1423 """Report attempt to download playlist page with given number."""
1424 query = query.decode(preferredencoding())
1425 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1427 def _real_extract(self, query):
1428 mobj = re.match(self._VALID_URL, query)
1430 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1433 prefix, query = query.split(':')
1435 query = query.encode('utf-8')
1437 self._download_n_results(query, 1)
1439 elif prefix == 'all':
1440 self._download_n_results(query, self._max_yahoo_results)
1446 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1448 elif n > self._max_yahoo_results:
1449 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1450 n = self._max_yahoo_results
1451 self._download_n_results(query, n)
1453 except ValueError: # parsing prefix as integer fails
1454 self._download_n_results(query, 1)
1457 def _download_n_results(self, query, n):
1458 """Downloads a specified number of results for a query"""
1461 already_seen = set()
1465 self.report_download_page(query, pagenum)
1466 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1467 request = urllib2.Request(result_url)
1469 page = urllib2.urlopen(request).read()
1470 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1471 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1474 # Extract video identifiers
1475 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1476 video_id = mobj.group(1)
1477 if video_id not in already_seen:
1478 video_ids.append(video_id)
1479 already_seen.add(video_id)
1480 if len(video_ids) == n:
1481 # Specified n videos reached
1482 for id in video_ids:
1483 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1486 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1487 for id in video_ids:
1488 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1491 pagenum = pagenum + 1
1494 class YoutubePlaylistIE(InfoExtractor):
1495 """Information Extractor for YouTube playlists."""
1497 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1498 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1499 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s'
1500 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1501 IE_NAME = u'youtube:playlist'
1503 def __init__(self, downloader=None):
1504 InfoExtractor.__init__(self, downloader)
1506 def report_download_page(self, playlist_id, pagenum):
1507 """Report attempt to download playlist page with given number."""
1508 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1510 def _real_extract(self, url):
1511 # Extract playlist id
1512 mobj = re.match(self._VALID_URL, url)
1514 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1518 if mobj.group(3) is not None:
1519 self._downloader.download([mobj.group(3)])
1522 # Download playlist pages
1523 # prefix is 'p' as default for playlists but there are other types that need extra care
1524 playlist_prefix = mobj.group(1)
1525 if playlist_prefix == 'a':
1526 playlist_access = 'artist'
1528 playlist_prefix = 'p'
1529 playlist_access = 'view_play_list'
1530 playlist_id = mobj.group(2)
1535 self.report_download_page(playlist_id, pagenum)
1536 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1537 request = urllib2.Request(url)
1539 page = urllib2.urlopen(request).read()
1540 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1544 # Extract video identifiers
1546 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1547 if mobj.group(1) not in ids_in_page:
1548 ids_in_page.append(mobj.group(1))
1549 video_ids.extend(ids_in_page)
1551 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1553 pagenum = pagenum + 1
1555 playliststart = self._downloader.params.get('playliststart', 1) - 1
1556 playlistend = self._downloader.params.get('playlistend', -1)
1557 if playlistend == -1:
1558 video_ids = video_ids[playliststart:]
1560 video_ids = video_ids[playliststart:playlistend]
1562 for id in video_ids:
1563 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1567 class YoutubeUserIE(InfoExtractor):
1568 """Information Extractor for YouTube users."""
1570 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1571 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1572 _GDATA_PAGE_SIZE = 50
1573 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1574 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1575 IE_NAME = u'youtube:user'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, username, start_index):
1581 """Report attempt to download user page."""
1582 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1583 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1585 def _real_extract(self, url):
1587 mobj = re.match(self._VALID_URL, url)
1589 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1592 username = mobj.group(1)
1594 # Download video ids using YouTube Data API. Result size per
1595 # query is limited (currently to 50 videos) so we need to query
1596 # page by page until there are no video ids - it means we got
1603 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1604 self.report_download_page(username, start_index)
1606 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1609 page = urllib2.urlopen(request).read()
1610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1611 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1614 # Extract video identifiers
1617 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1618 if mobj.group(1) not in ids_in_page:
1619 ids_in_page.append(mobj.group(1))
1621 video_ids.extend(ids_in_page)
1623 # A little optimization - if current page is not
1624 # "full", ie. does not contain PAGE_SIZE video ids then
1625 # we can assume that this page is the last one - there
1626 # are no more ids on further pages - no need to query
1629 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1634 all_ids_count = len(video_ids)
1635 playliststart = self._downloader.params.get('playliststart', 1) - 1
1636 playlistend = self._downloader.params.get('playlistend', -1)
1638 if playlistend == -1:
1639 video_ids = video_ids[playliststart:]
1641 video_ids = video_ids[playliststart:playlistend]
1643 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1644 (username, all_ids_count, len(video_ids)))
1646 for video_id in video_ids:
1647 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1650 class BlipTVUserIE(InfoExtractor):
1651 """Information Extractor for blip.tv users."""
1653 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1655 IE_NAME = u'blip.tv:user'
1657 def __init__(self, downloader=None):
1658 InfoExtractor.__init__(self, downloader)
1660 def report_download_page(self, username, pagenum):
1661 """Report attempt to download user page."""
1662 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1663 (self.IE_NAME, username, pagenum))
1665 def _real_extract(self, url):
1667 mobj = re.match(self._VALID_URL, url)
1669 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1672 username = mobj.group(1)
1674 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1676 request = urllib2.Request(url)
1679 page = urllib2.urlopen(request).read().decode('utf-8')
1680 mobj = re.search(r'data-users-id="([^"]+)"', page)
1681 page_base = page_base % mobj.group(1)
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687 # Download video ids using BlipTV Ajax calls. Result size per
1688 # query is limited (currently to 12 videos) so we need to query
1689 # page by page until there are no video ids - it means we got
1696 self.report_download_page(username, pagenum)
1698 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1701 page = urllib2.urlopen(request).read().decode('utf-8')
1702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1703 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1706 # Extract video identifiers
1709 for mobj in re.finditer(r'href="/([^"]+)"', page):
1710 if mobj.group(1) not in ids_in_page:
1711 ids_in_page.append(unescapeHTML(mobj.group(1)))
1713 video_ids.extend(ids_in_page)
1715 # A little optimization - if current page is not
1716 # "full", ie. does not contain PAGE_SIZE video ids then
1717 # we can assume that this page is the last one - there
1718 # are no more ids on further pages - no need to query
1721 if len(ids_in_page) < self._PAGE_SIZE:
1726 all_ids_count = len(video_ids)
1727 playliststart = self._downloader.params.get('playliststart', 1) - 1
1728 playlistend = self._downloader.params.get('playlistend', -1)
1730 if playlistend == -1:
1731 video_ids = video_ids[playliststart:]
1733 video_ids = video_ids[playliststart:playlistend]
1735 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1736 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1738 for video_id in video_ids:
1739 self._downloader.download([u'http://blip.tv/'+video_id])
1742 class DepositFilesIE(InfoExtractor):
1743 """Information extractor for depositfiles.com"""
1745 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1746 IE_NAME = u'DepositFiles'
1748 def __init__(self, downloader=None):
1749 InfoExtractor.__init__(self, downloader)
1751 def report_download_webpage(self, file_id):
1752 """Report webpage download."""
1753 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1755 def report_extraction(self, file_id):
1756 """Report information extraction."""
1757 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1759 def _real_extract(self, url):
1760 file_id = url.split('/')[-1]
1761 # Rebuild url in english locale
1762 url = 'http://depositfiles.com/en/files/' + file_id
1764 # Retrieve file webpage with 'Free download' button pressed
1765 free_download_indication = { 'gateway_result' : '1' }
1766 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1768 self.report_download_webpage(file_id)
1769 webpage = urllib2.urlopen(request).read()
1770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1771 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1774 # Search for the real file URL
1775 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1776 if (mobj is None) or (mobj.group(1) is None):
1777 # Try to figure out reason of the error.
1778 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1779 if (mobj is not None) and (mobj.group(1) is not None):
1780 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1781 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1783 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1786 file_url = mobj.group(1)
1787 file_extension = os.path.splitext(file_url)[1][1:]
1789 # Search for file title
1790 mobj = re.search(r'<b title="(.*?)">', webpage)
1792 self._downloader.trouble(u'ERROR: unable to extract title')
1794 file_title = mobj.group(1).decode('utf-8')
1797 'id': file_id.decode('utf-8'),
1798 'url': file_url.decode('utf-8'),
1800 'upload_date': u'NA',
1801 'title': file_title,
1802 'ext': file_extension.decode('utf-8'),
1808 class FacebookIE(InfoExtractor):
1809 """Information Extractor for Facebook"""
1811 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1812 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1813 _NETRC_MACHINE = 'facebook'
1814 _available_formats = ['video', 'highqual', 'lowqual']
1815 _video_extensions = {
1820 IE_NAME = u'facebook'
1822 def __init__(self, downloader=None):
1823 InfoExtractor.__init__(self, downloader)
1825 def _reporter(self, message):
1826 """Add header and report message."""
1827 self._downloader.to_screen(u'[facebook] %s' % message)
1829 def report_login(self):
1830 """Report attempt to log in."""
1831 self._reporter(u'Logging in')
1833 def report_video_webpage_download(self, video_id):
1834 """Report attempt to download video webpage."""
1835 self._reporter(u'%s: Downloading video webpage' % video_id)
1837 def report_information_extraction(self, video_id):
1838 """Report attempt to extract video information."""
1839 self._reporter(u'%s: Extracting video information' % video_id)
1841 def _parse_page(self, video_webpage):
1842 """Extract video information from page"""
1844 data = {'title': r'\("video_title", "(.*?)"\)',
1845 'description': r'<div class="datawrap">(.*?)</div>',
1846 'owner': r'\("video_owner_name", "(.*?)"\)',
1847 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1850 for piece in data.keys():
1851 mobj = re.search(data[piece], video_webpage)
1852 if mobj is not None:
1853 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1857 for fmt in self._available_formats:
1858 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1859 if mobj is not None:
1860 # URL is in a Javascript segment inside an escaped Unicode format within
1861 # the generally utf-8 page
1862 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1863 video_info['video_urls'] = video_urls
1867 def _real_initialize(self):
1868 if self._downloader is None:
1873 downloader_params = self._downloader.params
1875 # Attempt to use provided username and password or .netrc data
1876 if downloader_params.get('username', None) is not None:
1877 useremail = downloader_params['username']
1878 password = downloader_params['password']
1879 elif downloader_params.get('usenetrc', False):
1881 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1882 if info is not None:
1886 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1887 except (IOError, netrc.NetrcParseError), err:
1888 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1891 if useremail is None:
1900 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1903 login_results = urllib2.urlopen(request).read()
1904 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1905 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1907 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1911 def _real_extract(self, url):
1912 mobj = re.match(self._VALID_URL, url)
1914 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1916 video_id = mobj.group('ID')
1919 self.report_video_webpage_download(video_id)
1920 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1922 page = urllib2.urlopen(request)
1923 video_webpage = page.read()
1924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1928 # Start extracting information
1929 self.report_information_extraction(video_id)
1931 # Extract information
1932 video_info = self._parse_page(video_webpage)
1935 if 'owner' not in video_info:
1936 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1938 video_uploader = video_info['owner']
1941 if 'title' not in video_info:
1942 self._downloader.trouble(u'ERROR: unable to extract video title')
1944 video_title = video_info['title']
1945 video_title = video_title.decode('utf-8')
1948 if 'thumbnail' not in video_info:
1949 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1950 video_thumbnail = ''
1952 video_thumbnail = video_info['thumbnail']
1956 if 'upload_date' in video_info:
1957 upload_time = video_info['upload_date']
1958 timetuple = email.utils.parsedate_tz(upload_time)
1959 if timetuple is not None:
1961 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1966 video_description = video_info.get('description', 'No description available.')
1968 url_map = video_info['video_urls']
1969 if len(url_map.keys()) > 0:
1970 # Decide which formats to download
1971 req_format = self._downloader.params.get('format', None)
1972 format_limit = self._downloader.params.get('format_limit', None)
1974 if format_limit is not None and format_limit in self._available_formats:
1975 format_list = self._available_formats[self._available_formats.index(format_limit):]
1977 format_list = self._available_formats
1978 existing_formats = [x for x in format_list if x in url_map]
1979 if len(existing_formats) == 0:
1980 self._downloader.trouble(u'ERROR: no known formats available for video')
1982 if req_format is None:
1983 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1984 elif req_format == 'worst':
1985 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1986 elif req_format == '-1':
1987 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1990 if req_format not in url_map:
1991 self._downloader.trouble(u'ERROR: requested format not available')
1993 video_url_list = [(req_format, url_map[req_format])] # Specific format
1996 for format_param, video_real_url in video_url_list:
1998 video_extension = self._video_extensions.get(format_param, 'mp4')
2001 'id': video_id.decode('utf-8'),
2002 'url': video_real_url.decode('utf-8'),
2003 'uploader': video_uploader.decode('utf-8'),
2004 'upload_date': upload_date,
2005 'title': video_title,
2006 'ext': video_extension.decode('utf-8'),
2007 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2008 'thumbnail': video_thumbnail.decode('utf-8'),
2009 'description': video_description.decode('utf-8'),
2014 class BlipTVIE(InfoExtractor):
2015 """Information extractor for blip.tv"""
2017 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2018 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2019 IE_NAME = u'blip.tv'
2021 def report_extraction(self, file_id):
2022 """Report information extraction."""
2023 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2025 def report_direct_download(self, title):
2026 """Report information extraction."""
2027 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2029 def _real_extract(self, url):
2030 mobj = re.match(self._VALID_URL, url)
2032 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2039 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2040 request = urllib2.Request(json_url.encode('utf-8'))
2041 self.report_extraction(mobj.group(1))
2044 urlh = urllib2.urlopen(request)
2045 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2046 basename = url.split('/')[-1]
2047 title,ext = os.path.splitext(basename)
2048 title = title.decode('UTF-8')
2049 ext = ext.replace('.', '')
2050 self.report_direct_download(title)
2058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2061 if info is None: # Regular URL
2063 json_code = urlh.read()
2064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2069 json_data = json.loads(json_code)
2070 if 'Post' in json_data:
2071 data = json_data['Post']
2075 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2076 video_url = data['media']['url']
2077 umobj = re.match(self._URL_EXT, video_url)
2079 raise ValueError('Can not determine filename extension')
2080 ext = umobj.group(1)
2083 'id': data['item_id'],
2085 'uploader': data['display_name'],
2086 'upload_date': upload_date,
2087 'title': data['title'],
2089 'format': data['media']['mimeType'],
2090 'thumbnail': data['thumbnailUrl'],
2091 'description': data['description'],
2092 'player_url': data['embedUrl']
2094 except (ValueError,KeyError), err:
2095 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2098 std_headers['User-Agent'] = 'iTunes/10.6.1'
2102 class MyVideoIE(InfoExtractor):
2103 """Information Extractor for myvideo.de."""
2105 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2106 IE_NAME = u'myvideo'
2108 def __init__(self, downloader=None):
2109 InfoExtractor.__init__(self, downloader)
2111 def report_download_webpage(self, video_id):
2112 """Report webpage download."""
2113 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2115 def report_extraction(self, video_id):
2116 """Report information extraction."""
2117 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2119 def _real_extract(self,url):
2120 mobj = re.match(self._VALID_URL, url)
2122 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2125 video_id = mobj.group(1)
2128 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2130 self.report_download_webpage(video_id)
2131 webpage = urllib2.urlopen(request).read()
2132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2136 self.report_extraction(video_id)
2137 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2140 self._downloader.trouble(u'ERROR: unable to extract media URL')
2142 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2144 mobj = re.search('<title>([^<]+)</title>', webpage)
2146 self._downloader.trouble(u'ERROR: unable to extract title')
2149 video_title = mobj.group(1)
2155 'upload_date': u'NA',
2156 'title': video_title,
2162 class ComedyCentralIE(InfoExtractor):
2163 """Information extractor for The Daily Show and Colbert Report """
2165 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2166 IE_NAME = u'comedycentral'
2168 def report_extraction(self, episode_id):
2169 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2171 def report_config_download(self, episode_id):
2172 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2174 def report_index_download(self, episode_id):
2175 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2177 def report_player_url(self, episode_id):
2178 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2180 def _real_extract(self, url):
2181 mobj = re.match(self._VALID_URL, url)
2183 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2186 if mobj.group('shortname'):
2187 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2188 url = u'http://www.thedailyshow.com/full-episodes/'
2190 url = u'http://www.colbertnation.com/full-episodes/'
2191 mobj = re.match(self._VALID_URL, url)
2192 assert mobj is not None
2194 dlNewest = not mobj.group('episode')
2196 epTitle = mobj.group('showname')
2198 epTitle = mobj.group('episode')
2200 req = urllib2.Request(url)
2201 self.report_extraction(epTitle)
2203 htmlHandle = urllib2.urlopen(req)
2204 html = htmlHandle.read()
2205 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2206 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2209 url = htmlHandle.geturl()
2210 mobj = re.match(self._VALID_URL, url)
2212 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2214 if mobj.group('episode') == '':
2215 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2217 epTitle = mobj.group('episode')
2219 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2220 if len(mMovieParams) == 0:
2221 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2224 playerUrl_raw = mMovieParams[0][0]
2225 self.report_player_url(epTitle)
2227 urlHandle = urllib2.urlopen(playerUrl_raw)
2228 playerUrl = urlHandle.geturl()
2229 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2233 uri = mMovieParams[0][1]
2234 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2235 self.report_index_download(epTitle)
2237 indexXml = urllib2.urlopen(indexUrl).read()
2238 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2239 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2244 idoc = xml.etree.ElementTree.fromstring(indexXml)
2245 itemEls = idoc.findall('.//item')
2246 for itemEl in itemEls:
2247 mediaId = itemEl.findall('./guid')[0].text
2248 shortMediaId = mediaId.split(':')[-1]
2249 showId = mediaId.split(':')[-2].replace('.com', '')
2250 officialTitle = itemEl.findall('./title')[0].text
2251 officialDate = itemEl.findall('./pubDate')[0].text
2253 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2254 urllib.urlencode({'uri': mediaId}))
2255 configReq = urllib2.Request(configUrl)
2256 self.report_config_download(epTitle)
2258 configXml = urllib2.urlopen(configReq).read()
2259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2260 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2263 cdoc = xml.etree.ElementTree.fromstring(configXml)
2265 for rendition in cdoc.findall('.//rendition'):
2266 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2270 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2273 # For now, just pick the highest bitrate
2274 format,video_url = turls[-1]
2276 effTitle = showId + u'-' + epTitle
2281 'upload_date': officialDate,
2286 'description': officialTitle,
2287 'player_url': playerUrl
2290 results.append(info)
2295 class EscapistIE(InfoExtractor):
2296 """Information extractor for The Escapist """
2298 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2299 IE_NAME = u'escapist'
2301 def report_extraction(self, showName):
2302 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2304 def report_config_download(self, showName):
2305 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2307 def _real_extract(self, url):
2308 mobj = re.match(self._VALID_URL, url)
2310 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2312 showName = mobj.group('showname')
2313 videoId = mobj.group('episode')
2315 self.report_extraction(showName)
2317 webPage = urllib2.urlopen(url)
2318 webPageBytes = webPage.read()
2319 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2320 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2322 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2325 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2326 description = unescapeHTML(descMatch.group(1))
2327 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2328 imgUrl = unescapeHTML(imgMatch.group(1))
2329 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2330 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2331 configUrlMatch = re.search('config=(.*)$', playerUrl)
2332 configUrl = urllib2.unquote(configUrlMatch.group(1))
2334 self.report_config_download(showName)
2336 configJSON = urllib2.urlopen(configUrl).read()
2337 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2338 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2341 # Technically, it's JavaScript, not JSON
2342 configJSON = configJSON.replace("'", '"')
2345 config = json.loads(configJSON)
2346 except (ValueError,), err:
2347 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2350 playlist = config['playlist']
2351 videoUrl = playlist[1]['url']
2356 'uploader': showName,
2357 'upload_date': None,
2361 'thumbnail': imgUrl,
2362 'description': description,
2363 'player_url': playerUrl,
2369 class CollegeHumorIE(InfoExtractor):
2370 """Information extractor for collegehumor.com"""
2372 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2373 IE_NAME = u'collegehumor'
2375 def report_webpage(self, video_id):
2376 """Report information extraction."""
2377 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2379 def report_extraction(self, video_id):
2380 """Report information extraction."""
2381 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2383 def _real_extract(self, url):
2384 mobj = re.match(self._VALID_URL, url)
2386 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388 video_id = mobj.group('videoid')
2390 self.report_webpage(video_id)
2391 request = urllib2.Request(url)
2393 webpage = urllib2.urlopen(request).read()
2394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2395 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2398 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2400 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2402 internal_video_id = m.group('internalvideoid')
2406 'internal_id': internal_video_id,
2409 self.report_extraction(video_id)
2410 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2412 metaXml = urllib2.urlopen(xmlUrl).read()
2413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2414 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2417 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2419 videoNode = mdoc.findall('./video')[0]
2420 info['description'] = videoNode.findall('./description')[0].text
2421 info['title'] = videoNode.findall('./caption')[0].text
2422 info['url'] = videoNode.findall('./file')[0].text
2423 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2424 info['ext'] = info['url'].rpartition('.')[2]
2425 info['format'] = info['ext']
2427 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2433 class XVideosIE(InfoExtractor):
2434 """Information extractor for xvideos.com"""
2436 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2437 IE_NAME = u'xvideos'
2439 def report_webpage(self, video_id):
2440 """Report information extraction."""
2441 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443 def report_extraction(self, video_id):
2444 """Report information extraction."""
2445 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447 def _real_extract(self, url):
2448 mobj = re.match(self._VALID_URL, url)
2450 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452 video_id = mobj.group(1).decode('utf-8')
2454 self.report_webpage(video_id)
2456 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2458 webpage = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2463 self.report_extraction(video_id)
2467 mobj = re.search(r'flv_url=(.+?)&', webpage)
2469 self._downloader.trouble(u'ERROR: unable to extract video url')
2471 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2475 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2477 self._downloader.trouble(u'ERROR: unable to extract video title')
2479 video_title = mobj.group(1).decode('utf-8')
2482 # Extract video thumbnail
2483 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2485 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2487 video_thumbnail = mobj.group(0).decode('utf-8')
2493 'upload_date': None,
2494 'title': video_title,
2497 'thumbnail': video_thumbnail,
2498 'description': None,
2505 class SoundcloudIE(InfoExtractor):
2506 """Information extractor for soundcloud.com
2507 To access the media, the uid of the song and a stream token
2508 must be extracted from the page source and the script must make
2509 a request to media.soundcloud.com/crossdomain.xml. Then
2510 the media can be grabbed by requesting from an url composed
2511 of the stream token and uid
2514 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2515 IE_NAME = u'soundcloud'
2517 def __init__(self, downloader=None):
2518 InfoExtractor.__init__(self, downloader)
2520 def report_webpage(self, video_id):
2521 """Report information extraction."""
2522 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2524 def report_extraction(self, video_id):
2525 """Report information extraction."""
2526 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2528 def _real_extract(self, url):
2529 mobj = re.match(self._VALID_URL, url)
2531 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2534 # extract uploader (which is in the url)
2535 uploader = mobj.group(1).decode('utf-8')
2536 # extract simple title (uploader + slug of song title)
2537 slug_title = mobj.group(2).decode('utf-8')
2538 simple_title = uploader + u'-' + slug_title
2540 self.report_webpage('%s/%s' % (uploader, slug_title))
2542 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2544 webpage = urllib2.urlopen(request).read()
2545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2546 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2549 self.report_extraction('%s/%s' % (uploader, slug_title))
2551 # extract uid and stream token that soundcloud hands out for access
2552 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2554 video_id = mobj.group(1)
2555 stream_token = mobj.group(2)
2557 # extract unsimplified title
2558 mobj = re.search('"title":"(.*?)",', webpage)
2560 title = mobj.group(1).decode('utf-8')
2562 title = simple_title
2564 # construct media url (with uid/token)
2565 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2566 mediaURL = mediaURL % (video_id, stream_token)
2569 description = u'No description available'
2570 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2572 description = mobj.group(1)
2576 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2579 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2580 except Exception, e:
2581 self._downloader.to_stderr(str(e))
2583 # for soundcloud, a request to a cross domain is required for cookies
2584 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2587 'id': video_id.decode('utf-8'),
2589 'uploader': uploader.decode('utf-8'),
2590 'upload_date': upload_date,
2595 'description': description.decode('utf-8')
2599 class InfoQIE(InfoExtractor):
2600 """Information extractor for infoq.com"""
2602 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2605 def report_webpage(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2609 def report_extraction(self, video_id):
2610 """Report information extraction."""
2611 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2613 def _real_extract(self, url):
2614 mobj = re.match(self._VALID_URL, url)
2616 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2619 self.report_webpage(url)
2621 request = urllib2.Request(url)
2623 webpage = urllib2.urlopen(request).read()
2624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2628 self.report_extraction(url)
2632 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2634 self._downloader.trouble(u'ERROR: unable to extract video url')
2636 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2640 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2642 self._downloader.trouble(u'ERROR: unable to extract video title')
2644 video_title = mobj.group(1).decode('utf-8')
2646 # Extract description
2647 video_description = u'No description available.'
2648 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2649 if mobj is not None:
2650 video_description = mobj.group(1).decode('utf-8')
2652 video_filename = video_url.split('/')[-1]
2653 video_id, extension = video_filename.split('.')
2659 'upload_date': None,
2660 'title': video_title,
2662 'format': extension, # Extension is always(?) mp4, but seems to be flv
2664 'description': video_description,
2670 class MixcloudIE(InfoExtractor):
2671 """Information extractor for www.mixcloud.com"""
2672 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2673 IE_NAME = u'mixcloud'
2675 def __init__(self, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2678 def report_download_json(self, file_id):
2679 """Report JSON download."""
2680 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2682 def report_extraction(self, file_id):
2683 """Report information extraction."""
2684 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2686 def get_urls(self, jsonData, fmt, bitrate='best'):
2687 """Get urls from 'audio_formats' section in json"""
2690 bitrate_list = jsonData[fmt]
2691 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2692 bitrate = max(bitrate_list) # select highest
2694 url_list = jsonData[fmt][bitrate]
2695 except TypeError: # we have no bitrate info.
2696 url_list = jsonData[fmt]
2699 def check_urls(self, url_list):
2700 """Returns 1st active url from list"""
2701 for url in url_list:
2703 urllib2.urlopen(url)
2705 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2710 def _print_formats(self, formats):
2711 print 'Available formats:'
2712 for fmt in formats.keys():
2713 for b in formats[fmt]:
2715 ext = formats[fmt][b][0]
2716 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2717 except TypeError: # we have no bitrate info
2718 ext = formats[fmt][0]
2719 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2722 def _real_extract(self, url):
2723 mobj = re.match(self._VALID_URL, url)
2725 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2727 # extract uploader & filename from url
2728 uploader = mobj.group(1).decode('utf-8')
2729 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2731 # construct API request
2732 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2733 # retrieve .json file with links to files
2734 request = urllib2.Request(file_url)
2736 self.report_download_json(file_url)
2737 jsonData = urllib2.urlopen(request).read()
2738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2739 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2743 json_data = json.loads(jsonData)
2744 player_url = json_data['player_swf_url']
2745 formats = dict(json_data['audio_formats'])
2747 req_format = self._downloader.params.get('format', None)
2750 if self._downloader.params.get('listformats', None):
2751 self._print_formats(formats)
2754 if req_format is None or req_format == 'best':
2755 for format_param in formats.keys():
2756 url_list = self.get_urls(formats, format_param)
2758 file_url = self.check_urls(url_list)
2759 if file_url is not None:
2762 if req_format not in formats.keys():
2763 self._downloader.trouble(u'ERROR: format is not available')
2766 url_list = self.get_urls(formats, req_format)
2767 file_url = self.check_urls(url_list)
2768 format_param = req_format
2771 'id': file_id.decode('utf-8'),
2772 'url': file_url.decode('utf-8'),
2773 'uploader': uploader.decode('utf-8'),
2774 'upload_date': u'NA',
2775 'title': json_data['name'],
2776 'ext': file_url.split('.')[-1].decode('utf-8'),
2777 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2778 'thumbnail': json_data['thumbnail_url'],
2779 'description': json_data['description'],
2780 'player_url': player_url.decode('utf-8'),
2783 class StanfordOpenClassroomIE(InfoExtractor):
2784 """Information extractor for Stanford's Open ClassRoom"""
2786 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2787 IE_NAME = u'stanfordoc'
2789 def report_download_webpage(self, objid):
2790 """Report information extraction."""
2791 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2793 def report_extraction(self, video_id):
2794 """Report information extraction."""
2795 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2803 if mobj.group('course') and mobj.group('video'): # A specific video
2804 course = mobj.group('course')
2805 video = mobj.group('video')
2807 'id': course + '_' + video,
2810 self.report_extraction(info['id'])
2811 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2812 xmlUrl = baseUrl + video + '.xml'
2814 metaXml = urllib2.urlopen(xmlUrl).read()
2815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2818 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2820 info['title'] = mdoc.findall('./title')[0].text
2821 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2823 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2825 info['ext'] = info['url'].rpartition('.')[2]
2826 info['format'] = info['ext']
2828 elif mobj.group('course'): # A course page
2829 course = mobj.group('course')
2835 self.report_download_webpage(info['id'])
2837 coursepage = urllib2.urlopen(url).read()
2838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2842 m = re.search('<h1>([^<]+)</h1>', coursepage)
2844 info['title'] = unescapeHTML(m.group(1))
2846 info['title'] = info['id']
2848 m = re.search('<description>([^<]+)</description>', coursepage)
2850 info['description'] = unescapeHTML(m.group(1))
2852 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2855 'type': 'reference',
2856 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2860 for entry in info['list']:
2861 assert entry['type'] == 'reference'
2862 results += self.extract(entry['url'])
2867 'id': 'Stanford OpenClassroom',
2871 self.report_download_webpage(info['id'])
2872 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2874 rootpage = urllib2.urlopen(rootURL).read()
2875 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2876 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2879 info['title'] = info['id']
2881 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2884 'type': 'reference',
2885 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2890 for entry in info['list']:
2891 assert entry['type'] == 'reference'
2892 results += self.extract(entry['url'])
2895 class MTVIE(InfoExtractor):
2896 """Information extractor for MTV.com"""
2898 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2901 def report_webpage(self, video_id):
2902 """Report information extraction."""
2903 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2905 def report_extraction(self, video_id):
2906 """Report information extraction."""
2907 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2909 def _real_extract(self, url):
2910 mobj = re.match(self._VALID_URL, url)
2912 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2914 if not mobj.group('proto'):
2915 url = 'http://' + url
2916 video_id = mobj.group('videoid')
2917 self.report_webpage(video_id)
2919 request = urllib2.Request(url)
2921 webpage = urllib2.urlopen(request).read()
2922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2923 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2926 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2928 self._downloader.trouble(u'ERROR: unable to extract song name')
2930 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2931 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2933 self._downloader.trouble(u'ERROR: unable to extract performer')
2935 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2936 video_title = performer + ' - ' + song_name
2938 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2940 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2942 mtvn_uri = mobj.group(1)
2944 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2946 self._downloader.trouble(u'ERROR: unable to extract content id')
2948 content_id = mobj.group(1)
2950 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2951 self.report_extraction(video_id)
2952 request = urllib2.Request(videogen_url)
2954 metadataXml = urllib2.urlopen(request).read()
2955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2956 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2959 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2960 renditions = mdoc.findall('.//rendition')
2962 # For now, always pick the highest quality.
2963 rendition = renditions[-1]
2966 _,_,ext = rendition.attrib['type'].partition('/')
2967 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2968 video_url = rendition.find('./src').text
2970 self._downloader.trouble('Invalid rendition field.')
2976 'uploader': performer,
2977 'title': video_title,
2985 class YoukuIE(InfoExtractor):
2987 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2990 def __init__(self, downloader=None):
2991 InfoExtractor.__init__(self, downloader)
2993 def report_download_webpage(self, file_id):
2994 """Report webpage download."""
2995 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2997 def report_extraction(self, file_id):
2998 """Report information extraction."""
2999 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3002 nowTime = int(time.time() * 1000)
3003 random1 = random.randint(1000,1998)
3004 random2 = random.randint(1000,9999)
3006 return "%d%d%d" %(nowTime,random1,random2)
3008 def _get_file_ID_mix_string(self, seed):
3010 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3012 for i in range(len(source)):
3013 seed = (seed * 211 + 30031 ) % 65536
3014 index = math.floor(seed / 65536 * len(source) )
3015 mixed.append(source[int(index)])
3016 source.remove(source[int(index)])
3017 #return ''.join(mixed)
3020 def _get_file_id(self, fileId, seed):
3021 mixed = self._get_file_ID_mix_string(seed)
3022 ids = fileId.split('*')
3026 realId.append(mixed[int(ch)])
3027 return ''.join(realId)
3029 def _real_extract(self, url):
3030 mobj = re.match(self._VALID_URL, url)
3032 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3034 video_id = mobj.group('ID')
3036 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3038 request = urllib2.Request(info_url, None, std_headers)
3040 self.report_download_webpage(video_id)
3041 jsondata = urllib2.urlopen(request).read()
3042 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3043 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3046 self.report_extraction(video_id)
3048 config = json.loads(jsondata)
3050 video_title = config['data'][0]['title']
3051 seed = config['data'][0]['seed']
3053 format = self._downloader.params.get('format', None)
3054 supported_format = config['data'][0]['streamfileids'].keys()
3056 if format is None or format == 'best':
3057 if 'hd2' in supported_format:
3062 elif format == 'worst':
3070 fileid = config['data'][0]['streamfileids'][format]
3071 seg_number = len(config['data'][0]['segs'][format])
3074 for i in xrange(seg_number):
3075 keys.append(config['data'][0]['segs'][format][i]['k'])
3078 #youku only could be viewed from mainland china
3080 self._downloader.trouble(u'ERROR: unable to extract info section')
3084 sid = self._gen_sid()
3085 fileid = self._get_file_id(fileid, seed)
3087 #column 8,9 of fileid represent the segment number
3088 #fileid[7:9] should be changed
3089 for index, key in enumerate(keys):
3091 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3092 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3095 'id': '%s_part%02d' % (video_id, index),
3096 'url': download_url,
3098 'title': video_title,
3102 files_info.append(info)
3107 class XNXXIE(InfoExtractor):
3108 """Information extractor for xnxx.com"""
3110 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3112 VIDEO_URL_RE = r'flv_url=(.*?)&'
3113 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3114 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3116 def report_webpage(self, video_id):
3117 """Report information extraction"""
3118 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3120 def report_extraction(self, video_id):
3121 """Report information extraction"""
3122 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3124 def _real_extract(self, url):
3125 mobj = re.match(self._VALID_URL, url)
3127 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3129 video_id = mobj.group(1).decode('utf-8')
3131 self.report_webpage(video_id)
3133 # Get webpage content
3135 webpage = urllib2.urlopen(url).read()
3136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3137 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3140 result = re.search(self.VIDEO_URL_RE, webpage)
3142 self._downloader.trouble(u'ERROR: unable to extract video url')
3144 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3146 result = re.search(self.VIDEO_TITLE_RE, webpage)
3148 self._downloader.trouble(u'ERROR: unable to extract video title')
3150 video_title = result.group(1).decode('utf-8')
3152 result = re.search(self.VIDEO_THUMB_RE, webpage)
3154 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3156 video_thumbnail = result.group(1).decode('utf-8')
3158 info = {'id': video_id,
3161 'upload_date': None,
3162 'title': video_title,
3165 'thumbnail': video_thumbnail,
3166 'description': None,