2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
106 (?: # the various things that can precede the ID:
107 (?:(?:v|embed|e)/) # v/ or embed/ or e/
108 |(?: # or the v= param in all its forms
109 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110 (?:\?|\#!?) # the params delimiter ? or # or #!
111 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 )? # optional -> youtube.com/xxxx is OK
115 )? # all until now is optional -> you can pass the naked ID
116 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
117 (?(1).+)? # if we found the ID, everything can follow
119 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123 _NETRC_MACHINE = 'youtube'
124 # Listed in order of quality
125 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127 _video_extensions = {
133 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139 _video_dimensions = {
157 def suitable(self, url):
158 """Receives a URL and returns True if suitable for this IE."""
159 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161 def report_lang(self):
162 """Report attempt to set language."""
163 self._downloader.to_screen(u'[youtube] Setting language')
165 def report_login(self):
166 """Report attempt to log in."""
167 self._downloader.to_screen(u'[youtube] Logging in')
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self._downloader.to_screen(u'[youtube] Confirming age')
173 def report_video_webpage_download(self, video_id):
174 """Report attempt to download video webpage."""
175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177 def report_video_info_webpage_download(self, video_id):
178 """Report attempt to download video info webpage."""
179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181 def report_video_subtitles_download(self, video_id):
182 """Report attempt to download video info webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185 def report_information_extraction(self, video_id):
186 """Report attempt to extract video information."""
187 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189 def report_unavailable_format(self, video_id, format):
190 """Report extracted video URL."""
191 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193 def report_rtmp_download(self):
194 """Indicate the download will use the RTMP protocol."""
195 self._downloader.to_screen(u'[youtube] RTMP download detected')
197 def _closed_captions_xml_to_srt(self, xml_string):
199 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200 # TODO parse xml instead of regex
201 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202 if not dur: dur = '4'
204 end = start + float(dur)
205 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207 caption = unescapeHTML(caption)
208 caption = unescapeHTML(caption) # double cycle, intentional
209 srt += str(n+1) + '\n'
210 srt += start + ' --> ' + end + '\n'
211 srt += caption + '\n\n'
214 def _print_formats(self, formats):
215 print 'Available formats:'
217 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219 def _real_initialize(self):
220 if self._downloader is None:
225 downloader_params = self._downloader.params
227 # Attempt to use provided username and password or .netrc data
228 if downloader_params.get('username', None) is not None:
229 username = downloader_params['username']
230 password = downloader_params['password']
231 elif downloader_params.get('usenetrc', False):
233 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
238 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239 except (IOError, netrc.NetrcParseError), err:
240 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 request = urllib2.Request(self._LANG_URL)
247 urllib2.urlopen(request).read()
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
252 # No authentication to be performed
258 'current_form': 'loginForm',
260 'action_login': 'Log In',
261 'username': username,
262 'password': password,
264 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 login_results = urllib2.urlopen(request).read()
268 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
278 'action_confirm': 'Confirm',
280 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 self.report_age_confirmation()
283 age_results = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
288 def _real_extract(self, url):
289 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290 mobj = re.search(self._NEXT_URL_RE, url)
292 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294 # Extract video id from URL
295 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 video_id = mobj.group(2)
302 self.report_video_webpage_download(video_id)
303 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 video_webpage = urllib2.urlopen(request).read()
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
310 # Attempt to extract SWF player URL
311 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
318 self.report_video_info_webpage_download(video_id)
319 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321 % (video_id, el_type))
322 request = urllib2.Request(video_info_url)
324 video_info_webpage = urllib2.urlopen(request).read()
325 video_info = parse_qs(video_info_webpage)
326 if 'token' in video_info:
328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 if 'token' not in video_info:
332 if 'reason' in video_info:
333 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 # Check for "rental" videos
339 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 # Start extracting information
344 self.report_information_extraction(video_id)
347 if 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 video_uploader = urllib.unquote_plus(video_info['author'][0])
353 if 'title' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract video title')
356 video_title = urllib.unquote_plus(video_info['title'][0])
357 video_title = video_title.decode('utf-8')
360 if 'thumbnail_url' not in video_info:
361 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 else: # don't panic if we can't find it
364 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372 for expression in format_expressions:
374 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
379 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380 if video_description: video_description = clean_html(video_description)
381 else: video_description = ''
384 video_subtitles = None
385 if self._downloader.params.get('writesubtitles', False):
387 self.report_video_subtitles_download(video_id)
388 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 srt_list = urllib2.urlopen(request).read()
391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395 if not srt_lang_list:
396 raise Trouble(u'WARNING: video has no closed captions')
397 if self._downloader.params.get('subtitleslang', False):
398 srt_lang = self._downloader.params.get('subtitleslang')
399 elif 'en' in srt_lang_list:
402 srt_lang = srt_lang_list.keys()[0]
403 if not srt_lang in srt_lang_list:
404 raise Trouble(u'WARNING: no closed captions found in the specified language')
405 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 srt_xml = urllib2.urlopen(request).read()
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 raise Trouble(u'WARNING: unable to download video subtitles')
412 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413 except Trouble as trouble:
414 self._downloader.trouble(trouble[0])
417 video_token = urllib.unquote_plus(video_info['token'][0])
419 # Decide which formats to download
420 req_format = self._downloader.params.get('format', None)
422 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423 self.report_rtmp_download()
424 video_url_list = [(None, video_info['conn'][0])]
425 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427 url_data = [parse_qs(uds) for uds in url_data_strs]
428 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
431 format_limit = self._downloader.params.get('format_limit', None)
432 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433 if format_limit is not None and format_limit in available_formats:
434 format_list = available_formats[available_formats.index(format_limit):]
436 format_list = available_formats
437 existing_formats = [x for x in format_list if x in url_map]
438 if len(existing_formats) == 0:
439 self._downloader.trouble(u'ERROR: no known formats available for video')
441 if self._downloader.params.get('listformats', None):
442 self._print_formats(existing_formats)
444 if req_format is None or req_format == 'best':
445 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446 elif req_format == 'worst':
447 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448 elif req_format in ('-1', 'all'):
449 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 # Specific formats. We pick the first in a slash-delimeted sequence.
452 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453 req_formats = req_format.split('/')
454 video_url_list = None
455 for rf in req_formats:
457 video_url_list = [(rf, url_map[rf])]
459 if video_url_list is None:
460 self._downloader.trouble(u'ERROR: requested format not available')
463 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
467 for format_param, video_real_url in video_url_list:
469 video_extension = self._video_extensions.get(format_param, 'flv')
472 'id': video_id.decode('utf-8'),
473 'url': video_real_url.decode('utf-8'),
474 'uploader': video_uploader.decode('utf-8'),
475 'upload_date': upload_date,
476 'title': video_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
487 class MetacafeIE(InfoExtractor):
488 """Information Extractor for metacafe.com."""
490 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493 IE_NAME = u'metacafe'
495 def __init__(self, downloader=None):
496 InfoExtractor.__init__(self, downloader)
498 def report_disclaimer(self):
499 """Report disclaimer retrieval."""
500 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502 def report_age_confirmation(self):
503 """Report attempt to confirm age."""
504 self._downloader.to_screen(u'[metacafe] Confirming age')
506 def report_download_webpage(self, video_id):
507 """Report webpage download."""
508 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510 def report_extraction(self, video_id):
511 """Report information extraction."""
512 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514 def _real_initialize(self):
515 # Retrieve disclaimer
516 request = urllib2.Request(self._DISCLAIMER)
518 self.report_disclaimer()
519 disclaimer = urllib2.urlopen(request).read()
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
527 'submit': "Continue - I'm over 18",
529 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 self.report_age_confirmation()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract id and simplified title from URL
539 mobj = re.match(self._VALID_URL, url)
541 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
544 video_id = mobj.group(1)
546 # Check if video comes from YouTube
547 mobj2 = re.match(r'^yt-(.*)$', video_id)
548 if mobj2 is not None:
549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
552 # Retrieve video webpage to extract further information
553 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 self.report_download_webpage(video_id)
556 webpage = urllib2.urlopen(request).read()
557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
561 # Extract URL, uploader and title from webpage
562 self.report_extraction(video_id)
563 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 mediaURL = urllib.unquote(mobj.group(1))
566 video_extension = mediaURL[-3:]
568 # Extract gdaKey if available
569 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
573 gdaKey = mobj.group(1)
574 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 vardict = parse_qs(mobj.group(1))
581 if 'mediaData' not in vardict:
582 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 mediaURL = mobj.group(1).replace('\\/', '/')
589 video_extension = mediaURL[-3:]
590 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract title')
596 video_title = mobj.group(1).decode('utf-8')
598 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 video_uploader = mobj.group(1)
605 'id': video_id.decode('utf-8'),
606 'url': video_url.decode('utf-8'),
607 'uploader': video_uploader.decode('utf-8'),
608 'upload_date': u'NA',
609 'title': video_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1)
642 video_extension = 'mp4'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 flashvars = urllib.unquote(mobj.group(1))
661 if 'hqURL' in flashvars: max_quality = 'hqURL'
662 elif 'sdURL' in flashvars: max_quality = 'sdURL'
663 else: max_quality = 'ldURL'
664 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
666 mobj = re.search(r'"video_url":"(.*?)",', urllib.unquote(webpage))
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
672 # TODO: support choosing qualities
674 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
676 self._downloader.trouble(u'ERROR: unable to extract title')
678 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
680 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
684 video_uploader = mobj.group(1)
687 'id': video_id.decode('utf-8'),
688 'url': video_url.decode('utf-8'),
689 'uploader': video_uploader.decode('utf-8'),
690 'upload_date': u'NA',
691 'title': video_title,
692 'ext': video_extension.decode('utf-8'),
698 class GoogleIE(InfoExtractor):
699 """Information extractor for video.google.com."""
701 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
702 IE_NAME = u'video.google'
704 def __init__(self, downloader=None):
705 InfoExtractor.__init__(self, downloader)
707 def report_download_webpage(self, video_id):
708 """Report webpage download."""
709 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
711 def report_extraction(self, video_id):
712 """Report information extraction."""
713 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
715 def _real_extract(self, url):
716 # Extract id from URL
717 mobj = re.match(self._VALID_URL, url)
719 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
722 video_id = mobj.group(1)
724 video_extension = 'mp4'
726 # Retrieve video webpage to extract further information
727 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
729 self.report_download_webpage(video_id)
730 webpage = urllib2.urlopen(request).read()
731 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
732 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
735 # Extract URL, uploader, and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r"download_url:'([^']+)'", webpage)
739 video_extension = 'flv'
740 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
742 self._downloader.trouble(u'ERROR: unable to extract media URL')
744 mediaURL = urllib.unquote(mobj.group(1))
745 mediaURL = mediaURL.replace('\\x3d', '\x3d')
746 mediaURL = mediaURL.replace('\\x26', '\x26')
750 mobj = re.search(r'<title>(.*)</title>', webpage)
752 self._downloader.trouble(u'ERROR: unable to extract title')
754 video_title = mobj.group(1).decode('utf-8')
756 # Extract video description
757 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
759 self._downloader.trouble(u'ERROR: unable to extract video description')
761 video_description = mobj.group(1).decode('utf-8')
762 if not video_description:
763 video_description = 'No description available.'
765 # Extract video thumbnail
766 if self._downloader.params.get('forcethumbnail', False):
767 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
769 webpage = urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
773 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
775 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
777 video_thumbnail = mobj.group(1)
778 else: # we need something to pass to process_info
782 'id': video_id.decode('utf-8'),
783 'url': video_url.decode('utf-8'),
785 'upload_date': u'NA',
786 'title': video_title,
787 'ext': video_extension.decode('utf-8'),
793 class PhotobucketIE(InfoExtractor):
794 """Information extractor for photobucket.com."""
796 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
797 IE_NAME = u'photobucket'
799 def __init__(self, downloader=None):
800 InfoExtractor.__init__(self, downloader)
802 def report_download_webpage(self, video_id):
803 """Report webpage download."""
804 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
806 def report_extraction(self, video_id):
807 """Report information extraction."""
808 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
810 def _real_extract(self, url):
811 # Extract id from URL
812 mobj = re.match(self._VALID_URL, url)
814 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
817 video_id = mobj.group(1)
819 video_extension = 'flv'
821 # Retrieve video webpage to extract further information
822 request = urllib2.Request(url)
824 self.report_download_webpage(video_id)
825 webpage = urllib2.urlopen(request).read()
826 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
827 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
830 # Extract URL, uploader, and title from webpage
831 self.report_extraction(video_id)
832 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
834 self._downloader.trouble(u'ERROR: unable to extract media URL')
836 mediaURL = urllib.unquote(mobj.group(1))
840 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
842 self._downloader.trouble(u'ERROR: unable to extract title')
844 video_title = mobj.group(1).decode('utf-8')
846 video_uploader = mobj.group(2).decode('utf-8')
849 'id': video_id.decode('utf-8'),
850 'url': video_url.decode('utf-8'),
851 'uploader': video_uploader,
852 'upload_date': u'NA',
853 'title': video_title,
854 'ext': video_extension.decode('utf-8'),
860 class YahooIE(InfoExtractor):
861 """Information extractor for video.yahoo.com."""
863 # _VALID_URL matches all Yahoo! Video URLs
864 # _VPAGE_URL matches only the extractable '/watch/' URLs
865 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
866 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
867 IE_NAME = u'video.yahoo'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
880 def _real_extract(self, url, new_video=True):
881 # Extract ID from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
887 video_id = mobj.group(2)
888 video_extension = 'flv'
890 # Rewrite valid but non-extractable URLs as
891 # extractable English language /watch/ URLs
892 if re.match(self._VPAGE_URL, url) is None:
893 request = urllib2.Request(url)
895 webpage = urllib2.urlopen(request).read()
896 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
897 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
900 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
902 self._downloader.trouble(u'ERROR: Unable to extract id field')
904 yahoo_id = mobj.group(1)
906 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
908 self._downloader.trouble(u'ERROR: Unable to extract vid field')
910 yahoo_vid = mobj.group(1)
912 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
913 return self._real_extract(url, new_video=False)
915 # Retrieve video webpage to extract further information
916 request = urllib2.Request(url)
918 self.report_download_webpage(video_id)
919 webpage = urllib2.urlopen(request).read()
920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
921 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
924 # Extract uploader and title from webpage
925 self.report_extraction(video_id)
926 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
928 self._downloader.trouble(u'ERROR: unable to extract video title')
930 video_title = mobj.group(1).decode('utf-8')
932 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
934 self._downloader.trouble(u'ERROR: unable to extract video uploader')
936 video_uploader = mobj.group(1).decode('utf-8')
938 # Extract video thumbnail
939 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
943 video_thumbnail = mobj.group(1).decode('utf-8')
945 # Extract video description
946 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
948 self._downloader.trouble(u'ERROR: unable to extract video description')
950 video_description = mobj.group(1).decode('utf-8')
951 if not video_description:
952 video_description = 'No description available.'
954 # Extract video height and width
955 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
957 self._downloader.trouble(u'ERROR: unable to extract video height')
959 yv_video_height = mobj.group(1)
961 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video width')
965 yv_video_width = mobj.group(1)
967 # Retrieve video playlist to extract media URL
968 # I'm not completely sure what all these options are, but we
969 # seem to need most of them, otherwise the server sends a 401.
970 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
971 yv_bitrate = '700' # according to Wikipedia this is hard-coded
972 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
973 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
974 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
976 self.report_download_webpage(video_id)
977 webpage = urllib2.urlopen(request).read()
978 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
979 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
982 # Extract media URL from playlist XML
983 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
985 self._downloader.trouble(u'ERROR: Unable to extract media URL')
987 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
988 video_url = unescapeHTML(video_url)
991 'id': video_id.decode('utf-8'),
993 'uploader': video_uploader,
994 'upload_date': u'NA',
995 'title': video_title,
996 'ext': video_extension.decode('utf-8'),
997 'thumbnail': video_thumbnail.decode('utf-8'),
998 'description': video_description,
999 'thumbnail': video_thumbnail,
1004 class VimeoIE(InfoExtractor):
1005 """Information extractor for vimeo.com."""
1007 # _VALID_URL matches Vimeo URLs
1008 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1011 def __init__(self, downloader=None):
1012 InfoExtractor.__init__(self, downloader)
1014 def report_download_webpage(self, video_id):
1015 """Report webpage download."""
1016 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1018 def report_extraction(self, video_id):
1019 """Report information extraction."""
1020 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1022 def _real_extract(self, url, new_video=True):
1023 # Extract ID from URL
1024 mobj = re.match(self._VALID_URL, url)
1026 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1029 video_id = mobj.group(1)
1031 # Retrieve video webpage to extract further information
1032 request = urllib2.Request(url, None, std_headers)
1034 self.report_download_webpage(video_id)
1035 webpage = urllib2.urlopen(request).read()
1036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1040 # Now we begin extracting as much information as we can from what we
1041 # retrieved. First we extract the information common to all extractors,
1042 # and latter we extract those that are Vimeo specific.
1043 self.report_extraction(video_id)
1045 # Extract the config JSON
1046 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1048 config = json.loads(config)
1050 self._downloader.trouble(u'ERROR: unable to extract info section')
1054 video_title = config["video"]["title"]
1057 video_uploader = config["video"]["owner"]["name"]
1059 # Extract video thumbnail
1060 video_thumbnail = config["video"]["thumbnail"]
1062 # Extract video description
1063 video_description = get_element_by_id("description", webpage.decode('utf8'))
1064 if video_description: video_description = clean_html(video_description)
1065 else: video_description = ''
1067 # Extract upload date
1068 video_upload_date = u'NA'
1069 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1070 if mobj is not None:
1071 video_upload_date = mobj.group(1)
1073 # Vimeo specific: extract request signature and timestamp
1074 sig = config['request']['signature']
1075 timestamp = config['request']['timestamp']
1077 # Vimeo specific: extract video codec and quality information
1078 # TODO bind to format param
1079 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1080 for codec in codecs:
1081 if codec[0] in config["video"]["files"]:
1082 video_codec = codec[0]
1083 video_extension = codec[1]
1084 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1085 else: quality = 'sd'
1088 self._downloader.trouble(u'ERROR: no known codec found')
1091 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1092 %(video_id, sig, timestamp, quality, video_codec.upper())
1097 'uploader': video_uploader,
1098 'upload_date': video_upload_date,
1099 'title': video_title,
1100 'ext': video_extension,
1101 'thumbnail': video_thumbnail,
1102 'description': video_description,
1107 class GenericIE(InfoExtractor):
1108 """Generic last-resort information extractor."""
1111 IE_NAME = u'generic'
1113 def __init__(self, downloader=None):
1114 InfoExtractor.__init__(self, downloader)
1116 def report_download_webpage(self, video_id):
1117 """Report webpage download."""
1118 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1119 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1121 def report_extraction(self, video_id):
1122 """Report information extraction."""
1123 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1125 def report_following_redirect(self, new_url):
1126 """Report information extraction."""
1127 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1129 def _test_redirect(self, url):
1130 """Check if it is a redirect, like url shorteners, in case restart chain."""
1131 class HeadRequest(urllib2.Request):
1132 def get_method(self):
1135 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1137 Subclass the HTTPRedirectHandler to make it use our
1138 HeadRequest also on the redirected URL
1140 def redirect_request(self, req, fp, code, msg, headers, newurl):
1141 if code in (301, 302, 303, 307):
1142 newurl = newurl.replace(' ', '%20')
1143 newheaders = dict((k,v) for k,v in req.headers.items()
1144 if k.lower() not in ("content-length", "content-type"))
1145 return HeadRequest(newurl,
1147 origin_req_host=req.get_origin_req_host(),
1150 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1152 class HTTPMethodFallback(urllib2.BaseHandler):
1154 Fallback to GET if HEAD is not allowed (405 HTTP error)
1156 def http_error_405(self, req, fp, code, msg, headers):
1160 newheaders = dict((k,v) for k,v in req.headers.items()
1161 if k.lower() not in ("content-length", "content-type"))
1162 return self.parent.open(urllib2.Request(req.get_full_url(),
1164 origin_req_host=req.get_origin_req_host(),
1168 opener = urllib2.OpenerDirector()
1169 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1170 HTTPMethodFallback, HEADRedirectHandler,
1171 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1172 opener.add_handler(handler())
1174 response = opener.open(HeadRequest(url))
1175 new_url = response.geturl()
1177 if url == new_url: return False
1179 self.report_following_redirect(new_url)
1180 self._downloader.download([new_url])
1183 def _real_extract(self, url):
1184 if self._test_redirect(url): return
1186 video_id = url.split('/')[-1]
1187 request = urllib2.Request(url)
1189 self.report_download_webpage(video_id)
1190 webpage = urllib2.urlopen(request).read()
1191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1194 except ValueError, err:
1195 # since this is the last-resort InfoExtractor, if
1196 # this error is thrown, it'll be thrown here
1197 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1200 self.report_extraction(video_id)
1201 # Start with something easy: JW Player in SWFObject
1202 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1204 # Broaden the search a little bit
1205 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1207 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210 # It's possible that one of the regexes
1211 # matched, but returned an empty group:
1212 if mobj.group(1) is None:
1213 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1216 video_url = urllib.unquote(mobj.group(1))
1217 video_id = os.path.basename(video_url)
1219 # here's a fun little line of code for you:
1220 video_extension = os.path.splitext(video_id)[1][1:]
1221 video_id = os.path.splitext(video_id)[0]
1223 # it's tempting to parse this further, but you would
1224 # have to take into account all the variations like
1225 # Video Title - Site Name
1226 # Site Name | Video Title
1227 # Video Title - Tagline | Site Name
1228 # and so on and so forth; it's just not practical
1229 mobj = re.search(r'<title>(.*)</title>', webpage)
1231 self._downloader.trouble(u'ERROR: unable to extract title')
1233 video_title = mobj.group(1).decode('utf-8')
1235 # video uploader is domain name
1236 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1238 self._downloader.trouble(u'ERROR: unable to extract title')
1240 video_uploader = mobj.group(1).decode('utf-8')
1243 'id': video_id.decode('utf-8'),
1244 'url': video_url.decode('utf-8'),
1245 'uploader': video_uploader,
1246 'upload_date': u'NA',
1247 'title': video_title,
1248 'ext': video_extension.decode('utf-8'),
1254 class YoutubeSearchIE(InfoExtractor):
1255 """Information Extractor for YouTube search queries."""
1256 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1257 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1258 _max_youtube_results = 1000
1259 IE_NAME = u'youtube:search'
1261 def __init__(self, downloader=None):
1262 InfoExtractor.__init__(self, downloader)
1264 def report_download_page(self, query, pagenum):
1265 """Report attempt to download search page with given number."""
1266 query = query.decode(preferredencoding())
1267 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1269 def _real_extract(self, query):
1270 mobj = re.match(self._VALID_URL, query)
1272 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1275 prefix, query = query.split(':')
1277 query = query.encode('utf-8')
1279 self._download_n_results(query, 1)
1281 elif prefix == 'all':
1282 self._download_n_results(query, self._max_youtube_results)
1288 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1290 elif n > self._max_youtube_results:
1291 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1292 n = self._max_youtube_results
1293 self._download_n_results(query, n)
1295 except ValueError: # parsing prefix as integer fails
1296 self._download_n_results(query, 1)
1299 def _download_n_results(self, query, n):
1300 """Downloads a specified number of results for a query"""
1306 while (50 * pagenum) < limit:
1307 self.report_download_page(query, pagenum+1)
1308 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1309 request = urllib2.Request(result_url)
1311 data = urllib2.urlopen(request).read()
1312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1313 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1315 api_response = json.loads(data)['data']
1317 new_ids = list(video['id'] for video in api_response['items'])
1318 video_ids += new_ids
1320 limit = min(n, api_response['totalItems'])
1323 if len(video_ids) > n:
1324 video_ids = video_ids[:n]
1325 for id in video_ids:
1326 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1330 class GoogleSearchIE(InfoExtractor):
1331 """Information Extractor for Google Video search queries."""
1332 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1333 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1334 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1335 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1336 _max_google_results = 1000
1337 IE_NAME = u'video.google:search'
1339 def __init__(self, downloader=None):
1340 InfoExtractor.__init__(self, downloader)
1342 def report_download_page(self, query, pagenum):
1343 """Report attempt to download playlist page with given number."""
1344 query = query.decode(preferredencoding())
1345 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1347 def _real_extract(self, query):
1348 mobj = re.match(self._VALID_URL, query)
1350 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1353 prefix, query = query.split(':')
1355 query = query.encode('utf-8')
1357 self._download_n_results(query, 1)
1359 elif prefix == 'all':
1360 self._download_n_results(query, self._max_google_results)
1366 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1368 elif n > self._max_google_results:
1369 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1370 n = self._max_google_results
1371 self._download_n_results(query, n)
1373 except ValueError: # parsing prefix as integer fails
1374 self._download_n_results(query, 1)
1377 def _download_n_results(self, query, n):
1378 """Downloads a specified number of results for a query"""
1384 self.report_download_page(query, pagenum)
1385 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1386 request = urllib2.Request(result_url)
1388 page = urllib2.urlopen(request).read()
1389 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1390 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1393 # Extract video identifiers
1394 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1395 video_id = mobj.group(1)
1396 if video_id not in video_ids:
1397 video_ids.append(video_id)
1398 if len(video_ids) == n:
1399 # Specified n videos reached
1400 for id in video_ids:
1401 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1404 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1405 for id in video_ids:
1406 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1409 pagenum = pagenum + 1
1412 class YahooSearchIE(InfoExtractor):
1413 """Information Extractor for Yahoo! Video search queries."""
1414 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1415 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1416 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1417 _MORE_PAGES_INDICATOR = r'\s*Next'
1418 _max_yahoo_results = 1000
1419 IE_NAME = u'video.yahoo:search'
1421 def __init__(self, downloader=None):
1422 InfoExtractor.__init__(self, downloader)
1424 def report_download_page(self, query, pagenum):
1425 """Report attempt to download playlist page with given number."""
1426 query = query.decode(preferredencoding())
1427 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1429 def _real_extract(self, query):
1430 mobj = re.match(self._VALID_URL, query)
1432 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1435 prefix, query = query.split(':')
1437 query = query.encode('utf-8')
1439 self._download_n_results(query, 1)
1441 elif prefix == 'all':
1442 self._download_n_results(query, self._max_yahoo_results)
1448 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450 elif n > self._max_yahoo_results:
1451 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1452 n = self._max_yahoo_results
1453 self._download_n_results(query, n)
1455 except ValueError: # parsing prefix as integer fails
1456 self._download_n_results(query, 1)
1459 def _download_n_results(self, query, n):
1460 """Downloads a specified number of results for a query"""
1463 already_seen = set()
1467 self.report_download_page(query, pagenum)
1468 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1469 request = urllib2.Request(result_url)
1471 page = urllib2.urlopen(request).read()
1472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1476 # Extract video identifiers
1477 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1478 video_id = mobj.group(1)
1479 if video_id not in already_seen:
1480 video_ids.append(video_id)
1481 already_seen.add(video_id)
1482 if len(video_ids) == n:
1483 # Specified n videos reached
1484 for id in video_ids:
1485 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1489 for id in video_ids:
1490 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1493 pagenum = pagenum + 1
1496 class YoutubePlaylistIE(InfoExtractor):
1497 """Information Extractor for YouTube playlists."""
1499 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1500 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1501 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s'
1502 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1503 IE_NAME = u'youtube:playlist'
1505 def __init__(self, downloader=None):
1506 InfoExtractor.__init__(self, downloader)
1508 def report_download_page(self, playlist_id, pagenum):
1509 """Report attempt to download playlist page with given number."""
1510 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1512 def _real_extract(self, url):
1513 # Extract playlist id
1514 mobj = re.match(self._VALID_URL, url)
1516 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1520 if mobj.group(3) is not None:
1521 self._downloader.download([mobj.group(3)])
1524 # Download playlist pages
1525 # prefix is 'p' as default for playlists but there are other types that need extra care
1526 playlist_prefix = mobj.group(1)
1527 if playlist_prefix == 'a':
1528 playlist_access = 'artist'
1530 playlist_prefix = 'p'
1531 playlist_access = 'view_play_list'
1532 playlist_id = mobj.group(2)
1537 self.report_download_page(playlist_id, pagenum)
1538 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1539 request = urllib2.Request(url)
1541 page = urllib2.urlopen(request).read()
1542 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1543 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1546 # Extract video identifiers
1548 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1549 if mobj.group(1) not in ids_in_page:
1550 ids_in_page.append(mobj.group(1))
1551 video_ids.extend(ids_in_page)
1553 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1555 pagenum = pagenum + 1
1557 playliststart = self._downloader.params.get('playliststart', 1) - 1
1558 playlistend = self._downloader.params.get('playlistend', -1)
1559 if playlistend == -1:
1560 video_ids = video_ids[playliststart:]
1562 video_ids = video_ids[playliststart:playlistend]
1564 for id in video_ids:
1565 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1569 class YoutubeUserIE(InfoExtractor):
1570 """Information Extractor for YouTube users."""
1572 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1573 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1574 _GDATA_PAGE_SIZE = 50
1575 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1576 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1577 IE_NAME = u'youtube:user'
1579 def __init__(self, downloader=None):
1580 InfoExtractor.__init__(self, downloader)
1582 def report_download_page(self, username, start_index):
1583 """Report attempt to download user page."""
1584 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1585 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1587 def _real_extract(self, url):
1589 mobj = re.match(self._VALID_URL, url)
1591 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1594 username = mobj.group(1)
1596 # Download video ids using YouTube Data API. Result size per
1597 # query is limited (currently to 50 videos) so we need to query
1598 # page by page until there are no video ids - it means we got
1605 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1606 self.report_download_page(username, start_index)
1608 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1611 page = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1616 # Extract video identifiers
1619 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1620 if mobj.group(1) not in ids_in_page:
1621 ids_in_page.append(mobj.group(1))
1623 video_ids.extend(ids_in_page)
1625 # A little optimization - if current page is not
1626 # "full", ie. does not contain PAGE_SIZE video ids then
1627 # we can assume that this page is the last one - there
1628 # are no more ids on further pages - no need to query
1631 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1636 all_ids_count = len(video_ids)
1637 playliststart = self._downloader.params.get('playliststart', 1) - 1
1638 playlistend = self._downloader.params.get('playlistend', -1)
1640 if playlistend == -1:
1641 video_ids = video_ids[playliststart:]
1643 video_ids = video_ids[playliststart:playlistend]
1645 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1646 (username, all_ids_count, len(video_ids)))
1648 for video_id in video_ids:
1649 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1652 class BlipTVUserIE(InfoExtractor):
1653 """Information Extractor for blip.tv users."""
1655 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1657 IE_NAME = u'blip.tv:user'
1659 def __init__(self, downloader=None):
1660 InfoExtractor.__init__(self, downloader)
1662 def report_download_page(self, username, pagenum):
1663 """Report attempt to download user page."""
1664 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1665 (self.IE_NAME, username, pagenum))
1667 def _real_extract(self, url):
1669 mobj = re.match(self._VALID_URL, url)
1671 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1674 username = mobj.group(1)
1676 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1678 request = urllib2.Request(url)
1681 page = urllib2.urlopen(request).read().decode('utf-8')
1682 mobj = re.search(r'data-users-id="([^"]+)"', page)
1683 page_base = page_base % mobj.group(1)
1684 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1685 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1689 # Download video ids using BlipTV Ajax calls. Result size per
1690 # query is limited (currently to 12 videos) so we need to query
1691 # page by page until there are no video ids - it means we got
1698 self.report_download_page(username, pagenum)
1700 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1703 page = urllib2.urlopen(request).read().decode('utf-8')
1704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1705 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1708 # Extract video identifiers
1711 for mobj in re.finditer(r'href="/([^"]+)"', page):
1712 if mobj.group(1) not in ids_in_page:
1713 ids_in_page.append(unescapeHTML(mobj.group(1)))
1715 video_ids.extend(ids_in_page)
1717 # A little optimization - if current page is not
1718 # "full", ie. does not contain PAGE_SIZE video ids then
1719 # we can assume that this page is the last one - there
1720 # are no more ids on further pages - no need to query
1723 if len(ids_in_page) < self._PAGE_SIZE:
1728 all_ids_count = len(video_ids)
1729 playliststart = self._downloader.params.get('playliststart', 1) - 1
1730 playlistend = self._downloader.params.get('playlistend', -1)
1732 if playlistend == -1:
1733 video_ids = video_ids[playliststart:]
1735 video_ids = video_ids[playliststart:playlistend]
1737 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1738 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1740 for video_id in video_ids:
1741 self._downloader.download([u'http://blip.tv/'+video_id])
1744 class DepositFilesIE(InfoExtractor):
1745 """Information extractor for depositfiles.com"""
1747 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1748 IE_NAME = u'DepositFiles'
1750 def __init__(self, downloader=None):
1751 InfoExtractor.__init__(self, downloader)
1753 def report_download_webpage(self, file_id):
1754 """Report webpage download."""
1755 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1757 def report_extraction(self, file_id):
1758 """Report information extraction."""
1759 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1761 def _real_extract(self, url):
1762 file_id = url.split('/')[-1]
1763 # Rebuild url in english locale
1764 url = 'http://depositfiles.com/en/files/' + file_id
1766 # Retrieve file webpage with 'Free download' button pressed
1767 free_download_indication = { 'gateway_result' : '1' }
1768 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1770 self.report_download_webpage(file_id)
1771 webpage = urllib2.urlopen(request).read()
1772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1773 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1776 # Search for the real file URL
1777 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1778 if (mobj is None) or (mobj.group(1) is None):
1779 # Try to figure out reason of the error.
1780 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1781 if (mobj is not None) and (mobj.group(1) is not None):
1782 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1783 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1785 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1788 file_url = mobj.group(1)
1789 file_extension = os.path.splitext(file_url)[1][1:]
1791 # Search for file title
1792 mobj = re.search(r'<b title="(.*?)">', webpage)
1794 self._downloader.trouble(u'ERROR: unable to extract title')
1796 file_title = mobj.group(1).decode('utf-8')
1799 'id': file_id.decode('utf-8'),
1800 'url': file_url.decode('utf-8'),
1802 'upload_date': u'NA',
1803 'title': file_title,
1804 'ext': file_extension.decode('utf-8'),
1810 class FacebookIE(InfoExtractor):
1811 """Information Extractor for Facebook"""
1813 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1814 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1815 _NETRC_MACHINE = 'facebook'
1816 _available_formats = ['video', 'highqual', 'lowqual']
1817 _video_extensions = {
1822 IE_NAME = u'facebook'
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1827 def _reporter(self, message):
1828 """Add header and report message."""
1829 self._downloader.to_screen(u'[facebook] %s' % message)
1831 def report_login(self):
1832 """Report attempt to log in."""
1833 self._reporter(u'Logging in')
1835 def report_video_webpage_download(self, video_id):
1836 """Report attempt to download video webpage."""
1837 self._reporter(u'%s: Downloading video webpage' % video_id)
1839 def report_information_extraction(self, video_id):
1840 """Report attempt to extract video information."""
1841 self._reporter(u'%s: Extracting video information' % video_id)
1843 def _parse_page(self, video_webpage):
1844 """Extract video information from page"""
1846 data = {'title': r'\("video_title", "(.*?)"\)',
1847 'description': r'<div class="datawrap">(.*?)</div>',
1848 'owner': r'\("video_owner_name", "(.*?)"\)',
1849 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1852 for piece in data.keys():
1853 mobj = re.search(data[piece], video_webpage)
1854 if mobj is not None:
1855 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1859 for fmt in self._available_formats:
1860 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1861 if mobj is not None:
1862 # URL is in a Javascript segment inside an escaped Unicode format within
1863 # the generally utf-8 page
1864 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1865 video_info['video_urls'] = video_urls
1869 def _real_initialize(self):
1870 if self._downloader is None:
1875 downloader_params = self._downloader.params
1877 # Attempt to use provided username and password or .netrc data
1878 if downloader_params.get('username', None) is not None:
1879 useremail = downloader_params['username']
1880 password = downloader_params['password']
1881 elif downloader_params.get('usenetrc', False):
1883 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1884 if info is not None:
1888 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1889 except (IOError, netrc.NetrcParseError), err:
1890 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1893 if useremail is None:
1902 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1905 login_results = urllib2.urlopen(request).read()
1906 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1907 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1909 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1910 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1913 def _real_extract(self, url):
1914 mobj = re.match(self._VALID_URL, url)
1916 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1918 video_id = mobj.group('ID')
1921 self.report_video_webpage_download(video_id)
1922 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1924 page = urllib2.urlopen(request)
1925 video_webpage = page.read()
1926 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1927 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1930 # Start extracting information
1931 self.report_information_extraction(video_id)
1933 # Extract information
1934 video_info = self._parse_page(video_webpage)
1937 if 'owner' not in video_info:
1938 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1940 video_uploader = video_info['owner']
1943 if 'title' not in video_info:
1944 self._downloader.trouble(u'ERROR: unable to extract video title')
1946 video_title = video_info['title']
1947 video_title = video_title.decode('utf-8')
1950 if 'thumbnail' not in video_info:
1951 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1952 video_thumbnail = ''
1954 video_thumbnail = video_info['thumbnail']
1958 if 'upload_date' in video_info:
1959 upload_time = video_info['upload_date']
1960 timetuple = email.utils.parsedate_tz(upload_time)
1961 if timetuple is not None:
1963 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1968 video_description = video_info.get('description', 'No description available.')
1970 url_map = video_info['video_urls']
1971 if len(url_map.keys()) > 0:
1972 # Decide which formats to download
1973 req_format = self._downloader.params.get('format', None)
1974 format_limit = self._downloader.params.get('format_limit', None)
1976 if format_limit is not None and format_limit in self._available_formats:
1977 format_list = self._available_formats[self._available_formats.index(format_limit):]
1979 format_list = self._available_formats
1980 existing_formats = [x for x in format_list if x in url_map]
1981 if len(existing_formats) == 0:
1982 self._downloader.trouble(u'ERROR: no known formats available for video')
1984 if req_format is None:
1985 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1986 elif req_format == 'worst':
1987 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1988 elif req_format == '-1':
1989 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1992 if req_format not in url_map:
1993 self._downloader.trouble(u'ERROR: requested format not available')
1995 video_url_list = [(req_format, url_map[req_format])] # Specific format
1998 for format_param, video_real_url in video_url_list:
2000 video_extension = self._video_extensions.get(format_param, 'mp4')
2003 'id': video_id.decode('utf-8'),
2004 'url': video_real_url.decode('utf-8'),
2005 'uploader': video_uploader.decode('utf-8'),
2006 'upload_date': upload_date,
2007 'title': video_title,
2008 'ext': video_extension.decode('utf-8'),
2009 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2010 'thumbnail': video_thumbnail.decode('utf-8'),
2011 'description': video_description.decode('utf-8'),
2016 class BlipTVIE(InfoExtractor):
2017 """Information extractor for blip.tv"""
2019 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2020 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2021 IE_NAME = u'blip.tv'
2023 def report_extraction(self, file_id):
2024 """Report information extraction."""
2025 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2027 def report_direct_download(self, title):
2028 """Report information extraction."""
2029 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2031 def _real_extract(self, url):
2032 mobj = re.match(self._VALID_URL, url)
2034 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2041 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2042 request = urllib2.Request(json_url.encode('utf-8'))
2043 self.report_extraction(mobj.group(1))
2046 urlh = urllib2.urlopen(request)
2047 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2048 basename = url.split('/')[-1]
2049 title,ext = os.path.splitext(basename)
2050 title = title.decode('UTF-8')
2051 ext = ext.replace('.', '')
2052 self.report_direct_download(title)
2060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2063 if info is None: # Regular URL
2065 json_code = urlh.read()
2066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2071 json_data = json.loads(json_code)
2072 if 'Post' in json_data:
2073 data = json_data['Post']
2077 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2078 video_url = data['media']['url']
2079 umobj = re.match(self._URL_EXT, video_url)
2081 raise ValueError('Can not determine filename extension')
2082 ext = umobj.group(1)
2085 'id': data['item_id'],
2087 'uploader': data['display_name'],
2088 'upload_date': upload_date,
2089 'title': data['title'],
2091 'format': data['media']['mimeType'],
2092 'thumbnail': data['thumbnailUrl'],
2093 'description': data['description'],
2094 'player_url': data['embedUrl']
2096 except (ValueError,KeyError), err:
2097 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2100 std_headers['User-Agent'] = 'iTunes/10.6.1'
2104 class MyVideoIE(InfoExtractor):
2105 """Information Extractor for myvideo.de."""
2107 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2108 IE_NAME = u'myvideo'
2110 def __init__(self, downloader=None):
2111 InfoExtractor.__init__(self, downloader)
2113 def report_download_webpage(self, video_id):
2114 """Report webpage download."""
2115 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2117 def report_extraction(self, video_id):
2118 """Report information extraction."""
2119 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2121 def _real_extract(self,url):
2122 mobj = re.match(self._VALID_URL, url)
2124 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2127 video_id = mobj.group(1)
2130 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2132 self.report_download_webpage(video_id)
2133 webpage = urllib2.urlopen(request).read()
2134 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2135 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2138 self.report_extraction(video_id)
2139 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2142 self._downloader.trouble(u'ERROR: unable to extract media URL')
2144 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2146 mobj = re.search('<title>([^<]+)</title>', webpage)
2148 self._downloader.trouble(u'ERROR: unable to extract title')
2151 video_title = mobj.group(1)
2157 'upload_date': u'NA',
2158 'title': video_title,
2164 class ComedyCentralIE(InfoExtractor):
2165 """Information extractor for The Daily Show and Colbert Report """
2167 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2168 IE_NAME = u'comedycentral'
2170 def report_extraction(self, episode_id):
2171 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2173 def report_config_download(self, episode_id):
2174 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2176 def report_index_download(self, episode_id):
2177 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2179 def report_player_url(self, episode_id):
2180 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2182 def _real_extract(self, url):
2183 mobj = re.match(self._VALID_URL, url)
2185 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2188 if mobj.group('shortname'):
2189 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2190 url = u'http://www.thedailyshow.com/full-episodes/'
2192 url = u'http://www.colbertnation.com/full-episodes/'
2193 mobj = re.match(self._VALID_URL, url)
2194 assert mobj is not None
2196 dlNewest = not mobj.group('episode')
2198 epTitle = mobj.group('showname')
2200 epTitle = mobj.group('episode')
2202 req = urllib2.Request(url)
2203 self.report_extraction(epTitle)
2205 htmlHandle = urllib2.urlopen(req)
2206 html = htmlHandle.read()
2207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2208 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2211 url = htmlHandle.geturl()
2212 mobj = re.match(self._VALID_URL, url)
2214 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2216 if mobj.group('episode') == '':
2217 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2219 epTitle = mobj.group('episode')
2221 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2222 if len(mMovieParams) == 0:
2223 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2226 playerUrl_raw = mMovieParams[0][0]
2227 self.report_player_url(epTitle)
2229 urlHandle = urllib2.urlopen(playerUrl_raw)
2230 playerUrl = urlHandle.geturl()
2231 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2232 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2235 uri = mMovieParams[0][1]
2236 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2237 self.report_index_download(epTitle)
2239 indexXml = urllib2.urlopen(indexUrl).read()
2240 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2241 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2246 idoc = xml.etree.ElementTree.fromstring(indexXml)
2247 itemEls = idoc.findall('.//item')
2248 for itemEl in itemEls:
2249 mediaId = itemEl.findall('./guid')[0].text
2250 shortMediaId = mediaId.split(':')[-1]
2251 showId = mediaId.split(':')[-2].replace('.com', '')
2252 officialTitle = itemEl.findall('./title')[0].text
2253 officialDate = itemEl.findall('./pubDate')[0].text
2255 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2256 urllib.urlencode({'uri': mediaId}))
2257 configReq = urllib2.Request(configUrl)
2258 self.report_config_download(epTitle)
2260 configXml = urllib2.urlopen(configReq).read()
2261 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2262 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2265 cdoc = xml.etree.ElementTree.fromstring(configXml)
2267 for rendition in cdoc.findall('.//rendition'):
2268 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2272 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2275 # For now, just pick the highest bitrate
2276 format,video_url = turls[-1]
2278 effTitle = showId + u'-' + epTitle
2283 'upload_date': officialDate,
2288 'description': officialTitle,
2289 'player_url': playerUrl
2292 results.append(info)
2297 class EscapistIE(InfoExtractor):
2298 """Information extractor for The Escapist """
2300 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2301 IE_NAME = u'escapist'
2303 def report_extraction(self, showName):
2304 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2306 def report_config_download(self, showName):
2307 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2309 def _real_extract(self, url):
2310 mobj = re.match(self._VALID_URL, url)
2312 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2314 showName = mobj.group('showname')
2315 videoId = mobj.group('episode')
2317 self.report_extraction(showName)
2319 webPage = urllib2.urlopen(url)
2320 webPageBytes = webPage.read()
2321 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2322 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2327 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2328 description = unescapeHTML(descMatch.group(1))
2329 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2330 imgUrl = unescapeHTML(imgMatch.group(1))
2331 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2332 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2333 configUrlMatch = re.search('config=(.*)$', playerUrl)
2334 configUrl = urllib2.unquote(configUrlMatch.group(1))
2336 self.report_config_download(showName)
2338 configJSON = urllib2.urlopen(configUrl).read()
2339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2343 # Technically, it's JavaScript, not JSON
2344 configJSON = configJSON.replace("'", '"')
2347 config = json.loads(configJSON)
2348 except (ValueError,), err:
2349 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2352 playlist = config['playlist']
2353 videoUrl = playlist[1]['url']
2358 'uploader': showName,
2359 'upload_date': None,
2363 'thumbnail': imgUrl,
2364 'description': description,
2365 'player_url': playerUrl,
2371 class CollegeHumorIE(InfoExtractor):
2372 """Information extractor for collegehumor.com"""
2374 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2375 IE_NAME = u'collegehumor'
2377 def report_webpage(self, video_id):
2378 """Report information extraction."""
2379 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2381 def report_extraction(self, video_id):
2382 """Report information extraction."""
2383 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2385 def _real_extract(self, url):
2386 mobj = re.match(self._VALID_URL, url)
2388 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2390 video_id = mobj.group('videoid')
2392 self.report_webpage(video_id)
2393 request = urllib2.Request(url)
2395 webpage = urllib2.urlopen(request).read()
2396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2397 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2400 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2402 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2404 internal_video_id = m.group('internalvideoid')
2408 'internal_id': internal_video_id,
2411 self.report_extraction(video_id)
2412 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2414 metaXml = urllib2.urlopen(xmlUrl).read()
2415 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2416 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2419 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2421 videoNode = mdoc.findall('./video')[0]
2422 info['description'] = videoNode.findall('./description')[0].text
2423 info['title'] = videoNode.findall('./caption')[0].text
2424 info['url'] = videoNode.findall('./file')[0].text
2425 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2426 info['ext'] = info['url'].rpartition('.')[2]
2427 info['format'] = info['ext']
2429 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2435 class XVideosIE(InfoExtractor):
2436 """Information extractor for xvideos.com"""
2438 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2439 IE_NAME = u'xvideos'
2441 def report_webpage(self, video_id):
2442 """Report information extraction."""
2443 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2445 def report_extraction(self, video_id):
2446 """Report information extraction."""
2447 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2449 def _real_extract(self, url):
2450 mobj = re.match(self._VALID_URL, url)
2452 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2454 video_id = mobj.group(1).decode('utf-8')
2456 self.report_webpage(video_id)
2458 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2460 webpage = urllib2.urlopen(request).read()
2461 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2462 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2465 self.report_extraction(video_id)
2469 mobj = re.search(r'flv_url=(.+?)&', webpage)
2471 self._downloader.trouble(u'ERROR: unable to extract video url')
2473 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2477 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2479 self._downloader.trouble(u'ERROR: unable to extract video title')
2481 video_title = mobj.group(1).decode('utf-8')
2484 # Extract video thumbnail
2485 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2487 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2489 video_thumbnail = mobj.group(0).decode('utf-8')
2495 'upload_date': None,
2496 'title': video_title,
2499 'thumbnail': video_thumbnail,
2500 'description': None,
2507 class SoundcloudIE(InfoExtractor):
2508 """Information extractor for soundcloud.com
2509 To access the media, the uid of the song and a stream token
2510 must be extracted from the page source and the script must make
2511 a request to media.soundcloud.com/crossdomain.xml. Then
2512 the media can be grabbed by requesting from an url composed
2513 of the stream token and uid
2516 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2517 IE_NAME = u'soundcloud'
2519 def __init__(self, downloader=None):
2520 InfoExtractor.__init__(self, downloader)
2522 def report_webpage(self, video_id):
2523 """Report information extraction."""
2524 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2526 def report_extraction(self, video_id):
2527 """Report information extraction."""
2528 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2530 def _real_extract(self, url):
2531 mobj = re.match(self._VALID_URL, url)
2533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2536 # extract uploader (which is in the url)
2537 uploader = mobj.group(1).decode('utf-8')
2538 # extract simple title (uploader + slug of song title)
2539 slug_title = mobj.group(2).decode('utf-8')
2540 simple_title = uploader + u'-' + slug_title
2542 self.report_webpage('%s/%s' % (uploader, slug_title))
2544 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2546 webpage = urllib2.urlopen(request).read()
2547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2551 self.report_extraction('%s/%s' % (uploader, slug_title))
2553 # extract uid and stream token that soundcloud hands out for access
2554 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2556 video_id = mobj.group(1)
2557 stream_token = mobj.group(2)
2559 # extract unsimplified title
2560 mobj = re.search('"title":"(.*?)",', webpage)
2562 title = mobj.group(1).decode('utf-8')
2564 title = simple_title
2566 # construct media url (with uid/token)
2567 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2568 mediaURL = mediaURL % (video_id, stream_token)
2571 description = u'No description available'
2572 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2574 description = mobj.group(1)
2578 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2581 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2582 except Exception, e:
2583 self._downloader.to_stderr(str(e))
2585 # for soundcloud, a request to a cross domain is required for cookies
2586 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2589 'id': video_id.decode('utf-8'),
2591 'uploader': uploader.decode('utf-8'),
2592 'upload_date': upload_date,
2597 'description': description.decode('utf-8')
2601 class InfoQIE(InfoExtractor):
2602 """Information extractor for infoq.com"""
2604 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2607 def report_webpage(self, video_id):
2608 """Report information extraction."""
2609 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2611 def report_extraction(self, video_id):
2612 """Report information extraction."""
2613 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2615 def _real_extract(self, url):
2616 mobj = re.match(self._VALID_URL, url)
2618 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2621 self.report_webpage(url)
2623 request = urllib2.Request(url)
2625 webpage = urllib2.urlopen(request).read()
2626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2627 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2630 self.report_extraction(url)
2634 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2636 self._downloader.trouble(u'ERROR: unable to extract video url')
2638 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2642 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2644 self._downloader.trouble(u'ERROR: unable to extract video title')
2646 video_title = mobj.group(1).decode('utf-8')
2648 # Extract description
2649 video_description = u'No description available.'
2650 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2651 if mobj is not None:
2652 video_description = mobj.group(1).decode('utf-8')
2654 video_filename = video_url.split('/')[-1]
2655 video_id, extension = video_filename.split('.')
2661 'upload_date': None,
2662 'title': video_title,
2664 'format': extension, # Extension is always(?) mp4, but seems to be flv
2666 'description': video_description,
2672 class MixcloudIE(InfoExtractor):
2673 """Information extractor for www.mixcloud.com"""
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675 IE_NAME = u'mixcloud'
2677 def __init__(self, downloader=None):
2678 InfoExtractor.__init__(self, downloader)
2680 def report_download_json(self, file_id):
2681 """Report JSON download."""
2682 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2684 def report_extraction(self, file_id):
2685 """Report information extraction."""
2686 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2688 def get_urls(self, jsonData, fmt, bitrate='best'):
2689 """Get urls from 'audio_formats' section in json"""
2692 bitrate_list = jsonData[fmt]
2693 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2694 bitrate = max(bitrate_list) # select highest
2696 url_list = jsonData[fmt][bitrate]
2697 except TypeError: # we have no bitrate info.
2698 url_list = jsonData[fmt]
2701 def check_urls(self, url_list):
2702 """Returns 1st active url from list"""
2703 for url in url_list:
2705 urllib2.urlopen(url)
2707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712 def _print_formats(self, formats):
2713 print 'Available formats:'
2714 for fmt in formats.keys():
2715 for b in formats[fmt]:
2717 ext = formats[fmt][b][0]
2718 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2719 except TypeError: # we have no bitrate info
2720 ext = formats[fmt][0]
2721 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2724 def _real_extract(self, url):
2725 mobj = re.match(self._VALID_URL, url)
2727 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2729 # extract uploader & filename from url
2730 uploader = mobj.group(1).decode('utf-8')
2731 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2733 # construct API request
2734 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2735 # retrieve .json file with links to files
2736 request = urllib2.Request(file_url)
2738 self.report_download_json(file_url)
2739 jsonData = urllib2.urlopen(request).read()
2740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2741 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2745 json_data = json.loads(jsonData)
2746 player_url = json_data['player_swf_url']
2747 formats = dict(json_data['audio_formats'])
2749 req_format = self._downloader.params.get('format', None)
2752 if self._downloader.params.get('listformats', None):
2753 self._print_formats(formats)
2756 if req_format is None or req_format == 'best':
2757 for format_param in formats.keys():
2758 url_list = self.get_urls(formats, format_param)
2760 file_url = self.check_urls(url_list)
2761 if file_url is not None:
2764 if req_format not in formats.keys():
2765 self._downloader.trouble(u'ERROR: format is not available')
2768 url_list = self.get_urls(formats, req_format)
2769 file_url = self.check_urls(url_list)
2770 format_param = req_format
2773 'id': file_id.decode('utf-8'),
2774 'url': file_url.decode('utf-8'),
2775 'uploader': uploader.decode('utf-8'),
2776 'upload_date': u'NA',
2777 'title': json_data['name'],
2778 'ext': file_url.split('.')[-1].decode('utf-8'),
2779 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2780 'thumbnail': json_data['thumbnail_url'],
2781 'description': json_data['description'],
2782 'player_url': player_url.decode('utf-8'),
2785 class StanfordOpenClassroomIE(InfoExtractor):
2786 """Information extractor for Stanford's Open ClassRoom"""
2788 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2789 IE_NAME = u'stanfordoc'
2791 def report_download_webpage(self, objid):
2792 """Report information extraction."""
2793 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2795 def report_extraction(self, video_id):
2796 """Report information extraction."""
2797 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2802 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805 if mobj.group('course') and mobj.group('video'): # A specific video
2806 course = mobj.group('course')
2807 video = mobj.group('video')
2809 'id': course + '_' + video,
2812 self.report_extraction(info['id'])
2813 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2814 xmlUrl = baseUrl + video + '.xml'
2816 metaXml = urllib2.urlopen(xmlUrl).read()
2817 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2818 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2820 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2822 info['title'] = mdoc.findall('./title')[0].text
2823 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2825 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2827 info['ext'] = info['url'].rpartition('.')[2]
2828 info['format'] = info['ext']
2830 elif mobj.group('course'): # A course page
2831 course = mobj.group('course')
2837 self.report_download_webpage(info['id'])
2839 coursepage = urllib2.urlopen(url).read()
2840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2841 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2844 m = re.search('<h1>([^<]+)</h1>', coursepage)
2846 info['title'] = unescapeHTML(m.group(1))
2848 info['title'] = info['id']
2850 m = re.search('<description>([^<]+)</description>', coursepage)
2852 info['description'] = unescapeHTML(m.group(1))
2854 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2857 'type': 'reference',
2858 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2862 for entry in info['list']:
2863 assert entry['type'] == 'reference'
2864 results += self.extract(entry['url'])
2869 'id': 'Stanford OpenClassroom',
2873 self.report_download_webpage(info['id'])
2874 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2876 rootpage = urllib2.urlopen(rootURL).read()
2877 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2878 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2881 info['title'] = info['id']
2883 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2886 'type': 'reference',
2887 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2892 for entry in info['list']:
2893 assert entry['type'] == 'reference'
2894 results += self.extract(entry['url'])
2897 class MTVIE(InfoExtractor):
2898 """Information extractor for MTV.com"""
2900 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2903 def report_webpage(self, video_id):
2904 """Report information extraction."""
2905 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2907 def report_extraction(self, video_id):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911 def _real_extract(self, url):
2912 mobj = re.match(self._VALID_URL, url)
2914 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2916 if not mobj.group('proto'):
2917 url = 'http://' + url
2918 video_id = mobj.group('videoid')
2919 self.report_webpage(video_id)
2921 request = urllib2.Request(url)
2923 webpage = urllib2.urlopen(request).read()
2924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2925 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2928 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2930 self._downloader.trouble(u'ERROR: unable to extract song name')
2932 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2933 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2935 self._downloader.trouble(u'ERROR: unable to extract performer')
2937 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2938 video_title = performer + ' - ' + song_name
2940 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2942 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2944 mtvn_uri = mobj.group(1)
2946 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2948 self._downloader.trouble(u'ERROR: unable to extract content id')
2950 content_id = mobj.group(1)
2952 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2953 self.report_extraction(video_id)
2954 request = urllib2.Request(videogen_url)
2956 metadataXml = urllib2.urlopen(request).read()
2957 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2958 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2961 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2962 renditions = mdoc.findall('.//rendition')
2964 # For now, always pick the highest quality.
2965 rendition = renditions[-1]
2968 _,_,ext = rendition.attrib['type'].partition('/')
2969 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2970 video_url = rendition.find('./src').text
2972 self._downloader.trouble('Invalid rendition field.')
2978 'uploader': performer,
2979 'title': video_title,
2987 class YoukuIE(InfoExtractor):
2989 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2992 def __init__(self, downloader=None):
2993 InfoExtractor.__init__(self, downloader)
2995 def report_download_webpage(self, file_id):
2996 """Report webpage download."""
2997 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2999 def report_extraction(self, file_id):
3000 """Report information extraction."""
3001 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3004 nowTime = int(time.time() * 1000)
3005 random1 = random.randint(1000,1998)
3006 random2 = random.randint(1000,9999)
3008 return "%d%d%d" %(nowTime,random1,random2)
3010 def _get_file_ID_mix_string(self, seed):
3012 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3014 for i in range(len(source)):
3015 seed = (seed * 211 + 30031 ) % 65536
3016 index = math.floor(seed / 65536 * len(source) )
3017 mixed.append(source[int(index)])
3018 source.remove(source[int(index)])
3019 #return ''.join(mixed)
3022 def _get_file_id(self, fileId, seed):
3023 mixed = self._get_file_ID_mix_string(seed)
3024 ids = fileId.split('*')
3028 realId.append(mixed[int(ch)])
3029 return ''.join(realId)
3031 def _real_extract(self, url):
3032 mobj = re.match(self._VALID_URL, url)
3034 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3036 video_id = mobj.group('ID')
3038 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3040 request = urllib2.Request(info_url, None, std_headers)
3042 self.report_download_webpage(video_id)
3043 jsondata = urllib2.urlopen(request).read()
3044 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3045 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3048 self.report_extraction(video_id)
3050 config = json.loads(jsondata)
3052 video_title = config['data'][0]['title']
3053 seed = config['data'][0]['seed']
3055 format = self._downloader.params.get('format', None)
3056 supported_format = config['data'][0]['streamfileids'].keys()
3058 if format is None or format == 'best':
3059 if 'hd2' in supported_format:
3064 elif format == 'worst':
3072 fileid = config['data'][0]['streamfileids'][format]
3073 seg_number = len(config['data'][0]['segs'][format])
3076 for i in xrange(seg_number):
3077 keys.append(config['data'][0]['segs'][format][i]['k'])
3080 #youku only could be viewed from mainland china
3082 self._downloader.trouble(u'ERROR: unable to extract info section')
3086 sid = self._gen_sid()
3087 fileid = self._get_file_id(fileid, seed)
3089 #column 8,9 of fileid represent the segment number
3090 #fileid[7:9] should be changed
3091 for index, key in enumerate(keys):
3093 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3094 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3097 'id': '%s_part%02d' % (video_id, index),
3098 'url': download_url,
3100 'title': video_title,
3104 files_info.append(info)
3109 class XNXXIE(InfoExtractor):
3110 """Information extractor for xnxx.com"""
3112 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3114 VIDEO_URL_RE = r'flv_url=(.*?)&'
3115 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3116 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3118 def report_webpage(self, video_id):
3119 """Report information extraction"""
3120 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3122 def report_extraction(self, video_id):
3123 """Report information extraction"""
3124 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3126 def _real_extract(self, url):
3127 mobj = re.match(self._VALID_URL, url)
3129 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3131 video_id = mobj.group(1).decode('utf-8')
3133 self.report_webpage(video_id)
3135 # Get webpage content
3137 webpage = urllib2.urlopen(url).read()
3138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3139 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3142 result = re.search(self.VIDEO_URL_RE, webpage)
3144 self._downloader.trouble(u'ERROR: unable to extract video url')
3146 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3148 result = re.search(self.VIDEO_TITLE_RE, webpage)
3150 self._downloader.trouble(u'ERROR: unable to extract video title')
3152 video_title = result.group(1).decode('utf-8')
3154 result = re.search(self.VIDEO_THUMB_RE, webpage)
3156 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3158 video_thumbnail = result.group(1).decode('utf-8')
3160 info = {'id': video_id,
3163 'upload_date': None,
3164 'title': video_title,
3167 'thumbnail': video_thumbnail,
3168 'description': None,