2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
106 (?: # the various things that can precede the ID:
107 (?:(?:v|embed|e)/) # v/ or embed/ or e/
108 |(?: # or the v= param in all its forms
109 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110 (?:\?|\#!?) # the params delimiter ? or # or #!
111 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 )? # optional -> youtube.com/xxxx is OK
115 )? # all until now is optional -> you can pass the naked ID
116 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
117 (?(1).+)? # if we found the ID, everything can follow
119 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123 _NETRC_MACHINE = 'youtube'
124 # Listed in order of quality
125 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127 _video_extensions = {
133 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139 _video_dimensions = {
157 def suitable(self, url):
158 """Receives a URL and returns True if suitable for this IE."""
159 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161 def report_lang(self):
162 """Report attempt to set language."""
163 self._downloader.to_screen(u'[youtube] Setting language')
165 def report_login(self):
166 """Report attempt to log in."""
167 self._downloader.to_screen(u'[youtube] Logging in')
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self._downloader.to_screen(u'[youtube] Confirming age')
173 def report_video_webpage_download(self, video_id):
174 """Report attempt to download video webpage."""
175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177 def report_video_info_webpage_download(self, video_id):
178 """Report attempt to download video info webpage."""
179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181 def report_video_subtitles_download(self, video_id):
182 """Report attempt to download video info webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185 def report_information_extraction(self, video_id):
186 """Report attempt to extract video information."""
187 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189 def report_unavailable_format(self, video_id, format):
190 """Report extracted video URL."""
191 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193 def report_rtmp_download(self):
194 """Indicate the download will use the RTMP protocol."""
195 self._downloader.to_screen(u'[youtube] RTMP download detected')
197 def _closed_captions_xml_to_srt(self, xml_string):
199 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200 # TODO parse xml instead of regex
201 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202 if not dur: dur = '4'
204 end = start + float(dur)
205 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207 caption = unescapeHTML(caption)
208 caption = unescapeHTML(caption) # double cycle, intentional
209 srt += str(n+1) + '\n'
210 srt += start + ' --> ' + end + '\n'
211 srt += caption + '\n\n'
214 def _print_formats(self, formats):
215 print 'Available formats:'
217 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219 def _real_initialize(self):
220 if self._downloader is None:
225 downloader_params = self._downloader.params
227 # Attempt to use provided username and password or .netrc data
228 if downloader_params.get('username', None) is not None:
229 username = downloader_params['username']
230 password = downloader_params['password']
231 elif downloader_params.get('usenetrc', False):
233 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
238 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239 except (IOError, netrc.NetrcParseError), err:
240 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 request = urllib2.Request(self._LANG_URL)
247 urllib2.urlopen(request).read()
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
252 # No authentication to be performed
258 'current_form': 'loginForm',
260 'action_login': 'Log In',
261 'username': username,
262 'password': password,
264 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 login_results = urllib2.urlopen(request).read()
268 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
278 'action_confirm': 'Confirm',
280 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 self.report_age_confirmation()
283 age_results = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
288 def _real_extract(self, url):
289 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290 mobj = re.search(self._NEXT_URL_RE, url)
292 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294 # Extract video id from URL
295 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 video_id = mobj.group(2)
302 self.report_video_webpage_download(video_id)
303 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 video_webpage = urllib2.urlopen(request).read()
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
310 # Attempt to extract SWF player URL
311 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
318 self.report_video_info_webpage_download(video_id)
319 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321 % (video_id, el_type))
322 request = urllib2.Request(video_info_url)
324 video_info_webpage = urllib2.urlopen(request).read()
325 video_info = parse_qs(video_info_webpage)
326 if 'token' in video_info:
328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 if 'token' not in video_info:
332 if 'reason' in video_info:
333 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 # Check for "rental" videos
339 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 # Start extracting information
344 self.report_information_extraction(video_id)
347 if 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 video_uploader = urllib.unquote_plus(video_info['author'][0])
353 if 'title' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract video title')
356 video_title = urllib.unquote_plus(video_info['title'][0])
357 video_title = video_title.decode('utf-8')
360 if 'thumbnail_url' not in video_info:
361 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 else: # don't panic if we can't find it
364 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372 for expression in format_expressions:
374 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
379 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380 if video_description: video_description = clean_html(video_description)
381 else: video_description = ''
384 video_subtitles = None
385 if self._downloader.params.get('writesubtitles', False):
387 self.report_video_subtitles_download(video_id)
388 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 srt_list = urllib2.urlopen(request).read()
391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395 if not srt_lang_list:
396 raise Trouble(u'WARNING: video has no closed captions')
397 if self._downloader.params.get('subtitleslang', False):
398 srt_lang = self._downloader.params.get('subtitleslang')
399 elif 'en' in srt_lang_list:
402 srt_lang = srt_lang_list.keys()[0]
403 if not srt_lang in srt_lang_list:
404 raise Trouble(u'WARNING: no closed captions found in the specified language')
405 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 srt_xml = urllib2.urlopen(request).read()
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 raise Trouble(u'WARNING: unable to download video subtitles')
412 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413 except Trouble as trouble:
414 self._downloader.trouble(trouble[0])
417 video_token = urllib.unquote_plus(video_info['token'][0])
419 # Decide which formats to download
420 req_format = self._downloader.params.get('format', None)
422 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423 self.report_rtmp_download()
424 video_url_list = [(None, video_info['conn'][0])]
425 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427 url_data = [parse_qs(uds) for uds in url_data_strs]
428 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
431 format_limit = self._downloader.params.get('format_limit', None)
432 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433 if format_limit is not None and format_limit in available_formats:
434 format_list = available_formats[available_formats.index(format_limit):]
436 format_list = available_formats
437 existing_formats = [x for x in format_list if x in url_map]
438 if len(existing_formats) == 0:
439 self._downloader.trouble(u'ERROR: no known formats available for video')
441 if self._downloader.params.get('listformats', None):
442 self._print_formats(existing_formats)
444 if req_format is None or req_format == 'best':
445 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446 elif req_format == 'worst':
447 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448 elif req_format in ('-1', 'all'):
449 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 # Specific formats. We pick the first in a slash-delimeted sequence.
452 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453 req_formats = req_format.split('/')
454 video_url_list = None
455 for rf in req_formats:
457 video_url_list = [(rf, url_map[rf])]
459 if video_url_list is None:
460 self._downloader.trouble(u'ERROR: requested format not available')
463 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
467 for format_param, video_real_url in video_url_list:
469 video_extension = self._video_extensions.get(format_param, 'flv')
472 'id': video_id.decode('utf-8'),
473 'url': video_real_url.decode('utf-8'),
474 'uploader': video_uploader.decode('utf-8'),
475 'upload_date': upload_date,
476 'title': video_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
487 class MetacafeIE(InfoExtractor):
488 """Information Extractor for metacafe.com."""
490 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493 IE_NAME = u'metacafe'
495 def __init__(self, downloader=None):
496 InfoExtractor.__init__(self, downloader)
498 def report_disclaimer(self):
499 """Report disclaimer retrieval."""
500 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502 def report_age_confirmation(self):
503 """Report attempt to confirm age."""
504 self._downloader.to_screen(u'[metacafe] Confirming age')
506 def report_download_webpage(self, video_id):
507 """Report webpage download."""
508 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510 def report_extraction(self, video_id):
511 """Report information extraction."""
512 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514 def _real_initialize(self):
515 # Retrieve disclaimer
516 request = urllib2.Request(self._DISCLAIMER)
518 self.report_disclaimer()
519 disclaimer = urllib2.urlopen(request).read()
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
527 'submit': "Continue - I'm over 18",
529 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 self.report_age_confirmation()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract id and simplified title from URL
539 mobj = re.match(self._VALID_URL, url)
541 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
544 video_id = mobj.group(1)
546 # Check if video comes from YouTube
547 mobj2 = re.match(r'^yt-(.*)$', video_id)
548 if mobj2 is not None:
549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
552 # Retrieve video webpage to extract further information
553 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 self.report_download_webpage(video_id)
556 webpage = urllib2.urlopen(request).read()
557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
561 # Extract URL, uploader and title from webpage
562 self.report_extraction(video_id)
563 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 mediaURL = urllib.unquote(mobj.group(1))
566 video_extension = mediaURL[-3:]
568 # Extract gdaKey if available
569 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
573 gdaKey = mobj.group(1)
574 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 vardict = parse_qs(mobj.group(1))
581 if 'mediaData' not in vardict:
582 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 mediaURL = mobj.group(1).replace('\\/', '/')
589 video_extension = mediaURL[-3:]
590 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract title')
596 video_title = mobj.group(1).decode('utf-8')
598 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 video_uploader = mobj.group(1)
605 'id': video_id.decode('utf-8'),
606 'url': video_url.decode('utf-8'),
607 'uploader': video_uploader.decode('utf-8'),
608 'upload_date': u'NA',
609 'title': video_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1).split('_')[0].split('?')[0]
642 video_extension = 'mp4'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 flashvars = urllib.unquote(mobj.group(1))
661 if 'hqURL' in flashvars: max_quality = 'hqURL'
662 elif 'sdURL' in flashvars: max_quality = 'sdURL'
663 else: max_quality = 'ldURL'
664 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
666 mobj = re.search(r'"video_url":"(.*?)",', flashvars)
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
672 # TODO: support choosing qualities
674 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
676 self._downloader.trouble(u'ERROR: unable to extract title')
678 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
680 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
684 video_uploader = mobj.group(1)
686 video_upload_date = u'NA'
687 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
689 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
692 'id': video_id.decode('utf-8'),
693 'url': video_url.decode('utf-8'),
694 'uploader': video_uploader.decode('utf-8'),
695 'upload_date': video_upload_date,
696 'title': video_title,
697 'ext': video_extension.decode('utf-8'),
703 class GoogleIE(InfoExtractor):
704 """Information extractor for video.google.com."""
706 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
707 IE_NAME = u'video.google'
709 def __init__(self, downloader=None):
710 InfoExtractor.__init__(self, downloader)
712 def report_download_webpage(self, video_id):
713 """Report webpage download."""
714 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
716 def report_extraction(self, video_id):
717 """Report information extraction."""
718 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
720 def _real_extract(self, url):
721 # Extract id from URL
722 mobj = re.match(self._VALID_URL, url)
724 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
727 video_id = mobj.group(1)
729 video_extension = 'mp4'
731 # Retrieve video webpage to extract further information
732 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
734 self.report_download_webpage(video_id)
735 webpage = urllib2.urlopen(request).read()
736 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
737 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
740 # Extract URL, uploader, and title from webpage
741 self.report_extraction(video_id)
742 mobj = re.search(r"download_url:'([^']+)'", webpage)
744 video_extension = 'flv'
745 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
747 self._downloader.trouble(u'ERROR: unable to extract media URL')
749 mediaURL = urllib.unquote(mobj.group(1))
750 mediaURL = mediaURL.replace('\\x3d', '\x3d')
751 mediaURL = mediaURL.replace('\\x26', '\x26')
755 mobj = re.search(r'<title>(.*)</title>', webpage)
757 self._downloader.trouble(u'ERROR: unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
761 # Extract video description
762 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
764 self._downloader.trouble(u'ERROR: unable to extract video description')
766 video_description = mobj.group(1).decode('utf-8')
767 if not video_description:
768 video_description = 'No description available.'
770 # Extract video thumbnail
771 if self._downloader.params.get('forcethumbnail', False):
772 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
774 webpage = urllib2.urlopen(request).read()
775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
776 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
778 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
780 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
782 video_thumbnail = mobj.group(1)
783 else: # we need something to pass to process_info
787 'id': video_id.decode('utf-8'),
788 'url': video_url.decode('utf-8'),
790 'upload_date': u'NA',
791 'title': video_title,
792 'ext': video_extension.decode('utf-8'),
798 class PhotobucketIE(InfoExtractor):
799 """Information extractor for photobucket.com."""
801 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
802 IE_NAME = u'photobucket'
804 def __init__(self, downloader=None):
805 InfoExtractor.__init__(self, downloader)
807 def report_download_webpage(self, video_id):
808 """Report webpage download."""
809 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
811 def report_extraction(self, video_id):
812 """Report information extraction."""
813 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
815 def _real_extract(self, url):
816 # Extract id from URL
817 mobj = re.match(self._VALID_URL, url)
819 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
822 video_id = mobj.group(1)
824 video_extension = 'flv'
826 # Retrieve video webpage to extract further information
827 request = urllib2.Request(url)
829 self.report_download_webpage(video_id)
830 webpage = urllib2.urlopen(request).read()
831 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
832 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
835 # Extract URL, uploader, and title from webpage
836 self.report_extraction(video_id)
837 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
839 self._downloader.trouble(u'ERROR: unable to extract media URL')
841 mediaURL = urllib.unquote(mobj.group(1))
845 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
847 self._downloader.trouble(u'ERROR: unable to extract title')
849 video_title = mobj.group(1).decode('utf-8')
851 video_uploader = mobj.group(2).decode('utf-8')
854 'id': video_id.decode('utf-8'),
855 'url': video_url.decode('utf-8'),
856 'uploader': video_uploader,
857 'upload_date': u'NA',
858 'title': video_title,
859 'ext': video_extension.decode('utf-8'),
865 class YahooIE(InfoExtractor):
866 """Information extractor for video.yahoo.com."""
868 # _VALID_URL matches all Yahoo! Video URLs
869 # _VPAGE_URL matches only the extractable '/watch/' URLs
870 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
871 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
872 IE_NAME = u'video.yahoo'
874 def __init__(self, downloader=None):
875 InfoExtractor.__init__(self, downloader)
877 def report_download_webpage(self, video_id):
878 """Report webpage download."""
879 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
881 def report_extraction(self, video_id):
882 """Report information extraction."""
883 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
885 def _real_extract(self, url, new_video=True):
886 # Extract ID from URL
887 mobj = re.match(self._VALID_URL, url)
889 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
892 video_id = mobj.group(2)
893 video_extension = 'flv'
895 # Rewrite valid but non-extractable URLs as
896 # extractable English language /watch/ URLs
897 if re.match(self._VPAGE_URL, url) is None:
898 request = urllib2.Request(url)
900 webpage = urllib2.urlopen(request).read()
901 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
902 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
905 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
907 self._downloader.trouble(u'ERROR: Unable to extract id field')
909 yahoo_id = mobj.group(1)
911 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
913 self._downloader.trouble(u'ERROR: Unable to extract vid field')
915 yahoo_vid = mobj.group(1)
917 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
918 return self._real_extract(url, new_video=False)
920 # Retrieve video webpage to extract further information
921 request = urllib2.Request(url)
923 self.report_download_webpage(video_id)
924 webpage = urllib2.urlopen(request).read()
925 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
926 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
929 # Extract uploader and title from webpage
930 self.report_extraction(video_id)
931 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video title')
935 video_title = mobj.group(1).decode('utf-8')
937 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video uploader')
941 video_uploader = mobj.group(1).decode('utf-8')
943 # Extract video thumbnail
944 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
946 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
948 video_thumbnail = mobj.group(1).decode('utf-8')
950 # Extract video description
951 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video description')
955 video_description = mobj.group(1).decode('utf-8')
956 if not video_description:
957 video_description = 'No description available.'
959 # Extract video height and width
960 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
962 self._downloader.trouble(u'ERROR: unable to extract video height')
964 yv_video_height = mobj.group(1)
966 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video width')
970 yv_video_width = mobj.group(1)
972 # Retrieve video playlist to extract media URL
973 # I'm not completely sure what all these options are, but we
974 # seem to need most of them, otherwise the server sends a 401.
975 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
976 yv_bitrate = '700' # according to Wikipedia this is hard-coded
977 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
978 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
979 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
981 self.report_download_webpage(video_id)
982 webpage = urllib2.urlopen(request).read()
983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
984 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
987 # Extract media URL from playlist XML
988 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
990 self._downloader.trouble(u'ERROR: Unable to extract media URL')
992 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
993 video_url = unescapeHTML(video_url)
996 'id': video_id.decode('utf-8'),
998 'uploader': video_uploader,
999 'upload_date': u'NA',
1000 'title': video_title,
1001 'ext': video_extension.decode('utf-8'),
1002 'thumbnail': video_thumbnail.decode('utf-8'),
1003 'description': video_description,
1004 'thumbnail': video_thumbnail,
1009 class VimeoIE(InfoExtractor):
1010 """Information extractor for vimeo.com."""
1012 # _VALID_URL matches Vimeo URLs
1013 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1016 def __init__(self, downloader=None):
1017 InfoExtractor.__init__(self, downloader)
1019 def report_download_webpage(self, video_id):
1020 """Report webpage download."""
1021 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1023 def report_extraction(self, video_id):
1024 """Report information extraction."""
1025 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1027 def _real_extract(self, url, new_video=True):
1028 # Extract ID from URL
1029 mobj = re.match(self._VALID_URL, url)
1031 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1034 video_id = mobj.group(1)
1036 # Retrieve video webpage to extract further information
1037 request = urllib2.Request(url, None, std_headers)
1039 self.report_download_webpage(video_id)
1040 webpage = urllib2.urlopen(request).read()
1041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1042 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1045 # Now we begin extracting as much information as we can from what we
1046 # retrieved. First we extract the information common to all extractors,
1047 # and latter we extract those that are Vimeo specific.
1048 self.report_extraction(video_id)
1050 # Extract the config JSON
1051 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1053 config = json.loads(config)
1055 self._downloader.trouble(u'ERROR: unable to extract info section')
1059 video_title = config["video"]["title"]
1062 video_uploader = config["video"]["owner"]["name"]
1064 # Extract video thumbnail
1065 video_thumbnail = config["video"]["thumbnail"]
1067 # Extract video description
1068 video_description = get_element_by_id("description", webpage.decode('utf8'))
1069 if video_description: video_description = clean_html(video_description)
1070 else: video_description = ''
1072 # Extract upload date
1073 video_upload_date = u'NA'
1074 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1075 if mobj is not None:
1076 video_upload_date = mobj.group(1)
1078 # Vimeo specific: extract request signature and timestamp
1079 sig = config['request']['signature']
1080 timestamp = config['request']['timestamp']
1082 # Vimeo specific: extract video codec and quality information
1083 # TODO bind to format param
1084 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1085 for codec in codecs:
1086 if codec[0] in config["video"]["files"]:
1087 video_codec = codec[0]
1088 video_extension = codec[1]
1089 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1090 else: quality = 'sd'
1093 self._downloader.trouble(u'ERROR: no known codec found')
1096 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1097 %(video_id, sig, timestamp, quality, video_codec.upper())
1102 'uploader': video_uploader,
1103 'upload_date': video_upload_date,
1104 'title': video_title,
1105 'ext': video_extension,
1106 'thumbnail': video_thumbnail,
1107 'description': video_description,
1112 class GenericIE(InfoExtractor):
1113 """Generic last-resort information extractor."""
1116 IE_NAME = u'generic'
1118 def __init__(self, downloader=None):
1119 InfoExtractor.__init__(self, downloader)
1121 def report_download_webpage(self, video_id):
1122 """Report webpage download."""
1123 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1124 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1126 def report_extraction(self, video_id):
1127 """Report information extraction."""
1128 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1130 def report_following_redirect(self, new_url):
1131 """Report information extraction."""
1132 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1134 def _test_redirect(self, url):
1135 """Check if it is a redirect, like url shorteners, in case restart chain."""
1136 class HeadRequest(urllib2.Request):
1137 def get_method(self):
1140 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1142 Subclass the HTTPRedirectHandler to make it use our
1143 HeadRequest also on the redirected URL
1145 def redirect_request(self, req, fp, code, msg, headers, newurl):
1146 if code in (301, 302, 303, 307):
1147 newurl = newurl.replace(' ', '%20')
1148 newheaders = dict((k,v) for k,v in req.headers.items()
1149 if k.lower() not in ("content-length", "content-type"))
1150 return HeadRequest(newurl,
1152 origin_req_host=req.get_origin_req_host(),
1155 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1157 class HTTPMethodFallback(urllib2.BaseHandler):
1159 Fallback to GET if HEAD is not allowed (405 HTTP error)
1161 def http_error_405(self, req, fp, code, msg, headers):
1165 newheaders = dict((k,v) for k,v in req.headers.items()
1166 if k.lower() not in ("content-length", "content-type"))
1167 return self.parent.open(urllib2.Request(req.get_full_url(),
1169 origin_req_host=req.get_origin_req_host(),
1173 opener = urllib2.OpenerDirector()
1174 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1175 HTTPMethodFallback, HEADRedirectHandler,
1176 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1177 opener.add_handler(handler())
1179 response = opener.open(HeadRequest(url))
1180 new_url = response.geturl()
1182 if url == new_url: return False
1184 self.report_following_redirect(new_url)
1185 self._downloader.download([new_url])
1188 def _real_extract(self, url):
1189 if self._test_redirect(url): return
1191 video_id = url.split('/')[-1]
1192 request = urllib2.Request(url)
1194 self.report_download_webpage(video_id)
1195 webpage = urllib2.urlopen(request).read()
1196 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1197 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1199 except ValueError, err:
1200 # since this is the last-resort InfoExtractor, if
1201 # this error is thrown, it'll be thrown here
1202 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1205 self.report_extraction(video_id)
1206 # Start with something easy: JW Player in SWFObject
1207 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1209 # Broaden the search a little bit
1210 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1212 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1215 # It's possible that one of the regexes
1216 # matched, but returned an empty group:
1217 if mobj.group(1) is None:
1218 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1221 video_url = urllib.unquote(mobj.group(1))
1222 video_id = os.path.basename(video_url)
1224 # here's a fun little line of code for you:
1225 video_extension = os.path.splitext(video_id)[1][1:]
1226 video_id = os.path.splitext(video_id)[0]
1228 # it's tempting to parse this further, but you would
1229 # have to take into account all the variations like
1230 # Video Title - Site Name
1231 # Site Name | Video Title
1232 # Video Title - Tagline | Site Name
1233 # and so on and so forth; it's just not practical
1234 mobj = re.search(r'<title>(.*)</title>', webpage)
1236 self._downloader.trouble(u'ERROR: unable to extract title')
1238 video_title = mobj.group(1).decode('utf-8')
1240 # video uploader is domain name
1241 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1243 self._downloader.trouble(u'ERROR: unable to extract title')
1245 video_uploader = mobj.group(1).decode('utf-8')
1248 'id': video_id.decode('utf-8'),
1249 'url': video_url.decode('utf-8'),
1250 'uploader': video_uploader,
1251 'upload_date': u'NA',
1252 'title': video_title,
1253 'ext': video_extension.decode('utf-8'),
1259 class YoutubeSearchIE(InfoExtractor):
1260 """Information Extractor for YouTube search queries."""
1261 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1262 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1263 _max_youtube_results = 1000
1264 IE_NAME = u'youtube:search'
1266 def __init__(self, downloader=None):
1267 InfoExtractor.__init__(self, downloader)
1269 def report_download_page(self, query, pagenum):
1270 """Report attempt to download search page with given number."""
1271 query = query.decode(preferredencoding())
1272 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1274 def _real_extract(self, query):
1275 mobj = re.match(self._VALID_URL, query)
1277 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1280 prefix, query = query.split(':')
1282 query = query.encode('utf-8')
1284 self._download_n_results(query, 1)
1286 elif prefix == 'all':
1287 self._download_n_results(query, self._max_youtube_results)
1293 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1295 elif n > self._max_youtube_results:
1296 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1297 n = self._max_youtube_results
1298 self._download_n_results(query, n)
1300 except ValueError: # parsing prefix as integer fails
1301 self._download_n_results(query, 1)
1304 def _download_n_results(self, query, n):
1305 """Downloads a specified number of results for a query"""
1311 while (50 * pagenum) < limit:
1312 self.report_download_page(query, pagenum+1)
1313 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1314 request = urllib2.Request(result_url)
1316 data = urllib2.urlopen(request).read()
1317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1318 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1320 api_response = json.loads(data)['data']
1322 new_ids = list(video['id'] for video in api_response['items'])
1323 video_ids += new_ids
1325 limit = min(n, api_response['totalItems'])
1328 if len(video_ids) > n:
1329 video_ids = video_ids[:n]
1330 for id in video_ids:
1331 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1335 class GoogleSearchIE(InfoExtractor):
1336 """Information Extractor for Google Video search queries."""
1337 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1338 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1339 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1340 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1341 _max_google_results = 1000
1342 IE_NAME = u'video.google:search'
1344 def __init__(self, downloader=None):
1345 InfoExtractor.__init__(self, downloader)
1347 def report_download_page(self, query, pagenum):
1348 """Report attempt to download playlist page with given number."""
1349 query = query.decode(preferredencoding())
1350 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1352 def _real_extract(self, query):
1353 mobj = re.match(self._VALID_URL, query)
1355 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1358 prefix, query = query.split(':')
1360 query = query.encode('utf-8')
1362 self._download_n_results(query, 1)
1364 elif prefix == 'all':
1365 self._download_n_results(query, self._max_google_results)
1371 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1373 elif n > self._max_google_results:
1374 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1375 n = self._max_google_results
1376 self._download_n_results(query, n)
1378 except ValueError: # parsing prefix as integer fails
1379 self._download_n_results(query, 1)
1382 def _download_n_results(self, query, n):
1383 """Downloads a specified number of results for a query"""
1389 self.report_download_page(query, pagenum)
1390 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1391 request = urllib2.Request(result_url)
1393 page = urllib2.urlopen(request).read()
1394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1398 # Extract video identifiers
1399 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1400 video_id = mobj.group(1)
1401 if video_id not in video_ids:
1402 video_ids.append(video_id)
1403 if len(video_ids) == n:
1404 # Specified n videos reached
1405 for id in video_ids:
1406 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1409 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1410 for id in video_ids:
1411 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1414 pagenum = pagenum + 1
1417 class YahooSearchIE(InfoExtractor):
1418 """Information Extractor for Yahoo! Video search queries."""
1419 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1420 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1421 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1422 _MORE_PAGES_INDICATOR = r'\s*Next'
1423 _max_yahoo_results = 1000
1424 IE_NAME = u'video.yahoo:search'
1426 def __init__(self, downloader=None):
1427 InfoExtractor.__init__(self, downloader)
1429 def report_download_page(self, query, pagenum):
1430 """Report attempt to download playlist page with given number."""
1431 query = query.decode(preferredencoding())
1432 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1434 def _real_extract(self, query):
1435 mobj = re.match(self._VALID_URL, query)
1437 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1440 prefix, query = query.split(':')
1442 query = query.encode('utf-8')
1444 self._download_n_results(query, 1)
1446 elif prefix == 'all':
1447 self._download_n_results(query, self._max_yahoo_results)
1453 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1455 elif n > self._max_yahoo_results:
1456 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1457 n = self._max_yahoo_results
1458 self._download_n_results(query, n)
1460 except ValueError: # parsing prefix as integer fails
1461 self._download_n_results(query, 1)
1464 def _download_n_results(self, query, n):
1465 """Downloads a specified number of results for a query"""
1468 already_seen = set()
1472 self.report_download_page(query, pagenum)
1473 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1474 request = urllib2.Request(result_url)
1476 page = urllib2.urlopen(request).read()
1477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1481 # Extract video identifiers
1482 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1483 video_id = mobj.group(1)
1484 if video_id not in already_seen:
1485 video_ids.append(video_id)
1486 already_seen.add(video_id)
1487 if len(video_ids) == n:
1488 # Specified n videos reached
1489 for id in video_ids:
1490 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1493 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1494 for id in video_ids:
1495 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1498 pagenum = pagenum + 1
1501 class YoutubePlaylistIE(InfoExtractor):
1502 """Information Extractor for YouTube playlists."""
1504 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1505 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1506 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s'
1507 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1508 IE_NAME = u'youtube:playlist'
1510 def __init__(self, downloader=None):
1511 InfoExtractor.__init__(self, downloader)
1513 def report_download_page(self, playlist_id, pagenum):
1514 """Report attempt to download playlist page with given number."""
1515 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1517 def _real_extract(self, url):
1518 # Extract playlist id
1519 mobj = re.match(self._VALID_URL, url)
1521 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1525 if mobj.group(3) is not None:
1526 self._downloader.download([mobj.group(3)])
1529 # Download playlist pages
1530 # prefix is 'p' as default for playlists but there are other types that need extra care
1531 playlist_prefix = mobj.group(1)
1532 if playlist_prefix == 'a':
1533 playlist_access = 'artist'
1535 playlist_prefix = 'p'
1536 playlist_access = 'view_play_list'
1537 playlist_id = mobj.group(2)
1542 self.report_download_page(playlist_id, pagenum)
1543 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1544 request = urllib2.Request(url)
1546 page = urllib2.urlopen(request).read()
1547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1551 # Extract video identifiers
1553 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1554 if mobj.group(1) not in ids_in_page:
1555 ids_in_page.append(mobj.group(1))
1556 video_ids.extend(ids_in_page)
1558 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1560 pagenum = pagenum + 1
1562 playliststart = self._downloader.params.get('playliststart', 1) - 1
1563 playlistend = self._downloader.params.get('playlistend', -1)
1564 if playlistend == -1:
1565 video_ids = video_ids[playliststart:]
1567 video_ids = video_ids[playliststart:playlistend]
1569 for id in video_ids:
1570 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1574 class YoutubeUserIE(InfoExtractor):
1575 """Information Extractor for YouTube users."""
1577 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1578 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1579 _GDATA_PAGE_SIZE = 50
1580 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1581 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1582 IE_NAME = u'youtube:user'
1584 def __init__(self, downloader=None):
1585 InfoExtractor.__init__(self, downloader)
1587 def report_download_page(self, username, start_index):
1588 """Report attempt to download user page."""
1589 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1590 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1592 def _real_extract(self, url):
1594 mobj = re.match(self._VALID_URL, url)
1596 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1599 username = mobj.group(1)
1601 # Download video ids using YouTube Data API. Result size per
1602 # query is limited (currently to 50 videos) so we need to query
1603 # page by page until there are no video ids - it means we got
1610 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1611 self.report_download_page(username, start_index)
1613 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1616 page = urllib2.urlopen(request).read()
1617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1618 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1621 # Extract video identifiers
1624 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1625 if mobj.group(1) not in ids_in_page:
1626 ids_in_page.append(mobj.group(1))
1628 video_ids.extend(ids_in_page)
1630 # A little optimization - if current page is not
1631 # "full", ie. does not contain PAGE_SIZE video ids then
1632 # we can assume that this page is the last one - there
1633 # are no more ids on further pages - no need to query
1636 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1641 all_ids_count = len(video_ids)
1642 playliststart = self._downloader.params.get('playliststart', 1) - 1
1643 playlistend = self._downloader.params.get('playlistend', -1)
1645 if playlistend == -1:
1646 video_ids = video_ids[playliststart:]
1648 video_ids = video_ids[playliststart:playlistend]
1650 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1651 (username, all_ids_count, len(video_ids)))
1653 for video_id in video_ids:
1654 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1657 class BlipTVUserIE(InfoExtractor):
1658 """Information Extractor for blip.tv users."""
1660 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1662 IE_NAME = u'blip.tv:user'
1664 def __init__(self, downloader=None):
1665 InfoExtractor.__init__(self, downloader)
1667 def report_download_page(self, username, pagenum):
1668 """Report attempt to download user page."""
1669 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1670 (self.IE_NAME, username, pagenum))
1672 def _real_extract(self, url):
1674 mobj = re.match(self._VALID_URL, url)
1676 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1679 username = mobj.group(1)
1681 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1683 request = urllib2.Request(url)
1686 page = urllib2.urlopen(request).read().decode('utf-8')
1687 mobj = re.search(r'data-users-id="([^"]+)"', page)
1688 page_base = page_base % mobj.group(1)
1689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1694 # Download video ids using BlipTV Ajax calls. Result size per
1695 # query is limited (currently to 12 videos) so we need to query
1696 # page by page until there are no video ids - it means we got
1703 self.report_download_page(username, pagenum)
1705 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1708 page = urllib2.urlopen(request).read().decode('utf-8')
1709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1713 # Extract video identifiers
1716 for mobj in re.finditer(r'href="/([^"]+)"', page):
1717 if mobj.group(1) not in ids_in_page:
1718 ids_in_page.append(unescapeHTML(mobj.group(1)))
1720 video_ids.extend(ids_in_page)
1722 # A little optimization - if current page is not
1723 # "full", ie. does not contain PAGE_SIZE video ids then
1724 # we can assume that this page is the last one - there
1725 # are no more ids on further pages - no need to query
1728 if len(ids_in_page) < self._PAGE_SIZE:
1733 all_ids_count = len(video_ids)
1734 playliststart = self._downloader.params.get('playliststart', 1) - 1
1735 playlistend = self._downloader.params.get('playlistend', -1)
1737 if playlistend == -1:
1738 video_ids = video_ids[playliststart:]
1740 video_ids = video_ids[playliststart:playlistend]
1742 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1743 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1745 for video_id in video_ids:
1746 self._downloader.download([u'http://blip.tv/'+video_id])
1749 class DepositFilesIE(InfoExtractor):
1750 """Information extractor for depositfiles.com"""
1752 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1753 IE_NAME = u'DepositFiles'
1755 def __init__(self, downloader=None):
1756 InfoExtractor.__init__(self, downloader)
1758 def report_download_webpage(self, file_id):
1759 """Report webpage download."""
1760 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1762 def report_extraction(self, file_id):
1763 """Report information extraction."""
1764 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1766 def _real_extract(self, url):
1767 file_id = url.split('/')[-1]
1768 # Rebuild url in english locale
1769 url = 'http://depositfiles.com/en/files/' + file_id
1771 # Retrieve file webpage with 'Free download' button pressed
1772 free_download_indication = { 'gateway_result' : '1' }
1773 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1775 self.report_download_webpage(file_id)
1776 webpage = urllib2.urlopen(request).read()
1777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1781 # Search for the real file URL
1782 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1783 if (mobj is None) or (mobj.group(1) is None):
1784 # Try to figure out reason of the error.
1785 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1786 if (mobj is not None) and (mobj.group(1) is not None):
1787 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1788 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1790 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1793 file_url = mobj.group(1)
1794 file_extension = os.path.splitext(file_url)[1][1:]
1796 # Search for file title
1797 mobj = re.search(r'<b title="(.*?)">', webpage)
1799 self._downloader.trouble(u'ERROR: unable to extract title')
1801 file_title = mobj.group(1).decode('utf-8')
1804 'id': file_id.decode('utf-8'),
1805 'url': file_url.decode('utf-8'),
1807 'upload_date': u'NA',
1808 'title': file_title,
1809 'ext': file_extension.decode('utf-8'),
1815 class FacebookIE(InfoExtractor):
1816 """Information Extractor for Facebook"""
1818 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1819 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1820 _NETRC_MACHINE = 'facebook'
1821 _available_formats = ['video', 'highqual', 'lowqual']
1822 _video_extensions = {
1827 IE_NAME = u'facebook'
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1832 def _reporter(self, message):
1833 """Add header and report message."""
1834 self._downloader.to_screen(u'[facebook] %s' % message)
1836 def report_login(self):
1837 """Report attempt to log in."""
1838 self._reporter(u'Logging in')
1840 def report_video_webpage_download(self, video_id):
1841 """Report attempt to download video webpage."""
1842 self._reporter(u'%s: Downloading video webpage' % video_id)
1844 def report_information_extraction(self, video_id):
1845 """Report attempt to extract video information."""
1846 self._reporter(u'%s: Extracting video information' % video_id)
1848 def _parse_page(self, video_webpage):
1849 """Extract video information from page"""
1851 data = {'title': r'\("video_title", "(.*?)"\)',
1852 'description': r'<div class="datawrap">(.*?)</div>',
1853 'owner': r'\("video_owner_name", "(.*?)"\)',
1854 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1857 for piece in data.keys():
1858 mobj = re.search(data[piece], video_webpage)
1859 if mobj is not None:
1860 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1864 for fmt in self._available_formats:
1865 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1866 if mobj is not None:
1867 # URL is in a Javascript segment inside an escaped Unicode format within
1868 # the generally utf-8 page
1869 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1870 video_info['video_urls'] = video_urls
1874 def _real_initialize(self):
1875 if self._downloader is None:
1880 downloader_params = self._downloader.params
1882 # Attempt to use provided username and password or .netrc data
1883 if downloader_params.get('username', None) is not None:
1884 useremail = downloader_params['username']
1885 password = downloader_params['password']
1886 elif downloader_params.get('usenetrc', False):
1888 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1889 if info is not None:
1893 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1894 except (IOError, netrc.NetrcParseError), err:
1895 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1898 if useremail is None:
1907 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1910 login_results = urllib2.urlopen(request).read()
1911 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1912 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1918 def _real_extract(self, url):
1919 mobj = re.match(self._VALID_URL, url)
1921 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1923 video_id = mobj.group('ID')
1926 self.report_video_webpage_download(video_id)
1927 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1929 page = urllib2.urlopen(request)
1930 video_webpage = page.read()
1931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1932 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1935 # Start extracting information
1936 self.report_information_extraction(video_id)
1938 # Extract information
1939 video_info = self._parse_page(video_webpage)
1942 if 'owner' not in video_info:
1943 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1945 video_uploader = video_info['owner']
1948 if 'title' not in video_info:
1949 self._downloader.trouble(u'ERROR: unable to extract video title')
1951 video_title = video_info['title']
1952 video_title = video_title.decode('utf-8')
1955 if 'thumbnail' not in video_info:
1956 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1957 video_thumbnail = ''
1959 video_thumbnail = video_info['thumbnail']
1963 if 'upload_date' in video_info:
1964 upload_time = video_info['upload_date']
1965 timetuple = email.utils.parsedate_tz(upload_time)
1966 if timetuple is not None:
1968 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1973 video_description = video_info.get('description', 'No description available.')
1975 url_map = video_info['video_urls']
1976 if len(url_map.keys()) > 0:
1977 # Decide which formats to download
1978 req_format = self._downloader.params.get('format', None)
1979 format_limit = self._downloader.params.get('format_limit', None)
1981 if format_limit is not None and format_limit in self._available_formats:
1982 format_list = self._available_formats[self._available_formats.index(format_limit):]
1984 format_list = self._available_formats
1985 existing_formats = [x for x in format_list if x in url_map]
1986 if len(existing_formats) == 0:
1987 self._downloader.trouble(u'ERROR: no known formats available for video')
1989 if req_format is None:
1990 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1991 elif req_format == 'worst':
1992 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1993 elif req_format == '-1':
1994 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1997 if req_format not in url_map:
1998 self._downloader.trouble(u'ERROR: requested format not available')
2000 video_url_list = [(req_format, url_map[req_format])] # Specific format
2003 for format_param, video_real_url in video_url_list:
2005 video_extension = self._video_extensions.get(format_param, 'mp4')
2008 'id': video_id.decode('utf-8'),
2009 'url': video_real_url.decode('utf-8'),
2010 'uploader': video_uploader.decode('utf-8'),
2011 'upload_date': upload_date,
2012 'title': video_title,
2013 'ext': video_extension.decode('utf-8'),
2014 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2015 'thumbnail': video_thumbnail.decode('utf-8'),
2016 'description': video_description.decode('utf-8'),
2021 class BlipTVIE(InfoExtractor):
2022 """Information extractor for blip.tv"""
2024 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2025 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2026 IE_NAME = u'blip.tv'
2028 def report_extraction(self, file_id):
2029 """Report information extraction."""
2030 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2032 def report_direct_download(self, title):
2033 """Report information extraction."""
2034 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2036 def _real_extract(self, url):
2037 mobj = re.match(self._VALID_URL, url)
2039 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2046 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2047 request = urllib2.Request(json_url.encode('utf-8'))
2048 self.report_extraction(mobj.group(1))
2051 urlh = urllib2.urlopen(request)
2052 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2053 basename = url.split('/')[-1]
2054 title,ext = os.path.splitext(basename)
2055 title = title.decode('UTF-8')
2056 ext = ext.replace('.', '')
2057 self.report_direct_download(title)
2065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2068 if info is None: # Regular URL
2070 json_code = urlh.read()
2071 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2072 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2076 json_data = json.loads(json_code)
2077 if 'Post' in json_data:
2078 data = json_data['Post']
2082 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2083 video_url = data['media']['url']
2084 umobj = re.match(self._URL_EXT, video_url)
2086 raise ValueError('Can not determine filename extension')
2087 ext = umobj.group(1)
2090 'id': data['item_id'],
2092 'uploader': data['display_name'],
2093 'upload_date': upload_date,
2094 'title': data['title'],
2096 'format': data['media']['mimeType'],
2097 'thumbnail': data['thumbnailUrl'],
2098 'description': data['description'],
2099 'player_url': data['embedUrl']
2101 except (ValueError,KeyError), err:
2102 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2105 std_headers['User-Agent'] = 'iTunes/10.6.1'
2109 class MyVideoIE(InfoExtractor):
2110 """Information Extractor for myvideo.de."""
2112 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2113 IE_NAME = u'myvideo'
2115 def __init__(self, downloader=None):
2116 InfoExtractor.__init__(self, downloader)
2118 def report_download_webpage(self, video_id):
2119 """Report webpage download."""
2120 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2122 def report_extraction(self, video_id):
2123 """Report information extraction."""
2124 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2126 def _real_extract(self,url):
2127 mobj = re.match(self._VALID_URL, url)
2129 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2132 video_id = mobj.group(1)
2135 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2137 self.report_download_webpage(video_id)
2138 webpage = urllib2.urlopen(request).read()
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2143 self.report_extraction(video_id)
2144 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2147 self._downloader.trouble(u'ERROR: unable to extract media URL')
2149 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2151 mobj = re.search('<title>([^<]+)</title>', webpage)
2153 self._downloader.trouble(u'ERROR: unable to extract title')
2156 video_title = mobj.group(1)
2162 'upload_date': u'NA',
2163 'title': video_title,
2169 class ComedyCentralIE(InfoExtractor):
2170 """Information extractor for The Daily Show and Colbert Report """
2172 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2173 IE_NAME = u'comedycentral'
2175 def report_extraction(self, episode_id):
2176 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2178 def report_config_download(self, episode_id):
2179 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2181 def report_index_download(self, episode_id):
2182 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2184 def report_player_url(self, episode_id):
2185 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2187 def _real_extract(self, url):
2188 mobj = re.match(self._VALID_URL, url)
2190 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2193 if mobj.group('shortname'):
2194 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2195 url = u'http://www.thedailyshow.com/full-episodes/'
2197 url = u'http://www.colbertnation.com/full-episodes/'
2198 mobj = re.match(self._VALID_URL, url)
2199 assert mobj is not None
2201 dlNewest = not mobj.group('episode')
2203 epTitle = mobj.group('showname')
2205 epTitle = mobj.group('episode')
2207 req = urllib2.Request(url)
2208 self.report_extraction(epTitle)
2210 htmlHandle = urllib2.urlopen(req)
2211 html = htmlHandle.read()
2212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2213 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2216 url = htmlHandle.geturl()
2217 mobj = re.match(self._VALID_URL, url)
2219 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2221 if mobj.group('episode') == '':
2222 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2224 epTitle = mobj.group('episode')
2226 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2227 if len(mMovieParams) == 0:
2228 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2231 playerUrl_raw = mMovieParams[0][0]
2232 self.report_player_url(epTitle)
2234 urlHandle = urllib2.urlopen(playerUrl_raw)
2235 playerUrl = urlHandle.geturl()
2236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2237 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2240 uri = mMovieParams[0][1]
2241 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2242 self.report_index_download(epTitle)
2244 indexXml = urllib2.urlopen(indexUrl).read()
2245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2251 idoc = xml.etree.ElementTree.fromstring(indexXml)
2252 itemEls = idoc.findall('.//item')
2253 for itemEl in itemEls:
2254 mediaId = itemEl.findall('./guid')[0].text
2255 shortMediaId = mediaId.split(':')[-1]
2256 showId = mediaId.split(':')[-2].replace('.com', '')
2257 officialTitle = itemEl.findall('./title')[0].text
2258 officialDate = itemEl.findall('./pubDate')[0].text
2260 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2261 urllib.urlencode({'uri': mediaId}))
2262 configReq = urllib2.Request(configUrl)
2263 self.report_config_download(epTitle)
2265 configXml = urllib2.urlopen(configReq).read()
2266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2267 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2270 cdoc = xml.etree.ElementTree.fromstring(configXml)
2272 for rendition in cdoc.findall('.//rendition'):
2273 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2277 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2280 # For now, just pick the highest bitrate
2281 format,video_url = turls[-1]
2283 effTitle = showId + u'-' + epTitle
2288 'upload_date': officialDate,
2293 'description': officialTitle,
2294 'player_url': playerUrl
2297 results.append(info)
2302 class EscapistIE(InfoExtractor):
2303 """Information extractor for The Escapist """
2305 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2306 IE_NAME = u'escapist'
2308 def report_extraction(self, showName):
2309 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2311 def report_config_download(self, showName):
2312 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2314 def _real_extract(self, url):
2315 mobj = re.match(self._VALID_URL, url)
2317 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2319 showName = mobj.group('showname')
2320 videoId = mobj.group('episode')
2322 self.report_extraction(showName)
2324 webPage = urllib2.urlopen(url)
2325 webPageBytes = webPage.read()
2326 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2327 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2329 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2332 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2333 description = unescapeHTML(descMatch.group(1))
2334 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2335 imgUrl = unescapeHTML(imgMatch.group(1))
2336 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2337 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2338 configUrlMatch = re.search('config=(.*)$', playerUrl)
2339 configUrl = urllib2.unquote(configUrlMatch.group(1))
2341 self.report_config_download(showName)
2343 configJSON = urllib2.urlopen(configUrl).read()
2344 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2345 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2348 # Technically, it's JavaScript, not JSON
2349 configJSON = configJSON.replace("'", '"')
2352 config = json.loads(configJSON)
2353 except (ValueError,), err:
2354 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2357 playlist = config['playlist']
2358 videoUrl = playlist[1]['url']
2363 'uploader': showName,
2364 'upload_date': None,
2368 'thumbnail': imgUrl,
2369 'description': description,
2370 'player_url': playerUrl,
2376 class CollegeHumorIE(InfoExtractor):
2377 """Information extractor for collegehumor.com"""
2379 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2380 IE_NAME = u'collegehumor'
2382 def report_webpage(self, video_id):
2383 """Report information extraction."""
2384 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2386 def report_extraction(self, video_id):
2387 """Report information extraction."""
2388 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2390 def _real_extract(self, url):
2391 mobj = re.match(self._VALID_URL, url)
2393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2395 video_id = mobj.group('videoid')
2397 self.report_webpage(video_id)
2398 request = urllib2.Request(url)
2400 webpage = urllib2.urlopen(request).read()
2401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2405 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2407 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2409 internal_video_id = m.group('internalvideoid')
2413 'internal_id': internal_video_id,
2416 self.report_extraction(video_id)
2417 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2419 metaXml = urllib2.urlopen(xmlUrl).read()
2420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2421 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2424 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2426 videoNode = mdoc.findall('./video')[0]
2427 info['description'] = videoNode.findall('./description')[0].text
2428 info['title'] = videoNode.findall('./caption')[0].text
2429 info['url'] = videoNode.findall('./file')[0].text
2430 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2431 info['ext'] = info['url'].rpartition('.')[2]
2432 info['format'] = info['ext']
2434 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2440 class XVideosIE(InfoExtractor):
2441 """Information extractor for xvideos.com"""
2443 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2444 IE_NAME = u'xvideos'
2446 def report_webpage(self, video_id):
2447 """Report information extraction."""
2448 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2450 def report_extraction(self, video_id):
2451 """Report information extraction."""
2452 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2454 def _real_extract(self, url):
2455 mobj = re.match(self._VALID_URL, url)
2457 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2459 video_id = mobj.group(1).decode('utf-8')
2461 self.report_webpage(video_id)
2463 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2465 webpage = urllib2.urlopen(request).read()
2466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2470 self.report_extraction(video_id)
2474 mobj = re.search(r'flv_url=(.+?)&', webpage)
2476 self._downloader.trouble(u'ERROR: unable to extract video url')
2478 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2482 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2484 self._downloader.trouble(u'ERROR: unable to extract video title')
2486 video_title = mobj.group(1).decode('utf-8')
2489 # Extract video thumbnail
2490 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2492 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2494 video_thumbnail = mobj.group(0).decode('utf-8')
2500 'upload_date': None,
2501 'title': video_title,
2504 'thumbnail': video_thumbnail,
2505 'description': None,
2512 class SoundcloudIE(InfoExtractor):
2513 """Information extractor for soundcloud.com
2514 To access the media, the uid of the song and a stream token
2515 must be extracted from the page source and the script must make
2516 a request to media.soundcloud.com/crossdomain.xml. Then
2517 the media can be grabbed by requesting from an url composed
2518 of the stream token and uid
2521 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2522 IE_NAME = u'soundcloud'
2524 def __init__(self, downloader=None):
2525 InfoExtractor.__init__(self, downloader)
2527 def report_webpage(self, video_id):
2528 """Report information extraction."""
2529 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2531 def report_extraction(self, video_id):
2532 """Report information extraction."""
2533 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2535 def _real_extract(self, url):
2536 mobj = re.match(self._VALID_URL, url)
2538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2541 # extract uploader (which is in the url)
2542 uploader = mobj.group(1).decode('utf-8')
2543 # extract simple title (uploader + slug of song title)
2544 slug_title = mobj.group(2).decode('utf-8')
2545 simple_title = uploader + u'-' + slug_title
2547 self.report_webpage('%s/%s' % (uploader, slug_title))
2549 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2551 webpage = urllib2.urlopen(request).read()
2552 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2556 self.report_extraction('%s/%s' % (uploader, slug_title))
2558 # extract uid and stream token that soundcloud hands out for access
2559 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2561 video_id = mobj.group(1)
2562 stream_token = mobj.group(2)
2564 # extract unsimplified title
2565 mobj = re.search('"title":"(.*?)",', webpage)
2567 title = mobj.group(1).decode('utf-8')
2569 title = simple_title
2571 # construct media url (with uid/token)
2572 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2573 mediaURL = mediaURL % (video_id, stream_token)
2576 description = u'No description available'
2577 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2579 description = mobj.group(1)
2583 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2586 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2587 except Exception, e:
2588 self._downloader.to_stderr(str(e))
2590 # for soundcloud, a request to a cross domain is required for cookies
2591 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2594 'id': video_id.decode('utf-8'),
2596 'uploader': uploader.decode('utf-8'),
2597 'upload_date': upload_date,
2602 'description': description.decode('utf-8')
2606 class InfoQIE(InfoExtractor):
2607 """Information extractor for infoq.com"""
2609 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2612 def report_webpage(self, video_id):
2613 """Report information extraction."""
2614 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2616 def report_extraction(self, video_id):
2617 """Report information extraction."""
2618 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2620 def _real_extract(self, url):
2621 mobj = re.match(self._VALID_URL, url)
2623 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2626 self.report_webpage(url)
2628 request = urllib2.Request(url)
2630 webpage = urllib2.urlopen(request).read()
2631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2635 self.report_extraction(url)
2639 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2641 self._downloader.trouble(u'ERROR: unable to extract video url')
2643 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2647 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2649 self._downloader.trouble(u'ERROR: unable to extract video title')
2651 video_title = mobj.group(1).decode('utf-8')
2653 # Extract description
2654 video_description = u'No description available.'
2655 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2656 if mobj is not None:
2657 video_description = mobj.group(1).decode('utf-8')
2659 video_filename = video_url.split('/')[-1]
2660 video_id, extension = video_filename.split('.')
2666 'upload_date': None,
2667 'title': video_title,
2669 'format': extension, # Extension is always(?) mp4, but seems to be flv
2671 'description': video_description,
2677 class MixcloudIE(InfoExtractor):
2678 """Information extractor for www.mixcloud.com"""
2679 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2680 IE_NAME = u'mixcloud'
2682 def __init__(self, downloader=None):
2683 InfoExtractor.__init__(self, downloader)
2685 def report_download_json(self, file_id):
2686 """Report JSON download."""
2687 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2689 def report_extraction(self, file_id):
2690 """Report information extraction."""
2691 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2693 def get_urls(self, jsonData, fmt, bitrate='best'):
2694 """Get urls from 'audio_formats' section in json"""
2697 bitrate_list = jsonData[fmt]
2698 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2699 bitrate = max(bitrate_list) # select highest
2701 url_list = jsonData[fmt][bitrate]
2702 except TypeError: # we have no bitrate info.
2703 url_list = jsonData[fmt]
2706 def check_urls(self, url_list):
2707 """Returns 1st active url from list"""
2708 for url in url_list:
2710 urllib2.urlopen(url)
2712 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2717 def _print_formats(self, formats):
2718 print 'Available formats:'
2719 for fmt in formats.keys():
2720 for b in formats[fmt]:
2722 ext = formats[fmt][b][0]
2723 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2724 except TypeError: # we have no bitrate info
2725 ext = formats[fmt][0]
2726 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2729 def _real_extract(self, url):
2730 mobj = re.match(self._VALID_URL, url)
2732 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2734 # extract uploader & filename from url
2735 uploader = mobj.group(1).decode('utf-8')
2736 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2738 # construct API request
2739 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2740 # retrieve .json file with links to files
2741 request = urllib2.Request(file_url)
2743 self.report_download_json(file_url)
2744 jsonData = urllib2.urlopen(request).read()
2745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2746 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2750 json_data = json.loads(jsonData)
2751 player_url = json_data['player_swf_url']
2752 formats = dict(json_data['audio_formats'])
2754 req_format = self._downloader.params.get('format', None)
2757 if self._downloader.params.get('listformats', None):
2758 self._print_formats(formats)
2761 if req_format is None or req_format == 'best':
2762 for format_param in formats.keys():
2763 url_list = self.get_urls(formats, format_param)
2765 file_url = self.check_urls(url_list)
2766 if file_url is not None:
2769 if req_format not in formats.keys():
2770 self._downloader.trouble(u'ERROR: format is not available')
2773 url_list = self.get_urls(formats, req_format)
2774 file_url = self.check_urls(url_list)
2775 format_param = req_format
2778 'id': file_id.decode('utf-8'),
2779 'url': file_url.decode('utf-8'),
2780 'uploader': uploader.decode('utf-8'),
2781 'upload_date': u'NA',
2782 'title': json_data['name'],
2783 'ext': file_url.split('.')[-1].decode('utf-8'),
2784 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2785 'thumbnail': json_data['thumbnail_url'],
2786 'description': json_data['description'],
2787 'player_url': player_url.decode('utf-8'),
2790 class StanfordOpenClassroomIE(InfoExtractor):
2791 """Information extractor for Stanford's Open ClassRoom"""
2793 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2794 IE_NAME = u'stanfordoc'
2796 def report_download_webpage(self, objid):
2797 """Report information extraction."""
2798 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2800 def report_extraction(self, video_id):
2801 """Report information extraction."""
2802 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2804 def _real_extract(self, url):
2805 mobj = re.match(self._VALID_URL, url)
2807 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2810 if mobj.group('course') and mobj.group('video'): # A specific video
2811 course = mobj.group('course')
2812 video = mobj.group('video')
2814 'id': course + '_' + video,
2817 self.report_extraction(info['id'])
2818 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2819 xmlUrl = baseUrl + video + '.xml'
2821 metaXml = urllib2.urlopen(xmlUrl).read()
2822 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2823 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2825 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2827 info['title'] = mdoc.findall('./title')[0].text
2828 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2830 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2832 info['ext'] = info['url'].rpartition('.')[2]
2833 info['format'] = info['ext']
2835 elif mobj.group('course'): # A course page
2836 course = mobj.group('course')
2842 self.report_download_webpage(info['id'])
2844 coursepage = urllib2.urlopen(url).read()
2845 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2846 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2849 m = re.search('<h1>([^<]+)</h1>', coursepage)
2851 info['title'] = unescapeHTML(m.group(1))
2853 info['title'] = info['id']
2855 m = re.search('<description>([^<]+)</description>', coursepage)
2857 info['description'] = unescapeHTML(m.group(1))
2859 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2862 'type': 'reference',
2863 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2867 for entry in info['list']:
2868 assert entry['type'] == 'reference'
2869 results += self.extract(entry['url'])
2874 'id': 'Stanford OpenClassroom',
2878 self.report_download_webpage(info['id'])
2879 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2881 rootpage = urllib2.urlopen(rootURL).read()
2882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2883 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2886 info['title'] = info['id']
2888 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2891 'type': 'reference',
2892 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2897 for entry in info['list']:
2898 assert entry['type'] == 'reference'
2899 results += self.extract(entry['url'])
2902 class MTVIE(InfoExtractor):
2903 """Information extractor for MTV.com"""
2905 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2908 def report_webpage(self, video_id):
2909 """Report information extraction."""
2910 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2912 def report_extraction(self, video_id):
2913 """Report information extraction."""
2914 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2916 def _real_extract(self, url):
2917 mobj = re.match(self._VALID_URL, url)
2919 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2921 if not mobj.group('proto'):
2922 url = 'http://' + url
2923 video_id = mobj.group('videoid')
2924 self.report_webpage(video_id)
2926 request = urllib2.Request(url)
2928 webpage = urllib2.urlopen(request).read()
2929 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2930 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2933 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2935 self._downloader.trouble(u'ERROR: unable to extract song name')
2937 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2938 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2940 self._downloader.trouble(u'ERROR: unable to extract performer')
2942 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2943 video_title = performer + ' - ' + song_name
2945 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2947 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2949 mtvn_uri = mobj.group(1)
2951 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2953 self._downloader.trouble(u'ERROR: unable to extract content id')
2955 content_id = mobj.group(1)
2957 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2958 self.report_extraction(video_id)
2959 request = urllib2.Request(videogen_url)
2961 metadataXml = urllib2.urlopen(request).read()
2962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2963 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2966 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2967 renditions = mdoc.findall('.//rendition')
2969 # For now, always pick the highest quality.
2970 rendition = renditions[-1]
2973 _,_,ext = rendition.attrib['type'].partition('/')
2974 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2975 video_url = rendition.find('./src').text
2977 self._downloader.trouble('Invalid rendition field.')
2983 'uploader': performer,
2984 'title': video_title,
2992 class YoukuIE(InfoExtractor):
2994 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2997 def __init__(self, downloader=None):
2998 InfoExtractor.__init__(self, downloader)
3000 def report_download_webpage(self, file_id):
3001 """Report webpage download."""
3002 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3004 def report_extraction(self, file_id):
3005 """Report information extraction."""
3006 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3009 nowTime = int(time.time() * 1000)
3010 random1 = random.randint(1000,1998)
3011 random2 = random.randint(1000,9999)
3013 return "%d%d%d" %(nowTime,random1,random2)
3015 def _get_file_ID_mix_string(self, seed):
3017 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3019 for i in range(len(source)):
3020 seed = (seed * 211 + 30031 ) % 65536
3021 index = math.floor(seed / 65536 * len(source) )
3022 mixed.append(source[int(index)])
3023 source.remove(source[int(index)])
3024 #return ''.join(mixed)
3027 def _get_file_id(self, fileId, seed):
3028 mixed = self._get_file_ID_mix_string(seed)
3029 ids = fileId.split('*')
3033 realId.append(mixed[int(ch)])
3034 return ''.join(realId)
3036 def _real_extract(self, url):
3037 mobj = re.match(self._VALID_URL, url)
3039 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3041 video_id = mobj.group('ID')
3043 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3045 request = urllib2.Request(info_url, None, std_headers)
3047 self.report_download_webpage(video_id)
3048 jsondata = urllib2.urlopen(request).read()
3049 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3050 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3053 self.report_extraction(video_id)
3055 config = json.loads(jsondata)
3057 video_title = config['data'][0]['title']
3058 seed = config['data'][0]['seed']
3060 format = self._downloader.params.get('format', None)
3061 supported_format = config['data'][0]['streamfileids'].keys()
3063 if format is None or format == 'best':
3064 if 'hd2' in supported_format:
3069 elif format == 'worst':
3077 fileid = config['data'][0]['streamfileids'][format]
3078 seg_number = len(config['data'][0]['segs'][format])
3081 for i in xrange(seg_number):
3082 keys.append(config['data'][0]['segs'][format][i]['k'])
3085 #youku only could be viewed from mainland china
3087 self._downloader.trouble(u'ERROR: unable to extract info section')
3091 sid = self._gen_sid()
3092 fileid = self._get_file_id(fileid, seed)
3094 #column 8,9 of fileid represent the segment number
3095 #fileid[7:9] should be changed
3096 for index, key in enumerate(keys):
3098 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3099 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3102 'id': '%s_part%02d' % (video_id, index),
3103 'url': download_url,
3105 'title': video_title,
3109 files_info.append(info)
3114 class XNXXIE(InfoExtractor):
3115 """Information extractor for xnxx.com"""
3117 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3119 VIDEO_URL_RE = r'flv_url=(.*?)&'
3120 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3121 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3123 def report_webpage(self, video_id):
3124 """Report information extraction"""
3125 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3127 def report_extraction(self, video_id):
3128 """Report information extraction"""
3129 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3131 def _real_extract(self, url):
3132 mobj = re.match(self._VALID_URL, url)
3134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3136 video_id = mobj.group(1).decode('utf-8')
3138 self.report_webpage(video_id)
3140 # Get webpage content
3142 webpage = urllib2.urlopen(url).read()
3143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3144 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3147 result = re.search(self.VIDEO_URL_RE, webpage)
3149 self._downloader.trouble(u'ERROR: unable to extract video url')
3151 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3153 result = re.search(self.VIDEO_TITLE_RE, webpage)
3155 self._downloader.trouble(u'ERROR: unable to extract video title')
3157 video_title = result.group(1).decode('utf-8')
3159 result = re.search(self.VIDEO_THUMB_RE, webpage)
3161 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3163 video_thumbnail = result.group(1).decode('utf-8')
3165 info = {'id': video_id,
3168 'upload_date': None,
3169 'title': video_title,
3172 'thumbnail': video_thumbnail,
3173 'description': None,