2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
106 (?: # the various things that can precede the ID:
107 (?:(?:v|embed|e)/) # v/ or embed/ or e/
108 |(?: # or the v= param in all its forms
109 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
110 (?:\?|\#!?) # the params delimiter ? or # or #!
111 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
114 )? # optional -> youtube.com/xxxx is OK
115 )? # all until now is optional -> you can pass the naked ID
116 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
117 (?(1).+)? # if we found the ID, everything can follow
119 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
120 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
121 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
122 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
123 _NETRC_MACHINE = 'youtube'
124 # Listed in order of quality
125 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
126 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
127 _video_extensions = {
133 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
139 _video_dimensions = {
157 def suitable(self, url):
158 """Receives a URL and returns True if suitable for this IE."""
159 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161 def report_lang(self):
162 """Report attempt to set language."""
163 self._downloader.to_screen(u'[youtube] Setting language')
165 def report_login(self):
166 """Report attempt to log in."""
167 self._downloader.to_screen(u'[youtube] Logging in')
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self._downloader.to_screen(u'[youtube] Confirming age')
173 def report_video_webpage_download(self, video_id):
174 """Report attempt to download video webpage."""
175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177 def report_video_info_webpage_download(self, video_id):
178 """Report attempt to download video info webpage."""
179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181 def report_video_subtitles_download(self, video_id):
182 """Report attempt to download video info webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185 def report_information_extraction(self, video_id):
186 """Report attempt to extract video information."""
187 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189 def report_unavailable_format(self, video_id, format):
190 """Report extracted video URL."""
191 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193 def report_rtmp_download(self):
194 """Indicate the download will use the RTMP protocol."""
195 self._downloader.to_screen(u'[youtube] RTMP download detected')
197 def _closed_captions_xml_to_srt(self, xml_string):
199 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
200 # TODO parse xml instead of regex
201 for n, (start, dur_tag, dur, caption) in enumerate(texts):
202 if not dur: dur = '4'
204 end = start + float(dur)
205 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
206 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
207 caption = unescapeHTML(caption)
208 caption = unescapeHTML(caption) # double cycle, intentional
209 srt += str(n+1) + '\n'
210 srt += start + ' --> ' + end + '\n'
211 srt += caption + '\n\n'
214 def _print_formats(self, formats):
215 print 'Available formats:'
217 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219 def _real_initialize(self):
220 if self._downloader is None:
225 downloader_params = self._downloader.params
227 # Attempt to use provided username and password or .netrc data
228 if downloader_params.get('username', None) is not None:
229 username = downloader_params['username']
230 password = downloader_params['password']
231 elif downloader_params.get('usenetrc', False):
233 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
238 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
239 except (IOError, netrc.NetrcParseError), err:
240 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 request = urllib2.Request(self._LANG_URL)
247 urllib2.urlopen(request).read()
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
252 # No authentication to be performed
258 'current_form': 'loginForm',
260 'action_login': 'Log In',
261 'username': username,
262 'password': password,
264 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
267 login_results = urllib2.urlopen(request).read()
268 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
269 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
272 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
278 'action_confirm': 'Confirm',
280 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 self.report_age_confirmation()
283 age_results = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
288 def _real_extract(self, url):
289 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
290 mobj = re.search(self._NEXT_URL_RE, url)
292 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294 # Extract video id from URL
295 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 video_id = mobj.group(2)
302 self.report_video_webpage_download(video_id)
303 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 video_webpage = urllib2.urlopen(request).read()
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
310 # Attempt to extract SWF player URL
311 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
318 self.report_video_info_webpage_download(video_id)
319 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
320 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
321 % (video_id, el_type))
322 request = urllib2.Request(video_info_url)
324 video_info_webpage = urllib2.urlopen(request).read()
325 video_info = parse_qs(video_info_webpage)
326 if 'token' in video_info:
328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
329 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 if 'token' not in video_info:
332 if 'reason' in video_info:
333 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
338 # Check for "rental" videos
339 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
340 self._downloader.trouble(u'ERROR: "rental" videos not supported')
343 # Start extracting information
344 self.report_information_extraction(video_id)
347 if 'author' not in video_info:
348 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 video_uploader = urllib.unquote_plus(video_info['author'][0])
353 if 'title' not in video_info:
354 self._downloader.trouble(u'ERROR: unable to extract video title')
356 video_title = urllib.unquote_plus(video_info['title'][0])
357 video_title = video_title.decode('utf-8')
360 if 'thumbnail_url' not in video_info:
361 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 else: # don't panic if we can't find it
364 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
371 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
372 for expression in format_expressions:
374 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
379 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
380 if video_description: video_description = clean_html(video_description)
381 else: video_description = ''
384 video_subtitles = None
385 if self._downloader.params.get('writesubtitles', False):
387 self.report_video_subtitles_download(video_id)
388 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 srt_list = urllib2.urlopen(request).read()
391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
392 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
393 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
394 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
395 if not srt_lang_list:
396 raise Trouble(u'WARNING: video has no closed captions')
397 if self._downloader.params.get('subtitleslang', False):
398 srt_lang = self._downloader.params.get('subtitleslang')
399 elif 'en' in srt_lang_list:
402 srt_lang = srt_lang_list.keys()[0]
403 if not srt_lang in srt_lang_list:
404 raise Trouble(u'WARNING: no closed captions found in the specified language')
405 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 srt_xml = urllib2.urlopen(request).read()
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 raise Trouble(u'WARNING: unable to download video subtitles')
412 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
413 except Trouble as trouble:
414 self._downloader.trouble(trouble[0])
417 video_token = urllib.unquote_plus(video_info['token'][0])
419 # Decide which formats to download
420 req_format = self._downloader.params.get('format', None)
422 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
423 self.report_rtmp_download()
424 video_url_list = [(None, video_info['conn'][0])]
425 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
426 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
427 url_data = [parse_qs(uds) for uds in url_data_strs]
428 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
429 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
431 format_limit = self._downloader.params.get('format_limit', None)
432 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
433 if format_limit is not None and format_limit in available_formats:
434 format_list = available_formats[available_formats.index(format_limit):]
436 format_list = available_formats
437 existing_formats = [x for x in format_list if x in url_map]
438 if len(existing_formats) == 0:
439 self._downloader.trouble(u'ERROR: no known formats available for video')
441 if self._downloader.params.get('listformats', None):
442 self._print_formats(existing_formats)
444 if req_format is None or req_format == 'best':
445 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
446 elif req_format == 'worst':
447 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
448 elif req_format in ('-1', 'all'):
449 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 # Specific formats. We pick the first in a slash-delimeted sequence.
452 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
453 req_formats = req_format.split('/')
454 video_url_list = None
455 for rf in req_formats:
457 video_url_list = [(rf, url_map[rf])]
459 if video_url_list is None:
460 self._downloader.trouble(u'ERROR: requested format not available')
463 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
467 for format_param, video_real_url in video_url_list:
469 video_extension = self._video_extensions.get(format_param, 'flv')
472 'id': video_id.decode('utf-8'),
473 'url': video_real_url.decode('utf-8'),
474 'uploader': video_uploader.decode('utf-8'),
475 'upload_date': upload_date,
476 'title': video_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
487 class MetacafeIE(InfoExtractor):
488 """Information Extractor for metacafe.com."""
490 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
491 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
492 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
493 IE_NAME = u'metacafe'
495 def __init__(self, downloader=None):
496 InfoExtractor.__init__(self, downloader)
498 def report_disclaimer(self):
499 """Report disclaimer retrieval."""
500 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502 def report_age_confirmation(self):
503 """Report attempt to confirm age."""
504 self._downloader.to_screen(u'[metacafe] Confirming age')
506 def report_download_webpage(self, video_id):
507 """Report webpage download."""
508 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510 def report_extraction(self, video_id):
511 """Report information extraction."""
512 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514 def _real_initialize(self):
515 # Retrieve disclaimer
516 request = urllib2.Request(self._DISCLAIMER)
518 self.report_disclaimer()
519 disclaimer = urllib2.urlopen(request).read()
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
527 'submit': "Continue - I'm over 18",
529 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 self.report_age_confirmation()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract id and simplified title from URL
539 mobj = re.match(self._VALID_URL, url)
541 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
544 video_id = mobj.group(1)
546 # Check if video comes from YouTube
547 mobj2 = re.match(r'^yt-(.*)$', video_id)
548 if mobj2 is not None:
549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
552 # Retrieve video webpage to extract further information
553 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 self.report_download_webpage(video_id)
556 webpage = urllib2.urlopen(request).read()
557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
558 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
561 # Extract URL, uploader and title from webpage
562 self.report_extraction(video_id)
563 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 mediaURL = urllib.unquote(mobj.group(1))
566 video_extension = mediaURL[-3:]
568 # Extract gdaKey if available
569 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
573 gdaKey = mobj.group(1)
574 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 vardict = parse_qs(mobj.group(1))
581 if 'mediaData' not in vardict:
582 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 mediaURL = mobj.group(1).replace('\\/', '/')
589 video_extension = mediaURL[-3:]
590 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 self._downloader.trouble(u'ERROR: unable to extract title')
596 video_title = mobj.group(1).decode('utf-8')
598 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 video_uploader = mobj.group(1)
605 'id': video_id.decode('utf-8'),
606 'url': video_url.decode('utf-8'),
607 'uploader': video_uploader.decode('utf-8'),
608 'upload_date': u'NA',
609 'title': video_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1).split('_')[0].split('?')[0]
642 video_extension = 'mp4'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 flashvars = urllib.unquote(mobj.group(1))
662 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
665 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
668 self._downloader.trouble(u'ERROR: unable to extract video URL')
671 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
673 self._downloader.trouble(u'ERROR: unable to extract video URL')
676 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
678 # TODO: support choosing qualities
680 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract title')
684 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
686 video_uploader = u'NA'
687 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
689 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
691 video_uploader = mobj.group(1)
693 video_upload_date = u'NA'
694 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
696 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
699 'id': video_id.decode('utf-8'),
700 'url': video_url.decode('utf-8'),
701 'uploader': video_uploader.decode('utf-8'),
702 'upload_date': video_upload_date,
703 'title': video_title,
704 'ext': video_extension.decode('utf-8'),
710 class GoogleIE(InfoExtractor):
711 """Information extractor for video.google.com."""
713 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
714 IE_NAME = u'video.google'
716 def __init__(self, downloader=None):
717 InfoExtractor.__init__(self, downloader)
719 def report_download_webpage(self, video_id):
720 """Report webpage download."""
721 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
723 def report_extraction(self, video_id):
724 """Report information extraction."""
725 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
727 def _real_extract(self, url):
728 # Extract id from URL
729 mobj = re.match(self._VALID_URL, url)
731 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
734 video_id = mobj.group(1)
736 video_extension = 'mp4'
738 # Retrieve video webpage to extract further information
739 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
741 self.report_download_webpage(video_id)
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
747 # Extract URL, uploader, and title from webpage
748 self.report_extraction(video_id)
749 mobj = re.search(r"download_url:'([^']+)'", webpage)
751 video_extension = 'flv'
752 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
756 mediaURL = urllib.unquote(mobj.group(1))
757 mediaURL = mediaURL.replace('\\x3d', '\x3d')
758 mediaURL = mediaURL.replace('\\x26', '\x26')
762 mobj = re.search(r'<title>(.*)</title>', webpage)
764 self._downloader.trouble(u'ERROR: unable to extract title')
766 video_title = mobj.group(1).decode('utf-8')
768 # Extract video description
769 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
771 self._downloader.trouble(u'ERROR: unable to extract video description')
773 video_description = mobj.group(1).decode('utf-8')
774 if not video_description:
775 video_description = 'No description available.'
777 # Extract video thumbnail
778 if self._downloader.params.get('forcethumbnail', False):
779 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
781 webpage = urllib2.urlopen(request).read()
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
785 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
787 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
789 video_thumbnail = mobj.group(1)
790 else: # we need something to pass to process_info
794 'id': video_id.decode('utf-8'),
795 'url': video_url.decode('utf-8'),
797 'upload_date': u'NA',
798 'title': video_title,
799 'ext': video_extension.decode('utf-8'),
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = urllib2.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = urllib2.urlopen(request).read()
838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = urllib.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
864 'upload_date': u'NA',
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
872 class YahooIE(InfoExtractor):
873 """Information extractor for video.yahoo.com."""
875 # _VALID_URL matches all Yahoo! Video URLs
876 # _VPAGE_URL matches only the extractable '/watch/' URLs
877 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
878 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
879 IE_NAME = u'video.yahoo'
881 def __init__(self, downloader=None):
882 InfoExtractor.__init__(self, downloader)
884 def report_download_webpage(self, video_id):
885 """Report webpage download."""
886 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
888 def report_extraction(self, video_id):
889 """Report information extraction."""
890 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
892 def _real_extract(self, url, new_video=True):
893 # Extract ID from URL
894 mobj = re.match(self._VALID_URL, url)
896 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
899 video_id = mobj.group(2)
900 video_extension = 'flv'
902 # Rewrite valid but non-extractable URLs as
903 # extractable English language /watch/ URLs
904 if re.match(self._VPAGE_URL, url) is None:
905 request = urllib2.Request(url)
907 webpage = urllib2.urlopen(request).read()
908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
909 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
912 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
914 self._downloader.trouble(u'ERROR: Unable to extract id field')
916 yahoo_id = mobj.group(1)
918 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
920 self._downloader.trouble(u'ERROR: Unable to extract vid field')
922 yahoo_vid = mobj.group(1)
924 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
925 return self._real_extract(url, new_video=False)
927 # Retrieve video webpage to extract further information
928 request = urllib2.Request(url)
930 self.report_download_webpage(video_id)
931 webpage = urllib2.urlopen(request).read()
932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
933 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
936 # Extract uploader and title from webpage
937 self.report_extraction(video_id)
938 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
940 self._downloader.trouble(u'ERROR: unable to extract video title')
942 video_title = mobj.group(1).decode('utf-8')
944 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
946 self._downloader.trouble(u'ERROR: unable to extract video uploader')
948 video_uploader = mobj.group(1).decode('utf-8')
950 # Extract video thumbnail
951 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
955 video_thumbnail = mobj.group(1).decode('utf-8')
957 # Extract video description
958 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video description')
962 video_description = mobj.group(1).decode('utf-8')
963 if not video_description:
964 video_description = 'No description available.'
966 # Extract video height and width
967 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video height')
971 yv_video_height = mobj.group(1)
973 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video width')
977 yv_video_width = mobj.group(1)
979 # Retrieve video playlist to extract media URL
980 # I'm not completely sure what all these options are, but we
981 # seem to need most of them, otherwise the server sends a 401.
982 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
983 yv_bitrate = '700' # according to Wikipedia this is hard-coded
984 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
985 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
986 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
988 self.report_download_webpage(video_id)
989 webpage = urllib2.urlopen(request).read()
990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
994 # Extract media URL from playlist XML
995 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
997 self._downloader.trouble(u'ERROR: Unable to extract media URL')
999 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000 video_url = unescapeHTML(video_url)
1003 'id': video_id.decode('utf-8'),
1005 'uploader': video_uploader,
1006 'upload_date': u'NA',
1007 'title': video_title,
1008 'ext': video_extension.decode('utf-8'),
1009 'thumbnail': video_thumbnail.decode('utf-8'),
1010 'description': video_description,
1011 'thumbnail': video_thumbnail,
1016 class VimeoIE(InfoExtractor):
1017 """Information extractor for vimeo.com."""
1019 # _VALID_URL matches Vimeo URLs
1020 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1023 def __init__(self, downloader=None):
1024 InfoExtractor.__init__(self, downloader)
1026 def report_download_webpage(self, video_id):
1027 """Report webpage download."""
1028 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030 def report_extraction(self, video_id):
1031 """Report information extraction."""
1032 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034 def _real_extract(self, url, new_video=True):
1035 # Extract ID from URL
1036 mobj = re.match(self._VALID_URL, url)
1038 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1041 video_id = mobj.group(1)
1043 # Retrieve video webpage to extract further information
1044 request = urllib2.Request(url, None, std_headers)
1046 self.report_download_webpage(video_id)
1047 webpage = urllib2.urlopen(request).read()
1048 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1049 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1052 # Now we begin extracting as much information as we can from what we
1053 # retrieved. First we extract the information common to all extractors,
1054 # and latter we extract those that are Vimeo specific.
1055 self.report_extraction(video_id)
1057 # Extract the config JSON
1058 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1060 config = json.loads(config)
1062 self._downloader.trouble(u'ERROR: unable to extract info section')
1066 video_title = config["video"]["title"]
1069 video_uploader = config["video"]["owner"]["name"]
1071 # Extract video thumbnail
1072 video_thumbnail = config["video"]["thumbnail"]
1074 # Extract video description
1075 video_description = get_element_by_id("description", webpage.decode('utf8'))
1076 if video_description: video_description = clean_html(video_description)
1077 else: video_description = ''
1079 # Extract upload date
1080 video_upload_date = u'NA'
1081 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1082 if mobj is not None:
1083 video_upload_date = mobj.group(1)
1085 # Vimeo specific: extract request signature and timestamp
1086 sig = config['request']['signature']
1087 timestamp = config['request']['timestamp']
1089 # Vimeo specific: extract video codec and quality information
1090 # TODO bind to format param
1091 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1092 for codec in codecs:
1093 if codec[0] in config["video"]["files"]:
1094 video_codec = codec[0]
1095 video_extension = codec[1]
1096 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1097 else: quality = 'sd'
1100 self._downloader.trouble(u'ERROR: no known codec found')
1103 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1104 %(video_id, sig, timestamp, quality, video_codec.upper())
1109 'uploader': video_uploader,
1110 'upload_date': video_upload_date,
1111 'title': video_title,
1112 'ext': video_extension,
1113 'thumbnail': video_thumbnail,
1114 'description': video_description,
1119 class GenericIE(InfoExtractor):
1120 """Generic last-resort information extractor."""
1123 IE_NAME = u'generic'
1125 def __init__(self, downloader=None):
1126 InfoExtractor.__init__(self, downloader)
1128 def report_download_webpage(self, video_id):
1129 """Report webpage download."""
1130 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1131 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1133 def report_extraction(self, video_id):
1134 """Report information extraction."""
1135 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1137 def report_following_redirect(self, new_url):
1138 """Report information extraction."""
1139 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1141 def _test_redirect(self, url):
1142 """Check if it is a redirect, like url shorteners, in case restart chain."""
1143 class HeadRequest(urllib2.Request):
1144 def get_method(self):
1147 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1149 Subclass the HTTPRedirectHandler to make it use our
1150 HeadRequest also on the redirected URL
1152 def redirect_request(self, req, fp, code, msg, headers, newurl):
1153 if code in (301, 302, 303, 307):
1154 newurl = newurl.replace(' ', '%20')
1155 newheaders = dict((k,v) for k,v in req.headers.items()
1156 if k.lower() not in ("content-length", "content-type"))
1157 return HeadRequest(newurl,
1159 origin_req_host=req.get_origin_req_host(),
1162 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1164 class HTTPMethodFallback(urllib2.BaseHandler):
1166 Fallback to GET if HEAD is not allowed (405 HTTP error)
1168 def http_error_405(self, req, fp, code, msg, headers):
1172 newheaders = dict((k,v) for k,v in req.headers.items()
1173 if k.lower() not in ("content-length", "content-type"))
1174 return self.parent.open(urllib2.Request(req.get_full_url(),
1176 origin_req_host=req.get_origin_req_host(),
1180 opener = urllib2.OpenerDirector()
1181 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1182 HTTPMethodFallback, HEADRedirectHandler,
1183 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1184 opener.add_handler(handler())
1186 response = opener.open(HeadRequest(url))
1187 new_url = response.geturl()
1189 if url == new_url: return False
1191 self.report_following_redirect(new_url)
1192 self._downloader.download([new_url])
1195 def _real_extract(self, url):
1196 if self._test_redirect(url): return
1198 video_id = url.split('/')[-1]
1199 request = urllib2.Request(url)
1201 self.report_download_webpage(video_id)
1202 webpage = urllib2.urlopen(request).read()
1203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1206 except ValueError, err:
1207 # since this is the last-resort InfoExtractor, if
1208 # this error is thrown, it'll be thrown here
1209 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1212 self.report_extraction(video_id)
1213 # Start with something easy: JW Player in SWFObject
1214 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1216 # Broaden the search a little bit
1217 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1219 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1222 # It's possible that one of the regexes
1223 # matched, but returned an empty group:
1224 if mobj.group(1) is None:
1225 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228 video_url = urllib.unquote(mobj.group(1))
1229 video_id = os.path.basename(video_url)
1231 # here's a fun little line of code for you:
1232 video_extension = os.path.splitext(video_id)[1][1:]
1233 video_id = os.path.splitext(video_id)[0]
1235 # it's tempting to parse this further, but you would
1236 # have to take into account all the variations like
1237 # Video Title - Site Name
1238 # Site Name | Video Title
1239 # Video Title - Tagline | Site Name
1240 # and so on and so forth; it's just not practical
1241 mobj = re.search(r'<title>(.*)</title>', webpage)
1243 self._downloader.trouble(u'ERROR: unable to extract title')
1245 video_title = mobj.group(1).decode('utf-8')
1247 # video uploader is domain name
1248 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1250 self._downloader.trouble(u'ERROR: unable to extract title')
1252 video_uploader = mobj.group(1).decode('utf-8')
1255 'id': video_id.decode('utf-8'),
1256 'url': video_url.decode('utf-8'),
1257 'uploader': video_uploader,
1258 'upload_date': u'NA',
1259 'title': video_title,
1260 'ext': video_extension.decode('utf-8'),
1266 class YoutubeSearchIE(InfoExtractor):
1267 """Information Extractor for YouTube search queries."""
1268 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1269 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1270 _max_youtube_results = 1000
1271 IE_NAME = u'youtube:search'
1273 def __init__(self, downloader=None):
1274 InfoExtractor.__init__(self, downloader)
1276 def report_download_page(self, query, pagenum):
1277 """Report attempt to download search page with given number."""
1278 query = query.decode(preferredencoding())
1279 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1281 def _real_extract(self, query):
1282 mobj = re.match(self._VALID_URL, query)
1284 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1287 prefix, query = query.split(':')
1289 query = query.encode('utf-8')
1291 self._download_n_results(query, 1)
1293 elif prefix == 'all':
1294 self._download_n_results(query, self._max_youtube_results)
1300 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1302 elif n > self._max_youtube_results:
1303 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1304 n = self._max_youtube_results
1305 self._download_n_results(query, n)
1307 except ValueError: # parsing prefix as integer fails
1308 self._download_n_results(query, 1)
1311 def _download_n_results(self, query, n):
1312 """Downloads a specified number of results for a query"""
1318 while (50 * pagenum) < limit:
1319 self.report_download_page(query, pagenum+1)
1320 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1321 request = urllib2.Request(result_url)
1323 data = urllib2.urlopen(request).read()
1324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1327 api_response = json.loads(data)['data']
1329 new_ids = list(video['id'] for video in api_response['items'])
1330 video_ids += new_ids
1332 limit = min(n, api_response['totalItems'])
1335 if len(video_ids) > n:
1336 video_ids = video_ids[:n]
1337 for id in video_ids:
1338 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1342 class GoogleSearchIE(InfoExtractor):
1343 """Information Extractor for Google Video search queries."""
1344 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1345 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1346 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1347 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1348 _max_google_results = 1000
1349 IE_NAME = u'video.google:search'
1351 def __init__(self, downloader=None):
1352 InfoExtractor.__init__(self, downloader)
1354 def report_download_page(self, query, pagenum):
1355 """Report attempt to download playlist page with given number."""
1356 query = query.decode(preferredencoding())
1357 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1359 def _real_extract(self, query):
1360 mobj = re.match(self._VALID_URL, query)
1362 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1365 prefix, query = query.split(':')
1367 query = query.encode('utf-8')
1369 self._download_n_results(query, 1)
1371 elif prefix == 'all':
1372 self._download_n_results(query, self._max_google_results)
1378 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1380 elif n > self._max_google_results:
1381 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1382 n = self._max_google_results
1383 self._download_n_results(query, n)
1385 except ValueError: # parsing prefix as integer fails
1386 self._download_n_results(query, 1)
1389 def _download_n_results(self, query, n):
1390 """Downloads a specified number of results for a query"""
1396 self.report_download_page(query, pagenum)
1397 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1398 request = urllib2.Request(result_url)
1400 page = urllib2.urlopen(request).read()
1401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1405 # Extract video identifiers
1406 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1407 video_id = mobj.group(1)
1408 if video_id not in video_ids:
1409 video_ids.append(video_id)
1410 if len(video_ids) == n:
1411 # Specified n videos reached
1412 for id in video_ids:
1413 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1416 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1417 for id in video_ids:
1418 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1421 pagenum = pagenum + 1
1424 class YahooSearchIE(InfoExtractor):
1425 """Information Extractor for Yahoo! Video search queries."""
1426 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1427 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1428 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1429 _MORE_PAGES_INDICATOR = r'\s*Next'
1430 _max_yahoo_results = 1000
1431 IE_NAME = u'video.yahoo:search'
1433 def __init__(self, downloader=None):
1434 InfoExtractor.__init__(self, downloader)
1436 def report_download_page(self, query, pagenum):
1437 """Report attempt to download playlist page with given number."""
1438 query = query.decode(preferredencoding())
1439 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1441 def _real_extract(self, query):
1442 mobj = re.match(self._VALID_URL, query)
1444 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1447 prefix, query = query.split(':')
1449 query = query.encode('utf-8')
1451 self._download_n_results(query, 1)
1453 elif prefix == 'all':
1454 self._download_n_results(query, self._max_yahoo_results)
1460 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462 elif n > self._max_yahoo_results:
1463 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1464 n = self._max_yahoo_results
1465 self._download_n_results(query, n)
1467 except ValueError: # parsing prefix as integer fails
1468 self._download_n_results(query, 1)
1471 def _download_n_results(self, query, n):
1472 """Downloads a specified number of results for a query"""
1475 already_seen = set()
1479 self.report_download_page(query, pagenum)
1480 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1481 request = urllib2.Request(result_url)
1483 page = urllib2.urlopen(request).read()
1484 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1488 # Extract video identifiers
1489 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1490 video_id = mobj.group(1)
1491 if video_id not in already_seen:
1492 video_ids.append(video_id)
1493 already_seen.add(video_id)
1494 if len(video_ids) == n:
1495 # Specified n videos reached
1496 for id in video_ids:
1497 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1500 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1501 for id in video_ids:
1502 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1505 pagenum = pagenum + 1
1508 class YoutubePlaylistIE(InfoExtractor):
1509 """Information Extractor for YouTube playlists."""
1511 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1512 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1513 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1514 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1515 IE_NAME = u'youtube:playlist'
1517 def __init__(self, downloader=None):
1518 InfoExtractor.__init__(self, downloader)
1520 def report_download_page(self, playlist_id, pagenum):
1521 """Report attempt to download playlist page with given number."""
1522 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1524 def _real_extract(self, url):
1525 # Extract playlist id
1526 mobj = re.match(self._VALID_URL, url)
1528 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1532 if mobj.group(3) is not None:
1533 self._downloader.download([mobj.group(3)])
1536 # Download playlist pages
1537 # prefix is 'p' as default for playlists but there are other types that need extra care
1538 playlist_prefix = mobj.group(1)
1539 if playlist_prefix == 'a':
1540 playlist_access = 'artist'
1542 playlist_prefix = 'p'
1543 playlist_access = 'view_play_list'
1544 playlist_id = mobj.group(2)
1549 self.report_download_page(playlist_id, pagenum)
1550 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1551 request = urllib2.Request(url)
1553 page = urllib2.urlopen(request).read()
1554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1558 # Extract video identifiers
1560 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1561 if mobj.group(1) not in ids_in_page:
1562 ids_in_page.append(mobj.group(1))
1563 video_ids.extend(ids_in_page)
1565 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567 pagenum = pagenum + 1
1569 playliststart = self._downloader.params.get('playliststart', 1) - 1
1570 playlistend = self._downloader.params.get('playlistend', -1)
1571 if playlistend == -1:
1572 video_ids = video_ids[playliststart:]
1574 video_ids = video_ids[playliststart:playlistend]
1576 for id in video_ids:
1577 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1581 class YoutubeUserIE(InfoExtractor):
1582 """Information Extractor for YouTube users."""
1584 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1585 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1586 _GDATA_PAGE_SIZE = 50
1587 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1588 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1589 IE_NAME = u'youtube:user'
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1594 def report_download_page(self, username, start_index):
1595 """Report attempt to download user page."""
1596 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1597 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1599 def _real_extract(self, url):
1601 mobj = re.match(self._VALID_URL, url)
1603 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1606 username = mobj.group(1)
1608 # Download video ids using YouTube Data API. Result size per
1609 # query is limited (currently to 50 videos) so we need to query
1610 # page by page until there are no video ids - it means we got
1617 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1618 self.report_download_page(username, start_index)
1620 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1623 page = urllib2.urlopen(request).read()
1624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1625 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1628 # Extract video identifiers
1631 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1632 if mobj.group(1) not in ids_in_page:
1633 ids_in_page.append(mobj.group(1))
1635 video_ids.extend(ids_in_page)
1637 # A little optimization - if current page is not
1638 # "full", ie. does not contain PAGE_SIZE video ids then
1639 # we can assume that this page is the last one - there
1640 # are no more ids on further pages - no need to query
1643 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1648 all_ids_count = len(video_ids)
1649 playliststart = self._downloader.params.get('playliststart', 1) - 1
1650 playlistend = self._downloader.params.get('playlistend', -1)
1652 if playlistend == -1:
1653 video_ids = video_ids[playliststart:]
1655 video_ids = video_ids[playliststart:playlistend]
1657 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1658 (username, all_ids_count, len(video_ids)))
1660 for video_id in video_ids:
1661 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1664 class BlipTVUserIE(InfoExtractor):
1665 """Information Extractor for blip.tv users."""
1667 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1669 IE_NAME = u'blip.tv:user'
1671 def __init__(self, downloader=None):
1672 InfoExtractor.__init__(self, downloader)
1674 def report_download_page(self, username, pagenum):
1675 """Report attempt to download user page."""
1676 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1677 (self.IE_NAME, username, pagenum))
1679 def _real_extract(self, url):
1681 mobj = re.match(self._VALID_URL, url)
1683 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1686 username = mobj.group(1)
1688 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1690 request = urllib2.Request(url)
1693 page = urllib2.urlopen(request).read().decode('utf-8')
1694 mobj = re.search(r'data-users-id="([^"]+)"', page)
1695 page_base = page_base % mobj.group(1)
1696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1697 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1701 # Download video ids using BlipTV Ajax calls. Result size per
1702 # query is limited (currently to 12 videos) so we need to query
1703 # page by page until there are no video ids - it means we got
1710 self.report_download_page(username, pagenum)
1712 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1715 page = urllib2.urlopen(request).read().decode('utf-8')
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1720 # Extract video identifiers
1723 for mobj in re.finditer(r'href="/([^"]+)"', page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(unescapeHTML(mobj.group(1)))
1727 video_ids.extend(ids_in_page)
1729 # A little optimization - if current page is not
1730 # "full", ie. does not contain PAGE_SIZE video ids then
1731 # we can assume that this page is the last one - there
1732 # are no more ids on further pages - no need to query
1735 if len(ids_in_page) < self._PAGE_SIZE:
1740 all_ids_count = len(video_ids)
1741 playliststart = self._downloader.params.get('playliststart', 1) - 1
1742 playlistend = self._downloader.params.get('playlistend', -1)
1744 if playlistend == -1:
1745 video_ids = video_ids[playliststart:]
1747 video_ids = video_ids[playliststart:playlistend]
1749 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1750 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1752 for video_id in video_ids:
1753 self._downloader.download([u'http://blip.tv/'+video_id])
1756 class DepositFilesIE(InfoExtractor):
1757 """Information extractor for depositfiles.com"""
1759 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1760 IE_NAME = u'DepositFiles'
1762 def __init__(self, downloader=None):
1763 InfoExtractor.__init__(self, downloader)
1765 def report_download_webpage(self, file_id):
1766 """Report webpage download."""
1767 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1769 def report_extraction(self, file_id):
1770 """Report information extraction."""
1771 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1773 def _real_extract(self, url):
1774 file_id = url.split('/')[-1]
1775 # Rebuild url in english locale
1776 url = 'http://depositfiles.com/en/files/' + file_id
1778 # Retrieve file webpage with 'Free download' button pressed
1779 free_download_indication = { 'gateway_result' : '1' }
1780 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1782 self.report_download_webpage(file_id)
1783 webpage = urllib2.urlopen(request).read()
1784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1788 # Search for the real file URL
1789 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1790 if (mobj is None) or (mobj.group(1) is None):
1791 # Try to figure out reason of the error.
1792 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1793 if (mobj is not None) and (mobj.group(1) is not None):
1794 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1795 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1797 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1800 file_url = mobj.group(1)
1801 file_extension = os.path.splitext(file_url)[1][1:]
1803 # Search for file title
1804 mobj = re.search(r'<b title="(.*?)">', webpage)
1806 self._downloader.trouble(u'ERROR: unable to extract title')
1808 file_title = mobj.group(1).decode('utf-8')
1811 'id': file_id.decode('utf-8'),
1812 'url': file_url.decode('utf-8'),
1814 'upload_date': u'NA',
1815 'title': file_title,
1816 'ext': file_extension.decode('utf-8'),
1822 class FacebookIE(InfoExtractor):
1823 """Information Extractor for Facebook"""
1825 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1826 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1827 _NETRC_MACHINE = 'facebook'
1828 _available_formats = ['video', 'highqual', 'lowqual']
1829 _video_extensions = {
1834 IE_NAME = u'facebook'
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1839 def _reporter(self, message):
1840 """Add header and report message."""
1841 self._downloader.to_screen(u'[facebook] %s' % message)
1843 def report_login(self):
1844 """Report attempt to log in."""
1845 self._reporter(u'Logging in')
1847 def report_video_webpage_download(self, video_id):
1848 """Report attempt to download video webpage."""
1849 self._reporter(u'%s: Downloading video webpage' % video_id)
1851 def report_information_extraction(self, video_id):
1852 """Report attempt to extract video information."""
1853 self._reporter(u'%s: Extracting video information' % video_id)
1855 def _parse_page(self, video_webpage):
1856 """Extract video information from page"""
1858 data = {'title': r'\("video_title", "(.*?)"\)',
1859 'description': r'<div class="datawrap">(.*?)</div>',
1860 'owner': r'\("video_owner_name", "(.*?)"\)',
1861 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1864 for piece in data.keys():
1865 mobj = re.search(data[piece], video_webpage)
1866 if mobj is not None:
1867 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1871 for fmt in self._available_formats:
1872 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1873 if mobj is not None:
1874 # URL is in a Javascript segment inside an escaped Unicode format within
1875 # the generally utf-8 page
1876 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1877 video_info['video_urls'] = video_urls
1881 def _real_initialize(self):
1882 if self._downloader is None:
1887 downloader_params = self._downloader.params
1889 # Attempt to use provided username and password or .netrc data
1890 if downloader_params.get('username', None) is not None:
1891 useremail = downloader_params['username']
1892 password = downloader_params['password']
1893 elif downloader_params.get('usenetrc', False):
1895 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1896 if info is not None:
1900 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1901 except (IOError, netrc.NetrcParseError), err:
1902 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1905 if useremail is None:
1914 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1917 login_results = urllib2.urlopen(request).read()
1918 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1919 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1922 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1925 def _real_extract(self, url):
1926 mobj = re.match(self._VALID_URL, url)
1928 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1930 video_id = mobj.group('ID')
1933 self.report_video_webpage_download(video_id)
1934 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1936 page = urllib2.urlopen(request)
1937 video_webpage = page.read()
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1942 # Start extracting information
1943 self.report_information_extraction(video_id)
1945 # Extract information
1946 video_info = self._parse_page(video_webpage)
1949 if 'owner' not in video_info:
1950 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1952 video_uploader = video_info['owner']
1955 if 'title' not in video_info:
1956 self._downloader.trouble(u'ERROR: unable to extract video title')
1958 video_title = video_info['title']
1959 video_title = video_title.decode('utf-8')
1962 if 'thumbnail' not in video_info:
1963 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1964 video_thumbnail = ''
1966 video_thumbnail = video_info['thumbnail']
1970 if 'upload_date' in video_info:
1971 upload_time = video_info['upload_date']
1972 timetuple = email.utils.parsedate_tz(upload_time)
1973 if timetuple is not None:
1975 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1980 video_description = video_info.get('description', 'No description available.')
1982 url_map = video_info['video_urls']
1983 if len(url_map.keys()) > 0:
1984 # Decide which formats to download
1985 req_format = self._downloader.params.get('format', None)
1986 format_limit = self._downloader.params.get('format_limit', None)
1988 if format_limit is not None and format_limit in self._available_formats:
1989 format_list = self._available_formats[self._available_formats.index(format_limit):]
1991 format_list = self._available_formats
1992 existing_formats = [x for x in format_list if x in url_map]
1993 if len(existing_formats) == 0:
1994 self._downloader.trouble(u'ERROR: no known formats available for video')
1996 if req_format is None:
1997 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1998 elif req_format == 'worst':
1999 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2000 elif req_format == '-1':
2001 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2004 if req_format not in url_map:
2005 self._downloader.trouble(u'ERROR: requested format not available')
2007 video_url_list = [(req_format, url_map[req_format])] # Specific format
2010 for format_param, video_real_url in video_url_list:
2012 video_extension = self._video_extensions.get(format_param, 'mp4')
2015 'id': video_id.decode('utf-8'),
2016 'url': video_real_url.decode('utf-8'),
2017 'uploader': video_uploader.decode('utf-8'),
2018 'upload_date': upload_date,
2019 'title': video_title,
2020 'ext': video_extension.decode('utf-8'),
2021 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2022 'thumbnail': video_thumbnail.decode('utf-8'),
2023 'description': video_description.decode('utf-8'),
2028 class BlipTVIE(InfoExtractor):
2029 """Information extractor for blip.tv"""
2031 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2032 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2033 IE_NAME = u'blip.tv'
2035 def report_extraction(self, file_id):
2036 """Report information extraction."""
2037 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2039 def report_direct_download(self, title):
2040 """Report information extraction."""
2041 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2043 def _real_extract(self, url):
2044 mobj = re.match(self._VALID_URL, url)
2046 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2053 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2054 request = urllib2.Request(json_url.encode('utf-8'))
2055 self.report_extraction(mobj.group(1))
2058 urlh = urllib2.urlopen(request)
2059 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2060 basename = url.split('/')[-1]
2061 title,ext = os.path.splitext(basename)
2062 title = title.decode('UTF-8')
2063 ext = ext.replace('.', '')
2064 self.report_direct_download(title)
2072 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2073 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2075 if info is None: # Regular URL
2077 json_code = urlh.read()
2078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2083 json_data = json.loads(json_code)
2084 if 'Post' in json_data:
2085 data = json_data['Post']
2089 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2090 video_url = data['media']['url']
2091 umobj = re.match(self._URL_EXT, video_url)
2093 raise ValueError('Can not determine filename extension')
2094 ext = umobj.group(1)
2097 'id': data['item_id'],
2099 'uploader': data['display_name'],
2100 'upload_date': upload_date,
2101 'title': data['title'],
2103 'format': data['media']['mimeType'],
2104 'thumbnail': data['thumbnailUrl'],
2105 'description': data['description'],
2106 'player_url': data['embedUrl']
2108 except (ValueError,KeyError), err:
2109 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2112 std_headers['User-Agent'] = 'iTunes/10.6.1'
2116 class MyVideoIE(InfoExtractor):
2117 """Information Extractor for myvideo.de."""
2119 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2120 IE_NAME = u'myvideo'
2122 def __init__(self, downloader=None):
2123 InfoExtractor.__init__(self, downloader)
2125 def report_download_webpage(self, video_id):
2126 """Report webpage download."""
2127 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2129 def report_extraction(self, video_id):
2130 """Report information extraction."""
2131 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2133 def _real_extract(self,url):
2134 mobj = re.match(self._VALID_URL, url)
2136 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2139 video_id = mobj.group(1)
2142 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2144 self.report_download_webpage(video_id)
2145 webpage = urllib2.urlopen(request).read()
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2150 self.report_extraction(video_id)
2151 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2154 self._downloader.trouble(u'ERROR: unable to extract media URL')
2156 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2158 mobj = re.search('<title>([^<]+)</title>', webpage)
2160 self._downloader.trouble(u'ERROR: unable to extract title')
2163 video_title = mobj.group(1)
2169 'upload_date': u'NA',
2170 'title': video_title,
2176 class ComedyCentralIE(InfoExtractor):
2177 """Information extractor for The Daily Show and Colbert Report """
2179 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2180 IE_NAME = u'comedycentral'
2182 def report_extraction(self, episode_id):
2183 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2185 def report_config_download(self, episode_id):
2186 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2188 def report_index_download(self, episode_id):
2189 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2191 def report_player_url(self, episode_id):
2192 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2194 def _real_extract(self, url):
2195 mobj = re.match(self._VALID_URL, url)
2197 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2200 if mobj.group('shortname'):
2201 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2202 url = u'http://www.thedailyshow.com/full-episodes/'
2204 url = u'http://www.colbertnation.com/full-episodes/'
2205 mobj = re.match(self._VALID_URL, url)
2206 assert mobj is not None
2208 dlNewest = not mobj.group('episode')
2210 epTitle = mobj.group('showname')
2212 epTitle = mobj.group('episode')
2214 req = urllib2.Request(url)
2215 self.report_extraction(epTitle)
2217 htmlHandle = urllib2.urlopen(req)
2218 html = htmlHandle.read()
2219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2220 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2223 url = htmlHandle.geturl()
2224 mobj = re.match(self._VALID_URL, url)
2226 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2228 if mobj.group('episode') == '':
2229 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2231 epTitle = mobj.group('episode')
2233 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2234 if len(mMovieParams) == 0:
2235 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2238 playerUrl_raw = mMovieParams[0][0]
2239 self.report_player_url(epTitle)
2241 urlHandle = urllib2.urlopen(playerUrl_raw)
2242 playerUrl = urlHandle.geturl()
2243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2244 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2247 uri = mMovieParams[0][1]
2248 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2249 self.report_index_download(epTitle)
2251 indexXml = urllib2.urlopen(indexUrl).read()
2252 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2258 idoc = xml.etree.ElementTree.fromstring(indexXml)
2259 itemEls = idoc.findall('.//item')
2260 for itemEl in itemEls:
2261 mediaId = itemEl.findall('./guid')[0].text
2262 shortMediaId = mediaId.split(':')[-1]
2263 showId = mediaId.split(':')[-2].replace('.com', '')
2264 officialTitle = itemEl.findall('./title')[0].text
2265 officialDate = itemEl.findall('./pubDate')[0].text
2267 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2268 urllib.urlencode({'uri': mediaId}))
2269 configReq = urllib2.Request(configUrl)
2270 self.report_config_download(epTitle)
2272 configXml = urllib2.urlopen(configReq).read()
2273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2274 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2277 cdoc = xml.etree.ElementTree.fromstring(configXml)
2279 for rendition in cdoc.findall('.//rendition'):
2280 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2284 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2287 # For now, just pick the highest bitrate
2288 format,video_url = turls[-1]
2290 effTitle = showId + u'-' + epTitle
2295 'upload_date': officialDate,
2300 'description': officialTitle,
2301 'player_url': playerUrl
2304 results.append(info)
2309 class EscapistIE(InfoExtractor):
2310 """Information extractor for The Escapist """
2312 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2313 IE_NAME = u'escapist'
2315 def report_extraction(self, showName):
2316 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2318 def report_config_download(self, showName):
2319 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2321 def _real_extract(self, url):
2322 mobj = re.match(self._VALID_URL, url)
2324 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2326 showName = mobj.group('showname')
2327 videoId = mobj.group('episode')
2329 self.report_extraction(showName)
2331 webPage = urllib2.urlopen(url)
2332 webPageBytes = webPage.read()
2333 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2334 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2336 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2339 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2340 description = unescapeHTML(descMatch.group(1))
2341 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2342 imgUrl = unescapeHTML(imgMatch.group(1))
2343 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2344 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2345 configUrlMatch = re.search('config=(.*)$', playerUrl)
2346 configUrl = urllib2.unquote(configUrlMatch.group(1))
2348 self.report_config_download(showName)
2350 configJSON = urllib2.urlopen(configUrl).read()
2351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2352 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2355 # Technically, it's JavaScript, not JSON
2356 configJSON = configJSON.replace("'", '"')
2359 config = json.loads(configJSON)
2360 except (ValueError,), err:
2361 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2364 playlist = config['playlist']
2365 videoUrl = playlist[1]['url']
2370 'uploader': showName,
2371 'upload_date': None,
2375 'thumbnail': imgUrl,
2376 'description': description,
2377 'player_url': playerUrl,
2383 class CollegeHumorIE(InfoExtractor):
2384 """Information extractor for collegehumor.com"""
2386 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2387 IE_NAME = u'collegehumor'
2389 def report_webpage(self, video_id):
2390 """Report information extraction."""
2391 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2393 def report_extraction(self, video_id):
2394 """Report information extraction."""
2395 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2397 def _real_extract(self, url):
2398 mobj = re.match(self._VALID_URL, url)
2400 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2402 video_id = mobj.group('videoid')
2404 self.report_webpage(video_id)
2405 request = urllib2.Request(url)
2407 webpage = urllib2.urlopen(request).read()
2408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2409 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2412 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2414 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2416 internal_video_id = m.group('internalvideoid')
2420 'internal_id': internal_video_id,
2423 self.report_extraction(video_id)
2424 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2426 metaXml = urllib2.urlopen(xmlUrl).read()
2427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2428 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2431 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2433 videoNode = mdoc.findall('./video')[0]
2434 info['description'] = videoNode.findall('./description')[0].text
2435 info['title'] = videoNode.findall('./caption')[0].text
2436 info['url'] = videoNode.findall('./file')[0].text
2437 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2438 info['ext'] = info['url'].rpartition('.')[2]
2439 info['format'] = info['ext']
2441 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2447 class XVideosIE(InfoExtractor):
2448 """Information extractor for xvideos.com"""
2450 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2451 IE_NAME = u'xvideos'
2453 def report_webpage(self, video_id):
2454 """Report information extraction."""
2455 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2457 def report_extraction(self, video_id):
2458 """Report information extraction."""
2459 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2461 def _real_extract(self, url):
2462 mobj = re.match(self._VALID_URL, url)
2464 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2466 video_id = mobj.group(1).decode('utf-8')
2468 self.report_webpage(video_id)
2470 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2472 webpage = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2477 self.report_extraction(video_id)
2481 mobj = re.search(r'flv_url=(.+?)&', webpage)
2483 self._downloader.trouble(u'ERROR: unable to extract video url')
2485 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2489 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2491 self._downloader.trouble(u'ERROR: unable to extract video title')
2493 video_title = mobj.group(1).decode('utf-8')
2496 # Extract video thumbnail
2497 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2499 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2501 video_thumbnail = mobj.group(0).decode('utf-8')
2507 'upload_date': None,
2508 'title': video_title,
2511 'thumbnail': video_thumbnail,
2512 'description': None,
2519 class SoundcloudIE(InfoExtractor):
2520 """Information extractor for soundcloud.com
2521 To access the media, the uid of the song and a stream token
2522 must be extracted from the page source and the script must make
2523 a request to media.soundcloud.com/crossdomain.xml. Then
2524 the media can be grabbed by requesting from an url composed
2525 of the stream token and uid
2528 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2529 IE_NAME = u'soundcloud'
2531 def __init__(self, downloader=None):
2532 InfoExtractor.__init__(self, downloader)
2534 def report_webpage(self, video_id):
2535 """Report information extraction."""
2536 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2538 def report_extraction(self, video_id):
2539 """Report information extraction."""
2540 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2542 def _real_extract(self, url):
2543 mobj = re.match(self._VALID_URL, url)
2545 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2548 # extract uploader (which is in the url)
2549 uploader = mobj.group(1).decode('utf-8')
2550 # extract simple title (uploader + slug of song title)
2551 slug_title = mobj.group(2).decode('utf-8')
2552 simple_title = uploader + u'-' + slug_title
2554 self.report_webpage('%s/%s' % (uploader, slug_title))
2556 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2558 webpage = urllib2.urlopen(request).read()
2559 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2560 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2563 self.report_extraction('%s/%s' % (uploader, slug_title))
2565 # extract uid and stream token that soundcloud hands out for access
2566 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2568 video_id = mobj.group(1)
2569 stream_token = mobj.group(2)
2571 # extract unsimplified title
2572 mobj = re.search('"title":"(.*?)",', webpage)
2574 title = mobj.group(1).decode('utf-8')
2576 title = simple_title
2578 # construct media url (with uid/token)
2579 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2580 mediaURL = mediaURL % (video_id, stream_token)
2583 description = u'No description available'
2584 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2586 description = mobj.group(1)
2590 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2593 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2594 except Exception, e:
2595 self._downloader.to_stderr(str(e))
2597 # for soundcloud, a request to a cross domain is required for cookies
2598 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2601 'id': video_id.decode('utf-8'),
2603 'uploader': uploader.decode('utf-8'),
2604 'upload_date': upload_date,
2609 'description': description.decode('utf-8')
2613 class InfoQIE(InfoExtractor):
2614 """Information extractor for infoq.com"""
2616 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2619 def report_webpage(self, video_id):
2620 """Report information extraction."""
2621 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2623 def report_extraction(self, video_id):
2624 """Report information extraction."""
2625 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2627 def _real_extract(self, url):
2628 mobj = re.match(self._VALID_URL, url)
2630 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2633 self.report_webpage(url)
2635 request = urllib2.Request(url)
2637 webpage = urllib2.urlopen(request).read()
2638 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2639 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2642 self.report_extraction(url)
2646 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2648 self._downloader.trouble(u'ERROR: unable to extract video url')
2650 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2654 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2656 self._downloader.trouble(u'ERROR: unable to extract video title')
2658 video_title = mobj.group(1).decode('utf-8')
2660 # Extract description
2661 video_description = u'No description available.'
2662 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2663 if mobj is not None:
2664 video_description = mobj.group(1).decode('utf-8')
2666 video_filename = video_url.split('/')[-1]
2667 video_id, extension = video_filename.split('.')
2673 'upload_date': None,
2674 'title': video_title,
2676 'format': extension, # Extension is always(?) mp4, but seems to be flv
2678 'description': video_description,
2684 class MixcloudIE(InfoExtractor):
2685 """Information extractor for www.mixcloud.com"""
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2687 IE_NAME = u'mixcloud'
2689 def __init__(self, downloader=None):
2690 InfoExtractor.__init__(self, downloader)
2692 def report_download_json(self, file_id):
2693 """Report JSON download."""
2694 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2696 def report_extraction(self, file_id):
2697 """Report information extraction."""
2698 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2700 def get_urls(self, jsonData, fmt, bitrate='best'):
2701 """Get urls from 'audio_formats' section in json"""
2704 bitrate_list = jsonData[fmt]
2705 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2706 bitrate = max(bitrate_list) # select highest
2708 url_list = jsonData[fmt][bitrate]
2709 except TypeError: # we have no bitrate info.
2710 url_list = jsonData[fmt]
2713 def check_urls(self, url_list):
2714 """Returns 1st active url from list"""
2715 for url in url_list:
2717 urllib2.urlopen(url)
2719 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2724 def _print_formats(self, formats):
2725 print 'Available formats:'
2726 for fmt in formats.keys():
2727 for b in formats[fmt]:
2729 ext = formats[fmt][b][0]
2730 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2731 except TypeError: # we have no bitrate info
2732 ext = formats[fmt][0]
2733 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2736 def _real_extract(self, url):
2737 mobj = re.match(self._VALID_URL, url)
2739 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2741 # extract uploader & filename from url
2742 uploader = mobj.group(1).decode('utf-8')
2743 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2745 # construct API request
2746 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2747 # retrieve .json file with links to files
2748 request = urllib2.Request(file_url)
2750 self.report_download_json(file_url)
2751 jsonData = urllib2.urlopen(request).read()
2752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2757 json_data = json.loads(jsonData)
2758 player_url = json_data['player_swf_url']
2759 formats = dict(json_data['audio_formats'])
2761 req_format = self._downloader.params.get('format', None)
2764 if self._downloader.params.get('listformats', None):
2765 self._print_formats(formats)
2768 if req_format is None or req_format == 'best':
2769 for format_param in formats.keys():
2770 url_list = self.get_urls(formats, format_param)
2772 file_url = self.check_urls(url_list)
2773 if file_url is not None:
2776 if req_format not in formats.keys():
2777 self._downloader.trouble(u'ERROR: format is not available')
2780 url_list = self.get_urls(formats, req_format)
2781 file_url = self.check_urls(url_list)
2782 format_param = req_format
2785 'id': file_id.decode('utf-8'),
2786 'url': file_url.decode('utf-8'),
2787 'uploader': uploader.decode('utf-8'),
2788 'upload_date': u'NA',
2789 'title': json_data['name'],
2790 'ext': file_url.split('.')[-1].decode('utf-8'),
2791 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2792 'thumbnail': json_data['thumbnail_url'],
2793 'description': json_data['description'],
2794 'player_url': player_url.decode('utf-8'),
2797 class StanfordOpenClassroomIE(InfoExtractor):
2798 """Information extractor for Stanford's Open ClassRoom"""
2800 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2801 IE_NAME = u'stanfordoc'
2803 def report_download_webpage(self, objid):
2804 """Report information extraction."""
2805 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2807 def report_extraction(self, video_id):
2808 """Report information extraction."""
2809 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2811 def _real_extract(self, url):
2812 mobj = re.match(self._VALID_URL, url)
2814 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2817 if mobj.group('course') and mobj.group('video'): # A specific video
2818 course = mobj.group('course')
2819 video = mobj.group('video')
2821 'id': course + '_' + video,
2824 self.report_extraction(info['id'])
2825 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2826 xmlUrl = baseUrl + video + '.xml'
2828 metaXml = urllib2.urlopen(xmlUrl).read()
2829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2830 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2832 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2834 info['title'] = mdoc.findall('./title')[0].text
2835 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2837 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2839 info['ext'] = info['url'].rpartition('.')[2]
2840 info['format'] = info['ext']
2842 elif mobj.group('course'): # A course page
2843 course = mobj.group('course')
2849 self.report_download_webpage(info['id'])
2851 coursepage = urllib2.urlopen(url).read()
2852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2856 m = re.search('<h1>([^<]+)</h1>', coursepage)
2858 info['title'] = unescapeHTML(m.group(1))
2860 info['title'] = info['id']
2862 m = re.search('<description>([^<]+)</description>', coursepage)
2864 info['description'] = unescapeHTML(m.group(1))
2866 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2869 'type': 'reference',
2870 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2874 for entry in info['list']:
2875 assert entry['type'] == 'reference'
2876 results += self.extract(entry['url'])
2881 'id': 'Stanford OpenClassroom',
2885 self.report_download_webpage(info['id'])
2886 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2888 rootpage = urllib2.urlopen(rootURL).read()
2889 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2890 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2893 info['title'] = info['id']
2895 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2898 'type': 'reference',
2899 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2904 for entry in info['list']:
2905 assert entry['type'] == 'reference'
2906 results += self.extract(entry['url'])
2909 class MTVIE(InfoExtractor):
2910 """Information extractor for MTV.com"""
2912 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2915 def report_webpage(self, video_id):
2916 """Report information extraction."""
2917 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2919 def report_extraction(self, video_id):
2920 """Report information extraction."""
2921 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2928 if not mobj.group('proto'):
2929 url = 'http://' + url
2930 video_id = mobj.group('videoid')
2931 self.report_webpage(video_id)
2933 request = urllib2.Request(url)
2935 webpage = urllib2.urlopen(request).read()
2936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2940 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2942 self._downloader.trouble(u'ERROR: unable to extract song name')
2944 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2945 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2947 self._downloader.trouble(u'ERROR: unable to extract performer')
2949 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2950 video_title = performer + ' - ' + song_name
2952 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2954 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2956 mtvn_uri = mobj.group(1)
2958 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2960 self._downloader.trouble(u'ERROR: unable to extract content id')
2962 content_id = mobj.group(1)
2964 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2965 self.report_extraction(video_id)
2966 request = urllib2.Request(videogen_url)
2968 metadataXml = urllib2.urlopen(request).read()
2969 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2970 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2973 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2974 renditions = mdoc.findall('.//rendition')
2976 # For now, always pick the highest quality.
2977 rendition = renditions[-1]
2980 _,_,ext = rendition.attrib['type'].partition('/')
2981 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2982 video_url = rendition.find('./src').text
2984 self._downloader.trouble('Invalid rendition field.')
2990 'uploader': performer,
2991 'title': video_title,
2999 class YoukuIE(InfoExtractor):
3001 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3004 def __init__(self, downloader=None):
3005 InfoExtractor.__init__(self, downloader)
3007 def report_download_webpage(self, file_id):
3008 """Report webpage download."""
3009 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3011 def report_extraction(self, file_id):
3012 """Report information extraction."""
3013 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3016 nowTime = int(time.time() * 1000)
3017 random1 = random.randint(1000,1998)
3018 random2 = random.randint(1000,9999)
3020 return "%d%d%d" %(nowTime,random1,random2)
3022 def _get_file_ID_mix_string(self, seed):
3024 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3026 for i in range(len(source)):
3027 seed = (seed * 211 + 30031 ) % 65536
3028 index = math.floor(seed / 65536 * len(source) )
3029 mixed.append(source[int(index)])
3030 source.remove(source[int(index)])
3031 #return ''.join(mixed)
3034 def _get_file_id(self, fileId, seed):
3035 mixed = self._get_file_ID_mix_string(seed)
3036 ids = fileId.split('*')
3040 realId.append(mixed[int(ch)])
3041 return ''.join(realId)
3043 def _real_extract(self, url):
3044 mobj = re.match(self._VALID_URL, url)
3046 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3048 video_id = mobj.group('ID')
3050 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3052 request = urllib2.Request(info_url, None, std_headers)
3054 self.report_download_webpage(video_id)
3055 jsondata = urllib2.urlopen(request).read()
3056 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3060 self.report_extraction(video_id)
3062 config = json.loads(jsondata)
3064 video_title = config['data'][0]['title']
3065 seed = config['data'][0]['seed']
3067 format = self._downloader.params.get('format', None)
3068 supported_format = config['data'][0]['streamfileids'].keys()
3070 if format is None or format == 'best':
3071 if 'hd2' in supported_format:
3076 elif format == 'worst':
3084 fileid = config['data'][0]['streamfileids'][format]
3085 seg_number = len(config['data'][0]['segs'][format])
3088 for i in xrange(seg_number):
3089 keys.append(config['data'][0]['segs'][format][i]['k'])
3092 #youku only could be viewed from mainland china
3094 self._downloader.trouble(u'ERROR: unable to extract info section')
3098 sid = self._gen_sid()
3099 fileid = self._get_file_id(fileid, seed)
3101 #column 8,9 of fileid represent the segment number
3102 #fileid[7:9] should be changed
3103 for index, key in enumerate(keys):
3105 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3106 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3109 'id': '%s_part%02d' % (video_id, index),
3110 'url': download_url,
3112 'title': video_title,
3116 files_info.append(info)
3121 class XNXXIE(InfoExtractor):
3122 """Information extractor for xnxx.com"""
3124 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3126 VIDEO_URL_RE = r'flv_url=(.*?)&'
3127 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3128 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3130 def report_webpage(self, video_id):
3131 """Report information extraction"""
3132 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3134 def report_extraction(self, video_id):
3135 """Report information extraction"""
3136 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3138 def _real_extract(self, url):
3139 mobj = re.match(self._VALID_URL, url)
3141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3143 video_id = mobj.group(1).decode('utf-8')
3145 self.report_webpage(video_id)
3147 # Get webpage content
3149 webpage = urllib2.urlopen(url).read()
3150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3154 result = re.search(self.VIDEO_URL_RE, webpage)
3156 self._downloader.trouble(u'ERROR: unable to extract video url')
3158 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3160 result = re.search(self.VIDEO_TITLE_RE, webpage)
3162 self._downloader.trouble(u'ERROR: unable to extract video title')
3164 video_title = result.group(1).decode('utf-8')
3166 result = re.search(self.VIDEO_THUMB_RE, webpage)
3168 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3170 video_thumbnail = result.group(1).decode('utf-8')
3172 info = {'id': video_id,
3175 'upload_date': None,
3176 'title': video_title,
3179 'thumbnail': video_thumbnail,
3180 'description': None,
3186 class GooglePlusIE(InfoExtractor):
3187 """Information extractor for plus.google.com."""
3189 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3190 IE_NAME = u'plus.google'
3192 def __init__(self, downloader=None):
3193 InfoExtractor.__init__(self, downloader)
3195 def report_extract_entry(self, url):
3196 """Report downloading extry"""
3197 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3199 def report_date(self, upload_date):
3200 """Report downloading extry"""
3201 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3203 def report_uploader(self, uploader):
3204 """Report downloading extry"""
3205 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3207 def report_title(self, video_title):
3208 """Report downloading extry"""
3209 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3211 def report_extract_vid_page(self, video_page):
3212 """Report information extraction."""
3213 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3215 def _real_extract(self, url):
3216 # Extract id from URL
3217 mobj = re.match(self._VALID_URL, url)
3219 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3222 post_url = mobj.group(0)
3223 video_id = mobj.group(2)
3225 video_extension = 'flv'
3227 # Step 1, Retrieve post webpage to extract further information
3228 self.report_extract_entry(post_url)
3229 request = urllib2.Request(post_url)
3231 webpage = urllib2.urlopen(request).read()
3232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3233 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3236 # Extract update date
3238 pattern = 'title="Timestamp">(.*?)</a>'
3239 mobj = re.search(pattern, webpage)
3241 upload_date = mobj.group(1)
3242 # Convert timestring to a format suitable for filename
3243 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3244 upload_date = upload_date.strftime('%Y%m%d')
3245 self.report_date(upload_date)
3249 pattern = r'rel\="author".*?>(.*?)</a>'
3250 mobj = re.search(pattern, webpage)
3252 uploader = mobj.group(1)
3253 self.report_uploader(uploader)
3256 # Get the first line for title
3258 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3259 mobj = re.search(pattern, webpage)
3261 video_title = mobj.group(1)
3262 self.report_title(video_title)
3264 # Step 2, Stimulate clicking the image box to launch video
3265 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3266 mobj = re.search(pattern, webpage)
3268 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3270 video_page = mobj.group(1)
3271 request = urllib2.Request(video_page)
3273 webpage = urllib2.urlopen(request).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3277 self.report_extract_vid_page(video_page)
3280 # Extract video links on video page
3281 """Extract video links of all sizes"""
3282 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3283 mobj = re.findall(pattern, webpage)
3285 self._downloader.trouble(u'ERROR: unable to extract video links')
3287 # Sort in resolution
3288 links = sorted(mobj)
3290 # Choose the lowest of the sort, i.e. highest resolution
3291 video_url = links[-1]
3292 # Only get the url. The resolution part in the tuple has no use anymore
3293 video_url = video_url[-1]
3294 # Treat escaped \u0026 style hex
3295 video_url = unicode(video_url, "unicode_escape")
3299 'id': video_id.decode('utf-8'),
3301 'uploader': uploader.decode('utf-8'),
3302 'upload_date': upload_date.decode('utf-8'),
3303 'title': video_title.decode('utf-8'),
3304 'ext': video_extension.decode('utf-8'),