2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140 _video_dimensions = {
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
198 def _closed_captions_xml_to_srt(self, xml_string):
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
215 def _print_formats(self, formats):
216 print 'Available formats:'
218 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
220 def _real_initialize(self):
221 if self._downloader is None:
226 downloader_params = self._downloader.params
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
245 request = urllib2.Request(self._LANG_URL)
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
253 # No authentication to be performed
259 'current_form': 'loginForm',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
279 'action_confirm': 'Confirm',
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300 video_id = mobj.group(2)
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
344 # Start extracting information
345 self.report_information_extraction(video_id)
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
418 video_token = urllib.unquote_plus(video_info['token'][0])
420 # Decide which formats to download
421 req_format = self._downloader.params.get('format', None)
423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
424 self.report_rtmp_download()
425 video_url_list = [(None, video_info['conn'][0])]
426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
427 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
428 url_data = [parse_qs(uds) for uds in url_data_strs]
429 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
430 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
432 format_limit = self._downloader.params.get('format_limit', None)
433 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
434 if format_limit is not None and format_limit in available_formats:
435 format_list = available_formats[available_formats.index(format_limit):]
437 format_list = available_formats
438 existing_formats = [x for x in format_list if x in url_map]
439 if len(existing_formats) == 0:
440 self._downloader.trouble(u'ERROR: no known formats available for video')
442 if self._downloader.params.get('listformats', None):
443 self._print_formats(existing_formats)
445 if req_format is None or req_format == 'best':
446 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
447 elif req_format == 'worst':
448 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
449 elif req_format in ('-1', 'all'):
450 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
452 # Specific formats. We pick the first in a slash-delimeted sequence.
453 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
454 req_formats = req_format.split('/')
455 video_url_list = None
456 for rf in req_formats:
458 video_url_list = [(rf, url_map[rf])]
460 if video_url_list is None:
461 self._downloader.trouble(u'ERROR: requested format not available')
464 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
468 for format_param, video_real_url in video_url_list:
470 video_extension = self._video_extensions.get(format_param, 'flv')
473 'id': video_id.decode('utf-8'),
474 'url': video_real_url.decode('utf-8'),
475 'uploader': video_uploader.decode('utf-8'),
476 'upload_date': upload_date,
477 'title': video_title,
478 'ext': video_extension.decode('utf-8'),
479 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
480 'thumbnail': video_thumbnail.decode('utf-8'),
481 'description': video_description,
482 'player_url': player_url,
483 'subtitles': video_subtitles
488 class MetacafeIE(InfoExtractor):
489 """Information Extractor for metacafe.com."""
491 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
492 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
493 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
494 IE_NAME = u'metacafe'
496 def __init__(self, downloader=None):
497 InfoExtractor.__init__(self, downloader)
499 def report_disclaimer(self):
500 """Report disclaimer retrieval."""
501 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
503 def report_age_confirmation(self):
504 """Report attempt to confirm age."""
505 self._downloader.to_screen(u'[metacafe] Confirming age')
507 def report_download_webpage(self, video_id):
508 """Report webpage download."""
509 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
511 def report_extraction(self, video_id):
512 """Report information extraction."""
513 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
515 def _real_initialize(self):
516 # Retrieve disclaimer
517 request = urllib2.Request(self._DISCLAIMER)
519 self.report_disclaimer()
520 disclaimer = urllib2.urlopen(request).read()
521 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
522 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
528 'submit': "Continue - I'm over 18",
530 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
532 self.report_age_confirmation()
533 disclaimer = urllib2.urlopen(request).read()
534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
535 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
538 def _real_extract(self, url):
539 # Extract id and simplified title from URL
540 mobj = re.match(self._VALID_URL, url)
542 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
545 video_id = mobj.group(1)
547 # Check if video comes from YouTube
548 mobj2 = re.match(r'^yt-(.*)$', video_id)
549 if mobj2 is not None:
550 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
553 # Retrieve video webpage to extract further information
554 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
556 self.report_download_webpage(video_id)
557 webpage = urllib2.urlopen(request).read()
558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
559 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
562 # Extract URL, uploader and title from webpage
563 self.report_extraction(video_id)
564 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
566 mediaURL = urllib.unquote(mobj.group(1))
567 video_extension = mediaURL[-3:]
569 # Extract gdaKey if available
570 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
574 gdaKey = mobj.group(1)
575 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
577 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
579 self._downloader.trouble(u'ERROR: unable to extract media URL')
581 vardict = parse_qs(mobj.group(1))
582 if 'mediaData' not in vardict:
583 self._downloader.trouble(u'ERROR: unable to extract media URL')
585 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
587 self._downloader.trouble(u'ERROR: unable to extract media URL')
589 mediaURL = mobj.group(1).replace('\\/', '/')
590 video_extension = mediaURL[-3:]
591 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
593 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
595 self._downloader.trouble(u'ERROR: unable to extract title')
597 video_title = mobj.group(1).decode('utf-8')
599 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
601 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
603 video_uploader = mobj.group(1)
606 'id': video_id.decode('utf-8'),
607 'url': video_url.decode('utf-8'),
608 'uploader': video_uploader.decode('utf-8'),
609 'upload_date': u'NA',
610 'title': video_title,
611 'ext': video_extension.decode('utf-8'),
617 class DailymotionIE(InfoExtractor):
618 """Information Extractor for Dailymotion"""
620 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
621 IE_NAME = u'dailymotion'
623 def __init__(self, downloader=None):
624 InfoExtractor.__init__(self, downloader)
626 def report_download_webpage(self, video_id):
627 """Report webpage download."""
628 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
630 def report_extraction(self, video_id):
631 """Report information extraction."""
632 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
634 def _real_extract(self, url):
635 # Extract id and simplified title from URL
636 mobj = re.match(self._VALID_URL, url)
638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
641 video_id = mobj.group(1).split('_')[0].split('?')[0]
643 video_extension = 'mp4'
645 # Retrieve video webpage to extract further information
646 request = urllib2.Request(url)
647 request.add_header('Cookie', 'family_filter=off')
649 self.report_download_webpage(video_id)
650 webpage = urllib2.urlopen(request).read()
651 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
652 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
655 # Extract URL, uploader and title from webpage
656 self.report_extraction(video_id)
657 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
659 self._downloader.trouble(u'ERROR: unable to extract media URL')
661 flashvars = urllib.unquote(mobj.group(1))
663 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
666 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
669 self._downloader.trouble(u'ERROR: unable to extract video URL')
672 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
674 self._downloader.trouble(u'ERROR: unable to extract video URL')
677 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
679 # TODO: support choosing qualities
681 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
683 self._downloader.trouble(u'ERROR: unable to extract title')
685 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
687 video_uploader = u'NA'
688 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
690 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
692 video_uploader = mobj.group(1)
694 video_upload_date = u'NA'
695 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
697 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
700 'id': video_id.decode('utf-8'),
701 'url': video_url.decode('utf-8'),
702 'uploader': video_uploader.decode('utf-8'),
703 'upload_date': video_upload_date,
704 'title': video_title,
705 'ext': video_extension.decode('utf-8'),
711 class GoogleIE(InfoExtractor):
712 """Information extractor for video.google.com."""
714 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
715 IE_NAME = u'video.google'
717 def __init__(self, downloader=None):
718 InfoExtractor.__init__(self, downloader)
720 def report_download_webpage(self, video_id):
721 """Report webpage download."""
722 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
724 def report_extraction(self, video_id):
725 """Report information extraction."""
726 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
728 def _real_extract(self, url):
729 # Extract id from URL
730 mobj = re.match(self._VALID_URL, url)
732 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
735 video_id = mobj.group(1)
737 video_extension = 'mp4'
739 # Retrieve video webpage to extract further information
740 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
742 self.report_download_webpage(video_id)
743 webpage = urllib2.urlopen(request).read()
744 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
745 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
748 # Extract URL, uploader, and title from webpage
749 self.report_extraction(video_id)
750 mobj = re.search(r"download_url:'([^']+)'", webpage)
752 video_extension = 'flv'
753 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
755 self._downloader.trouble(u'ERROR: unable to extract media URL')
757 mediaURL = urllib.unquote(mobj.group(1))
758 mediaURL = mediaURL.replace('\\x3d', '\x3d')
759 mediaURL = mediaURL.replace('\\x26', '\x26')
763 mobj = re.search(r'<title>(.*)</title>', webpage)
765 self._downloader.trouble(u'ERROR: unable to extract title')
767 video_title = mobj.group(1).decode('utf-8')
769 # Extract video description
770 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
772 self._downloader.trouble(u'ERROR: unable to extract video description')
774 video_description = mobj.group(1).decode('utf-8')
775 if not video_description:
776 video_description = 'No description available.'
778 # Extract video thumbnail
779 if self._downloader.params.get('forcethumbnail', False):
780 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
782 webpage = urllib2.urlopen(request).read()
783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
784 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
786 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
788 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
790 video_thumbnail = mobj.group(1)
791 else: # we need something to pass to process_info
795 'id': video_id.decode('utf-8'),
796 'url': video_url.decode('utf-8'),
798 'upload_date': u'NA',
799 'title': video_title,
800 'ext': video_extension.decode('utf-8'),
806 class PhotobucketIE(InfoExtractor):
807 """Information extractor for photobucket.com."""
809 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
810 IE_NAME = u'photobucket'
812 def __init__(self, downloader=None):
813 InfoExtractor.__init__(self, downloader)
815 def report_download_webpage(self, video_id):
816 """Report webpage download."""
817 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
819 def report_extraction(self, video_id):
820 """Report information extraction."""
821 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
823 def _real_extract(self, url):
824 # Extract id from URL
825 mobj = re.match(self._VALID_URL, url)
827 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
830 video_id = mobj.group(1)
832 video_extension = 'flv'
834 # Retrieve video webpage to extract further information
835 request = urllib2.Request(url)
837 self.report_download_webpage(video_id)
838 webpage = urllib2.urlopen(request).read()
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
843 # Extract URL, uploader, and title from webpage
844 self.report_extraction(video_id)
845 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
847 self._downloader.trouble(u'ERROR: unable to extract media URL')
849 mediaURL = urllib.unquote(mobj.group(1))
853 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
855 self._downloader.trouble(u'ERROR: unable to extract title')
857 video_title = mobj.group(1).decode('utf-8')
859 video_uploader = mobj.group(2).decode('utf-8')
862 'id': video_id.decode('utf-8'),
863 'url': video_url.decode('utf-8'),
864 'uploader': video_uploader,
865 'upload_date': u'NA',
866 'title': video_title,
867 'ext': video_extension.decode('utf-8'),
873 class YahooIE(InfoExtractor):
874 """Information extractor for video.yahoo.com."""
876 # _VALID_URL matches all Yahoo! Video URLs
877 # _VPAGE_URL matches only the extractable '/watch/' URLs
878 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
879 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
880 IE_NAME = u'video.yahoo'
882 def __init__(self, downloader=None):
883 InfoExtractor.__init__(self, downloader)
885 def report_download_webpage(self, video_id):
886 """Report webpage download."""
887 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
889 def report_extraction(self, video_id):
890 """Report information extraction."""
891 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
893 def _real_extract(self, url, new_video=True):
894 # Extract ID from URL
895 mobj = re.match(self._VALID_URL, url)
897 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
900 video_id = mobj.group(2)
901 video_extension = 'flv'
903 # Rewrite valid but non-extractable URLs as
904 # extractable English language /watch/ URLs
905 if re.match(self._VPAGE_URL, url) is None:
906 request = urllib2.Request(url)
908 webpage = urllib2.urlopen(request).read()
909 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
910 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
913 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
915 self._downloader.trouble(u'ERROR: Unable to extract id field')
917 yahoo_id = mobj.group(1)
919 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
921 self._downloader.trouble(u'ERROR: Unable to extract vid field')
923 yahoo_vid = mobj.group(1)
925 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
926 return self._real_extract(url, new_video=False)
928 # Retrieve video webpage to extract further information
929 request = urllib2.Request(url)
931 self.report_download_webpage(video_id)
932 webpage = urllib2.urlopen(request).read()
933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
934 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
937 # Extract uploader and title from webpage
938 self.report_extraction(video_id)
939 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video title')
943 video_title = mobj.group(1).decode('utf-8')
945 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
947 self._downloader.trouble(u'ERROR: unable to extract video uploader')
949 video_uploader = mobj.group(1).decode('utf-8')
951 # Extract video thumbnail
952 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
954 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
956 video_thumbnail = mobj.group(1).decode('utf-8')
958 # Extract video description
959 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video description')
963 video_description = mobj.group(1).decode('utf-8')
964 if not video_description:
965 video_description = 'No description available.'
967 # Extract video height and width
968 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
970 self._downloader.trouble(u'ERROR: unable to extract video height')
972 yv_video_height = mobj.group(1)
974 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
976 self._downloader.trouble(u'ERROR: unable to extract video width')
978 yv_video_width = mobj.group(1)
980 # Retrieve video playlist to extract media URL
981 # I'm not completely sure what all these options are, but we
982 # seem to need most of them, otherwise the server sends a 401.
983 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
984 yv_bitrate = '700' # according to Wikipedia this is hard-coded
985 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
986 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
987 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
989 self.report_download_webpage(video_id)
990 webpage = urllib2.urlopen(request).read()
991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
992 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
995 # Extract media URL from playlist XML
996 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
998 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1000 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1001 video_url = unescapeHTML(video_url)
1004 'id': video_id.decode('utf-8'),
1006 'uploader': video_uploader,
1007 'upload_date': u'NA',
1008 'title': video_title,
1009 'ext': video_extension.decode('utf-8'),
1010 'thumbnail': video_thumbnail.decode('utf-8'),
1011 'description': video_description,
1012 'thumbnail': video_thumbnail,
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1042 video_id = mobj.group(1)
1044 # Retrieve video webpage to extract further information
1045 request = urllib2.Request(url, None, std_headers)
1047 self.report_download_webpage(video_id)
1048 webpage = urllib2.urlopen(request).read()
1049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1058 # Extract the config JSON
1059 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061 config = json.loads(config)
1063 self._downloader.trouble(u'ERROR: unable to extract info section')
1067 video_title = config["video"]["title"]
1070 video_uploader = config["video"]["owner"]["name"]
1072 # Extract video thumbnail
1073 video_thumbnail = config["video"]["thumbnail"]
1075 # Extract video description
1076 video_description = get_element_by_id("description", webpage.decode('utf8'))
1077 if video_description: video_description = clean_html(video_description)
1078 else: video_description = ''
1080 # Extract upload date
1081 video_upload_date = u'NA'
1082 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1083 if mobj is not None:
1084 video_upload_date = mobj.group(1)
1086 # Vimeo specific: extract request signature and timestamp
1087 sig = config['request']['signature']
1088 timestamp = config['request']['timestamp']
1090 # Vimeo specific: extract video codec and quality information
1091 # TODO bind to format param
1092 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1093 for codec in codecs:
1094 if codec[0] in config["video"]["files"]:
1095 video_codec = codec[0]
1096 video_extension = codec[1]
1097 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1098 else: quality = 'sd'
1101 self._downloader.trouble(u'ERROR: no known codec found')
1104 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1105 %(video_id, sig, timestamp, quality, video_codec.upper())
1110 'uploader': video_uploader,
1111 'upload_date': video_upload_date,
1112 'title': video_title,
1113 'ext': video_extension,
1114 'thumbnail': video_thumbnail,
1115 'description': video_description,
1120 class GenericIE(InfoExtractor):
1121 """Generic last-resort information extractor."""
1124 IE_NAME = u'generic'
1126 def __init__(self, downloader=None):
1127 InfoExtractor.__init__(self, downloader)
1129 def report_download_webpage(self, video_id):
1130 """Report webpage download."""
1131 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1132 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1134 def report_extraction(self, video_id):
1135 """Report information extraction."""
1136 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1138 def report_following_redirect(self, new_url):
1139 """Report information extraction."""
1140 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1142 def _test_redirect(self, url):
1143 """Check if it is a redirect, like url shorteners, in case restart chain."""
1144 class HeadRequest(urllib2.Request):
1145 def get_method(self):
1148 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1150 Subclass the HTTPRedirectHandler to make it use our
1151 HeadRequest also on the redirected URL
1153 def redirect_request(self, req, fp, code, msg, headers, newurl):
1154 if code in (301, 302, 303, 307):
1155 newurl = newurl.replace(' ', '%20')
1156 newheaders = dict((k,v) for k,v in req.headers.items()
1157 if k.lower() not in ("content-length", "content-type"))
1158 return HeadRequest(newurl,
1160 origin_req_host=req.get_origin_req_host(),
1163 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1165 class HTTPMethodFallback(urllib2.BaseHandler):
1167 Fallback to GET if HEAD is not allowed (405 HTTP error)
1169 def http_error_405(self, req, fp, code, msg, headers):
1173 newheaders = dict((k,v) for k,v in req.headers.items()
1174 if k.lower() not in ("content-length", "content-type"))
1175 return self.parent.open(urllib2.Request(req.get_full_url(),
1177 origin_req_host=req.get_origin_req_host(),
1181 opener = urllib2.OpenerDirector()
1182 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1183 HTTPMethodFallback, HEADRedirectHandler,
1184 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1185 opener.add_handler(handler())
1187 response = opener.open(HeadRequest(url))
1188 new_url = response.geturl()
1190 if url == new_url: return False
1192 self.report_following_redirect(new_url)
1193 self._downloader.download([new_url])
1196 def _real_extract(self, url):
1197 if self._test_redirect(url): return
1199 video_id = url.split('/')[-1]
1200 request = urllib2.Request(url)
1202 self.report_download_webpage(video_id)
1203 webpage = urllib2.urlopen(request).read()
1204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1205 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1207 except ValueError, err:
1208 # since this is the last-resort InfoExtractor, if
1209 # this error is thrown, it'll be thrown here
1210 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1213 self.report_extraction(video_id)
1214 # Start with something easy: JW Player in SWFObject
1215 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1217 # Broaden the search a little bit
1218 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1220 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1223 # It's possible that one of the regexes
1224 # matched, but returned an empty group:
1225 if mobj.group(1) is None:
1226 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1229 video_url = urllib.unquote(mobj.group(1))
1230 video_id = os.path.basename(video_url)
1232 # here's a fun little line of code for you:
1233 video_extension = os.path.splitext(video_id)[1][1:]
1234 video_id = os.path.splitext(video_id)[0]
1236 # it's tempting to parse this further, but you would
1237 # have to take into account all the variations like
1238 # Video Title - Site Name
1239 # Site Name | Video Title
1240 # Video Title - Tagline | Site Name
1241 # and so on and so forth; it's just not practical
1242 mobj = re.search(r'<title>(.*)</title>', webpage)
1244 self._downloader.trouble(u'ERROR: unable to extract title')
1246 video_title = mobj.group(1).decode('utf-8')
1248 # video uploader is domain name
1249 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1253 video_uploader = mobj.group(1).decode('utf-8')
1256 'id': video_id.decode('utf-8'),
1257 'url': video_url.decode('utf-8'),
1258 'uploader': video_uploader,
1259 'upload_date': u'NA',
1260 'title': video_title,
1261 'ext': video_extension.decode('utf-8'),
1267 class YoutubeSearchIE(InfoExtractor):
1268 """Information Extractor for YouTube search queries."""
1269 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1270 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1271 _max_youtube_results = 1000
1272 IE_NAME = u'youtube:search'
1274 def __init__(self, downloader=None):
1275 InfoExtractor.__init__(self, downloader)
1277 def report_download_page(self, query, pagenum):
1278 """Report attempt to download search page with given number."""
1279 query = query.decode(preferredencoding())
1280 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1282 def _real_extract(self, query):
1283 mobj = re.match(self._VALID_URL, query)
1285 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1288 prefix, query = query.split(':')
1290 query = query.encode('utf-8')
1292 self._download_n_results(query, 1)
1294 elif prefix == 'all':
1295 self._download_n_results(query, self._max_youtube_results)
1301 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1303 elif n > self._max_youtube_results:
1304 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1305 n = self._max_youtube_results
1306 self._download_n_results(query, n)
1308 except ValueError: # parsing prefix as integer fails
1309 self._download_n_results(query, 1)
1312 def _download_n_results(self, query, n):
1313 """Downloads a specified number of results for a query"""
1319 while (50 * pagenum) < limit:
1320 self.report_download_page(query, pagenum+1)
1321 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1322 request = urllib2.Request(result_url)
1324 data = urllib2.urlopen(request).read()
1325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1326 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1328 api_response = json.loads(data)['data']
1330 new_ids = list(video['id'] for video in api_response['items'])
1331 video_ids += new_ids
1333 limit = min(n, api_response['totalItems'])
1336 if len(video_ids) > n:
1337 video_ids = video_ids[:n]
1338 for id in video_ids:
1339 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1343 class GoogleSearchIE(InfoExtractor):
1344 """Information Extractor for Google Video search queries."""
1345 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1346 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1347 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1348 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1349 _max_google_results = 1000
1350 IE_NAME = u'video.google:search'
1352 def __init__(self, downloader=None):
1353 InfoExtractor.__init__(self, downloader)
1355 def report_download_page(self, query, pagenum):
1356 """Report attempt to download playlist page with given number."""
1357 query = query.decode(preferredencoding())
1358 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1360 def _real_extract(self, query):
1361 mobj = re.match(self._VALID_URL, query)
1363 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1366 prefix, query = query.split(':')
1368 query = query.encode('utf-8')
1370 self._download_n_results(query, 1)
1372 elif prefix == 'all':
1373 self._download_n_results(query, self._max_google_results)
1379 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1381 elif n > self._max_google_results:
1382 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1383 n = self._max_google_results
1384 self._download_n_results(query, n)
1386 except ValueError: # parsing prefix as integer fails
1387 self._download_n_results(query, 1)
1390 def _download_n_results(self, query, n):
1391 """Downloads a specified number of results for a query"""
1397 self.report_download_page(query, pagenum)
1398 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1399 request = urllib2.Request(result_url)
1401 page = urllib2.urlopen(request).read()
1402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1406 # Extract video identifiers
1407 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1408 video_id = mobj.group(1)
1409 if video_id not in video_ids:
1410 video_ids.append(video_id)
1411 if len(video_ids) == n:
1412 # Specified n videos reached
1413 for id in video_ids:
1414 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1417 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1418 for id in video_ids:
1419 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1422 pagenum = pagenum + 1
1425 class YahooSearchIE(InfoExtractor):
1426 """Information Extractor for Yahoo! Video search queries."""
1427 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1428 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1429 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1430 _MORE_PAGES_INDICATOR = r'\s*Next'
1431 _max_yahoo_results = 1000
1432 IE_NAME = u'video.yahoo:search'
1434 def __init__(self, downloader=None):
1435 InfoExtractor.__init__(self, downloader)
1437 def report_download_page(self, query, pagenum):
1438 """Report attempt to download playlist page with given number."""
1439 query = query.decode(preferredencoding())
1440 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1442 def _real_extract(self, query):
1443 mobj = re.match(self._VALID_URL, query)
1445 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448 prefix, query = query.split(':')
1450 query = query.encode('utf-8')
1452 self._download_n_results(query, 1)
1454 elif prefix == 'all':
1455 self._download_n_results(query, self._max_yahoo_results)
1461 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1463 elif n > self._max_yahoo_results:
1464 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1465 n = self._max_yahoo_results
1466 self._download_n_results(query, n)
1468 except ValueError: # parsing prefix as integer fails
1469 self._download_n_results(query, 1)
1472 def _download_n_results(self, query, n):
1473 """Downloads a specified number of results for a query"""
1476 already_seen = set()
1480 self.report_download_page(query, pagenum)
1481 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1482 request = urllib2.Request(result_url)
1484 page = urllib2.urlopen(request).read()
1485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1489 # Extract video identifiers
1490 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1491 video_id = mobj.group(1)
1492 if video_id not in already_seen:
1493 video_ids.append(video_id)
1494 already_seen.add(video_id)
1495 if len(video_ids) == n:
1496 # Specified n videos reached
1497 for id in video_ids:
1498 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1501 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1502 for id in video_ids:
1503 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1506 pagenum = pagenum + 1
1509 class YoutubePlaylistIE(InfoExtractor):
1510 """Information Extractor for YouTube playlists."""
1512 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1513 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1514 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1515 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1516 IE_NAME = u'youtube:playlist'
1518 def __init__(self, downloader=None):
1519 InfoExtractor.__init__(self, downloader)
1521 def report_download_page(self, playlist_id, pagenum):
1522 """Report attempt to download playlist page with given number."""
1523 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1525 def _real_extract(self, url):
1526 # Extract playlist id
1527 mobj = re.match(self._VALID_URL, url)
1529 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1533 if mobj.group(3) is not None:
1534 self._downloader.download([mobj.group(3)])
1537 # Download playlist pages
1538 # prefix is 'p' as default for playlists but there are other types that need extra care
1539 playlist_prefix = mobj.group(1)
1540 if playlist_prefix == 'a':
1541 playlist_access = 'artist'
1543 playlist_prefix = 'p'
1544 playlist_access = 'view_play_list'
1545 playlist_id = mobj.group(2)
1550 self.report_download_page(playlist_id, pagenum)
1551 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1552 request = urllib2.Request(url)
1554 page = urllib2.urlopen(request).read()
1555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1559 # Extract video identifiers
1561 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1562 if mobj.group(1) not in ids_in_page:
1563 ids_in_page.append(mobj.group(1))
1564 video_ids.extend(ids_in_page)
1566 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1568 pagenum = pagenum + 1
1570 playliststart = self._downloader.params.get('playliststart', 1) - 1
1571 playlistend = self._downloader.params.get('playlistend', -1)
1572 if playlistend == -1:
1573 video_ids = video_ids[playliststart:]
1575 video_ids = video_ids[playliststart:playlistend]
1577 for id in video_ids:
1578 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1582 class YoutubeChannelIE(InfoExtractor):
1583 """Information Extractor for YouTube channels."""
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1588 IE_NAME = u'youtube:channel'
1590 def report_download_page(self, channel_id, pagenum):
1591 """Report attempt to download channel page with given number."""
1592 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1598 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1601 # Download channel pages
1602 channel_id = mobj.group(1)
1607 self.report_download_page(channel_id, pagenum)
1608 url = self._TEMPLATE_URL % (channel_id, pagenum)
1609 request = urllib2.Request(url)
1611 page = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1616 # Extract video identifiers
1618 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1619 if mobj.group(1) not in ids_in_page:
1620 ids_in_page.append(mobj.group(1))
1621 video_ids.extend(ids_in_page)
1623 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1625 pagenum = pagenum + 1
1627 for id in video_ids:
1628 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1632 class YoutubeUserIE(InfoExtractor):
1633 """Information Extractor for YouTube users."""
1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1639 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1640 IE_NAME = u'youtube:user'
1642 def __init__(self, downloader=None):
1643 InfoExtractor.__init__(self, downloader)
1645 def report_download_page(self, username, start_index):
1646 """Report attempt to download user page."""
1647 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1648 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1650 def _real_extract(self, url):
1652 mobj = re.match(self._VALID_URL, url)
1654 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1657 username = mobj.group(1)
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669 self.report_download_page(username, start_index)
1671 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1674 page = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1679 # Extract video identifiers
1682 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1683 if mobj.group(1) not in ids_in_page:
1684 ids_in_page.append(mobj.group(1))
1686 video_ids.extend(ids_in_page)
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1694 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1699 all_ids_count = len(video_ids)
1700 playliststart = self._downloader.params.get('playliststart', 1) - 1
1701 playlistend = self._downloader.params.get('playlistend', -1)
1703 if playlistend == -1:
1704 video_ids = video_ids[playliststart:]
1706 video_ids = video_ids[playliststart:playlistend]
1708 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1709 (username, all_ids_count, len(video_ids)))
1711 for video_id in video_ids:
1712 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1715 class BlipTVUserIE(InfoExtractor):
1716 """Information Extractor for blip.tv users."""
1718 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1720 IE_NAME = u'blip.tv:user'
1722 def __init__(self, downloader=None):
1723 InfoExtractor.__init__(self, downloader)
1725 def report_download_page(self, username, pagenum):
1726 """Report attempt to download user page."""
1727 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1728 (self.IE_NAME, username, pagenum))
1730 def _real_extract(self, url):
1732 mobj = re.match(self._VALID_URL, url)
1734 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1737 username = mobj.group(1)
1739 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1741 request = urllib2.Request(url)
1744 page = urllib2.urlopen(request).read().decode('utf-8')
1745 mobj = re.search(r'data-users-id="([^"]+)"', page)
1746 page_base = page_base % mobj.group(1)
1747 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1748 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1752 # Download video ids using BlipTV Ajax calls. Result size per
1753 # query is limited (currently to 12 videos) so we need to query
1754 # page by page until there are no video ids - it means we got
1761 self.report_download_page(username, pagenum)
1763 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1766 page = urllib2.urlopen(request).read().decode('utf-8')
1767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1771 # Extract video identifiers
1774 for mobj in re.finditer(r'href="/([^"]+)"', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(unescapeHTML(mobj.group(1)))
1778 video_ids.extend(ids_in_page)
1780 # A little optimization - if current page is not
1781 # "full", ie. does not contain PAGE_SIZE video ids then
1782 # we can assume that this page is the last one - there
1783 # are no more ids on further pages - no need to query
1786 if len(ids_in_page) < self._PAGE_SIZE:
1791 all_ids_count = len(video_ids)
1792 playliststart = self._downloader.params.get('playliststart', 1) - 1
1793 playlistend = self._downloader.params.get('playlistend', -1)
1795 if playlistend == -1:
1796 video_ids = video_ids[playliststart:]
1798 video_ids = video_ids[playliststart:playlistend]
1800 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1801 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1803 for video_id in video_ids:
1804 self._downloader.download([u'http://blip.tv/'+video_id])
1807 class DepositFilesIE(InfoExtractor):
1808 """Information extractor for depositfiles.com"""
1810 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1811 IE_NAME = u'DepositFiles'
1813 def __init__(self, downloader=None):
1814 InfoExtractor.__init__(self, downloader)
1816 def report_download_webpage(self, file_id):
1817 """Report webpage download."""
1818 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1820 def report_extraction(self, file_id):
1821 """Report information extraction."""
1822 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1824 def _real_extract(self, url):
1825 file_id = url.split('/')[-1]
1826 # Rebuild url in english locale
1827 url = 'http://depositfiles.com/en/files/' + file_id
1829 # Retrieve file webpage with 'Free download' button pressed
1830 free_download_indication = { 'gateway_result' : '1' }
1831 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1833 self.report_download_webpage(file_id)
1834 webpage = urllib2.urlopen(request).read()
1835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1836 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1839 # Search for the real file URL
1840 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1841 if (mobj is None) or (mobj.group(1) is None):
1842 # Try to figure out reason of the error.
1843 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1844 if (mobj is not None) and (mobj.group(1) is not None):
1845 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1846 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1848 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1851 file_url = mobj.group(1)
1852 file_extension = os.path.splitext(file_url)[1][1:]
1854 # Search for file title
1855 mobj = re.search(r'<b title="(.*?)">', webpage)
1857 self._downloader.trouble(u'ERROR: unable to extract title')
1859 file_title = mobj.group(1).decode('utf-8')
1862 'id': file_id.decode('utf-8'),
1863 'url': file_url.decode('utf-8'),
1865 'upload_date': u'NA',
1866 'title': file_title,
1867 'ext': file_extension.decode('utf-8'),
1873 class FacebookIE(InfoExtractor):
1874 """Information Extractor for Facebook"""
1876 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1877 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1878 _NETRC_MACHINE = 'facebook'
1879 _available_formats = ['video', 'highqual', 'lowqual']
1880 _video_extensions = {
1885 IE_NAME = u'facebook'
1887 def __init__(self, downloader=None):
1888 InfoExtractor.__init__(self, downloader)
1890 def _reporter(self, message):
1891 """Add header and report message."""
1892 self._downloader.to_screen(u'[facebook] %s' % message)
1894 def report_login(self):
1895 """Report attempt to log in."""
1896 self._reporter(u'Logging in')
1898 def report_video_webpage_download(self, video_id):
1899 """Report attempt to download video webpage."""
1900 self._reporter(u'%s: Downloading video webpage' % video_id)
1902 def report_information_extraction(self, video_id):
1903 """Report attempt to extract video information."""
1904 self._reporter(u'%s: Extracting video information' % video_id)
1906 def _parse_page(self, video_webpage):
1907 """Extract video information from page"""
1909 data = {'title': r'\("video_title", "(.*?)"\)',
1910 'description': r'<div class="datawrap">(.*?)</div>',
1911 'owner': r'\("video_owner_name", "(.*?)"\)',
1912 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1915 for piece in data.keys():
1916 mobj = re.search(data[piece], video_webpage)
1917 if mobj is not None:
1918 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1922 for fmt in self._available_formats:
1923 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1924 if mobj is not None:
1925 # URL is in a Javascript segment inside an escaped Unicode format within
1926 # the generally utf-8 page
1927 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1928 video_info['video_urls'] = video_urls
1932 def _real_initialize(self):
1933 if self._downloader is None:
1938 downloader_params = self._downloader.params
1940 # Attempt to use provided username and password or .netrc data
1941 if downloader_params.get('username', None) is not None:
1942 useremail = downloader_params['username']
1943 password = downloader_params['password']
1944 elif downloader_params.get('usenetrc', False):
1946 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1947 if info is not None:
1951 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1952 except (IOError, netrc.NetrcParseError), err:
1953 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1956 if useremail is None:
1965 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1968 login_results = urllib2.urlopen(request).read()
1969 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1970 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1976 def _real_extract(self, url):
1977 mobj = re.match(self._VALID_URL, url)
1979 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1981 video_id = mobj.group('ID')
1984 self.report_video_webpage_download(video_id)
1985 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1987 page = urllib2.urlopen(request)
1988 video_webpage = page.read()
1989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1993 # Start extracting information
1994 self.report_information_extraction(video_id)
1996 # Extract information
1997 video_info = self._parse_page(video_webpage)
2000 if 'owner' not in video_info:
2001 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2003 video_uploader = video_info['owner']
2006 if 'title' not in video_info:
2007 self._downloader.trouble(u'ERROR: unable to extract video title')
2009 video_title = video_info['title']
2010 video_title = video_title.decode('utf-8')
2013 if 'thumbnail' not in video_info:
2014 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2015 video_thumbnail = ''
2017 video_thumbnail = video_info['thumbnail']
2021 if 'upload_date' in video_info:
2022 upload_time = video_info['upload_date']
2023 timetuple = email.utils.parsedate_tz(upload_time)
2024 if timetuple is not None:
2026 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2031 video_description = video_info.get('description', 'No description available.')
2033 url_map = video_info['video_urls']
2034 if len(url_map.keys()) > 0:
2035 # Decide which formats to download
2036 req_format = self._downloader.params.get('format', None)
2037 format_limit = self._downloader.params.get('format_limit', None)
2039 if format_limit is not None and format_limit in self._available_formats:
2040 format_list = self._available_formats[self._available_formats.index(format_limit):]
2042 format_list = self._available_formats
2043 existing_formats = [x for x in format_list if x in url_map]
2044 if len(existing_formats) == 0:
2045 self._downloader.trouble(u'ERROR: no known formats available for video')
2047 if req_format is None:
2048 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2049 elif req_format == 'worst':
2050 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2051 elif req_format == '-1':
2052 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2055 if req_format not in url_map:
2056 self._downloader.trouble(u'ERROR: requested format not available')
2058 video_url_list = [(req_format, url_map[req_format])] # Specific format
2061 for format_param, video_real_url in video_url_list:
2063 video_extension = self._video_extensions.get(format_param, 'mp4')
2066 'id': video_id.decode('utf-8'),
2067 'url': video_real_url.decode('utf-8'),
2068 'uploader': video_uploader.decode('utf-8'),
2069 'upload_date': upload_date,
2070 'title': video_title,
2071 'ext': video_extension.decode('utf-8'),
2072 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2073 'thumbnail': video_thumbnail.decode('utf-8'),
2074 'description': video_description.decode('utf-8'),
2079 class BlipTVIE(InfoExtractor):
2080 """Information extractor for blip.tv"""
2082 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2083 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2084 IE_NAME = u'blip.tv'
2086 def report_extraction(self, file_id):
2087 """Report information extraction."""
2088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2090 def report_direct_download(self, title):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2094 def _real_extract(self, url):
2095 mobj = re.match(self._VALID_URL, url)
2097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2104 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2105 request = urllib2.Request(json_url.encode('utf-8'))
2106 self.report_extraction(mobj.group(1))
2109 urlh = urllib2.urlopen(request)
2110 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2111 basename = url.split('/')[-1]
2112 title,ext = os.path.splitext(basename)
2113 title = title.decode('UTF-8')
2114 ext = ext.replace('.', '')
2115 self.report_direct_download(title)
2123 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2124 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2126 if info is None: # Regular URL
2128 json_code = urlh.read()
2129 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2130 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2134 json_data = json.loads(json_code)
2135 if 'Post' in json_data:
2136 data = json_data['Post']
2140 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2141 video_url = data['media']['url']
2142 umobj = re.match(self._URL_EXT, video_url)
2144 raise ValueError('Can not determine filename extension')
2145 ext = umobj.group(1)
2148 'id': data['item_id'],
2150 'uploader': data['display_name'],
2151 'upload_date': upload_date,
2152 'title': data['title'],
2154 'format': data['media']['mimeType'],
2155 'thumbnail': data['thumbnailUrl'],
2156 'description': data['description'],
2157 'player_url': data['embedUrl']
2159 except (ValueError,KeyError), err:
2160 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2163 std_headers['User-Agent'] = 'iTunes/10.6.1'
2167 class MyVideoIE(InfoExtractor):
2168 """Information Extractor for myvideo.de."""
2170 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2171 IE_NAME = u'myvideo'
2173 def __init__(self, downloader=None):
2174 InfoExtractor.__init__(self, downloader)
2176 def report_download_webpage(self, video_id):
2177 """Report webpage download."""
2178 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2180 def report_extraction(self, video_id):
2181 """Report information extraction."""
2182 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2184 def _real_extract(self,url):
2185 mobj = re.match(self._VALID_URL, url)
2187 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2190 video_id = mobj.group(1)
2193 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2195 self.report_download_webpage(video_id)
2196 webpage = urllib2.urlopen(request).read()
2197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2201 self.report_extraction(video_id)
2202 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2205 self._downloader.trouble(u'ERROR: unable to extract media URL')
2207 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2209 mobj = re.search('<title>([^<]+)</title>', webpage)
2211 self._downloader.trouble(u'ERROR: unable to extract title')
2214 video_title = mobj.group(1)
2220 'upload_date': u'NA',
2221 'title': video_title,
2227 class ComedyCentralIE(InfoExtractor):
2228 """Information extractor for The Daily Show and Colbert Report """
2230 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2231 IE_NAME = u'comedycentral'
2233 def report_extraction(self, episode_id):
2234 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2236 def report_config_download(self, episode_id):
2237 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2239 def report_index_download(self, episode_id):
2240 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2242 def report_player_url(self, episode_id):
2243 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2245 def _real_extract(self, url):
2246 mobj = re.match(self._VALID_URL, url)
2248 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2251 if mobj.group('shortname'):
2252 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253 url = u'http://www.thedailyshow.com/full-episodes/'
2255 url = u'http://www.colbertnation.com/full-episodes/'
2256 mobj = re.match(self._VALID_URL, url)
2257 assert mobj is not None
2259 dlNewest = not mobj.group('episode')
2261 epTitle = mobj.group('showname')
2263 epTitle = mobj.group('episode')
2265 req = urllib2.Request(url)
2266 self.report_extraction(epTitle)
2268 htmlHandle = urllib2.urlopen(req)
2269 html = htmlHandle.read()
2270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2271 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2274 url = htmlHandle.geturl()
2275 mobj = re.match(self._VALID_URL, url)
2277 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2279 if mobj.group('episode') == '':
2280 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2282 epTitle = mobj.group('episode')
2284 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2285 if len(mMovieParams) == 0:
2286 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2289 playerUrl_raw = mMovieParams[0][0]
2290 self.report_player_url(epTitle)
2292 urlHandle = urllib2.urlopen(playerUrl_raw)
2293 playerUrl = urlHandle.geturl()
2294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2295 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2298 uri = mMovieParams[0][1]
2299 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2300 self.report_index_download(epTitle)
2302 indexXml = urllib2.urlopen(indexUrl).read()
2303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2309 idoc = xml.etree.ElementTree.fromstring(indexXml)
2310 itemEls = idoc.findall('.//item')
2311 for itemEl in itemEls:
2312 mediaId = itemEl.findall('./guid')[0].text
2313 shortMediaId = mediaId.split(':')[-1]
2314 showId = mediaId.split(':')[-2].replace('.com', '')
2315 officialTitle = itemEl.findall('./title')[0].text
2316 officialDate = itemEl.findall('./pubDate')[0].text
2318 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2319 urllib.urlencode({'uri': mediaId}))
2320 configReq = urllib2.Request(configUrl)
2321 self.report_config_download(epTitle)
2323 configXml = urllib2.urlopen(configReq).read()
2324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2328 cdoc = xml.etree.ElementTree.fromstring(configXml)
2330 for rendition in cdoc.findall('.//rendition'):
2331 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2335 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2338 # For now, just pick the highest bitrate
2339 format,video_url = turls[-1]
2341 effTitle = showId + u'-' + epTitle
2346 'upload_date': officialDate,
2351 'description': officialTitle,
2352 'player_url': playerUrl
2355 results.append(info)
2360 class EscapistIE(InfoExtractor):
2361 """Information extractor for The Escapist """
2363 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2364 IE_NAME = u'escapist'
2366 def report_extraction(self, showName):
2367 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2369 def report_config_download(self, showName):
2370 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2372 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url)
2375 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2377 showName = mobj.group('showname')
2378 videoId = mobj.group('episode')
2380 self.report_extraction(showName)
2382 webPage = urllib2.urlopen(url)
2383 webPageBytes = webPage.read()
2384 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2385 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2390 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2391 description = unescapeHTML(descMatch.group(1))
2392 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2393 imgUrl = unescapeHTML(imgMatch.group(1))
2394 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2395 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2396 configUrlMatch = re.search('config=(.*)$', playerUrl)
2397 configUrl = urllib2.unquote(configUrlMatch.group(1))
2399 self.report_config_download(showName)
2401 configJSON = urllib2.urlopen(configUrl).read()
2402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2403 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2406 # Technically, it's JavaScript, not JSON
2407 configJSON = configJSON.replace("'", '"')
2410 config = json.loads(configJSON)
2411 except (ValueError,), err:
2412 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2415 playlist = config['playlist']
2416 videoUrl = playlist[1]['url']
2421 'uploader': showName,
2422 'upload_date': None,
2426 'thumbnail': imgUrl,
2427 'description': description,
2428 'player_url': playerUrl,
2434 class CollegeHumorIE(InfoExtractor):
2435 """Information extractor for collegehumor.com"""
2437 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2438 IE_NAME = u'collegehumor'
2440 def report_webpage(self, video_id):
2441 """Report information extraction."""
2442 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2444 def report_extraction(self, video_id):
2445 """Report information extraction."""
2446 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2448 def _real_extract(self, url):
2449 mobj = re.match(self._VALID_URL, url)
2451 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2453 video_id = mobj.group('videoid')
2455 self.report_webpage(video_id)
2456 request = urllib2.Request(url)
2458 webpage = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2463 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2465 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2467 internal_video_id = m.group('internalvideoid')
2471 'internal_id': internal_video_id,
2474 self.report_extraction(video_id)
2475 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2477 metaXml = urllib2.urlopen(xmlUrl).read()
2478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2482 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2484 videoNode = mdoc.findall('./video')[0]
2485 info['description'] = videoNode.findall('./description')[0].text
2486 info['title'] = videoNode.findall('./caption')[0].text
2487 info['url'] = videoNode.findall('./file')[0].text
2488 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2489 info['ext'] = info['url'].rpartition('.')[2]
2490 info['format'] = info['ext']
2492 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2498 class XVideosIE(InfoExtractor):
2499 """Information extractor for xvideos.com"""
2501 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2502 IE_NAME = u'xvideos'
2504 def report_webpage(self, video_id):
2505 """Report information extraction."""
2506 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2508 def report_extraction(self, video_id):
2509 """Report information extraction."""
2510 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2512 def _real_extract(self, url):
2513 mobj = re.match(self._VALID_URL, url)
2515 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2517 video_id = mobj.group(1).decode('utf-8')
2519 self.report_webpage(video_id)
2521 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2523 webpage = urllib2.urlopen(request).read()
2524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2528 self.report_extraction(video_id)
2532 mobj = re.search(r'flv_url=(.+?)&', webpage)
2534 self._downloader.trouble(u'ERROR: unable to extract video url')
2536 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2540 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2542 self._downloader.trouble(u'ERROR: unable to extract video title')
2544 video_title = mobj.group(1).decode('utf-8')
2547 # Extract video thumbnail
2548 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2550 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2552 video_thumbnail = mobj.group(0).decode('utf-8')
2558 'upload_date': None,
2559 'title': video_title,
2562 'thumbnail': video_thumbnail,
2563 'description': None,
2570 class SoundcloudIE(InfoExtractor):
2571 """Information extractor for soundcloud.com
2572 To access the media, the uid of the song and a stream token
2573 must be extracted from the page source and the script must make
2574 a request to media.soundcloud.com/crossdomain.xml. Then
2575 the media can be grabbed by requesting from an url composed
2576 of the stream token and uid
2579 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2580 IE_NAME = u'soundcloud'
2582 def __init__(self, downloader=None):
2583 InfoExtractor.__init__(self, downloader)
2585 def report_webpage(self, video_id):
2586 """Report information extraction."""
2587 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2589 def report_extraction(self, video_id):
2590 """Report information extraction."""
2591 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2593 def _real_extract(self, url):
2594 mobj = re.match(self._VALID_URL, url)
2596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2599 # extract uploader (which is in the url)
2600 uploader = mobj.group(1).decode('utf-8')
2601 # extract simple title (uploader + slug of song title)
2602 slug_title = mobj.group(2).decode('utf-8')
2603 simple_title = uploader + u'-' + slug_title
2605 self.report_webpage('%s/%s' % (uploader, slug_title))
2607 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2609 webpage = urllib2.urlopen(request).read()
2610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2614 self.report_extraction('%s/%s' % (uploader, slug_title))
2616 # extract uid and stream token that soundcloud hands out for access
2617 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2619 video_id = mobj.group(1)
2620 stream_token = mobj.group(2)
2622 # extract unsimplified title
2623 mobj = re.search('"title":"(.*?)",', webpage)
2625 title = mobj.group(1).decode('utf-8')
2627 title = simple_title
2629 # construct media url (with uid/token)
2630 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2631 mediaURL = mediaURL % (video_id, stream_token)
2634 description = u'No description available'
2635 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2637 description = mobj.group(1)
2641 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2644 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2645 except Exception, e:
2646 self._downloader.to_stderr(str(e))
2648 # for soundcloud, a request to a cross domain is required for cookies
2649 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2652 'id': video_id.decode('utf-8'),
2654 'uploader': uploader.decode('utf-8'),
2655 'upload_date': upload_date,
2660 'description': description.decode('utf-8')
2664 class InfoQIE(InfoExtractor):
2665 """Information extractor for infoq.com"""
2667 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2670 def report_webpage(self, video_id):
2671 """Report information extraction."""
2672 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2674 def report_extraction(self, video_id):
2675 """Report information extraction."""
2676 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2678 def _real_extract(self, url):
2679 mobj = re.match(self._VALID_URL, url)
2681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2684 self.report_webpage(url)
2686 request = urllib2.Request(url)
2688 webpage = urllib2.urlopen(request).read()
2689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2693 self.report_extraction(url)
2697 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2699 self._downloader.trouble(u'ERROR: unable to extract video url')
2701 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2705 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2707 self._downloader.trouble(u'ERROR: unable to extract video title')
2709 video_title = mobj.group(1).decode('utf-8')
2711 # Extract description
2712 video_description = u'No description available.'
2713 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2714 if mobj is not None:
2715 video_description = mobj.group(1).decode('utf-8')
2717 video_filename = video_url.split('/')[-1]
2718 video_id, extension = video_filename.split('.')
2724 'upload_date': None,
2725 'title': video_title,
2727 'format': extension, # Extension is always(?) mp4, but seems to be flv
2729 'description': video_description,
2735 class MixcloudIE(InfoExtractor):
2736 """Information extractor for www.mixcloud.com"""
2737 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2738 IE_NAME = u'mixcloud'
2740 def __init__(self, downloader=None):
2741 InfoExtractor.__init__(self, downloader)
2743 def report_download_json(self, file_id):
2744 """Report JSON download."""
2745 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2747 def report_extraction(self, file_id):
2748 """Report information extraction."""
2749 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2751 def get_urls(self, jsonData, fmt, bitrate='best'):
2752 """Get urls from 'audio_formats' section in json"""
2755 bitrate_list = jsonData[fmt]
2756 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2757 bitrate = max(bitrate_list) # select highest
2759 url_list = jsonData[fmt][bitrate]
2760 except TypeError: # we have no bitrate info.
2761 url_list = jsonData[fmt]
2764 def check_urls(self, url_list):
2765 """Returns 1st active url from list"""
2766 for url in url_list:
2768 urllib2.urlopen(url)
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775 def _print_formats(self, formats):
2776 print 'Available formats:'
2777 for fmt in formats.keys():
2778 for b in formats[fmt]:
2780 ext = formats[fmt][b][0]
2781 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2782 except TypeError: # we have no bitrate info
2783 ext = formats[fmt][0]
2784 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2787 def _real_extract(self, url):
2788 mobj = re.match(self._VALID_URL, url)
2790 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2792 # extract uploader & filename from url
2793 uploader = mobj.group(1).decode('utf-8')
2794 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2796 # construct API request
2797 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2798 # retrieve .json file with links to files
2799 request = urllib2.Request(file_url)
2801 self.report_download_json(file_url)
2802 jsonData = urllib2.urlopen(request).read()
2803 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2804 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2808 json_data = json.loads(jsonData)
2809 player_url = json_data['player_swf_url']
2810 formats = dict(json_data['audio_formats'])
2812 req_format = self._downloader.params.get('format', None)
2815 if self._downloader.params.get('listformats', None):
2816 self._print_formats(formats)
2819 if req_format is None or req_format == 'best':
2820 for format_param in formats.keys():
2821 url_list = self.get_urls(formats, format_param)
2823 file_url = self.check_urls(url_list)
2824 if file_url is not None:
2827 if req_format not in formats.keys():
2828 self._downloader.trouble(u'ERROR: format is not available')
2831 url_list = self.get_urls(formats, req_format)
2832 file_url = self.check_urls(url_list)
2833 format_param = req_format
2836 'id': file_id.decode('utf-8'),
2837 'url': file_url.decode('utf-8'),
2838 'uploader': uploader.decode('utf-8'),
2839 'upload_date': u'NA',
2840 'title': json_data['name'],
2841 'ext': file_url.split('.')[-1].decode('utf-8'),
2842 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2843 'thumbnail': json_data['thumbnail_url'],
2844 'description': json_data['description'],
2845 'player_url': player_url.decode('utf-8'),
2848 class StanfordOpenClassroomIE(InfoExtractor):
2849 """Information extractor for Stanford's Open ClassRoom"""
2851 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2852 IE_NAME = u'stanfordoc'
2854 def report_download_webpage(self, objid):
2855 """Report information extraction."""
2856 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2858 def report_extraction(self, video_id):
2859 """Report information extraction."""
2860 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2862 def _real_extract(self, url):
2863 mobj = re.match(self._VALID_URL, url)
2865 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2868 if mobj.group('course') and mobj.group('video'): # A specific video
2869 course = mobj.group('course')
2870 video = mobj.group('video')
2872 'id': course + '_' + video,
2875 self.report_extraction(info['id'])
2876 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2877 xmlUrl = baseUrl + video + '.xml'
2879 metaXml = urllib2.urlopen(xmlUrl).read()
2880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2883 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2885 info['title'] = mdoc.findall('./title')[0].text
2886 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2888 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2890 info['ext'] = info['url'].rpartition('.')[2]
2891 info['format'] = info['ext']
2893 elif mobj.group('course'): # A course page
2894 course = mobj.group('course')
2900 self.report_download_webpage(info['id'])
2902 coursepage = urllib2.urlopen(url).read()
2903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2904 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2907 m = re.search('<h1>([^<]+)</h1>', coursepage)
2909 info['title'] = unescapeHTML(m.group(1))
2911 info['title'] = info['id']
2913 m = re.search('<description>([^<]+)</description>', coursepage)
2915 info['description'] = unescapeHTML(m.group(1))
2917 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2920 'type': 'reference',
2921 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2925 for entry in info['list']:
2926 assert entry['type'] == 'reference'
2927 results += self.extract(entry['url'])
2932 'id': 'Stanford OpenClassroom',
2936 self.report_download_webpage(info['id'])
2937 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2939 rootpage = urllib2.urlopen(rootURL).read()
2940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2941 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2944 info['title'] = info['id']
2946 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2949 'type': 'reference',
2950 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2955 for entry in info['list']:
2956 assert entry['type'] == 'reference'
2957 results += self.extract(entry['url'])
2960 class MTVIE(InfoExtractor):
2961 """Information extractor for MTV.com"""
2963 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2966 def report_webpage(self, video_id):
2967 """Report information extraction."""
2968 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2970 def report_extraction(self, video_id):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2974 def _real_extract(self, url):
2975 mobj = re.match(self._VALID_URL, url)
2977 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2979 if not mobj.group('proto'):
2980 url = 'http://' + url
2981 video_id = mobj.group('videoid')
2982 self.report_webpage(video_id)
2984 request = urllib2.Request(url)
2986 webpage = urllib2.urlopen(request).read()
2987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2991 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2993 self._downloader.trouble(u'ERROR: unable to extract song name')
2995 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2996 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2998 self._downloader.trouble(u'ERROR: unable to extract performer')
3000 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3001 video_title = performer + ' - ' + song_name
3003 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3005 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3007 mtvn_uri = mobj.group(1)
3009 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3011 self._downloader.trouble(u'ERROR: unable to extract content id')
3013 content_id = mobj.group(1)
3015 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3016 self.report_extraction(video_id)
3017 request = urllib2.Request(videogen_url)
3019 metadataXml = urllib2.urlopen(request).read()
3020 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3021 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3024 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3025 renditions = mdoc.findall('.//rendition')
3027 # For now, always pick the highest quality.
3028 rendition = renditions[-1]
3031 _,_,ext = rendition.attrib['type'].partition('/')
3032 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3033 video_url = rendition.find('./src').text
3035 self._downloader.trouble('Invalid rendition field.')
3041 'uploader': performer,
3042 'title': video_title,
3050 class YoukuIE(InfoExtractor):
3052 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3055 def __init__(self, downloader=None):
3056 InfoExtractor.__init__(self, downloader)
3058 def report_download_webpage(self, file_id):
3059 """Report webpage download."""
3060 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3062 def report_extraction(self, file_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3067 nowTime = int(time.time() * 1000)
3068 random1 = random.randint(1000,1998)
3069 random2 = random.randint(1000,9999)
3071 return "%d%d%d" %(nowTime,random1,random2)
3073 def _get_file_ID_mix_string(self, seed):
3075 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3077 for i in range(len(source)):
3078 seed = (seed * 211 + 30031 ) % 65536
3079 index = math.floor(seed / 65536 * len(source) )
3080 mixed.append(source[int(index)])
3081 source.remove(source[int(index)])
3082 #return ''.join(mixed)
3085 def _get_file_id(self, fileId, seed):
3086 mixed = self._get_file_ID_mix_string(seed)
3087 ids = fileId.split('*')
3091 realId.append(mixed[int(ch)])
3092 return ''.join(realId)
3094 def _real_extract(self, url):
3095 mobj = re.match(self._VALID_URL, url)
3097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3099 video_id = mobj.group('ID')
3101 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3103 request = urllib2.Request(info_url, None, std_headers)
3105 self.report_download_webpage(video_id)
3106 jsondata = urllib2.urlopen(request).read()
3107 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3108 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3111 self.report_extraction(video_id)
3113 config = json.loads(jsondata)
3115 video_title = config['data'][0]['title']
3116 seed = config['data'][0]['seed']
3118 format = self._downloader.params.get('format', None)
3119 supported_format = config['data'][0]['streamfileids'].keys()
3121 if format is None or format == 'best':
3122 if 'hd2' in supported_format:
3127 elif format == 'worst':
3135 fileid = config['data'][0]['streamfileids'][format]
3136 seg_number = len(config['data'][0]['segs'][format])
3139 for i in xrange(seg_number):
3140 keys.append(config['data'][0]['segs'][format][i]['k'])
3143 #youku only could be viewed from mainland china
3145 self._downloader.trouble(u'ERROR: unable to extract info section')
3149 sid = self._gen_sid()
3150 fileid = self._get_file_id(fileid, seed)
3152 #column 8,9 of fileid represent the segment number
3153 #fileid[7:9] should be changed
3154 for index, key in enumerate(keys):
3156 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3157 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3160 'id': '%s_part%02d' % (video_id, index),
3161 'url': download_url,
3163 'title': video_title,
3167 files_info.append(info)
3172 class XNXXIE(InfoExtractor):
3173 """Information extractor for xnxx.com"""
3175 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3177 VIDEO_URL_RE = r'flv_url=(.*?)&'
3178 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3179 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3181 def report_webpage(self, video_id):
3182 """Report information extraction"""
3183 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3185 def report_extraction(self, video_id):
3186 """Report information extraction"""
3187 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3189 def _real_extract(self, url):
3190 mobj = re.match(self._VALID_URL, url)
3192 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3194 video_id = mobj.group(1).decode('utf-8')
3196 self.report_webpage(video_id)
3198 # Get webpage content
3200 webpage = urllib2.urlopen(url).read()
3201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3202 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3205 result = re.search(self.VIDEO_URL_RE, webpage)
3207 self._downloader.trouble(u'ERROR: unable to extract video url')
3209 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3211 result = re.search(self.VIDEO_TITLE_RE, webpage)
3213 self._downloader.trouble(u'ERROR: unable to extract video title')
3215 video_title = result.group(1).decode('utf-8')
3217 result = re.search(self.VIDEO_THUMB_RE, webpage)
3219 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3221 video_thumbnail = result.group(1).decode('utf-8')
3223 info = {'id': video_id,
3226 'upload_date': None,
3227 'title': video_title,
3230 'thumbnail': video_thumbnail,
3231 'description': None,
3237 class GooglePlusIE(InfoExtractor):
3238 """Information extractor for plus.google.com."""
3240 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3241 IE_NAME = u'plus.google'
3243 def __init__(self, downloader=None):
3244 InfoExtractor.__init__(self, downloader)
3246 def report_extract_entry(self, url):
3247 """Report downloading extry"""
3248 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3250 def report_date(self, upload_date):
3251 """Report downloading extry"""
3252 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3254 def report_uploader(self, uploader):
3255 """Report downloading extry"""
3256 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3258 def report_title(self, video_title):
3259 """Report downloading extry"""
3260 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3262 def report_extract_vid_page(self, video_page):
3263 """Report information extraction."""
3264 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3266 def _real_extract(self, url):
3267 # Extract id from URL
3268 mobj = re.match(self._VALID_URL, url)
3270 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3273 post_url = mobj.group(0)
3274 video_id = mobj.group(2)
3276 video_extension = 'flv'
3278 # Step 1, Retrieve post webpage to extract further information
3279 self.report_extract_entry(post_url)
3280 request = urllib2.Request(post_url)
3282 webpage = urllib2.urlopen(request).read()
3283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3287 # Extract update date
3289 pattern = 'title="Timestamp">(.*?)</a>'
3290 mobj = re.search(pattern, webpage)
3292 upload_date = mobj.group(1)
3293 # Convert timestring to a format suitable for filename
3294 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3295 upload_date = upload_date.strftime('%Y%m%d')
3296 self.report_date(upload_date)
3300 pattern = r'rel\="author".*?>(.*?)</a>'
3301 mobj = re.search(pattern, webpage)
3303 uploader = mobj.group(1)
3304 self.report_uploader(uploader)
3307 # Get the first line for title
3309 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3310 mobj = re.search(pattern, webpage)
3312 video_title = mobj.group(1)
3313 self.report_title(video_title)
3315 # Step 2, Stimulate clicking the image box to launch video
3316 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3317 mobj = re.search(pattern, webpage)
3319 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3321 video_page = mobj.group(1)
3322 request = urllib2.Request(video_page)
3324 webpage = urllib2.urlopen(request).read()
3325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3326 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3328 self.report_extract_vid_page(video_page)
3331 # Extract video links on video page
3332 """Extract video links of all sizes"""
3333 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3334 mobj = re.findall(pattern, webpage)
3336 self._downloader.trouble(u'ERROR: unable to extract video links')
3338 # Sort in resolution
3339 links = sorted(mobj)
3341 # Choose the lowest of the sort, i.e. highest resolution
3342 video_url = links[-1]
3343 # Only get the url. The resolution part in the tuple has no use anymore
3344 video_url = video_url[-1]
3345 # Treat escaped \u0026 style hex
3346 video_url = unicode(video_url, "unicode_escape")
3350 'id': video_id.decode('utf-8'),
3352 'uploader': uploader.decode('utf-8'),
3353 'upload_date': upload_date.decode('utf-8'),
3354 'title': video_title.decode('utf-8'),
3355 'ext': video_extension.decode('utf-8'),