2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140 _video_dimensions = {
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
198 def _closed_captions_xml_to_srt(self, xml_string):
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
215 def _print_formats(self, formats):
216 print 'Available formats:'
218 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
220 def _real_initialize(self):
221 if self._downloader is None:
226 downloader_params = self._downloader.params
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
245 request = urllib2.Request(self._LANG_URL)
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
253 # No authentication to be performed
259 'current_form': 'loginForm',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
279 'action_confirm': 'Confirm',
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
300 video_id = mobj.group(2)
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
344 # Start extracting information
345 self.report_information_extraction(video_id)
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
424 video_token = urllib.unquote_plus(video_info['token'][0])
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
464 video_url_list = [(rf, url_map[rf])]
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
474 for format_param, video_real_url in video_url_list:
476 video_extension = self._video_extensions.get(format_param, 'flv')
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
489 'subtitles': video_subtitles,
490 'duration': video_duration
495 class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
501 IE_NAME = u'metacafe'
503 def __init__(self, downloader=None):
504 InfoExtractor.__init__(self, downloader)
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
535 'submit': "Continue - I'm over 18",
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
552 video_id = mobj.group(1)
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
602 self._downloader.trouble(u'ERROR: unable to extract title')
604 video_title = mobj.group(1).decode('utf-8')
606 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
610 video_uploader = mobj.group(1)
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
618 'ext': video_extension.decode('utf-8'),
624 class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
628 IE_NAME = u'dailymotion'
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648 video_id = mobj.group(1).split('_')[0].split('?')[0]
650 video_extension = 'mp4'
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
668 flashvars = urllib.unquote(mobj.group(1))
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
686 # TODO: support choosing qualities
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
690 self._downloader.trouble(u'ERROR: unable to extract title')
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
694 video_uploader = u'NA'
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
697 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
699 video_uploader = mobj.group(1)
701 video_upload_date = u'NA'
702 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
704 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
710 'upload_date': video_upload_date,
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
718 class GoogleIE(InfoExtractor):
719 """Information extractor for video.google.com."""
721 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
722 IE_NAME = u'video.google'
724 def __init__(self, downloader=None):
725 InfoExtractor.__init__(self, downloader)
727 def report_download_webpage(self, video_id):
728 """Report webpage download."""
729 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
731 def report_extraction(self, video_id):
732 """Report information extraction."""
733 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
735 def _real_extract(self, url):
736 # Extract id from URL
737 mobj = re.match(self._VALID_URL, url)
739 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
742 video_id = mobj.group(1)
744 video_extension = 'mp4'
746 # Retrieve video webpage to extract further information
747 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
749 self.report_download_webpage(video_id)
750 webpage = urllib2.urlopen(request).read()
751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
755 # Extract URL, uploader, and title from webpage
756 self.report_extraction(video_id)
757 mobj = re.search(r"download_url:'([^']+)'", webpage)
759 video_extension = 'flv'
760 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
762 self._downloader.trouble(u'ERROR: unable to extract media URL')
764 mediaURL = urllib.unquote(mobj.group(1))
765 mediaURL = mediaURL.replace('\\x3d', '\x3d')
766 mediaURL = mediaURL.replace('\\x26', '\x26')
770 mobj = re.search(r'<title>(.*)</title>', webpage)
772 self._downloader.trouble(u'ERROR: unable to extract title')
774 video_title = mobj.group(1).decode('utf-8')
776 # Extract video description
777 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
779 self._downloader.trouble(u'ERROR: unable to extract video description')
781 video_description = mobj.group(1).decode('utf-8')
782 if not video_description:
783 video_description = 'No description available.'
785 # Extract video thumbnail
786 if self._downloader.params.get('forcethumbnail', False):
787 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
789 webpage = urllib2.urlopen(request).read()
790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
793 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
795 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
797 video_thumbnail = mobj.group(1)
798 else: # we need something to pass to process_info
802 'id': video_id.decode('utf-8'),
803 'url': video_url.decode('utf-8'),
805 'upload_date': u'NA',
806 'title': video_title,
807 'ext': video_extension.decode('utf-8'),
813 class PhotobucketIE(InfoExtractor):
814 """Information extractor for photobucket.com."""
816 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
817 IE_NAME = u'photobucket'
819 def __init__(self, downloader=None):
820 InfoExtractor.__init__(self, downloader)
822 def report_download_webpage(self, video_id):
823 """Report webpage download."""
824 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
826 def report_extraction(self, video_id):
827 """Report information extraction."""
828 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
830 def _real_extract(self, url):
831 # Extract id from URL
832 mobj = re.match(self._VALID_URL, url)
834 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
837 video_id = mobj.group(1)
839 video_extension = 'flv'
841 # Retrieve video webpage to extract further information
842 request = urllib2.Request(url)
844 self.report_download_webpage(video_id)
845 webpage = urllib2.urlopen(request).read()
846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
847 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
850 # Extract URL, uploader, and title from webpage
851 self.report_extraction(video_id)
852 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract media URL')
856 mediaURL = urllib.unquote(mobj.group(1))
860 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
862 self._downloader.trouble(u'ERROR: unable to extract title')
864 video_title = mobj.group(1).decode('utf-8')
866 video_uploader = mobj.group(2).decode('utf-8')
869 'id': video_id.decode('utf-8'),
870 'url': video_url.decode('utf-8'),
871 'uploader': video_uploader,
872 'upload_date': u'NA',
873 'title': video_title,
874 'ext': video_extension.decode('utf-8'),
880 class YahooIE(InfoExtractor):
881 """Information extractor for video.yahoo.com."""
883 # _VALID_URL matches all Yahoo! Video URLs
884 # _VPAGE_URL matches only the extractable '/watch/' URLs
885 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
886 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
887 IE_NAME = u'video.yahoo'
889 def __init__(self, downloader=None):
890 InfoExtractor.__init__(self, downloader)
892 def report_download_webpage(self, video_id):
893 """Report webpage download."""
894 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
896 def report_extraction(self, video_id):
897 """Report information extraction."""
898 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
900 def _real_extract(self, url, new_video=True):
901 # Extract ID from URL
902 mobj = re.match(self._VALID_URL, url)
904 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
907 video_id = mobj.group(2)
908 video_extension = 'flv'
910 # Rewrite valid but non-extractable URLs as
911 # extractable English language /watch/ URLs
912 if re.match(self._VPAGE_URL, url) is None:
913 request = urllib2.Request(url)
915 webpage = urllib2.urlopen(request).read()
916 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
917 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
920 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
922 self._downloader.trouble(u'ERROR: Unable to extract id field')
924 yahoo_id = mobj.group(1)
926 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
928 self._downloader.trouble(u'ERROR: Unable to extract vid field')
930 yahoo_vid = mobj.group(1)
932 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
933 return self._real_extract(url, new_video=False)
935 # Retrieve video webpage to extract further information
936 request = urllib2.Request(url)
938 self.report_download_webpage(video_id)
939 webpage = urllib2.urlopen(request).read()
940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
941 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
944 # Extract uploader and title from webpage
945 self.report_extraction(video_id)
946 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
948 self._downloader.trouble(u'ERROR: unable to extract video title')
950 video_title = mobj.group(1).decode('utf-8')
952 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
954 self._downloader.trouble(u'ERROR: unable to extract video uploader')
956 video_uploader = mobj.group(1).decode('utf-8')
958 # Extract video thumbnail
959 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
963 video_thumbnail = mobj.group(1).decode('utf-8')
965 # Extract video description
966 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video description')
970 video_description = mobj.group(1).decode('utf-8')
971 if not video_description:
972 video_description = 'No description available.'
974 # Extract video height and width
975 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
977 self._downloader.trouble(u'ERROR: unable to extract video height')
979 yv_video_height = mobj.group(1)
981 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
983 self._downloader.trouble(u'ERROR: unable to extract video width')
985 yv_video_width = mobj.group(1)
987 # Retrieve video playlist to extract media URL
988 # I'm not completely sure what all these options are, but we
989 # seem to need most of them, otherwise the server sends a 401.
990 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
991 yv_bitrate = '700' # according to Wikipedia this is hard-coded
992 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
993 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
994 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
996 self.report_download_webpage(video_id)
997 webpage = urllib2.urlopen(request).read()
998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1002 # Extract media URL from playlist XML
1003 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1005 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1007 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1008 video_url = unescapeHTML(video_url)
1011 'id': video_id.decode('utf-8'),
1013 'uploader': video_uploader,
1014 'upload_date': u'NA',
1015 'title': video_title,
1016 'ext': video_extension.decode('utf-8'),
1017 'thumbnail': video_thumbnail.decode('utf-8'),
1018 'description': video_description,
1019 'thumbnail': video_thumbnail,
1024 class VimeoIE(InfoExtractor):
1025 """Information extractor for vimeo.com."""
1027 # _VALID_URL matches Vimeo URLs
1028 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1031 def __init__(self, downloader=None):
1032 InfoExtractor.__init__(self, downloader)
1034 def report_download_webpage(self, video_id):
1035 """Report webpage download."""
1036 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1038 def report_extraction(self, video_id):
1039 """Report information extraction."""
1040 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1042 def _real_extract(self, url, new_video=True):
1043 # Extract ID from URL
1044 mobj = re.match(self._VALID_URL, url)
1046 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1049 video_id = mobj.group(1)
1051 # Retrieve video webpage to extract further information
1052 request = urllib2.Request(url, None, std_headers)
1054 self.report_download_webpage(video_id)
1055 webpage = urllib2.urlopen(request).read()
1056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1060 # Now we begin extracting as much information as we can from what we
1061 # retrieved. First we extract the information common to all extractors,
1062 # and latter we extract those that are Vimeo specific.
1063 self.report_extraction(video_id)
1065 # Extract the config JSON
1066 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1068 config = json.loads(config)
1070 self._downloader.trouble(u'ERROR: unable to extract info section')
1074 video_title = config["video"]["title"]
1077 video_uploader = config["video"]["owner"]["name"]
1079 # Extract video thumbnail
1080 video_thumbnail = config["video"]["thumbnail"]
1082 # Extract video description
1083 video_description = get_element_by_id("description", webpage.decode('utf8'))
1084 if video_description: video_description = clean_html(video_description)
1085 else: video_description = ''
1087 # Extract upload date
1088 video_upload_date = u'NA'
1089 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1090 if mobj is not None:
1091 video_upload_date = mobj.group(1)
1093 # Vimeo specific: extract request signature and timestamp
1094 sig = config['request']['signature']
1095 timestamp = config['request']['timestamp']
1097 # Vimeo specific: extract video codec and quality information
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 for codec in codecs:
1101 if codec[0] in config["video"]["files"]:
1102 video_codec = codec[0]
1103 video_extension = codec[1]
1104 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1105 else: quality = 'sd'
1108 self._downloader.trouble(u'ERROR: no known codec found')
1111 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112 %(video_id, sig, timestamp, quality, video_codec.upper())
1117 'uploader': video_uploader,
1118 'upload_date': video_upload_date,
1119 'title': video_title,
1120 'ext': video_extension,
1121 'thumbnail': video_thumbnail,
1122 'description': video_description,
1127 class GenericIE(InfoExtractor):
1128 """Generic last-resort information extractor."""
1131 IE_NAME = u'generic'
1133 def __init__(self, downloader=None):
1134 InfoExtractor.__init__(self, downloader)
1136 def report_download_webpage(self, video_id):
1137 """Report webpage download."""
1138 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1139 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1141 def report_extraction(self, video_id):
1142 """Report information extraction."""
1143 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1145 def report_following_redirect(self, new_url):
1146 """Report information extraction."""
1147 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1149 def _test_redirect(self, url):
1150 """Check if it is a redirect, like url shorteners, in case restart chain."""
1151 class HeadRequest(urllib2.Request):
1152 def get_method(self):
1155 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1157 Subclass the HTTPRedirectHandler to make it use our
1158 HeadRequest also on the redirected URL
1160 def redirect_request(self, req, fp, code, msg, headers, newurl):
1161 if code in (301, 302, 303, 307):
1162 newurl = newurl.replace(' ', '%20')
1163 newheaders = dict((k,v) for k,v in req.headers.items()
1164 if k.lower() not in ("content-length", "content-type"))
1165 return HeadRequest(newurl,
1167 origin_req_host=req.get_origin_req_host(),
1170 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1172 class HTTPMethodFallback(urllib2.BaseHandler):
1174 Fallback to GET if HEAD is not allowed (405 HTTP error)
1176 def http_error_405(self, req, fp, code, msg, headers):
1180 newheaders = dict((k,v) for k,v in req.headers.items()
1181 if k.lower() not in ("content-length", "content-type"))
1182 return self.parent.open(urllib2.Request(req.get_full_url(),
1184 origin_req_host=req.get_origin_req_host(),
1188 opener = urllib2.OpenerDirector()
1189 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1190 HTTPMethodFallback, HEADRedirectHandler,
1191 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1192 opener.add_handler(handler())
1194 response = opener.open(HeadRequest(url))
1195 new_url = response.geturl()
1197 if url == new_url: return False
1199 self.report_following_redirect(new_url)
1200 self._downloader.download([new_url])
1203 def _real_extract(self, url):
1204 if self._test_redirect(url): return
1206 video_id = url.split('/')[-1]
1207 request = urllib2.Request(url)
1209 self.report_download_webpage(video_id)
1210 webpage = urllib2.urlopen(request).read()
1211 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1212 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1214 except ValueError, err:
1215 # since this is the last-resort InfoExtractor, if
1216 # this error is thrown, it'll be thrown here
1217 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1220 self.report_extraction(video_id)
1221 # Start with something easy: JW Player in SWFObject
1222 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1224 # Broaden the search a little bit
1225 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1227 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1230 # It's possible that one of the regexes
1231 # matched, but returned an empty group:
1232 if mobj.group(1) is None:
1233 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1236 video_url = urllib.unquote(mobj.group(1))
1237 video_id = os.path.basename(video_url)
1239 # here's a fun little line of code for you:
1240 video_extension = os.path.splitext(video_id)[1][1:]
1241 video_id = os.path.splitext(video_id)[0]
1243 # it's tempting to parse this further, but you would
1244 # have to take into account all the variations like
1245 # Video Title - Site Name
1246 # Site Name | Video Title
1247 # Video Title - Tagline | Site Name
1248 # and so on and so forth; it's just not practical
1249 mobj = re.search(r'<title>(.*)</title>', webpage)
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1253 video_title = mobj.group(1).decode('utf-8')
1255 # video uploader is domain name
1256 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1258 self._downloader.trouble(u'ERROR: unable to extract title')
1260 video_uploader = mobj.group(1).decode('utf-8')
1263 'id': video_id.decode('utf-8'),
1264 'url': video_url.decode('utf-8'),
1265 'uploader': video_uploader,
1266 'upload_date': u'NA',
1267 'title': video_title,
1268 'ext': video_extension.decode('utf-8'),
1274 class YoutubeSearchIE(InfoExtractor):
1275 """Information Extractor for YouTube search queries."""
1276 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1277 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1278 _max_youtube_results = 1000
1279 IE_NAME = u'youtube:search'
1281 def __init__(self, downloader=None):
1282 InfoExtractor.__init__(self, downloader)
1284 def report_download_page(self, query, pagenum):
1285 """Report attempt to download search page with given number."""
1286 query = query.decode(preferredencoding())
1287 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1289 def _real_extract(self, query):
1290 mobj = re.match(self._VALID_URL, query)
1292 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1295 prefix, query = query.split(':')
1297 query = query.encode('utf-8')
1299 self._download_n_results(query, 1)
1301 elif prefix == 'all':
1302 self._download_n_results(query, self._max_youtube_results)
1308 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1310 elif n > self._max_youtube_results:
1311 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1312 n = self._max_youtube_results
1313 self._download_n_results(query, n)
1315 except ValueError: # parsing prefix as integer fails
1316 self._download_n_results(query, 1)
1319 def _download_n_results(self, query, n):
1320 """Downloads a specified number of results for a query"""
1326 while (50 * pagenum) < limit:
1327 self.report_download_page(query, pagenum+1)
1328 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1329 request = urllib2.Request(result_url)
1331 data = urllib2.urlopen(request).read()
1332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1335 api_response = json.loads(data)['data']
1337 new_ids = list(video['id'] for video in api_response['items'])
1338 video_ids += new_ids
1340 limit = min(n, api_response['totalItems'])
1343 if len(video_ids) > n:
1344 video_ids = video_ids[:n]
1345 for id in video_ids:
1346 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1350 class GoogleSearchIE(InfoExtractor):
1351 """Information Extractor for Google Video search queries."""
1352 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1353 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1354 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1355 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1356 _max_google_results = 1000
1357 IE_NAME = u'video.google:search'
1359 def __init__(self, downloader=None):
1360 InfoExtractor.__init__(self, downloader)
1362 def report_download_page(self, query, pagenum):
1363 """Report attempt to download playlist page with given number."""
1364 query = query.decode(preferredencoding())
1365 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1367 def _real_extract(self, query):
1368 mobj = re.match(self._VALID_URL, query)
1370 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1373 prefix, query = query.split(':')
1375 query = query.encode('utf-8')
1377 self._download_n_results(query, 1)
1379 elif prefix == 'all':
1380 self._download_n_results(query, self._max_google_results)
1386 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1388 elif n > self._max_google_results:
1389 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1390 n = self._max_google_results
1391 self._download_n_results(query, n)
1393 except ValueError: # parsing prefix as integer fails
1394 self._download_n_results(query, 1)
1397 def _download_n_results(self, query, n):
1398 """Downloads a specified number of results for a query"""
1404 self.report_download_page(query, pagenum)
1405 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1406 request = urllib2.Request(result_url)
1408 page = urllib2.urlopen(request).read()
1409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1413 # Extract video identifiers
1414 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1415 video_id = mobj.group(1)
1416 if video_id not in video_ids:
1417 video_ids.append(video_id)
1418 if len(video_ids) == n:
1419 # Specified n videos reached
1420 for id in video_ids:
1421 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1424 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1425 for id in video_ids:
1426 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1429 pagenum = pagenum + 1
1432 class YahooSearchIE(InfoExtractor):
1433 """Information Extractor for Yahoo! Video search queries."""
1434 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1435 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1436 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1437 _MORE_PAGES_INDICATOR = r'\s*Next'
1438 _max_yahoo_results = 1000
1439 IE_NAME = u'video.yahoo:search'
1441 def __init__(self, downloader=None):
1442 InfoExtractor.__init__(self, downloader)
1444 def report_download_page(self, query, pagenum):
1445 """Report attempt to download playlist page with given number."""
1446 query = query.decode(preferredencoding())
1447 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1449 def _real_extract(self, query):
1450 mobj = re.match(self._VALID_URL, query)
1452 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455 prefix, query = query.split(':')
1457 query = query.encode('utf-8')
1459 self._download_n_results(query, 1)
1461 elif prefix == 'all':
1462 self._download_n_results(query, self._max_yahoo_results)
1468 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1470 elif n > self._max_yahoo_results:
1471 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1472 n = self._max_yahoo_results
1473 self._download_n_results(query, n)
1475 except ValueError: # parsing prefix as integer fails
1476 self._download_n_results(query, 1)
1479 def _download_n_results(self, query, n):
1480 """Downloads a specified number of results for a query"""
1483 already_seen = set()
1487 self.report_download_page(query, pagenum)
1488 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1489 request = urllib2.Request(result_url)
1491 page = urllib2.urlopen(request).read()
1492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1496 # Extract video identifiers
1497 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1498 video_id = mobj.group(1)
1499 if video_id not in already_seen:
1500 video_ids.append(video_id)
1501 already_seen.add(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1513 pagenum = pagenum + 1
1516 class YoutubePlaylistIE(InfoExtractor):
1517 """Information Extractor for YouTube playlists."""
1519 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1520 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1521 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1522 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1523 IE_NAME = u'youtube:playlist'
1525 def __init__(self, downloader=None):
1526 InfoExtractor.__init__(self, downloader)
1528 def report_download_page(self, playlist_id, pagenum):
1529 """Report attempt to download playlist page with given number."""
1530 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1532 def _real_extract(self, url):
1533 # Extract playlist id
1534 mobj = re.match(self._VALID_URL, url)
1536 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1540 if mobj.group(3) is not None:
1541 self._downloader.download([mobj.group(3)])
1544 # Download playlist pages
1545 # prefix is 'p' as default for playlists but there are other types that need extra care
1546 playlist_prefix = mobj.group(1)
1547 if playlist_prefix == 'a':
1548 playlist_access = 'artist'
1550 playlist_prefix = 'p'
1551 playlist_access = 'view_play_list'
1552 playlist_id = mobj.group(2)
1557 self.report_download_page(playlist_id, pagenum)
1558 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1559 request = urllib2.Request(url)
1561 page = urllib2.urlopen(request).read()
1562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1566 # Extract video identifiers
1568 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1569 if mobj.group(1) not in ids_in_page:
1570 ids_in_page.append(mobj.group(1))
1571 video_ids.extend(ids_in_page)
1573 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575 pagenum = pagenum + 1
1577 playliststart = self._downloader.params.get('playliststart', 1) - 1
1578 playlistend = self._downloader.params.get('playlistend', -1)
1579 if playlistend == -1:
1580 video_ids = video_ids[playliststart:]
1582 video_ids = video_ids[playliststart:playlistend]
1584 for id in video_ids:
1585 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1589 class YoutubeChannelIE(InfoExtractor):
1590 """Information Extractor for YouTube channels."""
1592 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1593 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1594 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1595 IE_NAME = u'youtube:channel'
1597 def report_download_page(self, channel_id, pagenum):
1598 """Report attempt to download channel page with given number."""
1599 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1601 def _real_extract(self, url):
1602 # Extract channel id
1603 mobj = re.match(self._VALID_URL, url)
1605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1608 # Download channel pages
1609 channel_id = mobj.group(1)
1614 self.report_download_page(channel_id, pagenum)
1615 url = self._TEMPLATE_URL % (channel_id, pagenum)
1616 request = urllib2.Request(url)
1618 page = urllib2.urlopen(request).read()
1619 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1623 # Extract video identifiers
1625 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1626 if mobj.group(1) not in ids_in_page:
1627 ids_in_page.append(mobj.group(1))
1628 video_ids.extend(ids_in_page)
1630 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1632 pagenum = pagenum + 1
1634 for id in video_ids:
1635 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1639 class YoutubeUserIE(InfoExtractor):
1640 """Information Extractor for YouTube users."""
1642 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1643 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644 _GDATA_PAGE_SIZE = 50
1645 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1646 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1647 IE_NAME = u'youtube:user'
1649 def __init__(self, downloader=None):
1650 InfoExtractor.__init__(self, downloader)
1652 def report_download_page(self, username, start_index):
1653 """Report attempt to download user page."""
1654 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1655 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1657 def _real_extract(self, url):
1659 mobj = re.match(self._VALID_URL, url)
1661 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1664 username = mobj.group(1)
1666 # Download video ids using YouTube Data API. Result size per
1667 # query is limited (currently to 50 videos) so we need to query
1668 # page by page until there are no video ids - it means we got
1675 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1676 self.report_download_page(username, start_index)
1678 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1681 page = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1686 # Extract video identifiers
1689 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690 if mobj.group(1) not in ids_in_page:
1691 ids_in_page.append(mobj.group(1))
1693 video_ids.extend(ids_in_page)
1695 # A little optimization - if current page is not
1696 # "full", ie. does not contain PAGE_SIZE video ids then
1697 # we can assume that this page is the last one - there
1698 # are no more ids on further pages - no need to query
1701 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1706 all_ids_count = len(video_ids)
1707 playliststart = self._downloader.params.get('playliststart', 1) - 1
1708 playlistend = self._downloader.params.get('playlistend', -1)
1710 if playlistend == -1:
1711 video_ids = video_ids[playliststart:]
1713 video_ids = video_ids[playliststart:playlistend]
1715 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1716 (username, all_ids_count, len(video_ids)))
1718 for video_id in video_ids:
1719 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1722 class BlipTVUserIE(InfoExtractor):
1723 """Information Extractor for blip.tv users."""
1725 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1727 IE_NAME = u'blip.tv:user'
1729 def __init__(self, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1732 def report_download_page(self, username, pagenum):
1733 """Report attempt to download user page."""
1734 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1735 (self.IE_NAME, username, pagenum))
1737 def _real_extract(self, url):
1739 mobj = re.match(self._VALID_URL, url)
1741 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1744 username = mobj.group(1)
1746 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1748 request = urllib2.Request(url)
1751 page = urllib2.urlopen(request).read().decode('utf-8')
1752 mobj = re.search(r'data-users-id="([^"]+)"', page)
1753 page_base = page_base % mobj.group(1)
1754 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1755 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1759 # Download video ids using BlipTV Ajax calls. Result size per
1760 # query is limited (currently to 12 videos) so we need to query
1761 # page by page until there are no video ids - it means we got
1768 self.report_download_page(username, pagenum)
1770 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1773 page = urllib2.urlopen(request).read().decode('utf-8')
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1778 # Extract video identifiers
1781 for mobj in re.finditer(r'href="/([^"]+)"', page):
1782 if mobj.group(1) not in ids_in_page:
1783 ids_in_page.append(unescapeHTML(mobj.group(1)))
1785 video_ids.extend(ids_in_page)
1787 # A little optimization - if current page is not
1788 # "full", ie. does not contain PAGE_SIZE video ids then
1789 # we can assume that this page is the last one - there
1790 # are no more ids on further pages - no need to query
1793 if len(ids_in_page) < self._PAGE_SIZE:
1798 all_ids_count = len(video_ids)
1799 playliststart = self._downloader.params.get('playliststart', 1) - 1
1800 playlistend = self._downloader.params.get('playlistend', -1)
1802 if playlistend == -1:
1803 video_ids = video_ids[playliststart:]
1805 video_ids = video_ids[playliststart:playlistend]
1807 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1808 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1810 for video_id in video_ids:
1811 self._downloader.download([u'http://blip.tv/'+video_id])
1814 class DepositFilesIE(InfoExtractor):
1815 """Information extractor for depositfiles.com"""
1817 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1818 IE_NAME = u'DepositFiles'
1820 def __init__(self, downloader=None):
1821 InfoExtractor.__init__(self, downloader)
1823 def report_download_webpage(self, file_id):
1824 """Report webpage download."""
1825 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1827 def report_extraction(self, file_id):
1828 """Report information extraction."""
1829 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1831 def _real_extract(self, url):
1832 file_id = url.split('/')[-1]
1833 # Rebuild url in english locale
1834 url = 'http://depositfiles.com/en/files/' + file_id
1836 # Retrieve file webpage with 'Free download' button pressed
1837 free_download_indication = { 'gateway_result' : '1' }
1838 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1840 self.report_download_webpage(file_id)
1841 webpage = urllib2.urlopen(request).read()
1842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1846 # Search for the real file URL
1847 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1848 if (mobj is None) or (mobj.group(1) is None):
1849 # Try to figure out reason of the error.
1850 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1851 if (mobj is not None) and (mobj.group(1) is not None):
1852 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1853 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1855 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1858 file_url = mobj.group(1)
1859 file_extension = os.path.splitext(file_url)[1][1:]
1861 # Search for file title
1862 mobj = re.search(r'<b title="(.*?)">', webpage)
1864 self._downloader.trouble(u'ERROR: unable to extract title')
1866 file_title = mobj.group(1).decode('utf-8')
1869 'id': file_id.decode('utf-8'),
1870 'url': file_url.decode('utf-8'),
1872 'upload_date': u'NA',
1873 'title': file_title,
1874 'ext': file_extension.decode('utf-8'),
1880 class FacebookIE(InfoExtractor):
1881 """Information Extractor for Facebook"""
1883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885 _NETRC_MACHINE = 'facebook'
1886 _available_formats = ['video', 'highqual', 'lowqual']
1887 _video_extensions = {
1892 IE_NAME = u'facebook'
1894 def __init__(self, downloader=None):
1895 InfoExtractor.__init__(self, downloader)
1897 def _reporter(self, message):
1898 """Add header and report message."""
1899 self._downloader.to_screen(u'[facebook] %s' % message)
1901 def report_login(self):
1902 """Report attempt to log in."""
1903 self._reporter(u'Logging in')
1905 def report_video_webpage_download(self, video_id):
1906 """Report attempt to download video webpage."""
1907 self._reporter(u'%s: Downloading video webpage' % video_id)
1909 def report_information_extraction(self, video_id):
1910 """Report attempt to extract video information."""
1911 self._reporter(u'%s: Extracting video information' % video_id)
1913 def _parse_page(self, video_webpage):
1914 """Extract video information from page"""
1916 data = {'title': r'\("video_title", "(.*?)"\)',
1917 'description': r'<div class="datawrap">(.*?)</div>',
1918 'owner': r'\("video_owner_name", "(.*?)"\)',
1919 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1922 for piece in data.keys():
1923 mobj = re.search(data[piece], video_webpage)
1924 if mobj is not None:
1925 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1929 for fmt in self._available_formats:
1930 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1931 if mobj is not None:
1932 # URL is in a Javascript segment inside an escaped Unicode format within
1933 # the generally utf-8 page
1934 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1935 video_info['video_urls'] = video_urls
1939 def _real_initialize(self):
1940 if self._downloader is None:
1945 downloader_params = self._downloader.params
1947 # Attempt to use provided username and password or .netrc data
1948 if downloader_params.get('username', None) is not None:
1949 useremail = downloader_params['username']
1950 password = downloader_params['password']
1951 elif downloader_params.get('usenetrc', False):
1953 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1954 if info is not None:
1958 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1959 except (IOError, netrc.NetrcParseError), err:
1960 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1963 if useremail is None:
1972 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1975 login_results = urllib2.urlopen(request).read()
1976 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1977 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1979 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1983 def _real_extract(self, url):
1984 mobj = re.match(self._VALID_URL, url)
1986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1988 video_id = mobj.group('ID')
1991 self.report_video_webpage_download(video_id)
1992 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1994 page = urllib2.urlopen(request)
1995 video_webpage = page.read()
1996 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1997 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2000 # Start extracting information
2001 self.report_information_extraction(video_id)
2003 # Extract information
2004 video_info = self._parse_page(video_webpage)
2007 if 'owner' not in video_info:
2008 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2010 video_uploader = video_info['owner']
2013 if 'title' not in video_info:
2014 self._downloader.trouble(u'ERROR: unable to extract video title')
2016 video_title = video_info['title']
2017 video_title = video_title.decode('utf-8')
2020 if 'thumbnail' not in video_info:
2021 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2022 video_thumbnail = ''
2024 video_thumbnail = video_info['thumbnail']
2028 if 'upload_date' in video_info:
2029 upload_time = video_info['upload_date']
2030 timetuple = email.utils.parsedate_tz(upload_time)
2031 if timetuple is not None:
2033 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2038 video_description = video_info.get('description', 'No description available.')
2040 url_map = video_info['video_urls']
2041 if len(url_map.keys()) > 0:
2042 # Decide which formats to download
2043 req_format = self._downloader.params.get('format', None)
2044 format_limit = self._downloader.params.get('format_limit', None)
2046 if format_limit is not None and format_limit in self._available_formats:
2047 format_list = self._available_formats[self._available_formats.index(format_limit):]
2049 format_list = self._available_formats
2050 existing_formats = [x for x in format_list if x in url_map]
2051 if len(existing_formats) == 0:
2052 self._downloader.trouble(u'ERROR: no known formats available for video')
2054 if req_format is None:
2055 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2056 elif req_format == 'worst':
2057 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2058 elif req_format == '-1':
2059 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2062 if req_format not in url_map:
2063 self._downloader.trouble(u'ERROR: requested format not available')
2065 video_url_list = [(req_format, url_map[req_format])] # Specific format
2068 for format_param, video_real_url in video_url_list:
2070 video_extension = self._video_extensions.get(format_param, 'mp4')
2073 'id': video_id.decode('utf-8'),
2074 'url': video_real_url.decode('utf-8'),
2075 'uploader': video_uploader.decode('utf-8'),
2076 'upload_date': upload_date,
2077 'title': video_title,
2078 'ext': video_extension.decode('utf-8'),
2079 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2080 'thumbnail': video_thumbnail.decode('utf-8'),
2081 'description': video_description.decode('utf-8'),
2086 class BlipTVIE(InfoExtractor):
2087 """Information extractor for blip.tv"""
2089 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2090 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2091 IE_NAME = u'blip.tv'
2093 def report_extraction(self, file_id):
2094 """Report information extraction."""
2095 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2097 def report_direct_download(self, title):
2098 """Report information extraction."""
2099 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2101 def _real_extract(self, url):
2102 mobj = re.match(self._VALID_URL, url)
2104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2111 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2112 request = urllib2.Request(json_url.encode('utf-8'))
2113 self.report_extraction(mobj.group(1))
2116 urlh = urllib2.urlopen(request)
2117 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2118 basename = url.split('/')[-1]
2119 title,ext = os.path.splitext(basename)
2120 title = title.decode('UTF-8')
2121 ext = ext.replace('.', '')
2122 self.report_direct_download(title)
2130 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2131 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2133 if info is None: # Regular URL
2135 json_code = urlh.read()
2136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2141 json_data = json.loads(json_code)
2142 if 'Post' in json_data:
2143 data = json_data['Post']
2147 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2148 video_url = data['media']['url']
2149 umobj = re.match(self._URL_EXT, video_url)
2151 raise ValueError('Can not determine filename extension')
2152 ext = umobj.group(1)
2155 'id': data['item_id'],
2157 'uploader': data['display_name'],
2158 'upload_date': upload_date,
2159 'title': data['title'],
2161 'format': data['media']['mimeType'],
2162 'thumbnail': data['thumbnailUrl'],
2163 'description': data['description'],
2164 'player_url': data['embedUrl']
2166 except (ValueError,KeyError), err:
2167 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2170 std_headers['User-Agent'] = 'iTunes/10.6.1'
2174 class MyVideoIE(InfoExtractor):
2175 """Information Extractor for myvideo.de."""
2177 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2178 IE_NAME = u'myvideo'
2180 def __init__(self, downloader=None):
2181 InfoExtractor.__init__(self, downloader)
2183 def report_download_webpage(self, video_id):
2184 """Report webpage download."""
2185 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2187 def report_extraction(self, video_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2191 def _real_extract(self,url):
2192 mobj = re.match(self._VALID_URL, url)
2194 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2197 video_id = mobj.group(1)
2200 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2202 self.report_download_webpage(video_id)
2203 webpage = urllib2.urlopen(request).read()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2208 self.report_extraction(video_id)
2209 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2212 self._downloader.trouble(u'ERROR: unable to extract media URL')
2214 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2216 mobj = re.search('<title>([^<]+)</title>', webpage)
2218 self._downloader.trouble(u'ERROR: unable to extract title')
2221 video_title = mobj.group(1)
2227 'upload_date': u'NA',
2228 'title': video_title,
2234 class ComedyCentralIE(InfoExtractor):
2235 """Information extractor for The Daily Show and Colbert Report """
2237 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2238 IE_NAME = u'comedycentral'
2240 def report_extraction(self, episode_id):
2241 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2243 def report_config_download(self, episode_id):
2244 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2246 def report_index_download(self, episode_id):
2247 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2249 def report_player_url(self, episode_id):
2250 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2252 def _real_extract(self, url):
2253 mobj = re.match(self._VALID_URL, url)
2255 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258 if mobj.group('shortname'):
2259 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2260 url = u'http://www.thedailyshow.com/full-episodes/'
2262 url = u'http://www.colbertnation.com/full-episodes/'
2263 mobj = re.match(self._VALID_URL, url)
2264 assert mobj is not None
2266 dlNewest = not mobj.group('episode')
2268 epTitle = mobj.group('showname')
2270 epTitle = mobj.group('episode')
2272 req = urllib2.Request(url)
2273 self.report_extraction(epTitle)
2275 htmlHandle = urllib2.urlopen(req)
2276 html = htmlHandle.read()
2277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2281 url = htmlHandle.geturl()
2282 mobj = re.match(self._VALID_URL, url)
2284 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2286 if mobj.group('episode') == '':
2287 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2289 epTitle = mobj.group('episode')
2291 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2292 if len(mMovieParams) == 0:
2293 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2296 playerUrl_raw = mMovieParams[0][0]
2297 self.report_player_url(epTitle)
2299 urlHandle = urllib2.urlopen(playerUrl_raw)
2300 playerUrl = urlHandle.geturl()
2301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2302 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2305 uri = mMovieParams[0][1]
2306 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2307 self.report_index_download(epTitle)
2309 indexXml = urllib2.urlopen(indexUrl).read()
2310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2316 idoc = xml.etree.ElementTree.fromstring(indexXml)
2317 itemEls = idoc.findall('.//item')
2318 for itemEl in itemEls:
2319 mediaId = itemEl.findall('./guid')[0].text
2320 shortMediaId = mediaId.split(':')[-1]
2321 showId = mediaId.split(':')[-2].replace('.com', '')
2322 officialTitle = itemEl.findall('./title')[0].text
2323 officialDate = itemEl.findall('./pubDate')[0].text
2325 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2326 urllib.urlencode({'uri': mediaId}))
2327 configReq = urllib2.Request(configUrl)
2328 self.report_config_download(epTitle)
2330 configXml = urllib2.urlopen(configReq).read()
2331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2335 cdoc = xml.etree.ElementTree.fromstring(configXml)
2337 for rendition in cdoc.findall('.//rendition'):
2338 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2342 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2345 # For now, just pick the highest bitrate
2346 format,video_url = turls[-1]
2348 effTitle = showId + u'-' + epTitle
2353 'upload_date': officialDate,
2358 'description': officialTitle,
2359 'player_url': playerUrl
2362 results.append(info)
2367 class EscapistIE(InfoExtractor):
2368 """Information extractor for The Escapist """
2370 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371 IE_NAME = u'escapist'
2373 def report_extraction(self, showName):
2374 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2376 def report_config_download(self, showName):
2377 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2379 def _real_extract(self, url):
2380 mobj = re.match(self._VALID_URL, url)
2382 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2384 showName = mobj.group('showname')
2385 videoId = mobj.group('episode')
2387 self.report_extraction(showName)
2389 webPage = urllib2.urlopen(url)
2390 webPageBytes = webPage.read()
2391 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2392 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2397 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2398 description = unescapeHTML(descMatch.group(1))
2399 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2400 imgUrl = unescapeHTML(imgMatch.group(1))
2401 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2402 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2403 configUrlMatch = re.search('config=(.*)$', playerUrl)
2404 configUrl = urllib2.unquote(configUrlMatch.group(1))
2406 self.report_config_download(showName)
2408 configJSON = urllib2.urlopen(configUrl).read()
2409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2413 # Technically, it's JavaScript, not JSON
2414 configJSON = configJSON.replace("'", '"')
2417 config = json.loads(configJSON)
2418 except (ValueError,), err:
2419 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2422 playlist = config['playlist']
2423 videoUrl = playlist[1]['url']
2428 'uploader': showName,
2429 'upload_date': None,
2433 'thumbnail': imgUrl,
2434 'description': description,
2435 'player_url': playerUrl,
2441 class CollegeHumorIE(InfoExtractor):
2442 """Information extractor for collegehumor.com"""
2444 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2445 IE_NAME = u'collegehumor'
2447 def report_webpage(self, video_id):
2448 """Report information extraction."""
2449 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2451 def report_extraction(self, video_id):
2452 """Report information extraction."""
2453 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2455 def _real_extract(self, url):
2456 mobj = re.match(self._VALID_URL, url)
2458 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2460 video_id = mobj.group('videoid')
2462 self.report_webpage(video_id)
2463 request = urllib2.Request(url)
2465 webpage = urllib2.urlopen(request).read()
2466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2470 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2472 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2474 internal_video_id = m.group('internalvideoid')
2478 'internal_id': internal_video_id,
2481 self.report_extraction(video_id)
2482 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2484 metaXml = urllib2.urlopen(xmlUrl).read()
2485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2489 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2491 videoNode = mdoc.findall('./video')[0]
2492 info['description'] = videoNode.findall('./description')[0].text
2493 info['title'] = videoNode.findall('./caption')[0].text
2494 info['url'] = videoNode.findall('./file')[0].text
2495 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2496 info['ext'] = info['url'].rpartition('.')[2]
2497 info['format'] = info['ext']
2499 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2505 class XVideosIE(InfoExtractor):
2506 """Information extractor for xvideos.com"""
2508 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2509 IE_NAME = u'xvideos'
2511 def report_webpage(self, video_id):
2512 """Report information extraction."""
2513 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2515 def report_extraction(self, video_id):
2516 """Report information extraction."""
2517 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2519 def _real_extract(self, url):
2520 mobj = re.match(self._VALID_URL, url)
2522 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2524 video_id = mobj.group(1).decode('utf-8')
2526 self.report_webpage(video_id)
2528 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2530 webpage = urllib2.urlopen(request).read()
2531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2535 self.report_extraction(video_id)
2539 mobj = re.search(r'flv_url=(.+?)&', webpage)
2541 self._downloader.trouble(u'ERROR: unable to extract video url')
2543 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2547 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2549 self._downloader.trouble(u'ERROR: unable to extract video title')
2551 video_title = mobj.group(1).decode('utf-8')
2554 # Extract video thumbnail
2555 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2557 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2559 video_thumbnail = mobj.group(0).decode('utf-8')
2565 'upload_date': None,
2566 'title': video_title,
2569 'thumbnail': video_thumbnail,
2570 'description': None,
2577 class SoundcloudIE(InfoExtractor):
2578 """Information extractor for soundcloud.com
2579 To access the media, the uid of the song and a stream token
2580 must be extracted from the page source and the script must make
2581 a request to media.soundcloud.com/crossdomain.xml. Then
2582 the media can be grabbed by requesting from an url composed
2583 of the stream token and uid
2586 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2587 IE_NAME = u'soundcloud'
2589 def __init__(self, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2592 def report_webpage(self, video_id):
2593 """Report information extraction."""
2594 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2596 def report_extraction(self, video_id):
2597 """Report information extraction."""
2598 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2600 def _real_extract(self, url):
2601 mobj = re.match(self._VALID_URL, url)
2603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2606 # extract uploader (which is in the url)
2607 uploader = mobj.group(1).decode('utf-8')
2608 # extract simple title (uploader + slug of song title)
2609 slug_title = mobj.group(2).decode('utf-8')
2610 simple_title = uploader + u'-' + slug_title
2612 self.report_webpage('%s/%s' % (uploader, slug_title))
2614 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2616 webpage = urllib2.urlopen(request).read()
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2621 self.report_extraction('%s/%s' % (uploader, slug_title))
2623 # extract uid and stream token that soundcloud hands out for access
2624 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2626 video_id = mobj.group(1)
2627 stream_token = mobj.group(2)
2629 # extract unsimplified title
2630 mobj = re.search('"title":"(.*?)",', webpage)
2632 title = mobj.group(1).decode('utf-8')
2634 title = simple_title
2636 # construct media url (with uid/token)
2637 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2638 mediaURL = mediaURL % (video_id, stream_token)
2641 description = u'No description available'
2642 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2644 description = mobj.group(1)
2648 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2651 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2652 except Exception, e:
2653 self._downloader.to_stderr(str(e))
2655 # for soundcloud, a request to a cross domain is required for cookies
2656 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2659 'id': video_id.decode('utf-8'),
2661 'uploader': uploader.decode('utf-8'),
2662 'upload_date': upload_date,
2667 'description': description.decode('utf-8')
2671 class InfoQIE(InfoExtractor):
2672 """Information extractor for infoq.com"""
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2677 def report_webpage(self, video_id):
2678 """Report information extraction."""
2679 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2681 def report_extraction(self, video_id):
2682 """Report information extraction."""
2683 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2685 def _real_extract(self, url):
2686 mobj = re.match(self._VALID_URL, url)
2688 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2691 self.report_webpage(url)
2693 request = urllib2.Request(url)
2695 webpage = urllib2.urlopen(request).read()
2696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2697 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2700 self.report_extraction(url)
2704 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2706 self._downloader.trouble(u'ERROR: unable to extract video url')
2708 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2712 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2714 self._downloader.trouble(u'ERROR: unable to extract video title')
2716 video_title = mobj.group(1).decode('utf-8')
2718 # Extract description
2719 video_description = u'No description available.'
2720 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721 if mobj is not None:
2722 video_description = mobj.group(1).decode('utf-8')
2724 video_filename = video_url.split('/')[-1]
2725 video_id, extension = video_filename.split('.')
2731 'upload_date': None,
2732 'title': video_title,
2734 'format': extension, # Extension is always(?) mp4, but seems to be flv
2736 'description': video_description,
2742 class MixcloudIE(InfoExtractor):
2743 """Information extractor for www.mixcloud.com"""
2744 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745 IE_NAME = u'mixcloud'
2747 def __init__(self, downloader=None):
2748 InfoExtractor.__init__(self, downloader)
2750 def report_download_json(self, file_id):
2751 """Report JSON download."""
2752 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2754 def report_extraction(self, file_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2758 def get_urls(self, jsonData, fmt, bitrate='best'):
2759 """Get urls from 'audio_formats' section in json"""
2762 bitrate_list = jsonData[fmt]
2763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764 bitrate = max(bitrate_list) # select highest
2766 url_list = jsonData[fmt][bitrate]
2767 except TypeError: # we have no bitrate info.
2768 url_list = jsonData[fmt]
2771 def check_urls(self, url_list):
2772 """Returns 1st active url from list"""
2773 for url in url_list:
2775 urllib2.urlopen(url)
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2782 def _print_formats(self, formats):
2783 print 'Available formats:'
2784 for fmt in formats.keys():
2785 for b in formats[fmt]:
2787 ext = formats[fmt][b][0]
2788 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2789 except TypeError: # we have no bitrate info
2790 ext = formats[fmt][0]
2791 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2794 def _real_extract(self, url):
2795 mobj = re.match(self._VALID_URL, url)
2797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2799 # extract uploader & filename from url
2800 uploader = mobj.group(1).decode('utf-8')
2801 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2803 # construct API request
2804 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805 # retrieve .json file with links to files
2806 request = urllib2.Request(file_url)
2808 self.report_download_json(file_url)
2809 jsonData = urllib2.urlopen(request).read()
2810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2815 json_data = json.loads(jsonData)
2816 player_url = json_data['player_swf_url']
2817 formats = dict(json_data['audio_formats'])
2819 req_format = self._downloader.params.get('format', None)
2822 if self._downloader.params.get('listformats', None):
2823 self._print_formats(formats)
2826 if req_format is None or req_format == 'best':
2827 for format_param in formats.keys():
2828 url_list = self.get_urls(formats, format_param)
2830 file_url = self.check_urls(url_list)
2831 if file_url is not None:
2834 if req_format not in formats.keys():
2835 self._downloader.trouble(u'ERROR: format is not available')
2838 url_list = self.get_urls(formats, req_format)
2839 file_url = self.check_urls(url_list)
2840 format_param = req_format
2843 'id': file_id.decode('utf-8'),
2844 'url': file_url.decode('utf-8'),
2845 'uploader': uploader.decode('utf-8'),
2846 'upload_date': u'NA',
2847 'title': json_data['name'],
2848 'ext': file_url.split('.')[-1].decode('utf-8'),
2849 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850 'thumbnail': json_data['thumbnail_url'],
2851 'description': json_data['description'],
2852 'player_url': player_url.decode('utf-8'),
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856 """Information extractor for Stanford's Open ClassRoom"""
2858 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859 IE_NAME = u'stanfordoc'
2861 def report_download_webpage(self, objid):
2862 """Report information extraction."""
2863 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2865 def report_extraction(self, video_id):
2866 """Report information extraction."""
2867 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2869 def _real_extract(self, url):
2870 mobj = re.match(self._VALID_URL, url)
2872 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2875 if mobj.group('course') and mobj.group('video'): # A specific video
2876 course = mobj.group('course')
2877 video = mobj.group('video')
2879 'id': course + '_' + video,
2882 self.report_extraction(info['id'])
2883 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2884 xmlUrl = baseUrl + video + '.xml'
2886 metaXml = urllib2.urlopen(xmlUrl).read()
2887 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2888 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2890 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2892 info['title'] = mdoc.findall('./title')[0].text
2893 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2895 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2897 info['ext'] = info['url'].rpartition('.')[2]
2898 info['format'] = info['ext']
2900 elif mobj.group('course'): # A course page
2901 course = mobj.group('course')
2907 self.report_download_webpage(info['id'])
2909 coursepage = urllib2.urlopen(url).read()
2910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2911 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2914 m = re.search('<h1>([^<]+)</h1>', coursepage)
2916 info['title'] = unescapeHTML(m.group(1))
2918 info['title'] = info['id']
2920 m = re.search('<description>([^<]+)</description>', coursepage)
2922 info['description'] = unescapeHTML(m.group(1))
2924 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2927 'type': 'reference',
2928 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2932 for entry in info['list']:
2933 assert entry['type'] == 'reference'
2934 results += self.extract(entry['url'])
2939 'id': 'Stanford OpenClassroom',
2943 self.report_download_webpage(info['id'])
2944 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2946 rootpage = urllib2.urlopen(rootURL).read()
2947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2948 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2951 info['title'] = info['id']
2953 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2956 'type': 'reference',
2957 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2962 for entry in info['list']:
2963 assert entry['type'] == 'reference'
2964 results += self.extract(entry['url'])
2967 class MTVIE(InfoExtractor):
2968 """Information extractor for MTV.com"""
2970 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2973 def report_webpage(self, video_id):
2974 """Report information extraction."""
2975 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2977 def report_extraction(self, video_id):
2978 """Report information extraction."""
2979 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2981 def _real_extract(self, url):
2982 mobj = re.match(self._VALID_URL, url)
2984 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2986 if not mobj.group('proto'):
2987 url = 'http://' + url
2988 video_id = mobj.group('videoid')
2989 self.report_webpage(video_id)
2991 request = urllib2.Request(url)
2993 webpage = urllib2.urlopen(request).read()
2994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2998 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3000 self._downloader.trouble(u'ERROR: unable to extract song name')
3002 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3003 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3005 self._downloader.trouble(u'ERROR: unable to extract performer')
3007 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3008 video_title = performer + ' - ' + song_name
3010 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3012 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3014 mtvn_uri = mobj.group(1)
3016 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3018 self._downloader.trouble(u'ERROR: unable to extract content id')
3020 content_id = mobj.group(1)
3022 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3023 self.report_extraction(video_id)
3024 request = urllib2.Request(videogen_url)
3026 metadataXml = urllib2.urlopen(request).read()
3027 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3028 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3031 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3032 renditions = mdoc.findall('.//rendition')
3034 # For now, always pick the highest quality.
3035 rendition = renditions[-1]
3038 _,_,ext = rendition.attrib['type'].partition('/')
3039 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3040 video_url = rendition.find('./src').text
3042 self._downloader.trouble('Invalid rendition field.')
3048 'uploader': performer,
3049 'title': video_title,
3057 class YoukuIE(InfoExtractor):
3059 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3062 def __init__(self, downloader=None):
3063 InfoExtractor.__init__(self, downloader)
3065 def report_download_webpage(self, file_id):
3066 """Report webpage download."""
3067 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3069 def report_extraction(self, file_id):
3070 """Report information extraction."""
3071 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3074 nowTime = int(time.time() * 1000)
3075 random1 = random.randint(1000,1998)
3076 random2 = random.randint(1000,9999)
3078 return "%d%d%d" %(nowTime,random1,random2)
3080 def _get_file_ID_mix_string(self, seed):
3082 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3084 for i in range(len(source)):
3085 seed = (seed * 211 + 30031 ) % 65536
3086 index = math.floor(seed / 65536 * len(source) )
3087 mixed.append(source[int(index)])
3088 source.remove(source[int(index)])
3089 #return ''.join(mixed)
3092 def _get_file_id(self, fileId, seed):
3093 mixed = self._get_file_ID_mix_string(seed)
3094 ids = fileId.split('*')
3098 realId.append(mixed[int(ch)])
3099 return ''.join(realId)
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3106 video_id = mobj.group('ID')
3108 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3110 request = urllib2.Request(info_url, None, std_headers)
3112 self.report_download_webpage(video_id)
3113 jsondata = urllib2.urlopen(request).read()
3114 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3115 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3118 self.report_extraction(video_id)
3120 config = json.loads(jsondata)
3122 video_title = config['data'][0]['title']
3123 seed = config['data'][0]['seed']
3125 format = self._downloader.params.get('format', None)
3126 supported_format = config['data'][0]['streamfileids'].keys()
3128 if format is None or format == 'best':
3129 if 'hd2' in supported_format:
3134 elif format == 'worst':
3142 fileid = config['data'][0]['streamfileids'][format]
3143 seg_number = len(config['data'][0]['segs'][format])
3146 for i in xrange(seg_number):
3147 keys.append(config['data'][0]['segs'][format][i]['k'])
3150 #youku only could be viewed from mainland china
3152 self._downloader.trouble(u'ERROR: unable to extract info section')
3156 sid = self._gen_sid()
3157 fileid = self._get_file_id(fileid, seed)
3159 #column 8,9 of fileid represent the segment number
3160 #fileid[7:9] should be changed
3161 for index, key in enumerate(keys):
3163 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3164 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3167 'id': '%s_part%02d' % (video_id, index),
3168 'url': download_url,
3170 'title': video_title,
3174 files_info.append(info)
3179 class XNXXIE(InfoExtractor):
3180 """Information extractor for xnxx.com"""
3182 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3184 VIDEO_URL_RE = r'flv_url=(.*?)&'
3185 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3186 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3188 def report_webpage(self, video_id):
3189 """Report information extraction"""
3190 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3192 def report_extraction(self, video_id):
3193 """Report information extraction"""
3194 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3196 def _real_extract(self, url):
3197 mobj = re.match(self._VALID_URL, url)
3199 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3201 video_id = mobj.group(1).decode('utf-8')
3203 self.report_webpage(video_id)
3205 # Get webpage content
3207 webpage = urllib2.urlopen(url).read()
3208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3212 result = re.search(self.VIDEO_URL_RE, webpage)
3214 self._downloader.trouble(u'ERROR: unable to extract video url')
3216 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3218 result = re.search(self.VIDEO_TITLE_RE, webpage)
3220 self._downloader.trouble(u'ERROR: unable to extract video title')
3222 video_title = result.group(1).decode('utf-8')
3224 result = re.search(self.VIDEO_THUMB_RE, webpage)
3226 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3228 video_thumbnail = result.group(1).decode('utf-8')
3230 info = {'id': video_id,
3233 'upload_date': None,
3234 'title': video_title,
3237 'thumbnail': video_thumbnail,
3238 'description': None,
3244 class GooglePlusIE(InfoExtractor):
3245 """Information extractor for plus.google.com."""
3247 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3248 IE_NAME = u'plus.google'
3250 def __init__(self, downloader=None):
3251 InfoExtractor.__init__(self, downloader)
3253 def report_extract_entry(self, url):
3254 """Report downloading extry"""
3255 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3257 def report_date(self, upload_date):
3258 """Report downloading extry"""
3259 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3261 def report_uploader(self, uploader):
3262 """Report downloading extry"""
3263 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3265 def report_title(self, video_title):
3266 """Report downloading extry"""
3267 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3269 def report_extract_vid_page(self, video_page):
3270 """Report information extraction."""
3271 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3273 def _real_extract(self, url):
3274 # Extract id from URL
3275 mobj = re.match(self._VALID_URL, url)
3277 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3280 post_url = mobj.group(0)
3281 video_id = mobj.group(2)
3283 video_extension = 'flv'
3285 # Step 1, Retrieve post webpage to extract further information
3286 self.report_extract_entry(post_url)
3287 request = urllib2.Request(post_url)
3289 webpage = urllib2.urlopen(request).read()
3290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3294 # Extract update date
3296 pattern = 'title="Timestamp">(.*?)</a>'
3297 mobj = re.search(pattern, webpage)
3299 upload_date = mobj.group(1)
3300 # Convert timestring to a format suitable for filename
3301 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3302 upload_date = upload_date.strftime('%Y%m%d')
3303 self.report_date(upload_date)
3307 pattern = r'rel\="author".*?>(.*?)</a>'
3308 mobj = re.search(pattern, webpage)
3310 uploader = mobj.group(1)
3311 self.report_uploader(uploader)
3314 # Get the first line for title
3316 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3317 mobj = re.search(pattern, webpage)
3319 video_title = mobj.group(1)
3320 self.report_title(video_title)
3322 # Step 2, Stimulate clicking the image box to launch video
3323 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3324 mobj = re.search(pattern, webpage)
3326 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3328 video_page = mobj.group(1)
3329 request = urllib2.Request(video_page)
3331 webpage = urllib2.urlopen(request).read()
3332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3333 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3335 self.report_extract_vid_page(video_page)
3338 # Extract video links on video page
3339 """Extract video links of all sizes"""
3340 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3341 mobj = re.findall(pattern, webpage)
3343 self._downloader.trouble(u'ERROR: unable to extract video links')
3345 # Sort in resolution
3346 links = sorted(mobj)
3348 # Choose the lowest of the sort, i.e. highest resolution
3349 video_url = links[-1]
3350 # Only get the url. The resolution part in the tuple has no use anymore
3351 video_url = video_url[-1]
3352 # Treat escaped \u0026 style hex
3353 video_url = unicode(video_url, "unicode_escape")
3357 'id': video_id.decode('utf-8'),
3359 'uploader': uploader.decode('utf-8'),
3360 'upload_date': upload_date.decode('utf-8'),
3361 'title': video_title.decode('utf-8'),
3362 'ext': video_extension.decode('utf-8'),