2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader, unescaped.
44 upload_date: Video upload date (YYYYMMDD).
45 title: Video title, unescaped.
46 ext: Video filename extension.
48 The following fields are optional:
50 format: The video format, defaults to ext (used for --get-format)
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53 player_url: SWF Player URL (used for rtmpdump).
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Initializes an instance (authentication, etc)."""
80 self._real_initialize()
83 def extract(self, url):
84 """Extracts URL information and returns it in list of dicts."""
86 return self._real_extract(url)
88 def set_downloader(self, downloader):
89 """Sets the downloader for this IE."""
90 self._downloader = downloader
92 def _real_initialize(self):
93 """Real initialization process. Redefine in subclasses."""
96 def _real_extract(self, url):
97 """Real extraction process. Redefine in subclasses."""
101 class YoutubeIE(InfoExtractor):
102 """Information extractor for youtube.com."""
106 (?:https?://)? # http(s):// (optional)
107 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
108 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
109 (?:.*?\#/)? # handle anchor (#/) redirect urls
110 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
111 (?: # the various things that can precede the ID:
112 (?:(?:v|embed|e)/) # v/ or embed/ or e/
113 |(?: # or the v= param in all its forms
114 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
115 (?:\?|\#!?) # the params delimiter ? or # or #!
116 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
119 )? # optional -> youtube.com/xxxx is OK
120 )? # all until now is optional -> you can pass the naked ID
121 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
122 (?(1).+)? # if we found the ID, everything can follow
124 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
125 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
126 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
127 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
128 _NETRC_MACHINE = 'youtube'
129 # Listed in order of quality
130 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
131 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
132 _video_extensions = {
138 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
144 _video_dimensions = {
162 def suitable(self, url):
163 """Receives a URL and returns True if suitable for this IE."""
164 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
166 def report_lang(self):
167 """Report attempt to set language."""
168 self._downloader.to_screen(u'[youtube] Setting language')
170 def report_login(self):
171 """Report attempt to log in."""
172 self._downloader.to_screen(u'[youtube] Logging in')
174 def report_age_confirmation(self):
175 """Report attempt to confirm age."""
176 self._downloader.to_screen(u'[youtube] Confirming age')
178 def report_video_webpage_download(self, video_id):
179 """Report attempt to download video webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
182 def report_video_info_webpage_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
186 def report_video_subtitles_download(self, video_id):
187 """Report attempt to download video info webpage."""
188 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
190 def report_information_extraction(self, video_id):
191 """Report attempt to extract video information."""
192 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
194 def report_unavailable_format(self, video_id, format):
195 """Report extracted video URL."""
196 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
198 def report_rtmp_download(self):
199 """Indicate the download will use the RTMP protocol."""
200 self._downloader.to_screen(u'[youtube] RTMP download detected')
202 def _closed_captions_xml_to_srt(self, xml_string):
204 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
205 # TODO parse xml instead of regex
206 for n, (start, dur_tag, dur, caption) in enumerate(texts):
207 if not dur: dur = '4'
209 end = start + float(dur)
210 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
211 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
212 caption = unescapeHTML(caption)
213 caption = unescapeHTML(caption) # double cycle, intentional
214 srt += str(n+1) + '\n'
215 srt += start + ' --> ' + end + '\n'
216 srt += caption + '\n\n'
219 def _print_formats(self, formats):
220 print('Available formats:')
222 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
224 def _real_initialize(self):
225 if self._downloader is None:
230 downloader_params = self._downloader.params
232 # Attempt to use provided username and password or .netrc data
233 if downloader_params.get('username', None) is not None:
234 username = downloader_params['username']
235 password = downloader_params['password']
236 elif downloader_params.get('usenetrc', False):
238 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
243 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
244 except (IOError, netrc.NetrcParseError), err:
245 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249 request = urllib2.Request(self._LANG_URL)
252 urllib2.urlopen(request).read()
253 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
254 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
257 # No authentication to be performed
263 'current_form': 'loginForm',
265 'action_login': 'Log In',
266 'username': username,
267 'password': password,
269 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
272 login_results = urllib2.urlopen(request).read()
273 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
274 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
276 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
277 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
283 'action_confirm': 'Confirm',
285 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
287 self.report_age_confirmation()
288 age_results = urllib2.urlopen(request).read()
289 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
290 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
293 def _real_extract(self, url):
294 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
295 mobj = re.search(self._NEXT_URL_RE, url)
297 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
299 # Extract video id from URL
300 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
302 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
304 video_id = mobj.group(2)
307 self.report_video_webpage_download(video_id)
308 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
310 video_webpage = urllib2.urlopen(request).read()
311 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
312 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
315 # Attempt to extract SWF player URL
316 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
318 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
323 self.report_video_info_webpage_download(video_id)
324 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
325 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
326 % (video_id, el_type))
327 request = urllib2.Request(video_info_url)
329 video_info_webpage = urllib2.urlopen(request).read()
330 video_info = parse_qs(video_info_webpage)
331 if 'token' in video_info:
333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
334 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
336 if 'token' not in video_info:
337 if 'reason' in video_info:
338 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
340 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
343 # Check for "rental" videos
344 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
345 self._downloader.trouble(u'ERROR: "rental" videos not supported')
348 # Start extracting information
349 self.report_information_extraction(video_id)
352 if 'author' not in video_info:
353 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
355 video_uploader = urllib.unquote_plus(video_info['author'][0])
358 if 'title' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract video title')
361 video_title = urllib.unquote_plus(video_info['title'][0])
362 video_title = video_title.decode('utf-8')
365 if 'thumbnail_url' not in video_info:
366 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
368 else: # don't panic if we can't find it
369 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
373 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
375 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
376 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
377 for expression in format_expressions:
379 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
384 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
385 if video_description: video_description = clean_html(video_description)
386 else: video_description = ''
389 video_subtitles = None
390 if self._downloader.params.get('writesubtitles', False):
392 self.report_video_subtitles_download(video_id)
393 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
395 srt_list = urllib2.urlopen(request).read()
396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
397 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
398 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
399 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
400 if not srt_lang_list:
401 raise Trouble(u'WARNING: video has no closed captions')
402 if self._downloader.params.get('subtitleslang', False):
403 srt_lang = self._downloader.params.get('subtitleslang')
404 elif 'en' in srt_lang_list:
407 srt_lang = srt_lang_list.keys()[0]
408 if not srt_lang in srt_lang_list:
409 raise Trouble(u'WARNING: no closed captions found in the specified language')
410 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
412 srt_xml = urllib2.urlopen(request).read()
413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
414 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
416 raise Trouble(u'WARNING: unable to download video subtitles')
417 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
418 except Trouble as trouble:
419 self._downloader.trouble(trouble[0])
421 if 'length_seconds' not in video_info:
422 self._downloader.trouble(u'WARNING: unable to extract video duration')
425 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
428 video_token = urllib.unquote_plus(video_info['token'][0])
430 # Decide which formats to download
431 req_format = self._downloader.params.get('format', None)
433 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
434 self.report_rtmp_download()
435 video_url_list = [(None, video_info['conn'][0])]
436 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
437 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
438 url_data = [parse_qs(uds) for uds in url_data_strs]
439 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
440 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
442 format_limit = self._downloader.params.get('format_limit', None)
443 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
444 if format_limit is not None and format_limit in available_formats:
445 format_list = available_formats[available_formats.index(format_limit):]
447 format_list = available_formats
448 existing_formats = [x for x in format_list if x in url_map]
449 if len(existing_formats) == 0:
450 self._downloader.trouble(u'ERROR: no known formats available for video')
452 if self._downloader.params.get('listformats', None):
453 self._print_formats(existing_formats)
455 if req_format is None or req_format == 'best':
456 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
457 elif req_format == 'worst':
458 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
459 elif req_format in ('-1', 'all'):
460 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
462 # Specific formats. We pick the first in a slash-delimeted sequence.
463 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
464 req_formats = req_format.split('/')
465 video_url_list = None
466 for rf in req_formats:
468 video_url_list = [(rf, url_map[rf])]
470 if video_url_list is None:
471 self._downloader.trouble(u'ERROR: requested format not available')
474 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
478 for format_param, video_real_url in video_url_list:
480 video_extension = self._video_extensions.get(format_param, 'flv')
482 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
483 self._video_dimensions.get(format_param, '???'))
486 'id': video_id.decode('utf-8'),
487 'url': video_real_url.decode('utf-8'),
488 'uploader': video_uploader.decode('utf-8'),
489 'upload_date': upload_date,
490 'title': video_title,
491 'ext': video_extension.decode('utf-8'),
492 'format': video_format,
493 'thumbnail': video_thumbnail.decode('utf-8'),
494 'description': video_description,
495 'player_url': player_url,
496 'subtitles': video_subtitles,
497 'duration': video_duration
502 class MetacafeIE(InfoExtractor):
503 """Information Extractor for metacafe.com."""
505 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
506 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
507 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
508 IE_NAME = u'metacafe'
510 def __init__(self, downloader=None):
511 InfoExtractor.__init__(self, downloader)
513 def report_disclaimer(self):
514 """Report disclaimer retrieval."""
515 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
517 def report_age_confirmation(self):
518 """Report attempt to confirm age."""
519 self._downloader.to_screen(u'[metacafe] Confirming age')
521 def report_download_webpage(self, video_id):
522 """Report webpage download."""
523 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
525 def report_extraction(self, video_id):
526 """Report information extraction."""
527 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
529 def _real_initialize(self):
530 # Retrieve disclaimer
531 request = urllib2.Request(self._DISCLAIMER)
533 self.report_disclaimer()
534 disclaimer = urllib2.urlopen(request).read()
535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
542 'submit': "Continue - I'm over 18",
544 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
546 self.report_age_confirmation()
547 disclaimer = urllib2.urlopen(request).read()
548 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
549 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
552 def _real_extract(self, url):
553 # Extract id and simplified title from URL
554 mobj = re.match(self._VALID_URL, url)
556 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
559 video_id = mobj.group(1)
561 # Check if video comes from YouTube
562 mobj2 = re.match(r'^yt-(.*)$', video_id)
563 if mobj2 is not None:
564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
567 # Retrieve video webpage to extract further information
568 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
570 self.report_download_webpage(video_id)
571 webpage = urllib2.urlopen(request).read()
572 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
573 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
576 # Extract URL, uploader and title from webpage
577 self.report_extraction(video_id)
578 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
580 mediaURL = urllib.unquote(mobj.group(1))
581 video_extension = mediaURL[-3:]
583 # Extract gdaKey if available
584 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
588 gdaKey = mobj.group(1)
589 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
591 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
593 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 vardict = parse_qs(mobj.group(1))
596 if 'mediaData' not in vardict:
597 self._downloader.trouble(u'ERROR: unable to extract media URL')
599 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
601 self._downloader.trouble(u'ERROR: unable to extract media URL')
603 mediaURL = mobj.group(1).replace('\\/', '/')
604 video_extension = mediaURL[-3:]
605 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
607 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
609 self._downloader.trouble(u'ERROR: unable to extract title')
611 video_title = mobj.group(1).decode('utf-8')
613 mobj = re.search(r'submitter=(.*?);', webpage)
615 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
617 video_uploader = mobj.group(1)
620 'id': video_id.decode('utf-8'),
621 'url': video_url.decode('utf-8'),
622 'uploader': video_uploader.decode('utf-8'),
623 'upload_date': u'NA',
624 'title': video_title,
625 'ext': video_extension.decode('utf-8'),
629 class DailymotionIE(InfoExtractor):
630 """Information Extractor for Dailymotion"""
632 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
633 IE_NAME = u'dailymotion'
635 def __init__(self, downloader=None):
636 InfoExtractor.__init__(self, downloader)
638 def report_download_webpage(self, video_id):
639 """Report webpage download."""
640 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
642 def report_extraction(self, video_id):
643 """Report information extraction."""
644 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
646 def _real_extract(self, url):
647 # Extract id and simplified title from URL
648 mobj = re.match(self._VALID_URL, url)
650 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
653 video_id = mobj.group(1).split('_')[0].split('?')[0]
655 video_extension = 'mp4'
657 # Retrieve video webpage to extract further information
658 request = urllib2.Request(url)
659 request.add_header('Cookie', 'family_filter=off')
661 self.report_download_webpage(video_id)
662 webpage = urllib2.urlopen(request).read()
663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
664 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
667 # Extract URL, uploader and title from webpage
668 self.report_extraction(video_id)
669 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
671 self._downloader.trouble(u'ERROR: unable to extract media URL')
673 flashvars = urllib.unquote(mobj.group(1))
675 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
678 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
684 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
686 self._downloader.trouble(u'ERROR: unable to extract video URL')
689 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
691 # TODO: support choosing qualities
693 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
695 self._downloader.trouble(u'ERROR: unable to extract title')
697 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
699 video_uploader = u'NA'
700 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
702 # lookin for official user
703 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
704 if mobj_official is None:
705 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
707 video_uploader = mobj_official.group(1)
709 video_uploader = mobj.group(1)
711 video_upload_date = u'NA'
712 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
714 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
717 'id': video_id.decode('utf-8'),
718 'url': video_url.decode('utf-8'),
719 'uploader': video_uploader.decode('utf-8'),
720 'upload_date': video_upload_date,
721 'title': video_title,
722 'ext': video_extension.decode('utf-8'),
726 class GoogleIE(InfoExtractor):
727 """Information extractor for video.google.com."""
729 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
730 IE_NAME = u'video.google'
732 def __init__(self, downloader=None):
733 InfoExtractor.__init__(self, downloader)
735 def report_download_webpage(self, video_id):
736 """Report webpage download."""
737 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
739 def report_extraction(self, video_id):
740 """Report information extraction."""
741 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
743 def _real_extract(self, url):
744 # Extract id from URL
745 mobj = re.match(self._VALID_URL, url)
747 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
750 video_id = mobj.group(1)
752 video_extension = 'mp4'
754 # Retrieve video webpage to extract further information
755 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
757 self.report_download_webpage(video_id)
758 webpage = urllib2.urlopen(request).read()
759 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
760 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
763 # Extract URL, uploader, and title from webpage
764 self.report_extraction(video_id)
765 mobj = re.search(r"download_url:'([^']+)'", webpage)
767 video_extension = 'flv'
768 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
770 self._downloader.trouble(u'ERROR: unable to extract media URL')
772 mediaURL = urllib.unquote(mobj.group(1))
773 mediaURL = mediaURL.replace('\\x3d', '\x3d')
774 mediaURL = mediaURL.replace('\\x26', '\x26')
778 mobj = re.search(r'<title>(.*)</title>', webpage)
780 self._downloader.trouble(u'ERROR: unable to extract title')
782 video_title = mobj.group(1).decode('utf-8')
784 # Extract video description
785 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
787 self._downloader.trouble(u'ERROR: unable to extract video description')
789 video_description = mobj.group(1).decode('utf-8')
790 if not video_description:
791 video_description = 'No description available.'
793 # Extract video thumbnail
794 if self._downloader.params.get('forcethumbnail', False):
795 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
797 webpage = urllib2.urlopen(request).read()
798 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
799 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
801 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
803 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
805 video_thumbnail = mobj.group(1)
806 else: # we need something to pass to process_info
810 'id': video_id.decode('utf-8'),
811 'url': video_url.decode('utf-8'),
813 'upload_date': u'NA',
814 'title': video_title,
815 'ext': video_extension.decode('utf-8'),
819 class PhotobucketIE(InfoExtractor):
820 """Information extractor for photobucket.com."""
822 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
823 IE_NAME = u'photobucket'
825 def __init__(self, downloader=None):
826 InfoExtractor.__init__(self, downloader)
828 def report_download_webpage(self, video_id):
829 """Report webpage download."""
830 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
832 def report_extraction(self, video_id):
833 """Report information extraction."""
834 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
836 def _real_extract(self, url):
837 # Extract id from URL
838 mobj = re.match(self._VALID_URL, url)
840 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
843 video_id = mobj.group(1)
845 video_extension = 'flv'
847 # Retrieve video webpage to extract further information
848 request = urllib2.Request(url)
850 self.report_download_webpage(video_id)
851 webpage = urllib2.urlopen(request).read()
852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
856 # Extract URL, uploader, and title from webpage
857 self.report_extraction(video_id)
858 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
860 self._downloader.trouble(u'ERROR: unable to extract media URL')
862 mediaURL = urllib.unquote(mobj.group(1))
866 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
868 self._downloader.trouble(u'ERROR: unable to extract title')
870 video_title = mobj.group(1).decode('utf-8')
872 video_uploader = mobj.group(2).decode('utf-8')
875 'id': video_id.decode('utf-8'),
876 'url': video_url.decode('utf-8'),
877 'uploader': video_uploader,
878 'upload_date': u'NA',
879 'title': video_title,
880 'ext': video_extension.decode('utf-8'),
884 class YahooIE(InfoExtractor):
885 """Information extractor for video.yahoo.com."""
887 # _VALID_URL matches all Yahoo! Video URLs
888 # _VPAGE_URL matches only the extractable '/watch/' URLs
889 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
890 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
891 IE_NAME = u'video.yahoo'
893 def __init__(self, downloader=None):
894 InfoExtractor.__init__(self, downloader)
896 def report_download_webpage(self, video_id):
897 """Report webpage download."""
898 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900 def report_extraction(self, video_id):
901 """Report information extraction."""
902 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904 def _real_extract(self, url, new_video=True):
905 # Extract ID from URL
906 mobj = re.match(self._VALID_URL, url)
908 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
911 video_id = mobj.group(2)
912 video_extension = 'flv'
914 # Rewrite valid but non-extractable URLs as
915 # extractable English language /watch/ URLs
916 if re.match(self._VPAGE_URL, url) is None:
917 request = urllib2.Request(url)
919 webpage = urllib2.urlopen(request).read()
920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
921 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
924 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 yahoo_id = mobj.group(1)
930 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 yahoo_vid = mobj.group(1)
936 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
937 return self._real_extract(url, new_video=False)
939 # Retrieve video webpage to extract further information
940 request = urllib2.Request(url)
942 self.report_download_webpage(video_id)
943 webpage = urllib2.urlopen(request).read()
944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
948 # Extract uploader and title from webpage
949 self.report_extraction(video_id)
950 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video title')
954 video_title = mobj.group(1).decode('utf-8')
956 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 video_uploader = mobj.group(1).decode('utf-8')
962 # Extract video thumbnail
963 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 video_thumbnail = mobj.group(1).decode('utf-8')
969 # Extract video description
970 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 self._downloader.trouble(u'ERROR: unable to extract video description')
974 video_description = mobj.group(1).decode('utf-8')
975 if not video_description:
976 video_description = 'No description available.'
978 # Extract video height and width
979 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 self._downloader.trouble(u'ERROR: unable to extract video height')
983 yv_video_height = mobj.group(1)
985 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 self._downloader.trouble(u'ERROR: unable to extract video width')
989 yv_video_width = mobj.group(1)
991 # Retrieve video playlist to extract media URL
992 # I'm not completely sure what all these options are, but we
993 # seem to need most of them, otherwise the server sends a 401.
994 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
995 yv_bitrate = '700' # according to Wikipedia this is hard-coded
996 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
997 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
998 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 self.report_download_webpage(video_id)
1001 webpage = urllib2.urlopen(request).read()
1002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006 # Extract media URL from playlist XML
1007 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1012 video_url = unescapeHTML(video_url)
1015 'id': video_id.decode('utf-8'),
1017 'uploader': video_uploader,
1018 'upload_date': u'NA',
1019 'title': video_title,
1020 'ext': video_extension.decode('utf-8'),
1021 'thumbnail': video_thumbnail.decode('utf-8'),
1022 'description': video_description,
1026 class VimeoIE(InfoExtractor):
1027 """Information extractor for vimeo.com."""
1029 # _VALID_URL matches Vimeo URLs
1030 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1033 def __init__(self, downloader=None):
1034 InfoExtractor.__init__(self, downloader)
1036 def report_download_webpage(self, video_id):
1037 """Report webpage download."""
1038 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1040 def report_extraction(self, video_id):
1041 """Report information extraction."""
1042 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1044 def _real_extract(self, url, new_video=True):
1045 # Extract ID from URL
1046 mobj = re.match(self._VALID_URL, url)
1048 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1051 video_id = mobj.group(1)
1053 # Retrieve video webpage to extract further information
1054 request = urllib2.Request(url, None, std_headers)
1056 self.report_download_webpage(video_id)
1057 webpage = urllib2.urlopen(request).read()
1058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1059 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1062 # Now we begin extracting as much information as we can from what we
1063 # retrieved. First we extract the information common to all extractors,
1064 # and latter we extract those that are Vimeo specific.
1065 self.report_extraction(video_id)
1067 # Extract the config JSON
1068 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1070 config = json.loads(config)
1072 self._downloader.trouble(u'ERROR: unable to extract info section')
1076 video_title = config["video"]["title"]
1079 video_uploader = config["video"]["owner"]["name"]
1081 # Extract video thumbnail
1082 video_thumbnail = config["video"]["thumbnail"]
1084 # Extract video description
1085 video_description = get_element_by_id("description", webpage.decode('utf8'))
1086 if video_description: video_description = clean_html(video_description)
1087 else: video_description = ''
1089 # Extract upload date
1090 video_upload_date = u'NA'
1091 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1092 if mobj is not None:
1093 video_upload_date = mobj.group(1)
1095 # Vimeo specific: extract request signature and timestamp
1096 sig = config['request']['signature']
1097 timestamp = config['request']['timestamp']
1099 # Vimeo specific: extract video codec and quality information
1100 # First consider quality, then codecs, then take everything
1101 # TODO bind to format param
1102 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1103 files = { 'hd': [], 'sd': [], 'other': []}
1104 for codec_name, codec_extension in codecs:
1105 if codec_name in config["video"]["files"]:
1106 if 'hd' in config["video"]["files"][codec_name]:
1107 files['hd'].append((codec_name, codec_extension, 'hd'))
1108 elif 'sd' in config["video"]["files"][codec_name]:
1109 files['sd'].append((codec_name, codec_extension, 'sd'))
1111 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1113 for quality in ('hd', 'sd', 'other'):
1114 if len(files[quality]) > 0:
1115 video_quality = files[quality][0][2]
1116 video_codec = files[quality][0][0]
1117 video_extension = files[quality][0][1]
1118 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1121 self._downloader.trouble(u'ERROR: no known codec found')
1124 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1125 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1130 'uploader': video_uploader,
1131 'upload_date': video_upload_date,
1132 'title': video_title,
1133 'ext': video_extension,
1134 'thumbnail': video_thumbnail,
1135 'description': video_description,
1139 class GenericIE(InfoExtractor):
1140 """Generic last-resort information extractor."""
1143 IE_NAME = u'generic'
1145 def __init__(self, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1148 def report_download_webpage(self, video_id):
1149 """Report webpage download."""
1150 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1151 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1153 def report_extraction(self, video_id):
1154 """Report information extraction."""
1155 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1157 def report_following_redirect(self, new_url):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1161 def _test_redirect(self, url):
1162 """Check if it is a redirect, like url shorteners, in case restart chain."""
1163 class HeadRequest(urllib2.Request):
1164 def get_method(self):
1167 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1169 Subclass the HTTPRedirectHandler to make it use our
1170 HeadRequest also on the redirected URL
1172 def redirect_request(self, req, fp, code, msg, headers, newurl):
1173 if code in (301, 302, 303, 307):
1174 newurl = newurl.replace(' ', '%20')
1175 newheaders = dict((k,v) for k,v in req.headers.items()
1176 if k.lower() not in ("content-length", "content-type"))
1177 return HeadRequest(newurl,
1179 origin_req_host=req.get_origin_req_host(),
1182 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1184 class HTTPMethodFallback(urllib2.BaseHandler):
1186 Fallback to GET if HEAD is not allowed (405 HTTP error)
1188 def http_error_405(self, req, fp, code, msg, headers):
1192 newheaders = dict((k,v) for k,v in req.headers.items()
1193 if k.lower() not in ("content-length", "content-type"))
1194 return self.parent.open(urllib2.Request(req.get_full_url(),
1196 origin_req_host=req.get_origin_req_host(),
1200 opener = urllib2.OpenerDirector()
1201 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1202 HTTPMethodFallback, HEADRedirectHandler,
1203 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1204 opener.add_handler(handler())
1206 response = opener.open(HeadRequest(url))
1207 new_url = response.geturl()
1209 if url == new_url: return False
1211 self.report_following_redirect(new_url)
1212 self._downloader.download([new_url])
1215 def _real_extract(self, url):
1216 if self._test_redirect(url): return
1218 video_id = url.split('/')[-1]
1219 request = urllib2.Request(url)
1221 self.report_download_webpage(video_id)
1222 webpage = urllib2.urlopen(request).read()
1223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1226 except ValueError, err:
1227 # since this is the last-resort InfoExtractor, if
1228 # this error is thrown, it'll be thrown here
1229 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1232 self.report_extraction(video_id)
1233 # Start with something easy: JW Player in SWFObject
1234 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1236 # Broaden the search a little bit
1237 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1239 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1242 # It's possible that one of the regexes
1243 # matched, but returned an empty group:
1244 if mobj.group(1) is None:
1245 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1248 video_url = urllib.unquote(mobj.group(1))
1249 video_id = os.path.basename(video_url)
1251 # here's a fun little line of code for you:
1252 video_extension = os.path.splitext(video_id)[1][1:]
1253 video_id = os.path.splitext(video_id)[0]
1255 # it's tempting to parse this further, but you would
1256 # have to take into account all the variations like
1257 # Video Title - Site Name
1258 # Site Name | Video Title
1259 # Video Title - Tagline | Site Name
1260 # and so on and so forth; it's just not practical
1261 mobj = re.search(r'<title>(.*)</title>', webpage)
1263 self._downloader.trouble(u'ERROR: unable to extract title')
1265 video_title = mobj.group(1).decode('utf-8')
1267 # video uploader is domain name
1268 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1270 self._downloader.trouble(u'ERROR: unable to extract title')
1272 video_uploader = mobj.group(1).decode('utf-8')
1275 'id': video_id.decode('utf-8'),
1276 'url': video_url.decode('utf-8'),
1277 'uploader': video_uploader,
1278 'upload_date': u'NA',
1279 'title': video_title,
1280 'ext': video_extension.decode('utf-8'),
1284 class YoutubeSearchIE(InfoExtractor):
1285 """Information Extractor for YouTube search queries."""
1286 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1287 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1288 _max_youtube_results = 1000
1289 IE_NAME = u'youtube:search'
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1294 def report_download_page(self, query, pagenum):
1295 """Report attempt to download search page with given number."""
1296 query = query.decode(preferredencoding())
1297 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1299 def _real_extract(self, query):
1300 mobj = re.match(self._VALID_URL, query)
1302 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1305 prefix, query = query.split(':')
1307 query = query.encode('utf-8')
1309 self._download_n_results(query, 1)
1311 elif prefix == 'all':
1312 self._download_n_results(query, self._max_youtube_results)
1318 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1320 elif n > self._max_youtube_results:
1321 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1322 n = self._max_youtube_results
1323 self._download_n_results(query, n)
1325 except ValueError: # parsing prefix as integer fails
1326 self._download_n_results(query, 1)
1329 def _download_n_results(self, query, n):
1330 """Downloads a specified number of results for a query"""
1336 while (50 * pagenum) < limit:
1337 self.report_download_page(query, pagenum+1)
1338 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1339 request = urllib2.Request(result_url)
1341 data = urllib2.urlopen(request).read()
1342 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1343 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1345 api_response = json.loads(data)['data']
1347 new_ids = list(video['id'] for video in api_response['items'])
1348 video_ids += new_ids
1350 limit = min(n, api_response['totalItems'])
1353 if len(video_ids) > n:
1354 video_ids = video_ids[:n]
1355 for id in video_ids:
1356 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1360 class GoogleSearchIE(InfoExtractor):
1361 """Information Extractor for Google Video search queries."""
1362 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1363 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1364 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1365 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1366 _max_google_results = 1000
1367 IE_NAME = u'video.google:search'
1369 def __init__(self, downloader=None):
1370 InfoExtractor.__init__(self, downloader)
1372 def report_download_page(self, query, pagenum):
1373 """Report attempt to download playlist page with given number."""
1374 query = query.decode(preferredencoding())
1375 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1377 def _real_extract(self, query):
1378 mobj = re.match(self._VALID_URL, query)
1380 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1383 prefix, query = query.split(':')
1385 query = query.encode('utf-8')
1387 self._download_n_results(query, 1)
1389 elif prefix == 'all':
1390 self._download_n_results(query, self._max_google_results)
1396 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1398 elif n > self._max_google_results:
1399 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1400 n = self._max_google_results
1401 self._download_n_results(query, n)
1403 except ValueError: # parsing prefix as integer fails
1404 self._download_n_results(query, 1)
1407 def _download_n_results(self, query, n):
1408 """Downloads a specified number of results for a query"""
1414 self.report_download_page(query, pagenum)
1415 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1416 request = urllib2.Request(result_url)
1418 page = urllib2.urlopen(request).read()
1419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1423 # Extract video identifiers
1424 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1425 video_id = mobj.group(1)
1426 if video_id not in video_ids:
1427 video_ids.append(video_id)
1428 if len(video_ids) == n:
1429 # Specified n videos reached
1430 for id in video_ids:
1431 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1434 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1435 for id in video_ids:
1436 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1439 pagenum = pagenum + 1
1442 class YahooSearchIE(InfoExtractor):
1443 """Information Extractor for Yahoo! Video search queries."""
1444 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1445 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1446 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1447 _MORE_PAGES_INDICATOR = r'\s*Next'
1448 _max_yahoo_results = 1000
1449 IE_NAME = u'video.yahoo:search'
1451 def __init__(self, downloader=None):
1452 InfoExtractor.__init__(self, downloader)
1454 def report_download_page(self, query, pagenum):
1455 """Report attempt to download playlist page with given number."""
1456 query = query.decode(preferredencoding())
1457 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1459 def _real_extract(self, query):
1460 mobj = re.match(self._VALID_URL, query)
1462 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1465 prefix, query = query.split(':')
1467 query = query.encode('utf-8')
1469 self._download_n_results(query, 1)
1471 elif prefix == 'all':
1472 self._download_n_results(query, self._max_yahoo_results)
1478 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1480 elif n > self._max_yahoo_results:
1481 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1482 n = self._max_yahoo_results
1483 self._download_n_results(query, n)
1485 except ValueError: # parsing prefix as integer fails
1486 self._download_n_results(query, 1)
1489 def _download_n_results(self, query, n):
1490 """Downloads a specified number of results for a query"""
1493 already_seen = set()
1497 self.report_download_page(query, pagenum)
1498 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1499 request = urllib2.Request(result_url)
1501 page = urllib2.urlopen(request).read()
1502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1506 # Extract video identifiers
1507 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1508 video_id = mobj.group(1)
1509 if video_id not in already_seen:
1510 video_ids.append(video_id)
1511 already_seen.add(video_id)
1512 if len(video_ids) == n:
1513 # Specified n videos reached
1514 for id in video_ids:
1515 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519 for id in video_ids:
1520 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1523 pagenum = pagenum + 1
1526 class YoutubePlaylistIE(InfoExtractor):
1527 """Information Extractor for YouTube playlists."""
1529 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1530 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1531 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1532 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1533 IE_NAME = u'youtube:playlist'
1535 def __init__(self, downloader=None):
1536 InfoExtractor.__init__(self, downloader)
1538 def report_download_page(self, playlist_id, pagenum):
1539 """Report attempt to download playlist page with given number."""
1540 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1542 def _real_extract(self, url):
1543 # Extract playlist id
1544 mobj = re.match(self._VALID_URL, url)
1546 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1550 if mobj.group(3) is not None:
1551 self._downloader.download([mobj.group(3)])
1554 # Download playlist pages
1555 # prefix is 'p' as default for playlists but there are other types that need extra care
1556 playlist_prefix = mobj.group(1)
1557 if playlist_prefix == 'a':
1558 playlist_access = 'artist'
1560 playlist_prefix = 'p'
1561 playlist_access = 'view_play_list'
1562 playlist_id = mobj.group(2)
1567 self.report_download_page(playlist_id, pagenum)
1568 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1569 request = urllib2.Request(url)
1571 page = urllib2.urlopen(request).read()
1572 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1573 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1576 # Extract video identifiers
1578 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1579 if mobj.group(1) not in ids_in_page:
1580 ids_in_page.append(mobj.group(1))
1581 video_ids.extend(ids_in_page)
1583 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1585 pagenum = pagenum + 1
1587 playliststart = self._downloader.params.get('playliststart', 1) - 1
1588 playlistend = self._downloader.params.get('playlistend', -1)
1589 if playlistend == -1:
1590 video_ids = video_ids[playliststart:]
1592 video_ids = video_ids[playliststart:playlistend]
1594 for id in video_ids:
1595 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1599 class YoutubeChannelIE(InfoExtractor):
1600 """Information Extractor for YouTube channels."""
1602 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1603 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1604 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1605 IE_NAME = u'youtube:channel'
1607 def report_download_page(self, channel_id, pagenum):
1608 """Report attempt to download channel page with given number."""
1609 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1611 def _real_extract(self, url):
1612 # Extract channel id
1613 mobj = re.match(self._VALID_URL, url)
1615 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1618 # Download channel pages
1619 channel_id = mobj.group(1)
1624 self.report_download_page(channel_id, pagenum)
1625 url = self._TEMPLATE_URL % (channel_id, pagenum)
1626 request = urllib2.Request(url)
1628 page = urllib2.urlopen(request).read()
1629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1633 # Extract video identifiers
1635 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1636 if mobj.group(1) not in ids_in_page:
1637 ids_in_page.append(mobj.group(1))
1638 video_ids.extend(ids_in_page)
1640 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1642 pagenum = pagenum + 1
1644 for id in video_ids:
1645 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1649 class YoutubeUserIE(InfoExtractor):
1650 """Information Extractor for YouTube users."""
1652 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1653 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1654 _GDATA_PAGE_SIZE = 50
1655 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1656 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1657 IE_NAME = u'youtube:user'
1659 def __init__(self, downloader=None):
1660 InfoExtractor.__init__(self, downloader)
1662 def report_download_page(self, username, start_index):
1663 """Report attempt to download user page."""
1664 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1665 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1667 def _real_extract(self, url):
1669 mobj = re.match(self._VALID_URL, url)
1671 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1674 username = mobj.group(1)
1676 # Download video ids using YouTube Data API. Result size per
1677 # query is limited (currently to 50 videos) so we need to query
1678 # page by page until there are no video ids - it means we got
1685 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1686 self.report_download_page(username, start_index)
1688 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1691 page = urllib2.urlopen(request).read()
1692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1693 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1696 # Extract video identifiers
1699 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1700 if mobj.group(1) not in ids_in_page:
1701 ids_in_page.append(mobj.group(1))
1703 video_ids.extend(ids_in_page)
1705 # A little optimization - if current page is not
1706 # "full", ie. does not contain PAGE_SIZE video ids then
1707 # we can assume that this page is the last one - there
1708 # are no more ids on further pages - no need to query
1711 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1716 all_ids_count = len(video_ids)
1717 playliststart = self._downloader.params.get('playliststart', 1) - 1
1718 playlistend = self._downloader.params.get('playlistend', -1)
1720 if playlistend == -1:
1721 video_ids = video_ids[playliststart:]
1723 video_ids = video_ids[playliststart:playlistend]
1725 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1726 (username, all_ids_count, len(video_ids)))
1728 for video_id in video_ids:
1729 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1732 class BlipTVUserIE(InfoExtractor):
1733 """Information Extractor for blip.tv users."""
1735 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1737 IE_NAME = u'blip.tv:user'
1739 def __init__(self, downloader=None):
1740 InfoExtractor.__init__(self, downloader)
1742 def report_download_page(self, username, pagenum):
1743 """Report attempt to download user page."""
1744 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1745 (self.IE_NAME, username, pagenum))
1747 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1751 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1754 username = mobj.group(1)
1756 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1758 request = urllib2.Request(url)
1761 page = urllib2.urlopen(request).read().decode('utf-8')
1762 mobj = re.search(r'data-users-id="([^"]+)"', page)
1763 page_base = page_base % mobj.group(1)
1764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1765 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1769 # Download video ids using BlipTV Ajax calls. Result size per
1770 # query is limited (currently to 12 videos) so we need to query
1771 # page by page until there are no video ids - it means we got
1778 self.report_download_page(username, pagenum)
1780 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1783 page = urllib2.urlopen(request).read().decode('utf-8')
1784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1788 # Extract video identifiers
1791 for mobj in re.finditer(r'href="/([^"]+)"', page):
1792 if mobj.group(1) not in ids_in_page:
1793 ids_in_page.append(unescapeHTML(mobj.group(1)))
1795 video_ids.extend(ids_in_page)
1797 # A little optimization - if current page is not
1798 # "full", ie. does not contain PAGE_SIZE video ids then
1799 # we can assume that this page is the last one - there
1800 # are no more ids on further pages - no need to query
1803 if len(ids_in_page) < self._PAGE_SIZE:
1808 all_ids_count = len(video_ids)
1809 playliststart = self._downloader.params.get('playliststart', 1) - 1
1810 playlistend = self._downloader.params.get('playlistend', -1)
1812 if playlistend == -1:
1813 video_ids = video_ids[playliststart:]
1815 video_ids = video_ids[playliststart:playlistend]
1817 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1818 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1820 for video_id in video_ids:
1821 self._downloader.download([u'http://blip.tv/'+video_id])
1824 class DepositFilesIE(InfoExtractor):
1825 """Information extractor for depositfiles.com"""
1827 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1828 IE_NAME = u'DepositFiles'
1830 def __init__(self, downloader=None):
1831 InfoExtractor.__init__(self, downloader)
1833 def report_download_webpage(self, file_id):
1834 """Report webpage download."""
1835 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1837 def report_extraction(self, file_id):
1838 """Report information extraction."""
1839 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1841 def _real_extract(self, url):
1842 file_id = url.split('/')[-1]
1843 # Rebuild url in english locale
1844 url = 'http://depositfiles.com/en/files/' + file_id
1846 # Retrieve file webpage with 'Free download' button pressed
1847 free_download_indication = { 'gateway_result' : '1' }
1848 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1850 self.report_download_webpage(file_id)
1851 webpage = urllib2.urlopen(request).read()
1852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1856 # Search for the real file URL
1857 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1858 if (mobj is None) or (mobj.group(1) is None):
1859 # Try to figure out reason of the error.
1860 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1861 if (mobj is not None) and (mobj.group(1) is not None):
1862 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1863 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1865 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1868 file_url = mobj.group(1)
1869 file_extension = os.path.splitext(file_url)[1][1:]
1871 # Search for file title
1872 mobj = re.search(r'<b title="(.*?)">', webpage)
1874 self._downloader.trouble(u'ERROR: unable to extract title')
1876 file_title = mobj.group(1).decode('utf-8')
1879 'id': file_id.decode('utf-8'),
1880 'url': file_url.decode('utf-8'),
1882 'upload_date': u'NA',
1883 'title': file_title,
1884 'ext': file_extension.decode('utf-8'),
1888 class FacebookIE(InfoExtractor):
1889 """Information Extractor for Facebook"""
1891 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1892 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1893 _NETRC_MACHINE = 'facebook'
1894 _available_formats = ['video', 'highqual', 'lowqual']
1895 _video_extensions = {
1900 IE_NAME = u'facebook'
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1905 def _reporter(self, message):
1906 """Add header and report message."""
1907 self._downloader.to_screen(u'[facebook] %s' % message)
1909 def report_login(self):
1910 """Report attempt to log in."""
1911 self._reporter(u'Logging in')
1913 def report_video_webpage_download(self, video_id):
1914 """Report attempt to download video webpage."""
1915 self._reporter(u'%s: Downloading video webpage' % video_id)
1917 def report_information_extraction(self, video_id):
1918 """Report attempt to extract video information."""
1919 self._reporter(u'%s: Extracting video information' % video_id)
1921 def _parse_page(self, video_webpage):
1922 """Extract video information from page"""
1924 data = {'title': r'\("video_title", "(.*?)"\)',
1925 'description': r'<div class="datawrap">(.*?)</div>',
1926 'owner': r'\("video_owner_name", "(.*?)"\)',
1927 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1930 for piece in data.keys():
1931 mobj = re.search(data[piece], video_webpage)
1932 if mobj is not None:
1933 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1937 for fmt in self._available_formats:
1938 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1939 if mobj is not None:
1940 # URL is in a Javascript segment inside an escaped Unicode format within
1941 # the generally utf-8 page
1942 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1943 video_info['video_urls'] = video_urls
1947 def _real_initialize(self):
1948 if self._downloader is None:
1953 downloader_params = self._downloader.params
1955 # Attempt to use provided username and password or .netrc data
1956 if downloader_params.get('username', None) is not None:
1957 useremail = downloader_params['username']
1958 password = downloader_params['password']
1959 elif downloader_params.get('usenetrc', False):
1961 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1962 if info is not None:
1966 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1967 except (IOError, netrc.NetrcParseError), err:
1968 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1971 if useremail is None:
1980 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1983 login_results = urllib2.urlopen(request).read()
1984 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1985 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1988 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1991 def _real_extract(self, url):
1992 mobj = re.match(self._VALID_URL, url)
1994 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1996 video_id = mobj.group('ID')
1999 self.report_video_webpage_download(video_id)
2000 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2002 page = urllib2.urlopen(request)
2003 video_webpage = page.read()
2004 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2008 # Start extracting information
2009 self.report_information_extraction(video_id)
2011 # Extract information
2012 video_info = self._parse_page(video_webpage)
2015 if 'owner' not in video_info:
2016 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2018 video_uploader = video_info['owner']
2021 if 'title' not in video_info:
2022 self._downloader.trouble(u'ERROR: unable to extract video title')
2024 video_title = video_info['title']
2025 video_title = video_title.decode('utf-8')
2028 if 'thumbnail' not in video_info:
2029 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2030 video_thumbnail = ''
2032 video_thumbnail = video_info['thumbnail']
2036 if 'upload_date' in video_info:
2037 upload_time = video_info['upload_date']
2038 timetuple = email.utils.parsedate_tz(upload_time)
2039 if timetuple is not None:
2041 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2046 video_description = video_info.get('description', 'No description available.')
2048 url_map = video_info['video_urls']
2049 if len(url_map.keys()) > 0:
2050 # Decide which formats to download
2051 req_format = self._downloader.params.get('format', None)
2052 format_limit = self._downloader.params.get('format_limit', None)
2054 if format_limit is not None and format_limit in self._available_formats:
2055 format_list = self._available_formats[self._available_formats.index(format_limit):]
2057 format_list = self._available_formats
2058 existing_formats = [x for x in format_list if x in url_map]
2059 if len(existing_formats) == 0:
2060 self._downloader.trouble(u'ERROR: no known formats available for video')
2062 if req_format is None:
2063 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2064 elif req_format == 'worst':
2065 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2066 elif req_format == '-1':
2067 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2070 if req_format not in url_map:
2071 self._downloader.trouble(u'ERROR: requested format not available')
2073 video_url_list = [(req_format, url_map[req_format])] # Specific format
2076 for format_param, video_real_url in video_url_list:
2078 video_extension = self._video_extensions.get(format_param, 'mp4')
2081 'id': video_id.decode('utf-8'),
2082 'url': video_real_url.decode('utf-8'),
2083 'uploader': video_uploader.decode('utf-8'),
2084 'upload_date': upload_date,
2085 'title': video_title,
2086 'ext': video_extension.decode('utf-8'),
2087 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2088 'thumbnail': video_thumbnail.decode('utf-8'),
2089 'description': video_description.decode('utf-8'),
2093 class BlipTVIE(InfoExtractor):
2094 """Information extractor for blip.tv"""
2096 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2097 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2098 IE_NAME = u'blip.tv'
2100 def report_extraction(self, file_id):
2101 """Report information extraction."""
2102 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2104 def report_direct_download(self, title):
2105 """Report information extraction."""
2106 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url)
2111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2118 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119 request = urllib2.Request(json_url.encode('utf-8'))
2120 self.report_extraction(mobj.group(1))
2123 urlh = urllib2.urlopen(request)
2124 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2125 basename = url.split('/')[-1]
2126 title,ext = os.path.splitext(basename)
2127 title = title.decode('UTF-8')
2128 ext = ext.replace('.', '')
2129 self.report_direct_download(title)
2134 'upload_date': u'NA',
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2142 if info is None: # Regular URL
2144 json_code = urlh.read()
2145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2150 json_data = json.loads(json_code)
2151 if 'Post' in json_data:
2152 data = json_data['Post']
2156 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2157 video_url = data['media']['url']
2158 umobj = re.match(self._URL_EXT, video_url)
2160 raise ValueError('Can not determine filename extension')
2161 ext = umobj.group(1)
2164 'id': data['item_id'],
2166 'uploader': data['display_name'],
2167 'upload_date': upload_date,
2168 'title': data['title'],
2170 'format': data['media']['mimeType'],
2171 'thumbnail': data['thumbnailUrl'],
2172 'description': data['description'],
2173 'player_url': data['embedUrl']
2175 except (ValueError,KeyError), err:
2176 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2179 std_headers['User-Agent'] = 'iTunes/10.6.1'
2183 class MyVideoIE(InfoExtractor):
2184 """Information Extractor for myvideo.de."""
2186 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2187 IE_NAME = u'myvideo'
2189 def __init__(self, downloader=None):
2190 InfoExtractor.__init__(self, downloader)
2192 def report_download_webpage(self, video_id):
2193 """Report webpage download."""
2194 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2196 def report_extraction(self, video_id):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2200 def _real_extract(self,url):
2201 mobj = re.match(self._VALID_URL, url)
2203 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2206 video_id = mobj.group(1)
2209 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2211 self.report_download_webpage(video_id)
2212 webpage = urllib2.urlopen(request).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2217 self.report_extraction(video_id)
2218 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2221 self._downloader.trouble(u'ERROR: unable to extract media URL')
2223 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2225 mobj = re.search('<title>([^<]+)</title>', webpage)
2227 self._downloader.trouble(u'ERROR: unable to extract title')
2230 video_title = mobj.group(1)
2236 'upload_date': u'NA',
2237 'title': video_title,
2241 class ComedyCentralIE(InfoExtractor):
2242 """Information extractor for The Daily Show and Colbert Report """
2244 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2245 IE_NAME = u'comedycentral'
2247 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2249 _video_extensions = {
2257 _video_dimensions = {
2266 def report_extraction(self, episode_id):
2267 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2269 def report_config_download(self, episode_id):
2270 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2272 def report_index_download(self, episode_id):
2273 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2275 def report_player_url(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2279 def _print_formats(self, formats):
2280 print('Available formats:')
2282 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2285 def _real_extract(self, url):
2286 mobj = re.match(self._VALID_URL, url)
2288 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2291 if mobj.group('shortname'):
2292 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2293 url = u'http://www.thedailyshow.com/full-episodes/'
2295 url = u'http://www.colbertnation.com/full-episodes/'
2296 mobj = re.match(self._VALID_URL, url)
2297 assert mobj is not None
2299 dlNewest = not mobj.group('episode')
2301 epTitle = mobj.group('showname')
2303 epTitle = mobj.group('episode')
2305 req = urllib2.Request(url)
2306 self.report_extraction(epTitle)
2308 htmlHandle = urllib2.urlopen(req)
2309 html = htmlHandle.read()
2310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2314 url = htmlHandle.geturl()
2315 mobj = re.match(self._VALID_URL, url)
2317 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2319 if mobj.group('episode') == '':
2320 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2322 epTitle = mobj.group('episode')
2324 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2326 if len(mMovieParams) == 0:
2327 # The Colbert Report embeds the information in a without
2328 # a URL prefix; so extract the alternate reference
2329 # and then add the URL prefix manually.
2331 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2332 if len(altMovieParams) == 0:
2333 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2336 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2338 playerUrl_raw = mMovieParams[0][0]
2339 self.report_player_url(epTitle)
2341 urlHandle = urllib2.urlopen(playerUrl_raw)
2342 playerUrl = urlHandle.geturl()
2343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2347 uri = mMovieParams[0][1]
2348 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2349 self.report_index_download(epTitle)
2351 indexXml = urllib2.urlopen(indexUrl).read()
2352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2353 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2358 idoc = xml.etree.ElementTree.fromstring(indexXml)
2359 itemEls = idoc.findall('.//item')
2360 for itemEl in itemEls:
2361 mediaId = itemEl.findall('./guid')[0].text
2362 shortMediaId = mediaId.split(':')[-1]
2363 showId = mediaId.split(':')[-2].replace('.com', '')
2364 officialTitle = itemEl.findall('./title')[0].text
2365 officialDate = itemEl.findall('./pubDate')[0].text
2367 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2368 urllib.urlencode({'uri': mediaId}))
2369 configReq = urllib2.Request(configUrl)
2370 self.report_config_download(epTitle)
2372 configXml = urllib2.urlopen(configReq).read()
2373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2374 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2377 cdoc = xml.etree.ElementTree.fromstring(configXml)
2379 for rendition in cdoc.findall('.//rendition'):
2380 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2384 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2387 if self._downloader.params.get('listformats', None):
2388 self._print_formats([i[0] for i in turls])
2391 # For now, just pick the highest bitrate
2392 format,video_url = turls[-1]
2394 # Get the format arg from the arg stream
2395 req_format = self._downloader.params.get('format', None)
2397 # Select format if we can find one
2400 format, video_url = f, v
2403 # Patch to download from alternative CDN, which does not
2404 # break on current RTMPDump builds
2405 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2406 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2408 if video_url.startswith(broken_cdn):
2409 video_url = video_url.replace(broken_cdn, better_cdn)
2411 effTitle = showId + u'-' + epTitle
2416 'upload_date': officialDate,
2421 'description': officialTitle,
2422 'player_url': None #playerUrl
2425 results.append(info)
2430 class EscapistIE(InfoExtractor):
2431 """Information extractor for The Escapist """
2433 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2434 IE_NAME = u'escapist'
2436 def report_extraction(self, showName):
2437 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2439 def report_config_download(self, showName):
2440 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2442 def _real_extract(self, url):
2443 mobj = re.match(self._VALID_URL, url)
2445 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2447 showName = mobj.group('showname')
2448 videoId = mobj.group('episode')
2450 self.report_extraction(showName)
2452 webPage = urllib2.urlopen(url)
2453 webPageBytes = webPage.read()
2454 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2455 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2460 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2461 description = unescapeHTML(descMatch.group(1))
2462 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2463 imgUrl = unescapeHTML(imgMatch.group(1))
2464 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2465 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2466 configUrlMatch = re.search('config=(.*)$', playerUrl)
2467 configUrl = urllib2.unquote(configUrlMatch.group(1))
2469 self.report_config_download(showName)
2471 configJSON = urllib2.urlopen(configUrl).read()
2472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2473 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2476 # Technically, it's JavaScript, not JSON
2477 configJSON = configJSON.replace("'", '"')
2480 config = json.loads(configJSON)
2481 except (ValueError,), err:
2482 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2485 playlist = config['playlist']
2486 videoUrl = playlist[1]['url']
2491 'uploader': showName,
2492 'upload_date': u'NA',
2495 'thumbnail': imgUrl,
2496 'description': description,
2497 'player_url': playerUrl,
2503 class CollegeHumorIE(InfoExtractor):
2504 """Information extractor for collegehumor.com"""
2506 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2507 IE_NAME = u'collegehumor'
2509 def report_webpage(self, video_id):
2510 """Report information extraction."""
2511 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2513 def report_extraction(self, video_id):
2514 """Report information extraction."""
2515 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2517 def _real_extract(self, url):
2518 mobj = re.match(self._VALID_URL, url)
2520 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2522 video_id = mobj.group('videoid')
2524 self.report_webpage(video_id)
2525 request = urllib2.Request(url)
2527 webpage = urllib2.urlopen(request).read()
2528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2532 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2534 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2536 internal_video_id = m.group('internalvideoid')
2540 'internal_id': internal_video_id,
2542 'upload_date': u'NA',
2545 self.report_extraction(video_id)
2546 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2548 metaXml = urllib2.urlopen(xmlUrl).read()
2549 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2550 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2553 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2555 videoNode = mdoc.findall('./video')[0]
2556 info['description'] = videoNode.findall('./description')[0].text
2557 info['title'] = videoNode.findall('./caption')[0].text
2558 info['url'] = videoNode.findall('./file')[0].text
2559 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2560 info['ext'] = info['url'].rpartition('.')[2]
2562 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2568 class XVideosIE(InfoExtractor):
2569 """Information extractor for xvideos.com"""
2571 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2572 IE_NAME = u'xvideos'
2574 def report_webpage(self, video_id):
2575 """Report information extraction."""
2576 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2578 def report_extraction(self, video_id):
2579 """Report information extraction."""
2580 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2582 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2585 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2587 video_id = mobj.group(1).decode('utf-8')
2589 self.report_webpage(video_id)
2591 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2593 webpage = urllib2.urlopen(request).read()
2594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2595 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2598 self.report_extraction(video_id)
2602 mobj = re.search(r'flv_url=(.+?)&', webpage)
2604 self._downloader.trouble(u'ERROR: unable to extract video url')
2606 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2610 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2612 self._downloader.trouble(u'ERROR: unable to extract video title')
2614 video_title = mobj.group(1).decode('utf-8')
2617 # Extract video thumbnail
2618 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2620 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2622 video_thumbnail = mobj.group(0).decode('utf-8')
2628 'upload_date': u'NA',
2629 'title': video_title,
2631 'thumbnail': video_thumbnail,
2632 'description': None,
2638 class SoundcloudIE(InfoExtractor):
2639 """Information extractor for soundcloud.com
2640 To access the media, the uid of the song and a stream token
2641 must be extracted from the page source and the script must make
2642 a request to media.soundcloud.com/crossdomain.xml. Then
2643 the media can be grabbed by requesting from an url composed
2644 of the stream token and uid
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'soundcloud'
2650 def __init__(self, downloader=None):
2651 InfoExtractor.__init__(self, downloader)
2653 def report_webpage(self, video_id):
2654 """Report information extraction."""
2655 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2657 def report_extraction(self, video_id):
2658 """Report information extraction."""
2659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2661 def _real_extract(self, url):
2662 mobj = re.match(self._VALID_URL, url)
2664 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2667 # extract uploader (which is in the url)
2668 uploader = mobj.group(1).decode('utf-8')
2669 # extract simple title (uploader + slug of song title)
2670 slug_title = mobj.group(2).decode('utf-8')
2671 simple_title = uploader + u'-' + slug_title
2673 self.report_webpage('%s/%s' % (uploader, slug_title))
2675 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2677 webpage = urllib2.urlopen(request).read()
2678 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2679 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2682 self.report_extraction('%s/%s' % (uploader, slug_title))
2684 # extract uid and stream token that soundcloud hands out for access
2685 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2687 video_id = mobj.group(1)
2688 stream_token = mobj.group(2)
2690 # extract unsimplified title
2691 mobj = re.search('"title":"(.*?)",', webpage)
2693 title = mobj.group(1).decode('utf-8')
2695 title = simple_title
2697 # construct media url (with uid/token)
2698 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2699 mediaURL = mediaURL % (video_id, stream_token)
2702 description = u'No description available'
2703 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2705 description = mobj.group(1)
2709 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2712 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2713 except Exception, e:
2714 self._downloader.to_stderr(compat_str(e))
2716 # for soundcloud, a request to a cross domain is required for cookies
2717 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2720 'id': video_id.decode('utf-8'),
2722 'uploader': uploader.decode('utf-8'),
2723 'upload_date': upload_date,
2726 'description': description.decode('utf-8')
2730 class InfoQIE(InfoExtractor):
2731 """Information extractor for infoq.com"""
2733 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2736 def report_webpage(self, video_id):
2737 """Report information extraction."""
2738 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2740 def report_extraction(self, video_id):
2741 """Report information extraction."""
2742 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2744 def _real_extract(self, url):
2745 mobj = re.match(self._VALID_URL, url)
2747 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2750 self.report_webpage(url)
2752 request = urllib2.Request(url)
2754 webpage = urllib2.urlopen(request).read()
2755 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2756 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2759 self.report_extraction(url)
2763 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2765 self._downloader.trouble(u'ERROR: unable to extract video url')
2767 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2771 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2773 self._downloader.trouble(u'ERROR: unable to extract video title')
2775 video_title = mobj.group(1).decode('utf-8')
2777 # Extract description
2778 video_description = u'No description available.'
2779 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2780 if mobj is not None:
2781 video_description = mobj.group(1).decode('utf-8')
2783 video_filename = video_url.split('/')[-1]
2784 video_id, extension = video_filename.split('.')
2790 'upload_date': u'NA',
2791 'title': video_title,
2792 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2794 'description': video_description,
2799 class MixcloudIE(InfoExtractor):
2800 """Information extractor for www.mixcloud.com"""
2801 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2802 IE_NAME = u'mixcloud'
2804 def __init__(self, downloader=None):
2805 InfoExtractor.__init__(self, downloader)
2807 def report_download_json(self, file_id):
2808 """Report JSON download."""
2809 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2811 def report_extraction(self, file_id):
2812 """Report information extraction."""
2813 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2815 def get_urls(self, jsonData, fmt, bitrate='best'):
2816 """Get urls from 'audio_formats' section in json"""
2819 bitrate_list = jsonData[fmt]
2820 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2821 bitrate = max(bitrate_list) # select highest
2823 url_list = jsonData[fmt][bitrate]
2824 except TypeError: # we have no bitrate info.
2825 url_list = jsonData[fmt]
2828 def check_urls(self, url_list):
2829 """Returns 1st active url from list"""
2830 for url in url_list:
2832 urllib2.urlopen(url)
2834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839 def _print_formats(self, formats):
2840 print('Available formats:')
2841 for fmt in formats.keys():
2842 for b in formats[fmt]:
2844 ext = formats[fmt][b][0]
2845 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2846 except TypeError: # we have no bitrate info
2847 ext = formats[fmt][0]
2848 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2851 def _real_extract(self, url):
2852 mobj = re.match(self._VALID_URL, url)
2854 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2856 # extract uploader & filename from url
2857 uploader = mobj.group(1).decode('utf-8')
2858 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2860 # construct API request
2861 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2862 # retrieve .json file with links to files
2863 request = urllib2.Request(file_url)
2865 self.report_download_json(file_url)
2866 jsonData = urllib2.urlopen(request).read()
2867 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2868 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2872 json_data = json.loads(jsonData)
2873 player_url = json_data['player_swf_url']
2874 formats = dict(json_data['audio_formats'])
2876 req_format = self._downloader.params.get('format', None)
2879 if self._downloader.params.get('listformats', None):
2880 self._print_formats(formats)
2883 if req_format is None or req_format == 'best':
2884 for format_param in formats.keys():
2885 url_list = self.get_urls(formats, format_param)
2887 file_url = self.check_urls(url_list)
2888 if file_url is not None:
2891 if req_format not in formats.keys():
2892 self._downloader.trouble(u'ERROR: format is not available')
2895 url_list = self.get_urls(formats, req_format)
2896 file_url = self.check_urls(url_list)
2897 format_param = req_format
2900 'id': file_id.decode('utf-8'),
2901 'url': file_url.decode('utf-8'),
2902 'uploader': uploader.decode('utf-8'),
2903 'upload_date': u'NA',
2904 'title': json_data['name'],
2905 'ext': file_url.split('.')[-1].decode('utf-8'),
2906 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2907 'thumbnail': json_data['thumbnail_url'],
2908 'description': json_data['description'],
2909 'player_url': player_url.decode('utf-8'),
2912 class StanfordOpenClassroomIE(InfoExtractor):
2913 """Information extractor for Stanford's Open ClassRoom"""
2915 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2916 IE_NAME = u'stanfordoc'
2918 def report_download_webpage(self, objid):
2919 """Report information extraction."""
2920 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2922 def report_extraction(self, video_id):
2923 """Report information extraction."""
2924 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2926 def _real_extract(self, url):
2927 mobj = re.match(self._VALID_URL, url)
2929 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2932 if mobj.group('course') and mobj.group('video'): # A specific video
2933 course = mobj.group('course')
2934 video = mobj.group('video')
2936 'id': course + '_' + video,
2938 'upload_date': u'NA',
2941 self.report_extraction(info['id'])
2942 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2943 xmlUrl = baseUrl + video + '.xml'
2945 metaXml = urllib2.urlopen(xmlUrl).read()
2946 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2947 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2949 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2951 info['title'] = mdoc.findall('./title')[0].text
2952 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2954 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2956 info['ext'] = info['url'].rpartition('.')[2]
2958 elif mobj.group('course'): # A course page
2959 course = mobj.group('course')
2964 'upload_date': u'NA',
2967 self.report_download_webpage(info['id'])
2969 coursepage = urllib2.urlopen(url).read()
2970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2971 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2974 m = re.search('<h1>([^<]+)</h1>', coursepage)
2976 info['title'] = unescapeHTML(m.group(1))
2978 info['title'] = info['id']
2980 m = re.search('<description>([^<]+)</description>', coursepage)
2982 info['description'] = unescapeHTML(m.group(1))
2984 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2987 'type': 'reference',
2988 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2992 for entry in info['list']:
2993 assert entry['type'] == 'reference'
2994 results += self.extract(entry['url'])
2999 'id': 'Stanford OpenClassroom',
3002 'upload_date': u'NA',
3005 self.report_download_webpage(info['id'])
3006 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3008 rootpage = urllib2.urlopen(rootURL).read()
3009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3010 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3013 info['title'] = info['id']
3015 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3018 'type': 'reference',
3019 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3024 for entry in info['list']:
3025 assert entry['type'] == 'reference'
3026 results += self.extract(entry['url'])
3029 class MTVIE(InfoExtractor):
3030 """Information extractor for MTV.com"""
3032 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3035 def report_webpage(self, video_id):
3036 """Report information extraction."""
3037 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3039 def report_extraction(self, video_id):
3040 """Report information extraction."""
3041 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3043 def _real_extract(self, url):
3044 mobj = re.match(self._VALID_URL, url)
3046 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3048 if not mobj.group('proto'):
3049 url = 'http://' + url
3050 video_id = mobj.group('videoid')
3051 self.report_webpage(video_id)
3053 request = urllib2.Request(url)
3055 webpage = urllib2.urlopen(request).read()
3056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3057 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3060 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062 self._downloader.trouble(u'ERROR: unable to extract song name')
3064 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3065 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067 self._downloader.trouble(u'ERROR: unable to extract performer')
3069 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070 video_title = performer + ' - ' + song_name
3072 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076 mtvn_uri = mobj.group(1)
3078 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080 self._downloader.trouble(u'ERROR: unable to extract content id')
3082 content_id = mobj.group(1)
3084 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3085 self.report_extraction(video_id)
3086 request = urllib2.Request(videogen_url)
3088 metadataXml = urllib2.urlopen(request).read()
3089 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3090 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3093 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3094 renditions = mdoc.findall('.//rendition')
3096 # For now, always pick the highest quality.
3097 rendition = renditions[-1]
3100 _,_,ext = rendition.attrib['type'].partition('/')
3101 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3102 video_url = rendition.find('./src').text
3104 self._downloader.trouble('Invalid rendition field.')
3110 'uploader': performer,
3111 'upload_date': u'NA',
3112 'title': video_title,
3120 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125 def __init__(self, downloader=None):
3126 InfoExtractor.__init__(self, downloader)
3128 def report_download_webpage(self, file_id):
3129 """Report webpage download."""
3130 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3132 def report_extraction(self, file_id):
3133 """Report information extraction."""
3134 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3137 nowTime = int(time.time() * 1000)
3138 random1 = random.randint(1000,1998)
3139 random2 = random.randint(1000,9999)
3141 return "%d%d%d" %(nowTime,random1,random2)
3143 def _get_file_ID_mix_string(self, seed):
3145 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3147 for i in range(len(source)):
3148 seed = (seed * 211 + 30031 ) % 65536
3149 index = math.floor(seed / 65536 * len(source) )
3150 mixed.append(source[int(index)])
3151 source.remove(source[int(index)])
3152 #return ''.join(mixed)
3155 def _get_file_id(self, fileId, seed):
3156 mixed = self._get_file_ID_mix_string(seed)
3157 ids = fileId.split('*')
3161 realId.append(mixed[int(ch)])
3162 return ''.join(realId)
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3169 video_id = mobj.group('ID')
3171 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3173 request = urllib2.Request(info_url, None, std_headers)
3175 self.report_download_webpage(video_id)
3176 jsondata = urllib2.urlopen(request).read()
3177 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3181 self.report_extraction(video_id)
3183 config = json.loads(jsondata)
3185 video_title = config['data'][0]['title']
3186 seed = config['data'][0]['seed']
3188 format = self._downloader.params.get('format', None)
3189 supported_format = config['data'][0]['streamfileids'].keys()
3191 if format is None or format == 'best':
3192 if 'hd2' in supported_format:
3197 elif format == 'worst':
3205 fileid = config['data'][0]['streamfileids'][format]
3206 seg_number = len(config['data'][0]['segs'][format])
3209 for i in xrange(seg_number):
3210 keys.append(config['data'][0]['segs'][format][i]['k'])
3213 #youku only could be viewed from mainland china
3215 self._downloader.trouble(u'ERROR: unable to extract info section')
3219 sid = self._gen_sid()
3220 fileid = self._get_file_id(fileid, seed)
3222 #column 8,9 of fileid represent the segment number
3223 #fileid[7:9] should be changed
3224 for index, key in enumerate(keys):
3226 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230 'id': '%s_part%02d' % (video_id, index),
3231 'url': download_url,
3233 'upload_date': u'NA',
3234 'title': video_title,
3237 files_info.append(info)
3242 class XNXXIE(InfoExtractor):
3243 """Information extractor for xnxx.com"""
3245 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3247 VIDEO_URL_RE = r'flv_url=(.*?)&'
3248 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3249 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3251 def report_webpage(self, video_id):
3252 """Report information extraction"""
3253 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3255 def report_extraction(self, video_id):
3256 """Report information extraction"""
3257 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3259 def _real_extract(self, url):
3260 mobj = re.match(self._VALID_URL, url)
3262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264 video_id = mobj.group(1).decode('utf-8')
3266 self.report_webpage(video_id)
3268 # Get webpage content
3270 webpage = urllib2.urlopen(url).read()
3271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3275 result = re.search(self.VIDEO_URL_RE, webpage)
3277 self._downloader.trouble(u'ERROR: unable to extract video url')
3279 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3281 result = re.search(self.VIDEO_TITLE_RE, webpage)
3283 self._downloader.trouble(u'ERROR: unable to extract video title')
3285 video_title = result.group(1).decode('utf-8')
3287 result = re.search(self.VIDEO_THUMB_RE, webpage)
3289 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3291 video_thumbnail = result.group(1).decode('utf-8')
3297 'upload_date': u'NA',
3298 'title': video_title,
3300 'thumbnail': video_thumbnail,
3301 'description': None,
3305 class GooglePlusIE(InfoExtractor):
3306 """Information extractor for plus.google.com."""
3308 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3309 IE_NAME = u'plus.google'
3311 def __init__(self, downloader=None):
3312 InfoExtractor.__init__(self, downloader)
3314 def report_extract_entry(self, url):
3315 """Report downloading extry"""
3316 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3318 def report_date(self, upload_date):
3319 """Report downloading extry"""
3320 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3322 def report_uploader(self, uploader):
3323 """Report downloading extry"""
3324 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3326 def report_title(self, video_title):
3327 """Report downloading extry"""
3328 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3330 def report_extract_vid_page(self, video_page):
3331 """Report information extraction."""
3332 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3334 def _real_extract(self, url):
3335 # Extract id from URL
3336 mobj = re.match(self._VALID_URL, url)
3338 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3341 post_url = mobj.group(0)
3342 video_id = mobj.group(2)
3344 video_extension = 'flv'
3346 # Step 1, Retrieve post webpage to extract further information
3347 self.report_extract_entry(post_url)
3348 request = urllib2.Request(post_url)
3350 webpage = urllib2.urlopen(request).read()
3351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3355 # Extract update date
3357 pattern = 'title="Timestamp">(.*?)</a>'
3358 mobj = re.search(pattern, webpage)
3360 upload_date = mobj.group(1)
3361 # Convert timestring to a format suitable for filename
3362 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3363 upload_date = upload_date.strftime('%Y%m%d')
3364 self.report_date(upload_date)
3368 pattern = r'rel\="author".*?>(.*?)</a>'
3369 mobj = re.search(pattern, webpage)
3371 uploader = mobj.group(1)
3372 self.report_uploader(uploader)
3375 # Get the first line for title
3377 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3378 mobj = re.search(pattern, webpage)
3380 video_title = mobj.group(1)
3381 self.report_title(video_title)
3383 # Step 2, Stimulate clicking the image box to launch video
3384 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3385 mobj = re.search(pattern, webpage)
3387 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3389 video_page = mobj.group(1)
3390 request = urllib2.Request(video_page)
3392 webpage = urllib2.urlopen(request).read()
3393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3394 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3396 self.report_extract_vid_page(video_page)
3399 # Extract video links on video page
3400 """Extract video links of all sizes"""
3401 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3402 mobj = re.findall(pattern, webpage)
3404 self._downloader.trouble(u'ERROR: unable to extract video links')
3406 # Sort in resolution
3407 links = sorted(mobj)
3409 # Choose the lowest of the sort, i.e. highest resolution
3410 video_url = links[-1]
3411 # Only get the url. The resolution part in the tuple has no use anymore
3412 video_url = video_url[-1]
3413 # Treat escaped \u0026 style hex
3414 video_url = unicode(video_url, "unicode_escape")
3418 'id': video_id.decode('utf-8'),
3420 'uploader': uploader.decode('utf-8'),
3421 'upload_date': upload_date.decode('utf-8'),
3422 'title': video_title.decode('utf-8'),
3423 'ext': video_extension.decode('utf-8'),