2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader.
44 upload_date: Video upload date (YYYYMMDD).
45 title: Video title, unescaped.
46 ext: Video filename extension.
48 The following fields are optional:
50 format: The video format, defaults to ext (used for --get-format)
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53 player_url: SWF Player URL (used for rtmpdump).
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
59 _real_extract() must return a *list* of information dictionaries as
66 def __init__(self, downloader=None):
67 """Constructor. Receives an optional downloader."""
69 self.set_downloader(downloader)
71 def suitable(self, url):
72 """Receives a URL and returns True if suitable for this IE."""
73 return re.match(self._VALID_URL, url) is not None
76 """Initializes an instance (authentication, etc)."""
78 self._real_initialize()
81 def extract(self, url):
82 """Extracts URL information and returns it in list of dicts."""
84 return self._real_extract(url)
86 def set_downloader(self, downloader):
87 """Sets the downloader for this IE."""
88 self._downloader = downloader
90 def _real_initialize(self):
91 """Real initialization process. Redefine in subclasses."""
94 def _real_extract(self, url):
95 """Real extraction process. Redefine in subclasses."""
99 class YoutubeIE(InfoExtractor):
100 """Information extractor for youtube.com."""
104 (?:https?://)? # http(s):// (optional)
105 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
106 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
107 (?:.*?\#/)? # handle anchor (#/) redirect urls
108 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
109 (?: # the various things that can precede the ID:
110 (?:(?:v|embed|e)/) # v/ or embed/ or e/
111 |(?: # or the v= param in all its forms
112 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
113 (?:\?|\#!?) # the params delimiter ? or # or #!
114 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
117 )? # optional -> youtube.com/xxxx is OK
118 )? # all until now is optional -> you can pass the naked ID
119 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
120 (?(1).+)? # if we found the ID, everything can follow
122 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
123 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
124 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
125 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
126 _NETRC_MACHINE = 'youtube'
127 # Listed in order of quality
128 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
129 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
130 _video_extensions = {
136 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 _video_dimensions = {
160 def suitable(self, url):
161 """Receives a URL and returns True if suitable for this IE."""
162 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
164 def report_lang(self):
165 """Report attempt to set language."""
166 self._downloader.to_screen(u'[youtube] Setting language')
168 def report_login(self):
169 """Report attempt to log in."""
170 self._downloader.to_screen(u'[youtube] Logging in')
172 def report_age_confirmation(self):
173 """Report attempt to confirm age."""
174 self._downloader.to_screen(u'[youtube] Confirming age')
176 def report_video_webpage_download(self, video_id):
177 """Report attempt to download video webpage."""
178 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
180 def report_video_info_webpage_download(self, video_id):
181 """Report attempt to download video info webpage."""
182 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
184 def report_video_subtitles_download(self, video_id):
185 """Report attempt to download video info webpage."""
186 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
188 def report_information_extraction(self, video_id):
189 """Report attempt to extract video information."""
190 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
192 def report_unavailable_format(self, video_id, format):
193 """Report extracted video URL."""
194 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
196 def report_rtmp_download(self):
197 """Indicate the download will use the RTMP protocol."""
198 self._downloader.to_screen(u'[youtube] RTMP download detected')
200 def _closed_captions_xml_to_srt(self, xml_string):
202 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
203 # TODO parse xml instead of regex
204 for n, (start, dur_tag, dur, caption) in enumerate(texts):
205 if not dur: dur = '4'
207 end = start + float(dur)
208 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
209 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
210 caption = unescapeHTML(caption)
211 caption = unescapeHTML(caption) # double cycle, intentional
212 srt += str(n+1) + '\n'
213 srt += start + ' --> ' + end + '\n'
214 srt += caption + '\n\n'
217 def _print_formats(self, formats):
218 print('Available formats:')
220 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
222 def _real_initialize(self):
223 if self._downloader is None:
228 downloader_params = self._downloader.params
230 # Attempt to use provided username and password or .netrc data
231 if downloader_params.get('username', None) is not None:
232 username = downloader_params['username']
233 password = downloader_params['password']
234 elif downloader_params.get('usenetrc', False):
236 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
241 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
242 except (IOError, netrc.NetrcParseError), err:
243 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
247 request = urllib2.Request(self._LANG_URL)
250 urllib2.urlopen(request).read()
251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
255 # No authentication to be performed
261 'current_form': 'loginForm',
263 'action_login': 'Log In',
264 'username': username,
265 'password': password,
267 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
270 login_results = urllib2.urlopen(request).read()
271 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
272 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
275 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281 'action_confirm': 'Confirm',
283 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
285 self.report_age_confirmation()
286 age_results = urllib2.urlopen(request).read()
287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
291 def _real_extract(self, url):
292 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
293 mobj = re.search(self._NEXT_URL_RE, url)
295 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
297 # Extract video id from URL
298 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
300 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
302 video_id = mobj.group(2)
305 self.report_video_webpage_download(video_id)
306 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
308 video_webpage = urllib2.urlopen(request).read()
309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
313 # Attempt to extract SWF player URL
314 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
316 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
321 self.report_video_info_webpage_download(video_id)
322 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
323 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
324 % (video_id, el_type))
325 request = urllib2.Request(video_info_url)
327 video_info_webpage = urllib2.urlopen(request).read()
328 video_info = parse_qs(video_info_webpage)
329 if 'token' in video_info:
331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
332 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
334 if 'token' not in video_info:
335 if 'reason' in video_info:
336 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
338 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
341 # Check for "rental" videos
342 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
343 self._downloader.trouble(u'ERROR: "rental" videos not supported')
346 # Start extracting information
347 self.report_information_extraction(video_id)
350 if 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
353 video_uploader = urllib.unquote_plus(video_info['author'][0])
356 if 'title' not in video_info:
357 self._downloader.trouble(u'ERROR: unable to extract video title')
359 video_title = urllib.unquote_plus(video_info['title'][0])
360 video_title = video_title.decode('utf-8')
363 if 'thumbnail_url' not in video_info:
364 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
366 else: # don't panic if we can't find it
367 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
371 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
373 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
374 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
375 for expression in format_expressions:
377 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
382 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
383 if video_description: video_description = clean_html(video_description)
384 else: video_description = ''
387 video_subtitles = None
388 if self._downloader.params.get('writesubtitles', False):
390 self.report_video_subtitles_download(video_id)
391 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
393 srt_list = urllib2.urlopen(request).read()
394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
395 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
396 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
397 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
398 if not srt_lang_list:
399 raise Trouble(u'WARNING: video has no closed captions')
400 if self._downloader.params.get('subtitleslang', False):
401 srt_lang = self._downloader.params.get('subtitleslang')
402 elif 'en' in srt_lang_list:
405 srt_lang = srt_lang_list.keys()[0]
406 if not srt_lang in srt_lang_list:
407 raise Trouble(u'WARNING: no closed captions found in the specified language')
408 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
410 srt_xml = urllib2.urlopen(request).read()
411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
412 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
414 raise Trouble(u'WARNING: unable to download video subtitles')
415 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
416 except Trouble as trouble:
417 self._downloader.trouble(trouble[0])
419 if 'length_seconds' not in video_info:
420 self._downloader.trouble(u'WARNING: unable to extract video duration')
423 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
426 video_token = urllib.unquote_plus(video_info['token'][0])
428 # Decide which formats to download
429 req_format = self._downloader.params.get('format', None)
431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
432 self.report_rtmp_download()
433 video_url_list = [(None, video_info['conn'][0])]
434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
435 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
436 url_data = [parse_qs(uds) for uds in url_data_strs]
437 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
438 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
440 format_limit = self._downloader.params.get('format_limit', None)
441 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
442 if format_limit is not None and format_limit in available_formats:
443 format_list = available_formats[available_formats.index(format_limit):]
445 format_list = available_formats
446 existing_formats = [x for x in format_list if x in url_map]
447 if len(existing_formats) == 0:
448 self._downloader.trouble(u'ERROR: no known formats available for video')
450 if self._downloader.params.get('listformats', None):
451 self._print_formats(existing_formats)
453 if req_format is None or req_format == 'best':
454 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
455 elif req_format == 'worst':
456 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
457 elif req_format in ('-1', 'all'):
458 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
460 # Specific formats. We pick the first in a slash-delimeted sequence.
461 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
462 req_formats = req_format.split('/')
463 video_url_list = None
464 for rf in req_formats:
466 video_url_list = [(rf, url_map[rf])]
468 if video_url_list is None:
469 self._downloader.trouble(u'ERROR: requested format not available')
472 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
476 for format_param, video_real_url in video_url_list:
478 video_extension = self._video_extensions.get(format_param, 'flv')
480 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
481 self._video_dimensions.get(format_param, '???'))
484 'id': video_id.decode('utf-8'),
485 'url': video_real_url.decode('utf-8'),
486 'uploader': video_uploader.decode('utf-8'),
487 'upload_date': upload_date,
488 'title': video_title,
489 'ext': video_extension.decode('utf-8'),
490 'format': video_format,
491 'thumbnail': video_thumbnail.decode('utf-8'),
492 'description': video_description,
493 'player_url': player_url,
494 'subtitles': video_subtitles,
495 'duration': video_duration
500 class MetacafeIE(InfoExtractor):
501 """Information Extractor for metacafe.com."""
503 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
504 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
505 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
506 IE_NAME = u'metacafe'
508 def __init__(self, downloader=None):
509 InfoExtractor.__init__(self, downloader)
511 def report_disclaimer(self):
512 """Report disclaimer retrieval."""
513 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
515 def report_age_confirmation(self):
516 """Report attempt to confirm age."""
517 self._downloader.to_screen(u'[metacafe] Confirming age')
519 def report_download_webpage(self, video_id):
520 """Report webpage download."""
521 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
523 def report_extraction(self, video_id):
524 """Report information extraction."""
525 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
527 def _real_initialize(self):
528 # Retrieve disclaimer
529 request = urllib2.Request(self._DISCLAIMER)
531 self.report_disclaimer()
532 disclaimer = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
540 'submit': "Continue - I'm over 18",
542 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
544 self.report_age_confirmation()
545 disclaimer = urllib2.urlopen(request).read()
546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
550 def _real_extract(self, url):
551 # Extract id and simplified title from URL
552 mobj = re.match(self._VALID_URL, url)
554 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
557 video_id = mobj.group(1)
559 # Check if video comes from YouTube
560 mobj2 = re.match(r'^yt-(.*)$', video_id)
561 if mobj2 is not None:
562 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
565 # Retrieve video webpage to extract further information
566 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
568 self.report_download_webpage(video_id)
569 webpage = urllib2.urlopen(request).read()
570 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
571 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
574 # Extract URL, uploader and title from webpage
575 self.report_extraction(video_id)
576 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
578 mediaURL = urllib.unquote(mobj.group(1))
579 video_extension = mediaURL[-3:]
581 # Extract gdaKey if available
582 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
586 gdaKey = mobj.group(1)
587 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
589 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
591 self._downloader.trouble(u'ERROR: unable to extract media URL')
593 vardict = parse_qs(mobj.group(1))
594 if 'mediaData' not in vardict:
595 self._downloader.trouble(u'ERROR: unable to extract media URL')
597 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
599 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 mediaURL = mobj.group(1).replace('\\/', '/')
602 video_extension = mediaURL[-3:]
603 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
605 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
607 self._downloader.trouble(u'ERROR: unable to extract title')
609 video_title = mobj.group(1).decode('utf-8')
611 mobj = re.search(r'submitter=(.*?);', webpage)
613 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
615 video_uploader = mobj.group(1)
618 'id': video_id.decode('utf-8'),
619 'url': video_url.decode('utf-8'),
620 'uploader': video_uploader.decode('utf-8'),
621 'upload_date': u'NA',
622 'title': video_title,
623 'ext': video_extension.decode('utf-8'),
627 class DailymotionIE(InfoExtractor):
628 """Information Extractor for Dailymotion"""
630 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
631 IE_NAME = u'dailymotion'
633 def __init__(self, downloader=None):
634 InfoExtractor.__init__(self, downloader)
636 def report_download_webpage(self, video_id):
637 """Report webpage download."""
638 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
640 def report_extraction(self, video_id):
641 """Report information extraction."""
642 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
644 def _real_extract(self, url):
645 # Extract id and simplified title from URL
646 mobj = re.match(self._VALID_URL, url)
648 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
651 video_id = mobj.group(1).split('_')[0].split('?')[0]
653 video_extension = 'mp4'
655 # Retrieve video webpage to extract further information
656 request = urllib2.Request(url)
657 request.add_header('Cookie', 'family_filter=off')
659 self.report_download_webpage(video_id)
660 webpage = urllib2.urlopen(request).read()
661 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
665 # Extract URL, uploader and title from webpage
666 self.report_extraction(video_id)
667 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
669 self._downloader.trouble(u'ERROR: unable to extract media URL')
671 flashvars = urllib.unquote(mobj.group(1))
673 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
676 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
679 self._downloader.trouble(u'ERROR: unable to extract video URL')
682 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
684 self._downloader.trouble(u'ERROR: unable to extract video URL')
687 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
689 # TODO: support choosing qualities
691 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
693 self._downloader.trouble(u'ERROR: unable to extract title')
695 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
697 video_uploader = u'NA'
698 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
700 # lookin for official user
701 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
702 if mobj_official is None:
703 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
705 video_uploader = mobj_official.group(1)
707 video_uploader = mobj.group(1)
709 video_upload_date = u'NA'
710 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
712 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
715 'id': video_id.decode('utf-8'),
716 'url': video_url.decode('utf-8'),
717 'uploader': video_uploader.decode('utf-8'),
718 'upload_date': video_upload_date,
719 'title': video_title,
720 'ext': video_extension.decode('utf-8'),
724 class GoogleIE(InfoExtractor):
725 """Information extractor for video.google.com."""
727 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
728 IE_NAME = u'video.google'
730 def __init__(self, downloader=None):
731 InfoExtractor.__init__(self, downloader)
733 def report_download_webpage(self, video_id):
734 """Report webpage download."""
735 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
737 def report_extraction(self, video_id):
738 """Report information extraction."""
739 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
741 def _real_extract(self, url):
742 # Extract id from URL
743 mobj = re.match(self._VALID_URL, url)
745 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
748 video_id = mobj.group(1)
750 video_extension = 'mp4'
752 # Retrieve video webpage to extract further information
753 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
755 self.report_download_webpage(video_id)
756 webpage = urllib2.urlopen(request).read()
757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
758 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
761 # Extract URL, uploader, and title from webpage
762 self.report_extraction(video_id)
763 mobj = re.search(r"download_url:'([^']+)'", webpage)
765 video_extension = 'flv'
766 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
768 self._downloader.trouble(u'ERROR: unable to extract media URL')
770 mediaURL = urllib.unquote(mobj.group(1))
771 mediaURL = mediaURL.replace('\\x3d', '\x3d')
772 mediaURL = mediaURL.replace('\\x26', '\x26')
776 mobj = re.search(r'<title>(.*)</title>', webpage)
778 self._downloader.trouble(u'ERROR: unable to extract title')
780 video_title = mobj.group(1).decode('utf-8')
782 # Extract video description
783 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
785 self._downloader.trouble(u'ERROR: unable to extract video description')
787 video_description = mobj.group(1).decode('utf-8')
788 if not video_description:
789 video_description = 'No description available.'
791 # Extract video thumbnail
792 if self._downloader.params.get('forcethumbnail', False):
793 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
795 webpage = urllib2.urlopen(request).read()
796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
799 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
801 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
803 video_thumbnail = mobj.group(1)
804 else: # we need something to pass to process_info
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
811 'upload_date': u'NA',
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
817 class PhotobucketIE(InfoExtractor):
818 """Information extractor for photobucket.com."""
820 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
821 IE_NAME = u'photobucket'
823 def __init__(self, downloader=None):
824 InfoExtractor.__init__(self, downloader)
826 def report_download_webpage(self, video_id):
827 """Report webpage download."""
828 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
830 def report_extraction(self, video_id):
831 """Report information extraction."""
832 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
834 def _real_extract(self, url):
835 # Extract id from URL
836 mobj = re.match(self._VALID_URL, url)
838 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
841 video_id = mobj.group(1)
843 video_extension = 'flv'
845 # Retrieve video webpage to extract further information
846 request = urllib2.Request(url)
848 self.report_download_webpage(video_id)
849 webpage = urllib2.urlopen(request).read()
850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
854 # Extract URL, uploader, and title from webpage
855 self.report_extraction(video_id)
856 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
858 self._downloader.trouble(u'ERROR: unable to extract media URL')
860 mediaURL = urllib.unquote(mobj.group(1))
864 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
866 self._downloader.trouble(u'ERROR: unable to extract title')
868 video_title = mobj.group(1).decode('utf-8')
870 video_uploader = mobj.group(2).decode('utf-8')
873 'id': video_id.decode('utf-8'),
874 'url': video_url.decode('utf-8'),
875 'uploader': video_uploader,
876 'upload_date': u'NA',
877 'title': video_title,
878 'ext': video_extension.decode('utf-8'),
882 class YahooIE(InfoExtractor):
883 """Information extractor for video.yahoo.com."""
885 # _VALID_URL matches all Yahoo! Video URLs
886 # _VPAGE_URL matches only the extractable '/watch/' URLs
887 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
888 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
889 IE_NAME = u'video.yahoo'
891 def __init__(self, downloader=None):
892 InfoExtractor.__init__(self, downloader)
894 def report_download_webpage(self, video_id):
895 """Report webpage download."""
896 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
898 def report_extraction(self, video_id):
899 """Report information extraction."""
900 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
902 def _real_extract(self, url, new_video=True):
903 # Extract ID from URL
904 mobj = re.match(self._VALID_URL, url)
906 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
909 video_id = mobj.group(2)
910 video_extension = 'flv'
912 # Rewrite valid but non-extractable URLs as
913 # extractable English language /watch/ URLs
914 if re.match(self._VPAGE_URL, url) is None:
915 request = urllib2.Request(url)
917 webpage = urllib2.urlopen(request).read()
918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
922 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
924 self._downloader.trouble(u'ERROR: Unable to extract id field')
926 yahoo_id = mobj.group(1)
928 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
930 self._downloader.trouble(u'ERROR: Unable to extract vid field')
932 yahoo_vid = mobj.group(1)
934 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
935 return self._real_extract(url, new_video=False)
937 # Retrieve video webpage to extract further information
938 request = urllib2.Request(url)
940 self.report_download_webpage(video_id)
941 webpage = urllib2.urlopen(request).read()
942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
943 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
946 # Extract uploader and title from webpage
947 self.report_extraction(video_id)
948 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
950 self._downloader.trouble(u'ERROR: unable to extract video title')
952 video_title = mobj.group(1).decode('utf-8')
954 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
956 self._downloader.trouble(u'ERROR: unable to extract video uploader')
958 video_uploader = mobj.group(1).decode('utf-8')
960 # Extract video thumbnail
961 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
965 video_thumbnail = mobj.group(1).decode('utf-8')
967 # Extract video description
968 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
970 self._downloader.trouble(u'ERROR: unable to extract video description')
972 video_description = mobj.group(1).decode('utf-8')
973 if not video_description:
974 video_description = 'No description available.'
976 # Extract video height and width
977 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
979 self._downloader.trouble(u'ERROR: unable to extract video height')
981 yv_video_height = mobj.group(1)
983 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
985 self._downloader.trouble(u'ERROR: unable to extract video width')
987 yv_video_width = mobj.group(1)
989 # Retrieve video playlist to extract media URL
990 # I'm not completely sure what all these options are, but we
991 # seem to need most of them, otherwise the server sends a 401.
992 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
993 yv_bitrate = '700' # according to Wikipedia this is hard-coded
994 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
995 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
996 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
998 self.report_download_webpage(video_id)
999 webpage = urllib2.urlopen(request).read()
1000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1004 # Extract media URL from playlist XML
1005 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1007 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1009 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1010 video_url = unescapeHTML(video_url)
1013 'id': video_id.decode('utf-8'),
1015 'uploader': video_uploader,
1016 'upload_date': u'NA',
1017 'title': video_title,
1018 'ext': video_extension.decode('utf-8'),
1019 'thumbnail': video_thumbnail.decode('utf-8'),
1020 'description': video_description,
1021 'thumbnail': video_thumbnail,
1025 class VimeoIE(InfoExtractor):
1026 """Information extractor for vimeo.com."""
1028 # _VALID_URL matches Vimeo URLs
1029 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1032 def __init__(self, downloader=None):
1033 InfoExtractor.__init__(self, downloader)
1035 def report_download_webpage(self, video_id):
1036 """Report webpage download."""
1037 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1039 def report_extraction(self, video_id):
1040 """Report information extraction."""
1041 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1043 def _real_extract(self, url, new_video=True):
1044 # Extract ID from URL
1045 mobj = re.match(self._VALID_URL, url)
1047 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1050 video_id = mobj.group(1)
1052 # Retrieve video webpage to extract further information
1053 request = urllib2.Request(url, None, std_headers)
1055 self.report_download_webpage(video_id)
1056 webpage = urllib2.urlopen(request).read()
1057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1061 # Now we begin extracting as much information as we can from what we
1062 # retrieved. First we extract the information common to all extractors,
1063 # and latter we extract those that are Vimeo specific.
1064 self.report_extraction(video_id)
1066 # Extract the config JSON
1067 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1069 config = json.loads(config)
1071 self._downloader.trouble(u'ERROR: unable to extract info section')
1075 video_title = config["video"]["title"]
1078 video_uploader = config["video"]["owner"]["name"]
1080 # Extract video thumbnail
1081 video_thumbnail = config["video"]["thumbnail"]
1083 # Extract video description
1084 video_description = get_element_by_id("description", webpage.decode('utf8'))
1085 if video_description: video_description = clean_html(video_description)
1086 else: video_description = ''
1088 # Extract upload date
1089 video_upload_date = u'NA'
1090 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1091 if mobj is not None:
1092 video_upload_date = mobj.group(1)
1094 # Vimeo specific: extract request signature and timestamp
1095 sig = config['request']['signature']
1096 timestamp = config['request']['timestamp']
1098 # Vimeo specific: extract video codec and quality information
1099 # First consider quality, then codecs, then take everything
1100 # TODO bind to format param
1101 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1102 files = { 'hd': [], 'sd': [], 'other': []}
1103 for codec_name, codec_extension in codecs:
1104 if codec_name in config["video"]["files"]:
1105 if 'hd' in config["video"]["files"][codec_name]:
1106 files['hd'].append((codec_name, codec_extension, 'hd'))
1107 elif 'sd' in config["video"]["files"][codec_name]:
1108 files['sd'].append((codec_name, codec_extension, 'sd'))
1110 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1112 for quality in ('hd', 'sd', 'other'):
1113 if len(files[quality]) > 0:
1114 video_quality = files[quality][0][2]
1115 video_codec = files[quality][0][0]
1116 video_extension = files[quality][0][1]
1117 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1120 self._downloader.trouble(u'ERROR: no known codec found')
1123 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1124 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129 'uploader': video_uploader,
1130 'upload_date': video_upload_date,
1131 'title': video_title,
1132 'ext': video_extension,
1133 'thumbnail': video_thumbnail,
1134 'description': video_description,
1138 class GenericIE(InfoExtractor):
1139 """Generic last-resort information extractor."""
1142 IE_NAME = u'generic'
1144 def __init__(self, downloader=None):
1145 InfoExtractor.__init__(self, downloader)
1147 def report_download_webpage(self, video_id):
1148 """Report webpage download."""
1149 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1150 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1152 def report_extraction(self, video_id):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1156 def report_following_redirect(self, new_url):
1157 """Report information extraction."""
1158 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1160 def _test_redirect(self, url):
1161 """Check if it is a redirect, like url shorteners, in case restart chain."""
1162 class HeadRequest(urllib2.Request):
1163 def get_method(self):
1166 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1168 Subclass the HTTPRedirectHandler to make it use our
1169 HeadRequest also on the redirected URL
1171 def redirect_request(self, req, fp, code, msg, headers, newurl):
1172 if code in (301, 302, 303, 307):
1173 newurl = newurl.replace(' ', '%20')
1174 newheaders = dict((k,v) for k,v in req.headers.items()
1175 if k.lower() not in ("content-length", "content-type"))
1176 return HeadRequest(newurl,
1178 origin_req_host=req.get_origin_req_host(),
1181 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1183 class HTTPMethodFallback(urllib2.BaseHandler):
1185 Fallback to GET if HEAD is not allowed (405 HTTP error)
1187 def http_error_405(self, req, fp, code, msg, headers):
1191 newheaders = dict((k,v) for k,v in req.headers.items()
1192 if k.lower() not in ("content-length", "content-type"))
1193 return self.parent.open(urllib2.Request(req.get_full_url(),
1195 origin_req_host=req.get_origin_req_host(),
1199 opener = urllib2.OpenerDirector()
1200 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1201 HTTPMethodFallback, HEADRedirectHandler,
1202 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1203 opener.add_handler(handler())
1205 response = opener.open(HeadRequest(url))
1206 new_url = response.geturl()
1208 if url == new_url: return False
1210 self.report_following_redirect(new_url)
1211 self._downloader.download([new_url])
1214 def _real_extract(self, url):
1215 if self._test_redirect(url): return
1217 video_id = url.split('/')[-1]
1218 request = urllib2.Request(url)
1220 self.report_download_webpage(video_id)
1221 webpage = urllib2.urlopen(request).read()
1222 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1223 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1225 except ValueError, err:
1226 # since this is the last-resort InfoExtractor, if
1227 # this error is thrown, it'll be thrown here
1228 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1231 self.report_extraction(video_id)
1232 # Start with something easy: JW Player in SWFObject
1233 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1235 # Broaden the search a little bit
1236 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1238 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1241 # It's possible that one of the regexes
1242 # matched, but returned an empty group:
1243 if mobj.group(1) is None:
1244 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1247 video_url = urllib.unquote(mobj.group(1))
1248 video_id = os.path.basename(video_url)
1250 # here's a fun little line of code for you:
1251 video_extension = os.path.splitext(video_id)[1][1:]
1252 video_id = os.path.splitext(video_id)[0]
1254 # it's tempting to parse this further, but you would
1255 # have to take into account all the variations like
1256 # Video Title - Site Name
1257 # Site Name | Video Title
1258 # Video Title - Tagline | Site Name
1259 # and so on and so forth; it's just not practical
1260 mobj = re.search(r'<title>(.*)</title>', webpage)
1262 self._downloader.trouble(u'ERROR: unable to extract title')
1264 video_title = mobj.group(1).decode('utf-8')
1266 # video uploader is domain name
1267 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1269 self._downloader.trouble(u'ERROR: unable to extract title')
1271 video_uploader = mobj.group(1).decode('utf-8')
1274 'id': video_id.decode('utf-8'),
1275 'url': video_url.decode('utf-8'),
1276 'uploader': video_uploader,
1277 'upload_date': u'NA',
1278 'title': video_title,
1279 'ext': video_extension.decode('utf-8'),
1283 class YoutubeSearchIE(InfoExtractor):
1284 """Information Extractor for YouTube search queries."""
1285 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1286 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1287 _max_youtube_results = 1000
1288 IE_NAME = u'youtube:search'
1290 def __init__(self, downloader=None):
1291 InfoExtractor.__init__(self, downloader)
1293 def report_download_page(self, query, pagenum):
1294 """Report attempt to download search page with given number."""
1295 query = query.decode(preferredencoding())
1296 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1298 def _real_extract(self, query):
1299 mobj = re.match(self._VALID_URL, query)
1301 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1304 prefix, query = query.split(':')
1306 query = query.encode('utf-8')
1308 self._download_n_results(query, 1)
1310 elif prefix == 'all':
1311 self._download_n_results(query, self._max_youtube_results)
1317 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1319 elif n > self._max_youtube_results:
1320 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1321 n = self._max_youtube_results
1322 self._download_n_results(query, n)
1324 except ValueError: # parsing prefix as integer fails
1325 self._download_n_results(query, 1)
1328 def _download_n_results(self, query, n):
1329 """Downloads a specified number of results for a query"""
1335 while (50 * pagenum) < limit:
1336 self.report_download_page(query, pagenum+1)
1337 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1338 request = urllib2.Request(result_url)
1340 data = urllib2.urlopen(request).read()
1341 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1342 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1344 api_response = json.loads(data)['data']
1346 new_ids = list(video['id'] for video in api_response['items'])
1347 video_ids += new_ids
1349 limit = min(n, api_response['totalItems'])
1352 if len(video_ids) > n:
1353 video_ids = video_ids[:n]
1354 for id in video_ids:
1355 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1359 class GoogleSearchIE(InfoExtractor):
1360 """Information Extractor for Google Video search queries."""
1361 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1362 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1363 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1364 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1365 _max_google_results = 1000
1366 IE_NAME = u'video.google:search'
1368 def __init__(self, downloader=None):
1369 InfoExtractor.__init__(self, downloader)
1371 def report_download_page(self, query, pagenum):
1372 """Report attempt to download playlist page with given number."""
1373 query = query.decode(preferredencoding())
1374 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1376 def _real_extract(self, query):
1377 mobj = re.match(self._VALID_URL, query)
1379 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1382 prefix, query = query.split(':')
1384 query = query.encode('utf-8')
1386 self._download_n_results(query, 1)
1388 elif prefix == 'all':
1389 self._download_n_results(query, self._max_google_results)
1395 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1397 elif n > self._max_google_results:
1398 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1399 n = self._max_google_results
1400 self._download_n_results(query, n)
1402 except ValueError: # parsing prefix as integer fails
1403 self._download_n_results(query, 1)
1406 def _download_n_results(self, query, n):
1407 """Downloads a specified number of results for a query"""
1413 self.report_download_page(query, pagenum)
1414 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1415 request = urllib2.Request(result_url)
1417 page = urllib2.urlopen(request).read()
1418 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1419 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1422 # Extract video identifiers
1423 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1424 video_id = mobj.group(1)
1425 if video_id not in video_ids:
1426 video_ids.append(video_id)
1427 if len(video_ids) == n:
1428 # Specified n videos reached
1429 for id in video_ids:
1430 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1433 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1434 for id in video_ids:
1435 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1438 pagenum = pagenum + 1
1441 class YahooSearchIE(InfoExtractor):
1442 """Information Extractor for Yahoo! Video search queries."""
1443 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1444 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1445 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1446 _MORE_PAGES_INDICATOR = r'\s*Next'
1447 _max_yahoo_results = 1000
1448 IE_NAME = u'video.yahoo:search'
1450 def __init__(self, downloader=None):
1451 InfoExtractor.__init__(self, downloader)
1453 def report_download_page(self, query, pagenum):
1454 """Report attempt to download playlist page with given number."""
1455 query = query.decode(preferredencoding())
1456 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1458 def _real_extract(self, query):
1459 mobj = re.match(self._VALID_URL, query)
1461 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1464 prefix, query = query.split(':')
1466 query = query.encode('utf-8')
1468 self._download_n_results(query, 1)
1470 elif prefix == 'all':
1471 self._download_n_results(query, self._max_yahoo_results)
1477 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1479 elif n > self._max_yahoo_results:
1480 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1481 n = self._max_yahoo_results
1482 self._download_n_results(query, n)
1484 except ValueError: # parsing prefix as integer fails
1485 self._download_n_results(query, 1)
1488 def _download_n_results(self, query, n):
1489 """Downloads a specified number of results for a query"""
1492 already_seen = set()
1496 self.report_download_page(query, pagenum)
1497 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1498 request = urllib2.Request(result_url)
1500 page = urllib2.urlopen(request).read()
1501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1502 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1505 # Extract video identifiers
1506 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1507 video_id = mobj.group(1)
1508 if video_id not in already_seen:
1509 video_ids.append(video_id)
1510 already_seen.add(video_id)
1511 if len(video_ids) == n:
1512 # Specified n videos reached
1513 for id in video_ids:
1514 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1517 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1518 for id in video_ids:
1519 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1522 pagenum = pagenum + 1
1525 class YoutubePlaylistIE(InfoExtractor):
1526 """Information Extractor for YouTube playlists."""
1528 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1529 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1530 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1531 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1532 IE_NAME = u'youtube:playlist'
1534 def __init__(self, downloader=None):
1535 InfoExtractor.__init__(self, downloader)
1537 def report_download_page(self, playlist_id, pagenum):
1538 """Report attempt to download playlist page with given number."""
1539 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1541 def _real_extract(self, url):
1542 # Extract playlist id
1543 mobj = re.match(self._VALID_URL, url)
1545 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1549 if mobj.group(3) is not None:
1550 self._downloader.download([mobj.group(3)])
1553 # Download playlist pages
1554 # prefix is 'p' as default for playlists but there are other types that need extra care
1555 playlist_prefix = mobj.group(1)
1556 if playlist_prefix == 'a':
1557 playlist_access = 'artist'
1559 playlist_prefix = 'p'
1560 playlist_access = 'view_play_list'
1561 playlist_id = mobj.group(2)
1566 self.report_download_page(playlist_id, pagenum)
1567 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1568 request = urllib2.Request(url)
1570 page = urllib2.urlopen(request).read()
1571 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1572 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1575 # Extract video identifiers
1577 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1578 if mobj.group(1) not in ids_in_page:
1579 ids_in_page.append(mobj.group(1))
1580 video_ids.extend(ids_in_page)
1582 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1584 pagenum = pagenum + 1
1586 playliststart = self._downloader.params.get('playliststart', 1) - 1
1587 playlistend = self._downloader.params.get('playlistend', -1)
1588 if playlistend == -1:
1589 video_ids = video_ids[playliststart:]
1591 video_ids = video_ids[playliststart:playlistend]
1593 for id in video_ids:
1594 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1598 class YoutubeChannelIE(InfoExtractor):
1599 """Information Extractor for YouTube channels."""
1601 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1602 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1603 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1604 IE_NAME = u'youtube:channel'
1606 def report_download_page(self, channel_id, pagenum):
1607 """Report attempt to download channel page with given number."""
1608 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1610 def _real_extract(self, url):
1611 # Extract channel id
1612 mobj = re.match(self._VALID_URL, url)
1614 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1617 # Download channel pages
1618 channel_id = mobj.group(1)
1623 self.report_download_page(channel_id, pagenum)
1624 url = self._TEMPLATE_URL % (channel_id, pagenum)
1625 request = urllib2.Request(url)
1627 page = urllib2.urlopen(request).read()
1628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1629 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1632 # Extract video identifiers
1634 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1635 if mobj.group(1) not in ids_in_page:
1636 ids_in_page.append(mobj.group(1))
1637 video_ids.extend(ids_in_page)
1639 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1641 pagenum = pagenum + 1
1643 for id in video_ids:
1644 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1648 class YoutubeUserIE(InfoExtractor):
1649 """Information Extractor for YouTube users."""
1651 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1652 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1653 _GDATA_PAGE_SIZE = 50
1654 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1655 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1656 IE_NAME = u'youtube:user'
1658 def __init__(self, downloader=None):
1659 InfoExtractor.__init__(self, downloader)
1661 def report_download_page(self, username, start_index):
1662 """Report attempt to download user page."""
1663 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1664 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1666 def _real_extract(self, url):
1668 mobj = re.match(self._VALID_URL, url)
1670 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1673 username = mobj.group(1)
1675 # Download video ids using YouTube Data API. Result size per
1676 # query is limited (currently to 50 videos) so we need to query
1677 # page by page until there are no video ids - it means we got
1684 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1685 self.report_download_page(username, start_index)
1687 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1690 page = urllib2.urlopen(request).read()
1691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1692 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1695 # Extract video identifiers
1698 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1699 if mobj.group(1) not in ids_in_page:
1700 ids_in_page.append(mobj.group(1))
1702 video_ids.extend(ids_in_page)
1704 # A little optimization - if current page is not
1705 # "full", ie. does not contain PAGE_SIZE video ids then
1706 # we can assume that this page is the last one - there
1707 # are no more ids on further pages - no need to query
1710 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1715 all_ids_count = len(video_ids)
1716 playliststart = self._downloader.params.get('playliststart', 1) - 1
1717 playlistend = self._downloader.params.get('playlistend', -1)
1719 if playlistend == -1:
1720 video_ids = video_ids[playliststart:]
1722 video_ids = video_ids[playliststart:playlistend]
1724 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1725 (username, all_ids_count, len(video_ids)))
1727 for video_id in video_ids:
1728 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1731 class BlipTVUserIE(InfoExtractor):
1732 """Information Extractor for blip.tv users."""
1734 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1736 IE_NAME = u'blip.tv:user'
1738 def __init__(self, downloader=None):
1739 InfoExtractor.__init__(self, downloader)
1741 def report_download_page(self, username, pagenum):
1742 """Report attempt to download user page."""
1743 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1744 (self.IE_NAME, username, pagenum))
1746 def _real_extract(self, url):
1748 mobj = re.match(self._VALID_URL, url)
1750 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1753 username = mobj.group(1)
1755 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1757 request = urllib2.Request(url)
1760 page = urllib2.urlopen(request).read().decode('utf-8')
1761 mobj = re.search(r'data-users-id="([^"]+)"', page)
1762 page_base = page_base % mobj.group(1)
1763 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1764 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1768 # Download video ids using BlipTV Ajax calls. Result size per
1769 # query is limited (currently to 12 videos) so we need to query
1770 # page by page until there are no video ids - it means we got
1777 self.report_download_page(username, pagenum)
1779 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1782 page = urllib2.urlopen(request).read().decode('utf-8')
1783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1784 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1787 # Extract video identifiers
1790 for mobj in re.finditer(r'href="/([^"]+)"', page):
1791 if mobj.group(1) not in ids_in_page:
1792 ids_in_page.append(unescapeHTML(mobj.group(1)))
1794 video_ids.extend(ids_in_page)
1796 # A little optimization - if current page is not
1797 # "full", ie. does not contain PAGE_SIZE video ids then
1798 # we can assume that this page is the last one - there
1799 # are no more ids on further pages - no need to query
1802 if len(ids_in_page) < self._PAGE_SIZE:
1807 all_ids_count = len(video_ids)
1808 playliststart = self._downloader.params.get('playliststart', 1) - 1
1809 playlistend = self._downloader.params.get('playlistend', -1)
1811 if playlistend == -1:
1812 video_ids = video_ids[playliststart:]
1814 video_ids = video_ids[playliststart:playlistend]
1816 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1817 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1819 for video_id in video_ids:
1820 self._downloader.download([u'http://blip.tv/'+video_id])
1823 class DepositFilesIE(InfoExtractor):
1824 """Information extractor for depositfiles.com"""
1826 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1827 IE_NAME = u'DepositFiles'
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1832 def report_download_webpage(self, file_id):
1833 """Report webpage download."""
1834 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1836 def report_extraction(self, file_id):
1837 """Report information extraction."""
1838 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1840 def _real_extract(self, url):
1841 file_id = url.split('/')[-1]
1842 # Rebuild url in english locale
1843 url = 'http://depositfiles.com/en/files/' + file_id
1845 # Retrieve file webpage with 'Free download' button pressed
1846 free_download_indication = { 'gateway_result' : '1' }
1847 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1849 self.report_download_webpage(file_id)
1850 webpage = urllib2.urlopen(request).read()
1851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1855 # Search for the real file URL
1856 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1857 if (mobj is None) or (mobj.group(1) is None):
1858 # Try to figure out reason of the error.
1859 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1860 if (mobj is not None) and (mobj.group(1) is not None):
1861 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1862 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1864 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1867 file_url = mobj.group(1)
1868 file_extension = os.path.splitext(file_url)[1][1:]
1870 # Search for file title
1871 mobj = re.search(r'<b title="(.*?)">', webpage)
1873 self._downloader.trouble(u'ERROR: unable to extract title')
1875 file_title = mobj.group(1).decode('utf-8')
1878 'id': file_id.decode('utf-8'),
1879 'url': file_url.decode('utf-8'),
1881 'upload_date': u'NA',
1882 'title': file_title,
1883 'ext': file_extension.decode('utf-8'),
1887 class FacebookIE(InfoExtractor):
1888 """Information Extractor for Facebook"""
1890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1891 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1892 _NETRC_MACHINE = 'facebook'
1893 _available_formats = ['video', 'highqual', 'lowqual']
1894 _video_extensions = {
1899 IE_NAME = u'facebook'
1901 def __init__(self, downloader=None):
1902 InfoExtractor.__init__(self, downloader)
1904 def _reporter(self, message):
1905 """Add header and report message."""
1906 self._downloader.to_screen(u'[facebook] %s' % message)
1908 def report_login(self):
1909 """Report attempt to log in."""
1910 self._reporter(u'Logging in')
1912 def report_video_webpage_download(self, video_id):
1913 """Report attempt to download video webpage."""
1914 self._reporter(u'%s: Downloading video webpage' % video_id)
1916 def report_information_extraction(self, video_id):
1917 """Report attempt to extract video information."""
1918 self._reporter(u'%s: Extracting video information' % video_id)
1920 def _parse_page(self, video_webpage):
1921 """Extract video information from page"""
1923 data = {'title': r'\("video_title", "(.*?)"\)',
1924 'description': r'<div class="datawrap">(.*?)</div>',
1925 'owner': r'\("video_owner_name", "(.*?)"\)',
1926 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1929 for piece in data.keys():
1930 mobj = re.search(data[piece], video_webpage)
1931 if mobj is not None:
1932 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1936 for fmt in self._available_formats:
1937 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1938 if mobj is not None:
1939 # URL is in a Javascript segment inside an escaped Unicode format within
1940 # the generally utf-8 page
1941 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942 video_info['video_urls'] = video_urls
1946 def _real_initialize(self):
1947 if self._downloader is None:
1952 downloader_params = self._downloader.params
1954 # Attempt to use provided username and password or .netrc data
1955 if downloader_params.get('username', None) is not None:
1956 useremail = downloader_params['username']
1957 password = downloader_params['password']
1958 elif downloader_params.get('usenetrc', False):
1960 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1961 if info is not None:
1965 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1966 except (IOError, netrc.NetrcParseError), err:
1967 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1970 if useremail is None:
1979 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1982 login_results = urllib2.urlopen(request).read()
1983 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1984 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1990 def _real_extract(self, url):
1991 mobj = re.match(self._VALID_URL, url)
1993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1995 video_id = mobj.group('ID')
1998 self.report_video_webpage_download(video_id)
1999 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2001 page = urllib2.urlopen(request)
2002 video_webpage = page.read()
2003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2007 # Start extracting information
2008 self.report_information_extraction(video_id)
2010 # Extract information
2011 video_info = self._parse_page(video_webpage)
2014 if 'owner' not in video_info:
2015 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2017 video_uploader = video_info['owner']
2020 if 'title' not in video_info:
2021 self._downloader.trouble(u'ERROR: unable to extract video title')
2023 video_title = video_info['title']
2024 video_title = video_title.decode('utf-8')
2027 if 'thumbnail' not in video_info:
2028 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2029 video_thumbnail = ''
2031 video_thumbnail = video_info['thumbnail']
2035 if 'upload_date' in video_info:
2036 upload_time = video_info['upload_date']
2037 timetuple = email.utils.parsedate_tz(upload_time)
2038 if timetuple is not None:
2040 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2045 video_description = video_info.get('description', 'No description available.')
2047 url_map = video_info['video_urls']
2048 if len(url_map.keys()) > 0:
2049 # Decide which formats to download
2050 req_format = self._downloader.params.get('format', None)
2051 format_limit = self._downloader.params.get('format_limit', None)
2053 if format_limit is not None and format_limit in self._available_formats:
2054 format_list = self._available_formats[self._available_formats.index(format_limit):]
2056 format_list = self._available_formats
2057 existing_formats = [x for x in format_list if x in url_map]
2058 if len(existing_formats) == 0:
2059 self._downloader.trouble(u'ERROR: no known formats available for video')
2061 if req_format is None:
2062 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2063 elif req_format == 'worst':
2064 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2065 elif req_format == '-1':
2066 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2069 if req_format not in url_map:
2070 self._downloader.trouble(u'ERROR: requested format not available')
2072 video_url_list = [(req_format, url_map[req_format])] # Specific format
2075 for format_param, video_real_url in video_url_list:
2077 video_extension = self._video_extensions.get(format_param, 'mp4')
2080 'id': video_id.decode('utf-8'),
2081 'url': video_real_url.decode('utf-8'),
2082 'uploader': video_uploader.decode('utf-8'),
2083 'upload_date': upload_date,
2084 'title': video_title,
2085 'ext': video_extension.decode('utf-8'),
2086 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2087 'thumbnail': video_thumbnail.decode('utf-8'),
2088 'description': video_description.decode('utf-8'),
2092 class BlipTVIE(InfoExtractor):
2093 """Information extractor for blip.tv"""
2095 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2096 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2097 IE_NAME = u'blip.tv'
2099 def report_extraction(self, file_id):
2100 """Report information extraction."""
2101 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2103 def report_direct_download(self, title):
2104 """Report information extraction."""
2105 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2107 def _real_extract(self, url):
2108 mobj = re.match(self._VALID_URL, url)
2110 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2117 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2118 request = urllib2.Request(json_url.encode('utf-8'))
2119 self.report_extraction(mobj.group(1))
2122 urlh = urllib2.urlopen(request)
2123 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2124 basename = url.split('/')[-1]
2125 title,ext = os.path.splitext(basename)
2126 title = title.decode('UTF-8')
2127 ext = ext.replace('.', '')
2128 self.report_direct_download(title)
2136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2139 if info is None: # Regular URL
2141 json_code = urlh.read()
2142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2147 json_data = json.loads(json_code)
2148 if 'Post' in json_data:
2149 data = json_data['Post']
2153 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2154 video_url = data['media']['url']
2155 umobj = re.match(self._URL_EXT, video_url)
2157 raise ValueError('Can not determine filename extension')
2158 ext = umobj.group(1)
2161 'id': data['item_id'],
2163 'uploader': data['display_name'],
2164 'upload_date': upload_date,
2165 'title': data['title'],
2167 'format': data['media']['mimeType'],
2168 'thumbnail': data['thumbnailUrl'],
2169 'description': data['description'],
2170 'player_url': data['embedUrl']
2172 except (ValueError,KeyError), err:
2173 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2176 std_headers['User-Agent'] = 'iTunes/10.6.1'
2180 class MyVideoIE(InfoExtractor):
2181 """Information Extractor for myvideo.de."""
2183 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2184 IE_NAME = u'myvideo'
2186 def __init__(self, downloader=None):
2187 InfoExtractor.__init__(self, downloader)
2189 def report_download_webpage(self, video_id):
2190 """Report webpage download."""
2191 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2193 def report_extraction(self, video_id):
2194 """Report information extraction."""
2195 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2197 def _real_extract(self,url):
2198 mobj = re.match(self._VALID_URL, url)
2200 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2203 video_id = mobj.group(1)
2206 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2208 self.report_download_webpage(video_id)
2209 webpage = urllib2.urlopen(request).read()
2210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2211 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2214 self.report_extraction(video_id)
2215 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2218 self._downloader.trouble(u'ERROR: unable to extract media URL')
2220 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2222 mobj = re.search('<title>([^<]+)</title>', webpage)
2224 self._downloader.trouble(u'ERROR: unable to extract title')
2227 video_title = mobj.group(1)
2233 'upload_date': u'NA',
2234 'title': video_title,
2238 class ComedyCentralIE(InfoExtractor):
2239 """Information extractor for The Daily Show and Colbert Report """
2241 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2242 IE_NAME = u'comedycentral'
2244 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2246 _video_extensions = {
2254 _video_dimensions = {
2263 def report_extraction(self, episode_id):
2264 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2266 def report_config_download(self, episode_id):
2267 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2269 def report_index_download(self, episode_id):
2270 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2272 def report_player_url(self, episode_id):
2273 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2276 def _print_formats(self, formats):
2277 print('Available formats:')
2279 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2282 def _real_extract(self, url):
2283 mobj = re.match(self._VALID_URL, url)
2285 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2288 if mobj.group('shortname'):
2289 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2290 url = u'http://www.thedailyshow.com/full-episodes/'
2292 url = u'http://www.colbertnation.com/full-episodes/'
2293 mobj = re.match(self._VALID_URL, url)
2294 assert mobj is not None
2296 dlNewest = not mobj.group('episode')
2298 epTitle = mobj.group('showname')
2300 epTitle = mobj.group('episode')
2302 req = urllib2.Request(url)
2303 self.report_extraction(epTitle)
2305 htmlHandle = urllib2.urlopen(req)
2306 html = htmlHandle.read()
2307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2308 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2311 url = htmlHandle.geturl()
2312 mobj = re.match(self._VALID_URL, url)
2314 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2316 if mobj.group('episode') == '':
2317 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2319 epTitle = mobj.group('episode')
2321 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2323 if len(mMovieParams) == 0:
2324 # The Colbert Report embeds the information in a without
2325 # a URL prefix; so extract the alternate reference
2326 # and then add the URL prefix manually.
2328 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2329 if len(altMovieParams) == 0:
2330 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2333 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2335 playerUrl_raw = mMovieParams[0][0]
2336 self.report_player_url(epTitle)
2338 urlHandle = urllib2.urlopen(playerUrl_raw)
2339 playerUrl = urlHandle.geturl()
2340 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2341 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2344 uri = mMovieParams[0][1]
2345 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2346 self.report_index_download(epTitle)
2348 indexXml = urllib2.urlopen(indexUrl).read()
2349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2355 idoc = xml.etree.ElementTree.fromstring(indexXml)
2356 itemEls = idoc.findall('.//item')
2357 for itemEl in itemEls:
2358 mediaId = itemEl.findall('./guid')[0].text
2359 shortMediaId = mediaId.split(':')[-1]
2360 showId = mediaId.split(':')[-2].replace('.com', '')
2361 officialTitle = itemEl.findall('./title')[0].text
2362 officialDate = itemEl.findall('./pubDate')[0].text
2364 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2365 urllib.urlencode({'uri': mediaId}))
2366 configReq = urllib2.Request(configUrl)
2367 self.report_config_download(epTitle)
2369 configXml = urllib2.urlopen(configReq).read()
2370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2374 cdoc = xml.etree.ElementTree.fromstring(configXml)
2376 for rendition in cdoc.findall('.//rendition'):
2377 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2381 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2384 if self._downloader.params.get('listformats', None):
2385 self._print_formats([i[0] for i in turls])
2388 # For now, just pick the highest bitrate
2389 format,video_url = turls[-1]
2391 # Get the format arg from the arg stream
2392 req_format = self._downloader.params.get('format', None)
2394 # Select format if we can find one
2397 format, video_url = f, v
2400 # Patch to download from alternative CDN, which does not
2401 # break on current RTMPDump builds
2402 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2403 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2405 if video_url.startswith(broken_cdn):
2406 video_url = video_url.replace(broken_cdn, better_cdn)
2408 effTitle = showId + u'-' + epTitle
2413 'upload_date': officialDate,
2418 'description': officialTitle,
2419 'player_url': None #playerUrl
2422 results.append(info)
2427 class EscapistIE(InfoExtractor):
2428 """Information extractor for The Escapist """
2430 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2431 IE_NAME = u'escapist'
2433 def report_extraction(self, showName):
2434 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2436 def report_config_download(self, showName):
2437 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2439 def _real_extract(self, url):
2440 mobj = re.match(self._VALID_URL, url)
2442 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2444 showName = mobj.group('showname')
2445 videoId = mobj.group('episode')
2447 self.report_extraction(showName)
2449 webPage = urllib2.urlopen(url)
2450 webPageBytes = webPage.read()
2451 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2452 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2457 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2458 description = unescapeHTML(descMatch.group(1))
2459 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2460 imgUrl = unescapeHTML(imgMatch.group(1))
2461 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2462 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2463 configUrlMatch = re.search('config=(.*)$', playerUrl)
2464 configUrl = urllib2.unquote(configUrlMatch.group(1))
2466 self.report_config_download(showName)
2468 configJSON = urllib2.urlopen(configUrl).read()
2469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2473 # Technically, it's JavaScript, not JSON
2474 configJSON = configJSON.replace("'", '"')
2477 config = json.loads(configJSON)
2478 except (ValueError,), err:
2479 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2482 playlist = config['playlist']
2483 videoUrl = playlist[1]['url']
2488 'uploader': showName,
2489 'upload_date': None,
2492 'thumbnail': imgUrl,
2493 'description': description,
2494 'player_url': playerUrl,
2500 class CollegeHumorIE(InfoExtractor):
2501 """Information extractor for collegehumor.com"""
2503 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2504 IE_NAME = u'collegehumor'
2506 def report_webpage(self, video_id):
2507 """Report information extraction."""
2508 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2510 def report_extraction(self, video_id):
2511 """Report information extraction."""
2512 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2514 def _real_extract(self, url):
2515 mobj = re.match(self._VALID_URL, url)
2517 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2519 video_id = mobj.group('videoid')
2521 self.report_webpage(video_id)
2522 request = urllib2.Request(url)
2524 webpage = urllib2.urlopen(request).read()
2525 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2526 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2529 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2531 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2533 internal_video_id = m.group('internalvideoid')
2537 'internal_id': internal_video_id,
2540 self.report_extraction(video_id)
2541 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2543 metaXml = urllib2.urlopen(xmlUrl).read()
2544 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2545 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2548 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2550 videoNode = mdoc.findall('./video')[0]
2551 info['description'] = videoNode.findall('./description')[0].text
2552 info['title'] = videoNode.findall('./caption')[0].text
2553 info['url'] = videoNode.findall('./file')[0].text
2554 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2555 info['ext'] = info['url'].rpartition('.')[2]
2557 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2563 class XVideosIE(InfoExtractor):
2564 """Information extractor for xvideos.com"""
2566 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2567 IE_NAME = u'xvideos'
2569 def report_webpage(self, video_id):
2570 """Report information extraction."""
2571 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2573 def report_extraction(self, video_id):
2574 """Report information extraction."""
2575 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2577 def _real_extract(self, url):
2578 mobj = re.match(self._VALID_URL, url)
2580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2582 video_id = mobj.group(1).decode('utf-8')
2584 self.report_webpage(video_id)
2586 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2588 webpage = urllib2.urlopen(request).read()
2589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2590 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2593 self.report_extraction(video_id)
2597 mobj = re.search(r'flv_url=(.+?)&', webpage)
2599 self._downloader.trouble(u'ERROR: unable to extract video url')
2601 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2605 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2607 self._downloader.trouble(u'ERROR: unable to extract video title')
2609 video_title = mobj.group(1).decode('utf-8')
2612 # Extract video thumbnail
2613 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2615 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2617 video_thumbnail = mobj.group(0).decode('utf-8')
2623 'upload_date': None,
2624 'title': video_title,
2626 'thumbnail': video_thumbnail,
2627 'description': None,
2633 class SoundcloudIE(InfoExtractor):
2634 """Information extractor for soundcloud.com
2635 To access the media, the uid of the song and a stream token
2636 must be extracted from the page source and the script must make
2637 a request to media.soundcloud.com/crossdomain.xml. Then
2638 the media can be grabbed by requesting from an url composed
2639 of the stream token and uid
2642 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2643 IE_NAME = u'soundcloud'
2645 def __init__(self, downloader=None):
2646 InfoExtractor.__init__(self, downloader)
2648 def report_webpage(self, video_id):
2649 """Report information extraction."""
2650 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2652 def report_extraction(self, video_id):
2653 """Report information extraction."""
2654 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2656 def _real_extract(self, url):
2657 mobj = re.match(self._VALID_URL, url)
2659 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2662 # extract uploader (which is in the url)
2663 uploader = mobj.group(1).decode('utf-8')
2664 # extract simple title (uploader + slug of song title)
2665 slug_title = mobj.group(2).decode('utf-8')
2666 simple_title = uploader + u'-' + slug_title
2668 self.report_webpage('%s/%s' % (uploader, slug_title))
2670 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2672 webpage = urllib2.urlopen(request).read()
2673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2674 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2677 self.report_extraction('%s/%s' % (uploader, slug_title))
2679 # extract uid and stream token that soundcloud hands out for access
2680 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2682 video_id = mobj.group(1)
2683 stream_token = mobj.group(2)
2685 # extract unsimplified title
2686 mobj = re.search('"title":"(.*?)",', webpage)
2688 title = mobj.group(1).decode('utf-8')
2690 title = simple_title
2692 # construct media url (with uid/token)
2693 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2694 mediaURL = mediaURL % (video_id, stream_token)
2697 description = u'No description available'
2698 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2700 description = mobj.group(1)
2704 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2707 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2708 except Exception, e:
2709 self._downloader.to_stderr(compat_str(e))
2711 # for soundcloud, a request to a cross domain is required for cookies
2712 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2715 'id': video_id.decode('utf-8'),
2717 'uploader': uploader.decode('utf-8'),
2718 'upload_date': upload_date,
2721 'description': description.decode('utf-8')
2725 class InfoQIE(InfoExtractor):
2726 """Information extractor for infoq.com"""
2728 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2731 def report_webpage(self, video_id):
2732 """Report information extraction."""
2733 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2735 def report_extraction(self, video_id):
2736 """Report information extraction."""
2737 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2739 def _real_extract(self, url):
2740 mobj = re.match(self._VALID_URL, url)
2742 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2745 self.report_webpage(url)
2747 request = urllib2.Request(url)
2749 webpage = urllib2.urlopen(request).read()
2750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2754 self.report_extraction(url)
2758 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2760 self._downloader.trouble(u'ERROR: unable to extract video url')
2762 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2766 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2768 self._downloader.trouble(u'ERROR: unable to extract video title')
2770 video_title = mobj.group(1).decode('utf-8')
2772 # Extract description
2773 video_description = u'No description available.'
2774 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2775 if mobj is not None:
2776 video_description = mobj.group(1).decode('utf-8')
2778 video_filename = video_url.split('/')[-1]
2779 video_id, extension = video_filename.split('.')
2785 'upload_date': None,
2786 'title': video_title,
2787 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2789 'description': video_description,
2794 class MixcloudIE(InfoExtractor):
2795 """Information extractor for www.mixcloud.com"""
2796 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2797 IE_NAME = u'mixcloud'
2799 def __init__(self, downloader=None):
2800 InfoExtractor.__init__(self, downloader)
2802 def report_download_json(self, file_id):
2803 """Report JSON download."""
2804 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2806 def report_extraction(self, file_id):
2807 """Report information extraction."""
2808 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2810 def get_urls(self, jsonData, fmt, bitrate='best'):
2811 """Get urls from 'audio_formats' section in json"""
2814 bitrate_list = jsonData[fmt]
2815 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2816 bitrate = max(bitrate_list) # select highest
2818 url_list = jsonData[fmt][bitrate]
2819 except TypeError: # we have no bitrate info.
2820 url_list = jsonData[fmt]
2823 def check_urls(self, url_list):
2824 """Returns 1st active url from list"""
2825 for url in url_list:
2827 urllib2.urlopen(url)
2829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2834 def _print_formats(self, formats):
2835 print('Available formats:')
2836 for fmt in formats.keys():
2837 for b in formats[fmt]:
2839 ext = formats[fmt][b][0]
2840 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2841 except TypeError: # we have no bitrate info
2842 ext = formats[fmt][0]
2843 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2846 def _real_extract(self, url):
2847 mobj = re.match(self._VALID_URL, url)
2849 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851 # extract uploader & filename from url
2852 uploader = mobj.group(1).decode('utf-8')
2853 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2855 # construct API request
2856 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2857 # retrieve .json file with links to files
2858 request = urllib2.Request(file_url)
2860 self.report_download_json(file_url)
2861 jsonData = urllib2.urlopen(request).read()
2862 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2863 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2867 json_data = json.loads(jsonData)
2868 player_url = json_data['player_swf_url']
2869 formats = dict(json_data['audio_formats'])
2871 req_format = self._downloader.params.get('format', None)
2874 if self._downloader.params.get('listformats', None):
2875 self._print_formats(formats)
2878 if req_format is None or req_format == 'best':
2879 for format_param in formats.keys():
2880 url_list = self.get_urls(formats, format_param)
2882 file_url = self.check_urls(url_list)
2883 if file_url is not None:
2886 if req_format not in formats.keys():
2887 self._downloader.trouble(u'ERROR: format is not available')
2890 url_list = self.get_urls(formats, req_format)
2891 file_url = self.check_urls(url_list)
2892 format_param = req_format
2895 'id': file_id.decode('utf-8'),
2896 'url': file_url.decode('utf-8'),
2897 'uploader': uploader.decode('utf-8'),
2898 'upload_date': u'NA',
2899 'title': json_data['name'],
2900 'ext': file_url.split('.')[-1].decode('utf-8'),
2901 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2902 'thumbnail': json_data['thumbnail_url'],
2903 'description': json_data['description'],
2904 'player_url': player_url.decode('utf-8'),
2907 class StanfordOpenClassroomIE(InfoExtractor):
2908 """Information extractor for Stanford's Open ClassRoom"""
2910 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2911 IE_NAME = u'stanfordoc'
2913 def report_download_webpage(self, objid):
2914 """Report information extraction."""
2915 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2917 def report_extraction(self, video_id):
2918 """Report information extraction."""
2919 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2921 def _real_extract(self, url):
2922 mobj = re.match(self._VALID_URL, url)
2924 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927 if mobj.group('course') and mobj.group('video'): # A specific video
2928 course = mobj.group('course')
2929 video = mobj.group('video')
2931 'id': course + '_' + video,
2934 self.report_extraction(info['id'])
2935 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2936 xmlUrl = baseUrl + video + '.xml'
2938 metaXml = urllib2.urlopen(xmlUrl).read()
2939 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2940 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2942 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2944 info['title'] = mdoc.findall('./title')[0].text
2945 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2947 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2949 info['ext'] = info['url'].rpartition('.')[2]
2951 elif mobj.group('course'): # A course page
2952 course = mobj.group('course')
2958 self.report_download_webpage(info['id'])
2960 coursepage = urllib2.urlopen(url).read()
2961 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2962 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2965 m = re.search('<h1>([^<]+)</h1>', coursepage)
2967 info['title'] = unescapeHTML(m.group(1))
2969 info['title'] = info['id']
2971 m = re.search('<description>([^<]+)</description>', coursepage)
2973 info['description'] = unescapeHTML(m.group(1))
2975 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2978 'type': 'reference',
2979 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2983 for entry in info['list']:
2984 assert entry['type'] == 'reference'
2985 results += self.extract(entry['url'])
2990 'id': 'Stanford OpenClassroom',
2994 self.report_download_webpage(info['id'])
2995 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2997 rootpage = urllib2.urlopen(rootURL).read()
2998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2999 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3002 info['title'] = info['id']
3004 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3007 'type': 'reference',
3008 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3013 for entry in info['list']:
3014 assert entry['type'] == 'reference'
3015 results += self.extract(entry['url'])
3018 class MTVIE(InfoExtractor):
3019 """Information extractor for MTV.com"""
3021 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3024 def report_webpage(self, video_id):
3025 """Report information extraction."""
3026 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3028 def report_extraction(self, video_id):
3029 """Report information extraction."""
3030 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3032 def _real_extract(self, url):
3033 mobj = re.match(self._VALID_URL, url)
3035 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3037 if not mobj.group('proto'):
3038 url = 'http://' + url
3039 video_id = mobj.group('videoid')
3040 self.report_webpage(video_id)
3042 request = urllib2.Request(url)
3044 webpage = urllib2.urlopen(request).read()
3045 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3046 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3049 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3051 self._downloader.trouble(u'ERROR: unable to extract song name')
3053 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3054 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3056 self._downloader.trouble(u'ERROR: unable to extract performer')
3058 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3059 video_title = performer + ' - ' + song_name
3061 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3065 mtvn_uri = mobj.group(1)
3067 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3069 self._downloader.trouble(u'ERROR: unable to extract content id')
3071 content_id = mobj.group(1)
3073 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3074 self.report_extraction(video_id)
3075 request = urllib2.Request(videogen_url)
3077 metadataXml = urllib2.urlopen(request).read()
3078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3082 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3083 renditions = mdoc.findall('.//rendition')
3085 # For now, always pick the highest quality.
3086 rendition = renditions[-1]
3089 _,_,ext = rendition.attrib['type'].partition('/')
3090 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3091 video_url = rendition.find('./src').text
3093 self._downloader.trouble('Invalid rendition field.')
3099 'uploader': performer,
3100 'title': video_title,
3108 class YoukuIE(InfoExtractor):
3110 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3113 def __init__(self, downloader=None):
3114 InfoExtractor.__init__(self, downloader)
3116 def report_download_webpage(self, file_id):
3117 """Report webpage download."""
3118 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3120 def report_extraction(self, file_id):
3121 """Report information extraction."""
3122 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3125 nowTime = int(time.time() * 1000)
3126 random1 = random.randint(1000,1998)
3127 random2 = random.randint(1000,9999)
3129 return "%d%d%d" %(nowTime,random1,random2)
3131 def _get_file_ID_mix_string(self, seed):
3133 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3135 for i in range(len(source)):
3136 seed = (seed * 211 + 30031 ) % 65536
3137 index = math.floor(seed / 65536 * len(source) )
3138 mixed.append(source[int(index)])
3139 source.remove(source[int(index)])
3140 #return ''.join(mixed)
3143 def _get_file_id(self, fileId, seed):
3144 mixed = self._get_file_ID_mix_string(seed)
3145 ids = fileId.split('*')
3149 realId.append(mixed[int(ch)])
3150 return ''.join(realId)
3152 def _real_extract(self, url):
3153 mobj = re.match(self._VALID_URL, url)
3155 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3157 video_id = mobj.group('ID')
3159 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3161 request = urllib2.Request(info_url, None, std_headers)
3163 self.report_download_webpage(video_id)
3164 jsondata = urllib2.urlopen(request).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3166 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3169 self.report_extraction(video_id)
3171 config = json.loads(jsondata)
3173 video_title = config['data'][0]['title']
3174 seed = config['data'][0]['seed']
3176 format = self._downloader.params.get('format', None)
3177 supported_format = config['data'][0]['streamfileids'].keys()
3179 if format is None or format == 'best':
3180 if 'hd2' in supported_format:
3185 elif format == 'worst':
3193 fileid = config['data'][0]['streamfileids'][format]
3194 seg_number = len(config['data'][0]['segs'][format])
3197 for i in xrange(seg_number):
3198 keys.append(config['data'][0]['segs'][format][i]['k'])
3201 #youku only could be viewed from mainland china
3203 self._downloader.trouble(u'ERROR: unable to extract info section')
3207 sid = self._gen_sid()
3208 fileid = self._get_file_id(fileid, seed)
3210 #column 8,9 of fileid represent the segment number
3211 #fileid[7:9] should be changed
3212 for index, key in enumerate(keys):
3214 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3215 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3218 'id': '%s_part%02d' % (video_id, index),
3219 'url': download_url,
3221 'title': video_title,
3224 files_info.append(info)
3229 class XNXXIE(InfoExtractor):
3230 """Information extractor for xnxx.com"""
3232 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3234 VIDEO_URL_RE = r'flv_url=(.*?)&'
3235 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3236 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3238 def report_webpage(self, video_id):
3239 """Report information extraction"""
3240 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3242 def report_extraction(self, video_id):
3243 """Report information extraction"""
3244 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3246 def _real_extract(self, url):
3247 mobj = re.match(self._VALID_URL, url)
3249 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251 video_id = mobj.group(1).decode('utf-8')
3253 self.report_webpage(video_id)
3255 # Get webpage content
3257 webpage = urllib2.urlopen(url).read()
3258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3262 result = re.search(self.VIDEO_URL_RE, webpage)
3264 self._downloader.trouble(u'ERROR: unable to extract video url')
3266 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3268 result = re.search(self.VIDEO_TITLE_RE, webpage)
3270 self._downloader.trouble(u'ERROR: unable to extract video title')
3272 video_title = result.group(1).decode('utf-8')
3274 result = re.search(self.VIDEO_THUMB_RE, webpage)
3276 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3278 video_thumbnail = result.group(1).decode('utf-8')
3284 'upload_date': None,
3285 'title': video_title,
3287 'thumbnail': video_thumbnail,
3288 'description': None,
3292 class GooglePlusIE(InfoExtractor):
3293 """Information extractor for plus.google.com."""
3295 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3296 IE_NAME = u'plus.google'
3298 def __init__(self, downloader=None):
3299 InfoExtractor.__init__(self, downloader)
3301 def report_extract_entry(self, url):
3302 """Report downloading extry"""
3303 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3305 def report_date(self, upload_date):
3306 """Report downloading extry"""
3307 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3309 def report_uploader(self, uploader):
3310 """Report downloading extry"""
3311 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3313 def report_title(self, video_title):
3314 """Report downloading extry"""
3315 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3317 def report_extract_vid_page(self, video_page):
3318 """Report information extraction."""
3319 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3321 def _real_extract(self, url):
3322 # Extract id from URL
3323 mobj = re.match(self._VALID_URL, url)
3325 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3328 post_url = mobj.group(0)
3329 video_id = mobj.group(2)
3331 video_extension = 'flv'
3333 # Step 1, Retrieve post webpage to extract further information
3334 self.report_extract_entry(post_url)
3335 request = urllib2.Request(post_url)
3337 webpage = urllib2.urlopen(request).read()
3338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3342 # Extract update date
3344 pattern = 'title="Timestamp">(.*?)</a>'
3345 mobj = re.search(pattern, webpage)
3347 upload_date = mobj.group(1)
3348 # Convert timestring to a format suitable for filename
3349 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3350 upload_date = upload_date.strftime('%Y%m%d')
3351 self.report_date(upload_date)
3355 pattern = r'rel\="author".*?>(.*?)</a>'
3356 mobj = re.search(pattern, webpage)
3358 uploader = mobj.group(1)
3359 self.report_uploader(uploader)
3362 # Get the first line for title
3364 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3365 mobj = re.search(pattern, webpage)
3367 video_title = mobj.group(1)
3368 self.report_title(video_title)
3370 # Step 2, Stimulate clicking the image box to launch video
3371 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3372 mobj = re.search(pattern, webpage)
3374 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3376 video_page = mobj.group(1)
3377 request = urllib2.Request(video_page)
3379 webpage = urllib2.urlopen(request).read()
3380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3383 self.report_extract_vid_page(video_page)
3386 # Extract video links on video page
3387 """Extract video links of all sizes"""
3388 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3389 mobj = re.findall(pattern, webpage)
3391 self._downloader.trouble(u'ERROR: unable to extract video links')
3393 # Sort in resolution
3394 links = sorted(mobj)
3396 # Choose the lowest of the sort, i.e. highest resolution
3397 video_url = links[-1]
3398 # Only get the url. The resolution part in the tuple has no use anymore
3399 video_url = video_url[-1]
3400 # Treat escaped \u0026 style hex
3401 video_url = unicode(video_url, "unicode_escape")
3405 'id': video_id.decode('utf-8'),
3407 'uploader': uploader.decode('utf-8'),
3408 'upload_date': upload_date.decode('utf-8'),
3409 'title': video_title.decode('utf-8'),
3410 'ext': video_extension.decode('utf-8'),