2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_warning(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_warning(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0]
736 if 'sig' in url_data:
737 url += '&signature=' + url_data['sig'][0]
738 if 'ratebypass' not in url:
739 url += '&ratebypass=yes'
740 url_map[url_data['itag'][0]] = url
742 format_limit = self._downloader.params.get('format_limit', None)
743 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
744 if format_limit is not None and format_limit in available_formats:
745 format_list = available_formats[available_formats.index(format_limit):]
747 format_list = available_formats
748 existing_formats = [x for x in format_list if x in url_map]
749 if len(existing_formats) == 0:
750 raise ExtractorError(u'no known formats available for video')
751 if self._downloader.params.get('listformats', None):
752 self._print_formats(existing_formats)
754 if req_format is None or req_format == 'best':
755 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
756 elif req_format == 'worst':
757 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
758 elif req_format in ('-1', 'all'):
759 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
761 # Specific formats. We pick the first in a slash-delimeted sequence.
762 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
763 req_formats = req_format.split('/')
764 video_url_list = None
765 for rf in req_formats:
767 video_url_list = [(rf, url_map[rf])]
769 if video_url_list is None:
770 raise ExtractorError(u'requested format not available')
772 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
775 for format_param, video_real_url in video_url_list:
777 video_extension = self._video_extensions.get(format_param, 'flv')
779 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
780 self._video_dimensions.get(format_param, '???'))
784 'url': video_real_url,
785 'uploader': video_uploader,
786 'uploader_id': video_uploader_id,
787 'upload_date': upload_date,
788 'title': video_title,
789 'ext': video_extension,
790 'format': video_format,
791 'thumbnail': video_thumbnail,
792 'description': video_description,
793 'player_url': player_url,
794 'subtitles': video_subtitles,
795 'duration': video_duration
800 class MetacafeIE(InfoExtractor):
801 """Information Extractor for metacafe.com."""
803 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
804 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
805 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
806 IE_NAME = u'metacafe'
808 def report_disclaimer(self):
809 """Report disclaimer retrieval."""
810 self.to_screen(u'Retrieving disclaimer')
812 def _real_initialize(self):
813 # Retrieve disclaimer
814 request = compat_urllib_request.Request(self._DISCLAIMER)
816 self.report_disclaimer()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
824 'submit': "Continue - I'm over 18",
826 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
828 self.report_age_confirmation()
829 disclaimer = compat_urllib_request.urlopen(request).read()
830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
831 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
833 def _real_extract(self, url):
834 # Extract id and simplified title from URL
835 mobj = re.match(self._VALID_URL, url)
837 raise ExtractorError(u'Invalid URL: %s' % url)
839 video_id = mobj.group(1)
841 # Check if video comes from YouTube
842 mobj2 = re.match(r'^yt-(.*)$', video_id)
843 if mobj2 is not None:
844 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
846 # Retrieve video webpage to extract further information
847 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
849 # Extract URL, uploader and title from webpage
850 self.report_extraction(video_id)
851 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
853 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
854 video_extension = mediaURL[-3:]
856 # Extract gdaKey if available
857 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
861 gdaKey = mobj.group(1)
862 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
864 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
866 raise ExtractorError(u'Unable to extract media URL')
867 vardict = compat_parse_qs(mobj.group(1))
868 if 'mediaData' not in vardict:
869 raise ExtractorError(u'Unable to extract media URL')
870 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
872 raise ExtractorError(u'Unable to extract media URL')
873 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
874 video_extension = mediaURL[-3:]
875 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
877 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
879 raise ExtractorError(u'Unable to extract title')
880 video_title = mobj.group(1).decode('utf-8')
882 mobj = re.search(r'submitter=(.*?);', webpage)
884 raise ExtractorError(u'Unable to extract uploader nickname')
885 video_uploader = mobj.group(1)
888 'id': video_id.decode('utf-8'),
889 'url': video_url.decode('utf-8'),
890 'uploader': video_uploader.decode('utf-8'),
892 'title': video_title,
893 'ext': video_extension.decode('utf-8'),
896 class DailymotionIE(InfoExtractor):
897 """Information Extractor for Dailymotion"""
899 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
900 IE_NAME = u'dailymotion'
902 def _real_extract(self, url):
903 # Extract id and simplified title from URL
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1).split('_')[0].split('?')[0]
910 video_extension = 'mp4'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
914 request.add_header('Cookie', 'family_filter=off')
915 webpage = self._download_webpage(request, video_id)
917 # Extract URL, uploader and title from webpage
918 self.report_extraction(video_id)
919 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
921 raise ExtractorError(u'Unable to extract media URL')
922 flashvars = compat_urllib_parse.unquote(mobj.group(1))
924 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
927 self.to_screen(u'Using %s' % key)
930 raise ExtractorError(u'Unable to extract video URL')
932 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
934 raise ExtractorError(u'Unable to extract video URL')
936 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
938 # TODO: support choosing qualities
940 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
942 raise ExtractorError(u'Unable to extract title')
943 video_title = unescapeHTML(mobj.group('title'))
945 video_uploader = None
946 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
947 # Looking for official user
948 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
949 webpage, 'video uploader')
951 video_upload_date = None
952 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
954 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
959 'uploader': video_uploader,
960 'upload_date': video_upload_date,
961 'title': video_title,
962 'ext': video_extension,
966 class PhotobucketIE(InfoExtractor):
967 """Information extractor for photobucket.com."""
969 # TODO: the original _VALID_URL was:
970 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
971 # Check if it's necessary to keep the old extracion process
972 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
973 IE_NAME = u'photobucket'
975 def _real_extract(self, url):
976 # Extract id from URL
977 mobj = re.match(self._VALID_URL, url)
979 raise ExtractorError(u'Invalid URL: %s' % url)
981 video_id = mobj.group('id')
983 video_extension = mobj.group('ext')
985 # Retrieve video webpage to extract further information
986 webpage = self._download_webpage(url, video_id)
988 # Extract URL, uploader, and title from webpage
989 self.report_extraction(video_id)
990 # We try first by looking the javascript code:
991 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
993 info = json.loads(mobj.group('json'))
996 'url': info[u'downloadUrl'],
997 'uploader': info[u'username'],
998 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
999 'title': info[u'title'],
1000 'ext': video_extension,
1001 'thumbnail': info[u'thumbUrl'],
1004 # We try looking in other parts of the webpage
1005 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006 webpage, u'video URL')
1008 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1010 raise ExtractorError(u'Unable to extract title')
1011 video_title = mobj.group(1).decode('utf-8')
1012 video_uploader = mobj.group(2).decode('utf-8')
1015 'id': video_id.decode('utf-8'),
1016 'url': video_url.decode('utf-8'),
1017 'uploader': video_uploader,
1018 'upload_date': None,
1019 'title': video_title,
1020 'ext': video_extension.decode('utf-8'),
1024 class YahooIE(InfoExtractor):
1025 """Information extractor for screen.yahoo.com."""
1026 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1028 def _real_extract(self, url):
1029 mobj = re.match(self._VALID_URL, url)
1031 raise ExtractorError(u'Invalid URL: %s' % url)
1032 video_id = mobj.group('id')
1033 webpage = self._download_webpage(url, video_id)
1034 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1037 # TODO: Check which url parameters are required
1038 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1045 self.report_extraction(video_id)
1046 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1048 raise ExtractorError(u'Unable to extract video info')
1049 video_title = m_info.group('title')
1050 video_description = m_info.group('description')
1051 video_thumb = m_info.group('thumb')
1052 video_date = m_info.group('date')
1053 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1055 # TODO: Find a way to get mp4 videos
1056 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059 video_url = m_rest.group('url')
1060 video_path = m_rest.group('path')
1062 raise ExtractorError(u'Unable to extract video url')
1064 else: # We have to use a different method if another id is defined
1065 long_id = m_id.group('new_id')
1066 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069 info = json.loads(json_str)
1070 res = info[u'query'][u'results'][u'mediaObj'][0]
1071 stream = res[u'streams'][0]
1072 video_path = stream[u'path']
1073 video_url = stream[u'host']
1075 video_title = meta[u'title']
1076 video_description = meta[u'description']
1077 video_thumb = meta[u'thumbnail']
1078 video_date = None # I can't find it
1083 'play_path': video_path,
1084 'title':video_title,
1085 'description': video_description,
1086 'thumbnail': video_thumb,
1087 'upload_date': video_date,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def _verify_video_password(self, url, video_id, webpage):
1100 password = self._downloader.params.get('password', None)
1101 if password is None:
1102 raise ExtractorError(u'This video is protected by a password, use the --password option')
1103 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1104 data = compat_urllib_parse.urlencode({'password': password,
1106 # I didn't manage to use the password with https
1107 if url.startswith('https'):
1108 pass_url = url.replace('https','http')
1111 password_request = compat_urllib_request.Request(pass_url+'/password', data)
1112 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1113 password_request.add_header('Cookie', 'xsrft=%s' % token)
1114 pass_web = self._download_webpage(password_request, video_id,
1115 u'Verifying the password',
1118 def _real_extract(self, url, new_video=True):
1119 # Extract ID from URL
1120 mobj = re.match(self._VALID_URL, url)
1122 raise ExtractorError(u'Invalid URL: %s' % url)
1124 video_id = mobj.group('id')
1125 if not mobj.group('proto'):
1126 url = 'https://' + url
1127 if mobj.group('direct_link') or mobj.group('pro'):
1128 url = 'https://vimeo.com/' + video_id
1130 # Retrieve video webpage to extract further information
1131 request = compat_urllib_request.Request(url, None, std_headers)
1132 webpage = self._download_webpage(request, video_id)
1134 # Now we begin extracting as much information as we can from what we
1135 # retrieved. First we extract the information common to all extractors,
1136 # and latter we extract those that are Vimeo specific.
1137 self.report_extraction(video_id)
1139 # Extract the config JSON
1141 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1142 config = json.loads(config)
1144 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1145 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1147 if re.search('If so please provide the correct password.', webpage):
1148 self._verify_video_password(url, video_id, webpage)
1149 return self._real_extract(url)
1151 raise ExtractorError(u'Unable to extract info section')
1154 video_title = config["video"]["title"]
1156 # Extract uploader and uploader_id
1157 video_uploader = config["video"]["owner"]["name"]
1158 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1160 # Extract video thumbnail
1161 video_thumbnail = config["video"]["thumbnail"]
1163 # Extract video description
1164 video_description = get_element_by_attribute("itemprop", "description", webpage)
1165 if video_description: video_description = clean_html(video_description)
1166 else: video_description = u''
1168 # Extract upload date
1169 video_upload_date = None
1170 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1171 if mobj is not None:
1172 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1174 # Vimeo specific: extract request signature and timestamp
1175 sig = config['request']['signature']
1176 timestamp = config['request']['timestamp']
1178 # Vimeo specific: extract video codec and quality information
1179 # First consider quality, then codecs, then take everything
1180 # TODO bind to format param
1181 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1182 files = { 'hd': [], 'sd': [], 'other': []}
1183 for codec_name, codec_extension in codecs:
1184 if codec_name in config["video"]["files"]:
1185 if 'hd' in config["video"]["files"][codec_name]:
1186 files['hd'].append((codec_name, codec_extension, 'hd'))
1187 elif 'sd' in config["video"]["files"][codec_name]:
1188 files['sd'].append((codec_name, codec_extension, 'sd'))
1190 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1192 for quality in ('hd', 'sd', 'other'):
1193 if len(files[quality]) > 0:
1194 video_quality = files[quality][0][2]
1195 video_codec = files[quality][0][0]
1196 video_extension = files[quality][0][1]
1197 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1200 raise ExtractorError(u'No known codec found')
1202 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1203 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1208 'uploader': video_uploader,
1209 'uploader_id': video_uploader_id,
1210 'upload_date': video_upload_date,
1211 'title': video_title,
1212 'ext': video_extension,
1213 'thumbnail': video_thumbnail,
1214 'description': video_description,
1218 class ArteTvIE(InfoExtractor):
1219 """arte.tv information extractor."""
1221 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1222 _LIVE_URL = r'index-[0-9]+\.html$'
1224 IE_NAME = u'arte.tv'
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233 except ValueError as err:
1234 raise ExtractorError(u'Invalid URL: %s' % url)
1237 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1238 page = self.fetch_webpage(url)
1239 mobj = re.search(regex, page, regexFlags)
1243 raise ExtractorError(u'Invalid URL: %s' % url)
1245 for (i, key, err) in matchTuples:
1246 if mobj.group(i) is None:
1247 raise ExtractorError(err)
1249 info[key] = mobj.group(i)
1253 def extractLiveStream(self, url):
1254 video_lang = url.split('/')[-4]
1255 info = self.grep_webpage(
1257 r'src="(.*?/videothek_js.*?\.js)',
1260 (1, 'url', u'Invalid URL: %s' % url)
1263 http_host = url.split('/')[2]
1264 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1265 info = self.grep_webpage(
1267 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1268 '(http://.*?\.swf).*?' +
1272 (1, 'path', u'could not extract video path: %s' % url),
1273 (2, 'player', u'could not extract video player: %s' % url),
1274 (3, 'url', u'could not extract video url: %s' % url)
1277 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1279 def extractPlus7Stream(self, url):
1280 video_lang = url.split('/')[-3]
1281 info = self.grep_webpage(
1283 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1286 (1, 'url', u'Invalid URL: %s' % url)
1289 next_url = compat_urllib_parse.unquote(info.get('url'))
1290 info = self.grep_webpage(
1292 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1295 (1, 'url', u'Could not find <video> tag: %s' % url)
1298 next_url = compat_urllib_parse.unquote(info.get('url'))
1300 info = self.grep_webpage(
1302 r'<video id="(.*?)".*?>.*?' +
1303 '<name>(.*?)</name>.*?' +
1304 '<dateVideo>(.*?)</dateVideo>.*?' +
1305 '<url quality="hd">(.*?)</url>',
1308 (1, 'id', u'could not extract video id: %s' % url),
1309 (2, 'title', u'could not extract video title: %s' % url),
1310 (3, 'date', u'could not extract video date: %s' % url),
1311 (4, 'url', u'could not extract video url: %s' % url)
1316 'id': info.get('id'),
1317 'url': compat_urllib_parse.unquote(info.get('url')),
1318 'uploader': u'arte.tv',
1319 'upload_date': unified_strdate(info.get('date')),
1320 'title': info.get('title').decode('utf-8'),
1326 def _real_extract(self, url):
1327 video_id = url.split('/')[-1]
1328 self.report_extraction(video_id)
1330 if re.search(self._LIVE_URL, video_id) is not None:
1331 self.extractLiveStream(url)
1334 info = self.extractPlus7Stream(url)
1339 class GenericIE(InfoExtractor):
1340 """Generic last-resort information extractor."""
1343 IE_NAME = u'generic'
1345 def report_download_webpage(self, video_id):
1346 """Report webpage download."""
1347 if not self._downloader.params.get('test', False):
1348 self._downloader.report_warning(u'Falling back on generic information extractor.')
1349 super(GenericIE, self).report_download_webpage(video_id)
1351 def report_following_redirect(self, new_url):
1352 """Report information extraction."""
1353 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1355 def _test_redirect(self, url):
1356 """Check if it is a redirect, like url shorteners, in case return the new url."""
1357 class HeadRequest(compat_urllib_request.Request):
1358 def get_method(self):
1361 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1363 Subclass the HTTPRedirectHandler to make it use our
1364 HeadRequest also on the redirected URL
1366 def redirect_request(self, req, fp, code, msg, headers, newurl):
1367 if code in (301, 302, 303, 307):
1368 newurl = newurl.replace(' ', '%20')
1369 newheaders = dict((k,v) for k,v in req.headers.items()
1370 if k.lower() not in ("content-length", "content-type"))
1371 return HeadRequest(newurl,
1373 origin_req_host=req.get_origin_req_host(),
1376 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1378 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1380 Fallback to GET if HEAD is not allowed (405 HTTP error)
1382 def http_error_405(self, req, fp, code, msg, headers):
1386 newheaders = dict((k,v) for k,v in req.headers.items()
1387 if k.lower() not in ("content-length", "content-type"))
1388 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1390 origin_req_host=req.get_origin_req_host(),
1394 opener = compat_urllib_request.OpenerDirector()
1395 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1396 HTTPMethodFallback, HEADRedirectHandler,
1397 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1398 opener.add_handler(handler())
1400 response = opener.open(HeadRequest(url))
1401 if response is None:
1402 raise ExtractorError(u'Invalid URL protocol')
1403 new_url = response.geturl()
1408 self.report_following_redirect(new_url)
1411 def _real_extract(self, url):
1412 new_url = self._test_redirect(url)
1413 if new_url: return [self.url_result(new_url)]
1415 video_id = url.split('/')[-1]
1417 webpage = self._download_webpage(url, video_id)
1418 except ValueError as err:
1419 # since this is the last-resort InfoExtractor, if
1420 # this error is thrown, it'll be thrown here
1421 raise ExtractorError(u'Invalid URL: %s' % url)
1423 self.report_extraction(video_id)
1424 # Start with something easy: JW Player in SWFObject
1425 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1427 # Broaden the search a little bit
1428 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1430 # Broaden the search a little bit: JWPlayer JS loader
1431 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1433 # Try to find twitter cards info
1434 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1436 # We look for Open Graph info:
1437 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1438 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1439 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1440 if m_video_type is not None:
1441 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1443 raise ExtractorError(u'Invalid URL: %s' % url)
1445 # It's possible that one of the regexes
1446 # matched, but returned an empty group:
1447 if mobj.group(1) is None:
1448 raise ExtractorError(u'Invalid URL: %s' % url)
1450 video_url = compat_urllib_parse.unquote(mobj.group(1))
1451 video_id = os.path.basename(video_url)
1453 # here's a fun little line of code for you:
1454 video_extension = os.path.splitext(video_id)[1][1:]
1455 video_id = os.path.splitext(video_id)[0]
1457 # it's tempting to parse this further, but you would
1458 # have to take into account all the variations like
1459 # Video Title - Site Name
1460 # Site Name | Video Title
1461 # Video Title - Tagline | Site Name
1462 # and so on and so forth; it's just not practical
1463 video_title = self._html_search_regex(r'<title>(.*)</title>',
1464 webpage, u'video title')
1466 # video uploader is domain name
1467 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1468 url, u'video uploader')
1473 'uploader': video_uploader,
1474 'upload_date': None,
1475 'title': video_title,
1476 'ext': video_extension,
1480 class YoutubeSearchIE(SearchInfoExtractor):
1481 """Information Extractor for YouTube search queries."""
1482 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1484 IE_NAME = u'youtube:search'
1485 _SEARCH_KEY = 'ytsearch'
1487 def report_download_page(self, query, pagenum):
1488 """Report attempt to download search page with given number."""
1489 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1491 def _get_n_results(self, query, n):
1492 """Get a specified number of results for a query"""
1498 while (50 * pagenum) < limit:
1499 self.report_download_page(query, pagenum+1)
1500 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1501 request = compat_urllib_request.Request(result_url)
1503 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1506 api_response = json.loads(data)['data']
1508 if not 'items' in api_response:
1509 raise ExtractorError(u'[youtube] No video results')
1511 new_ids = list(video['id'] for video in api_response['items'])
1512 video_ids += new_ids
1514 limit = min(n, api_response['totalItems'])
1517 if len(video_ids) > n:
1518 video_ids = video_ids[:n]
1519 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1520 return self.playlist_result(videos, query)
1523 class GoogleSearchIE(SearchInfoExtractor):
1524 """Information Extractor for Google Video search queries."""
1525 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1527 IE_NAME = u'video.google:search'
1528 _SEARCH_KEY = 'gvsearch'
1530 def _get_n_results(self, query, n):
1531 """Get a specified number of results for a query"""
1534 '_type': 'playlist',
1539 for pagenum in itertools.count(1):
1540 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1541 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1542 note='Downloading result page ' + str(pagenum))
1544 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1547 'url': mobj.group(1)
1549 res['entries'].append(e)
1551 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1554 class YahooSearchIE(SearchInfoExtractor):
1555 """Information Extractor for Yahoo! Video search queries."""
1558 IE_NAME = u'screen.yahoo:search'
1559 _SEARCH_KEY = 'yvsearch'
1561 def _get_n_results(self, query, n):
1562 """Get a specified number of results for a query"""
1565 '_type': 'playlist',
1569 for pagenum in itertools.count(0):
1570 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1571 webpage = self._download_webpage(result_url, query,
1572 note='Downloading results page '+str(pagenum+1))
1573 info = json.loads(webpage)
1575 results = info[u'results']
1577 for (i, r) in enumerate(results):
1578 if (pagenum * 30) +i >= n:
1580 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1581 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1582 res['entries'].append(e)
1583 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1589 class YoutubePlaylistIE(InfoExtractor):
1590 """Information Extractor for YouTube playlists."""
1592 _VALID_URL = r"""(?:
1597 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1598 \? (?:.*?&)*? (?:p|a|list)=
1601 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1604 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1606 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1608 IE_NAME = u'youtube:playlist'
1611 def suitable(cls, url):
1612 """Receives a URL and returns True if suitable for this IE."""
1613 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1615 def _real_extract(self, url):
1616 # Extract playlist id
1617 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1619 raise ExtractorError(u'Invalid URL: %s' % url)
1621 # Download playlist videos from API
1622 playlist_id = mobj.group(1) or mobj.group(2)
1627 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1628 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1631 response = json.loads(page)
1632 except ValueError as err:
1633 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1635 if 'feed' not in response:
1636 raise ExtractorError(u'Got a malformed response from YouTube API')
1637 playlist_title = response['feed']['title']['$t']
1638 if 'entry' not in response['feed']:
1639 # Number of videos is a multiple of self._MAX_RESULTS
1642 for entry in response['feed']['entry']:
1643 index = entry['yt$position']['$t']
1644 if 'media$group' in entry and 'media$player' in entry['media$group']:
1645 videos.append((index, entry['media$group']['media$player']['url']))
1647 if len(response['feed']['entry']) < self._MAX_RESULTS:
1651 videos = [v[1] for v in sorted(videos)]
1653 url_results = [self.url_result(url, 'Youtube') for url in videos]
1654 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1657 class YoutubeChannelIE(InfoExtractor):
1658 """Information Extractor for YouTube channels."""
1660 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1661 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1662 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1663 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1664 IE_NAME = u'youtube:channel'
1666 def extract_videos_from_page(self, page):
1668 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1669 if mobj.group(1) not in ids_in_page:
1670 ids_in_page.append(mobj.group(1))
1673 def _real_extract(self, url):
1674 # Extract channel id
1675 mobj = re.match(self._VALID_URL, url)
1677 raise ExtractorError(u'Invalid URL: %s' % url)
1679 # Download channel page
1680 channel_id = mobj.group(1)
1684 url = self._TEMPLATE_URL % (channel_id, pagenum)
1685 page = self._download_webpage(url, channel_id,
1686 u'Downloading page #%s' % pagenum)
1688 # Extract video identifiers
1689 ids_in_page = self.extract_videos_from_page(page)
1690 video_ids.extend(ids_in_page)
1692 # Download any subsequent channel pages using the json-based channel_ajax query
1693 if self._MORE_PAGES_INDICATOR in page:
1695 pagenum = pagenum + 1
1697 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1698 page = self._download_webpage(url, channel_id,
1699 u'Downloading page #%s' % pagenum)
1701 page = json.loads(page)
1703 ids_in_page = self.extract_videos_from_page(page['content_html'])
1704 video_ids.extend(ids_in_page)
1706 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1709 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1711 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1712 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1713 return [self.playlist_result(url_entries, channel_id)]
1716 class YoutubeUserIE(InfoExtractor):
1717 """Information Extractor for YouTube users."""
1719 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1720 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1721 _GDATA_PAGE_SIZE = 50
1722 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1723 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1724 IE_NAME = u'youtube:user'
1726 def _real_extract(self, url):
1728 mobj = re.match(self._VALID_URL, url)
1730 raise ExtractorError(u'Invalid URL: %s' % url)
1732 username = mobj.group(1)
1734 # Download video ids using YouTube Data API. Result size per
1735 # query is limited (currently to 50 videos) so we need to query
1736 # page by page until there are no video ids - it means we got
1743 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1745 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1746 page = self._download_webpage(gdata_url, username,
1747 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1749 # Extract video identifiers
1752 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1753 if mobj.group(1) not in ids_in_page:
1754 ids_in_page.append(mobj.group(1))
1756 video_ids.extend(ids_in_page)
1758 # A little optimization - if current page is not
1759 # "full", ie. does not contain PAGE_SIZE video ids then
1760 # we can assume that this page is the last one - there
1761 # are no more ids on further pages - no need to query
1764 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1769 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1770 url_results = [self.url_result(url, 'Youtube') for url in urls]
1771 return [self.playlist_result(url_results, playlist_title = username)]
1774 class BlipTVUserIE(InfoExtractor):
1775 """Information Extractor for blip.tv users."""
1777 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1779 IE_NAME = u'blip.tv:user'
1781 def _real_extract(self, url):
1783 mobj = re.match(self._VALID_URL, url)
1785 raise ExtractorError(u'Invalid URL: %s' % url)
1787 username = mobj.group(1)
1789 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1791 page = self._download_webpage(url, username, u'Downloading user page')
1792 mobj = re.search(r'data-users-id="([^"]+)"', page)
1793 page_base = page_base % mobj.group(1)
1796 # Download video ids using BlipTV Ajax calls. Result size per
1797 # query is limited (currently to 12 videos) so we need to query
1798 # page by page until there are no video ids - it means we got
1805 url = page_base + "&page=" + str(pagenum)
1806 page = self._download_webpage(url, username,
1807 u'Downloading video ids from page %d' % pagenum)
1809 # Extract video identifiers
1812 for mobj in re.finditer(r'href="/([^"]+)"', page):
1813 if mobj.group(1) not in ids_in_page:
1814 ids_in_page.append(unescapeHTML(mobj.group(1)))
1816 video_ids.extend(ids_in_page)
1818 # A little optimization - if current page is not
1819 # "full", ie. does not contain PAGE_SIZE video ids then
1820 # we can assume that this page is the last one - there
1821 # are no more ids on further pages - no need to query
1824 if len(ids_in_page) < self._PAGE_SIZE:
1829 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1830 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1831 return [self.playlist_result(url_entries, playlist_title = username)]
1834 class DepositFilesIE(InfoExtractor):
1835 """Information extractor for depositfiles.com"""
1837 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1839 def _real_extract(self, url):
1840 file_id = url.split('/')[-1]
1841 # Rebuild url in english locale
1842 url = 'http://depositfiles.com/en/files/' + file_id
1844 # Retrieve file webpage with 'Free download' button pressed
1845 free_download_indication = { 'gateway_result' : '1' }
1846 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1848 self.report_download_webpage(file_id)
1849 webpage = compat_urllib_request.urlopen(request).read()
1850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1853 # Search for the real file URL
1854 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1855 if (mobj is None) or (mobj.group(1) is None):
1856 # Try to figure out reason of the error.
1857 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1858 if (mobj is not None) and (mobj.group(1) is not None):
1859 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1860 raise ExtractorError(u'%s' % restriction_message)
1862 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1864 file_url = mobj.group(1)
1865 file_extension = os.path.splitext(file_url)[1][1:]
1867 # Search for file title
1868 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1871 'id': file_id.decode('utf-8'),
1872 'url': file_url.decode('utf-8'),
1874 'upload_date': None,
1875 'title': file_title,
1876 'ext': file_extension.decode('utf-8'),
1880 class FacebookIE(InfoExtractor):
1881 """Information Extractor for Facebook"""
1883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885 _NETRC_MACHINE = 'facebook'
1886 IE_NAME = u'facebook'
1888 def report_login(self):
1889 """Report attempt to log in."""
1890 self.to_screen(u'Logging in')
1892 def _real_initialize(self):
1893 if self._downloader is None:
1898 downloader_params = self._downloader.params
1900 # Attempt to use provided username and password or .netrc data
1901 if downloader_params.get('username', None) is not None:
1902 useremail = downloader_params['username']
1903 password = downloader_params['password']
1904 elif downloader_params.get('usenetrc', False):
1906 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1907 if info is not None:
1911 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1912 except (IOError, netrc.NetrcParseError) as err:
1913 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1916 if useremail is None:
1925 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1928 login_results = compat_urllib_request.urlopen(request).read()
1929 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1930 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1936 def _real_extract(self, url):
1937 mobj = re.match(self._VALID_URL, url)
1939 raise ExtractorError(u'Invalid URL: %s' % url)
1940 video_id = mobj.group('ID')
1942 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1943 webpage = self._download_webpage(url, video_id)
1945 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1946 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1947 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1949 raise ExtractorError(u'Cannot parse data')
1950 data = dict(json.loads(m.group(1)))
1951 params_raw = compat_urllib_parse.unquote(data['params'])
1952 params = json.loads(params_raw)
1953 video_data = params['video_data'][0]
1954 video_url = video_data.get('hd_src')
1956 video_url = video_data['sd_src']
1958 raise ExtractorError(u'Cannot find video URL')
1959 video_duration = int(video_data['video_duration'])
1960 thumbnail = video_data['thumbnail_src']
1962 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1967 'title': video_title,
1970 'duration': video_duration,
1971 'thumbnail': thumbnail,
1976 class BlipTVIE(InfoExtractor):
1977 """Information extractor for blip.tv"""
1979 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1980 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1981 IE_NAME = u'blip.tv'
1983 def report_direct_download(self, title):
1984 """Report information extraction."""
1985 self.to_screen(u'%s: Direct download detected' % title)
1987 def _real_extract(self, url):
1988 mobj = re.match(self._VALID_URL, url)
1990 raise ExtractorError(u'Invalid URL: %s' % url)
1992 # See https://github.com/rg3/youtube-dl/issues/857
1993 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1994 if api_mobj is not None:
1995 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1996 urlp = compat_urllib_parse_urlparse(url)
1997 if urlp.path.startswith('/play/'):
1998 request = compat_urllib_request.Request(url)
1999 response = compat_urllib_request.urlopen(request)
2000 redirecturl = response.geturl()
2001 rurlp = compat_urllib_parse_urlparse(redirecturl)
2002 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2003 url = 'http://blip.tv/a/a-' + file_id
2004 return self._real_extract(url)
2011 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2012 request = compat_urllib_request.Request(json_url)
2013 request.add_header('User-Agent', 'iTunes/10.6.1')
2014 self.report_extraction(mobj.group(1))
2017 urlh = compat_urllib_request.urlopen(request)
2018 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2019 basename = url.split('/')[-1]
2020 title,ext = os.path.splitext(basename)
2021 title = title.decode('UTF-8')
2022 ext = ext.replace('.', '')
2023 self.report_direct_download(title)
2028 'upload_date': None,
2033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2035 if info is None: # Regular URL
2037 json_code_bytes = urlh.read()
2038 json_code = json_code_bytes.decode('utf-8')
2039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2040 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2043 json_data = json.loads(json_code)
2044 if 'Post' in json_data:
2045 data = json_data['Post']
2049 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2050 video_url = data['media']['url']
2051 umobj = re.match(self._URL_EXT, video_url)
2053 raise ValueError('Can not determine filename extension')
2054 ext = umobj.group(1)
2057 'id': data['item_id'],
2059 'uploader': data['display_name'],
2060 'upload_date': upload_date,
2061 'title': data['title'],
2063 'format': data['media']['mimeType'],
2064 'thumbnail': data['thumbnailUrl'],
2065 'description': data['description'],
2066 'player_url': data['embedUrl'],
2067 'user_agent': 'iTunes/10.6.1',
2069 except (ValueError,KeyError) as err:
2070 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2075 class MyVideoIE(InfoExtractor):
2076 """Information Extractor for myvideo.de."""
2078 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2079 IE_NAME = u'myvideo'
2081 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2082 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2083 # https://github.com/rg3/youtube-dl/pull/842
2084 def __rc4crypt(self,data, key):
2086 box = list(range(256))
2087 for i in list(range(256)):
2088 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2089 box[i], box[x] = box[x], box[i]
2095 y = (y + box[x]) % 256
2096 box[x], box[y] = box[y], box[x]
2097 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2101 return hashlib.md5(s).hexdigest().encode()
2103 def _real_extract(self,url):
2104 mobj = re.match(self._VALID_URL, url)
2106 raise ExtractorError(u'invalid URL: %s' % url)
2108 video_id = mobj.group(1)
2111 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2112 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2113 b'TnpsbA0KTVRkbU1tSTRNdz09'
2117 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2118 webpage = self._download_webpage(webpage_url, video_id)
2120 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2121 if mobj is not None:
2122 self.report_extraction(video_id)
2123 video_url = mobj.group(1) + '.flv'
2125 video_title = self._html_search_regex('<title>([^<]+)</title>',
2128 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2134 'upload_date': None,
2135 'title': video_title,
2140 mobj = re.search('var flashvars={(.+?)}', webpage)
2142 raise ExtractorError(u'Unable to extract video')
2147 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2148 if not a == '_encxml':
2151 encxml = compat_urllib_parse.unquote(b)
2152 if not params.get('domain'):
2153 params['domain'] = 'www.myvideo.de'
2154 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2155 if 'flash_playertype=MTV' in xmldata_url:
2156 self._downloader.report_warning(u'avoiding MTV player')
2158 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2159 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2163 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2164 enc_data_b = binascii.unhexlify(enc_data)
2166 base64.b64decode(base64.b64decode(GK)) +
2168 str(video_id).encode('utf-8')
2171 dec_data = self.__rc4crypt(enc_data_b, sk)
2174 self.report_extraction(video_id)
2177 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2179 video_url = compat_urllib_parse.unquote(mobj.group(1))
2180 if 'myvideo2flash' in video_url:
2181 self._downloader.report_warning(u'forcing RTMPT ...')
2182 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2185 # extract non rtmp videos
2186 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2188 raise ExtractorError(u'unable to extract url')
2189 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2191 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2192 video_file = compat_urllib_parse.unquote(video_file)
2194 if not video_file.endswith('f4m'):
2195 ppath, prefix = video_file.split('.')
2196 video_playpath = '%s:%s' % (prefix, ppath)
2197 video_hls_playlist = ''
2200 video_hls_playlist = (
2201 video_filepath + video_file
2202 ).replace('.f4m', '.m3u8')
2204 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2205 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2207 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2213 'tc_url': video_url,
2215 'upload_date': None,
2216 'title': video_title,
2218 'play_path': video_playpath,
2219 'video_file': video_file,
2220 'video_hls_playlist': video_hls_playlist,
2221 'player_url': video_swfobj,
2225 class ComedyCentralIE(InfoExtractor):
2226 """Information extractor for The Daily Show and Colbert Report """
2228 # urls can be abbreviations like :thedailyshow or :colbert
2229 # urls for episodes like:
2230 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2231 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2232 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2233 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2234 |(https?://)?(www\.)?
2235 (?P<showname>thedailyshow|colbertnation)\.com/
2236 (full-episodes/(?P<episode>.*)|
2238 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2239 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2242 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2244 _video_extensions = {
2252 _video_dimensions = {
2262 def suitable(cls, url):
2263 """Receives a URL and returns True if suitable for this IE."""
2264 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2266 def _print_formats(self, formats):
2267 print('Available formats:')
2269 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2272 def _real_extract(self, url):
2273 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2275 raise ExtractorError(u'Invalid URL: %s' % url)
2277 if mobj.group('shortname'):
2278 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2279 url = u'http://www.thedailyshow.com/full-episodes/'
2281 url = u'http://www.colbertnation.com/full-episodes/'
2282 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2283 assert mobj is not None
2285 if mobj.group('clip'):
2286 if mobj.group('showname') == 'thedailyshow':
2287 epTitle = mobj.group('tdstitle')
2289 epTitle = mobj.group('cntitle')
2292 dlNewest = not mobj.group('episode')
2294 epTitle = mobj.group('showname')
2296 epTitle = mobj.group('episode')
2298 self.report_extraction(epTitle)
2299 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2301 url = htmlHandle.geturl()
2302 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2304 raise ExtractorError(u'Invalid redirected URL: ' + url)
2305 if mobj.group('episode') == '':
2306 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2307 epTitle = mobj.group('episode')
2309 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2311 if len(mMovieParams) == 0:
2312 # The Colbert Report embeds the information in a without
2313 # a URL prefix; so extract the alternate reference
2314 # and then add the URL prefix manually.
2316 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2317 if len(altMovieParams) == 0:
2318 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2320 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2322 uri = mMovieParams[0][1]
2323 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2324 indexXml = self._download_webpage(indexUrl, epTitle,
2325 u'Downloading show index',
2326 u'unable to download episode index')
2330 idoc = xml.etree.ElementTree.fromstring(indexXml)
2331 itemEls = idoc.findall('.//item')
2332 for partNum,itemEl in enumerate(itemEls):
2333 mediaId = itemEl.findall('./guid')[0].text
2334 shortMediaId = mediaId.split(':')[-1]
2335 showId = mediaId.split(':')[-2].replace('.com', '')
2336 officialTitle = itemEl.findall('./title')[0].text
2337 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2339 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2340 compat_urllib_parse.urlencode({'uri': mediaId}))
2341 configXml = self._download_webpage(configUrl, epTitle,
2342 u'Downloading configuration for %s' % shortMediaId)
2344 cdoc = xml.etree.ElementTree.fromstring(configXml)
2346 for rendition in cdoc.findall('.//rendition'):
2347 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2351 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2354 if self._downloader.params.get('listformats', None):
2355 self._print_formats([i[0] for i in turls])
2358 # For now, just pick the highest bitrate
2359 format,rtmp_video_url = turls[-1]
2361 # Get the format arg from the arg stream
2362 req_format = self._downloader.params.get('format', None)
2364 # Select format if we can find one
2367 format, rtmp_video_url = f, v
2370 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2372 raise ExtractorError(u'Cannot transform RTMP url')
2373 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2374 video_url = base + m.group('finalid')
2376 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2381 'upload_date': officialDate,
2386 'description': officialTitle,
2388 results.append(info)
2393 class EscapistIE(InfoExtractor):
2394 """Information extractor for The Escapist """
2396 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2397 IE_NAME = u'escapist'
2399 def _real_extract(self, url):
2400 mobj = re.match(self._VALID_URL, url)
2402 raise ExtractorError(u'Invalid URL: %s' % url)
2403 showName = mobj.group('showname')
2404 videoId = mobj.group('episode')
2406 self.report_extraction(videoId)
2407 webpage = self._download_webpage(url, videoId)
2409 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2410 webpage, u'description', fatal=False)
2412 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2413 webpage, u'thumbnail', fatal=False)
2415 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2416 webpage, u'player url')
2418 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2419 webpage, u'player url').split(' : ')[-1]
2421 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2422 configUrl = compat_urllib_parse.unquote(configUrl)
2424 configJSON = self._download_webpage(configUrl, videoId,
2425 u'Downloading configuration',
2426 u'unable to download configuration')
2428 # Technically, it's JavaScript, not JSON
2429 configJSON = configJSON.replace("'", '"')
2432 config = json.loads(configJSON)
2433 except (ValueError,) as err:
2434 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2436 playlist = config['playlist']
2437 videoUrl = playlist[1]['url']
2442 'uploader': showName,
2443 'upload_date': None,
2446 'thumbnail': imgUrl,
2447 'description': videoDesc,
2448 'player_url': playerUrl,
2453 class CollegeHumorIE(InfoExtractor):
2454 """Information extractor for collegehumor.com"""
2457 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2458 IE_NAME = u'collegehumor'
2460 def report_manifest(self, video_id):
2461 """Report information extraction."""
2462 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2464 def _real_extract(self, url):
2465 mobj = re.match(self._VALID_URL, url)
2467 raise ExtractorError(u'Invalid URL: %s' % url)
2468 video_id = mobj.group('videoid')
2473 'upload_date': None,
2476 self.report_extraction(video_id)
2477 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2479 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2481 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2483 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2485 videoNode = mdoc.findall('./video')[0]
2486 info['description'] = videoNode.findall('./description')[0].text
2487 info['title'] = videoNode.findall('./caption')[0].text
2488 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2489 manifest_url = videoNode.findall('./file')[0].text
2491 raise ExtractorError(u'Invalid metadata XML file')
2493 manifest_url += '?hdcore=2.10.3'
2494 self.report_manifest(video_id)
2496 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2500 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2502 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2503 node_id = media_node.attrib['url']
2504 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2505 except IndexError as err:
2506 raise ExtractorError(u'Invalid manifest file')
2508 url_pr = compat_urllib_parse_urlparse(manifest_url)
2509 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2516 class XVideosIE(InfoExtractor):
2517 """Information extractor for xvideos.com"""
2519 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2520 IE_NAME = u'xvideos'
2522 def _real_extract(self, url):
2523 mobj = re.match(self._VALID_URL, url)
2525 raise ExtractorError(u'Invalid URL: %s' % url)
2526 video_id = mobj.group(1)
2528 webpage = self._download_webpage(url, video_id)
2530 self.report_extraction(video_id)
2533 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2534 webpage, u'video URL'))
2537 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2540 # Extract video thumbnail
2541 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2542 webpage, u'thumbnail', fatal=False)
2548 'upload_date': None,
2549 'title': video_title,
2551 'thumbnail': video_thumbnail,
2552 'description': None,
2558 class SoundcloudIE(InfoExtractor):
2559 """Information extractor for soundcloud.com
2560 To access the media, the uid of the song and a stream token
2561 must be extracted from the page source and the script must make
2562 a request to media.soundcloud.com/crossdomain.xml. Then
2563 the media can be grabbed by requesting from an url composed
2564 of the stream token and uid
2567 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2568 IE_NAME = u'soundcloud'
2570 def report_resolve(self, video_id):
2571 """Report information extraction."""
2572 self.to_screen(u'%s: Resolving id' % video_id)
2574 def _real_extract(self, url):
2575 mobj = re.match(self._VALID_URL, url)
2577 raise ExtractorError(u'Invalid URL: %s' % url)
2579 # extract uploader (which is in the url)
2580 uploader = mobj.group(1)
2581 # extract simple title (uploader + slug of song title)
2582 slug_title = mobj.group(2)
2583 simple_title = uploader + u'-' + slug_title
2584 full_title = '%s/%s' % (uploader, slug_title)
2586 self.report_resolve(full_title)
2588 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2589 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2590 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2592 info = json.loads(info_json)
2593 video_id = info['id']
2594 self.report_extraction(full_title)
2596 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2597 stream_json = self._download_webpage(streams_url, full_title,
2598 u'Downloading stream definitions',
2599 u'unable to download stream definitions')
2601 streams = json.loads(stream_json)
2602 mediaURL = streams['http_mp3_128_url']
2603 upload_date = unified_strdate(info['created_at'])
2608 'uploader': info['user']['username'],
2609 'upload_date': upload_date,
2610 'title': info['title'],
2612 'description': info['description'],
2615 class SoundcloudSetIE(InfoExtractor):
2616 """Information extractor for soundcloud.com sets
2617 To access the media, the uid of the song and a stream token
2618 must be extracted from the page source and the script must make
2619 a request to media.soundcloud.com/crossdomain.xml. Then
2620 the media can be grabbed by requesting from an url composed
2621 of the stream token and uid
2624 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2625 IE_NAME = u'soundcloud:set'
2627 def report_resolve(self, video_id):
2628 """Report information extraction."""
2629 self.to_screen(u'%s: Resolving id' % video_id)
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 raise ExtractorError(u'Invalid URL: %s' % url)
2636 # extract uploader (which is in the url)
2637 uploader = mobj.group(1)
2638 # extract simple title (uploader + slug of song title)
2639 slug_title = mobj.group(2)
2640 simple_title = uploader + u'-' + slug_title
2641 full_title = '%s/sets/%s' % (uploader, slug_title)
2643 self.report_resolve(full_title)
2645 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2646 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2647 info_json = self._download_webpage(resolv_url, full_title)
2650 info = json.loads(info_json)
2651 if 'errors' in info:
2652 for err in info['errors']:
2653 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2656 self.report_extraction(full_title)
2657 for track in info['tracks']:
2658 video_id = track['id']
2660 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2661 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2663 self.report_extraction(video_id)
2664 streams = json.loads(stream_json)
2665 mediaURL = streams['http_mp3_128_url']
2670 'uploader': track['user']['username'],
2671 'upload_date': unified_strdate(track['created_at']),
2672 'title': track['title'],
2674 'description': track['description'],
2679 class InfoQIE(InfoExtractor):
2680 """Information extractor for infoq.com"""
2681 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2683 def _real_extract(self, url):
2684 mobj = re.match(self._VALID_URL, url)
2686 raise ExtractorError(u'Invalid URL: %s' % url)
2688 webpage = self._download_webpage(url, video_id=url)
2689 self.report_extraction(url)
2692 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2694 raise ExtractorError(u'Unable to extract video url')
2695 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2696 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2699 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2702 # Extract description
2703 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2704 webpage, u'description', fatal=False)
2706 video_filename = video_url.split('/')[-1]
2707 video_id, extension = video_filename.split('.')
2713 'upload_date': None,
2714 'title': video_title,
2715 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2717 'description': video_description,
2722 class MixcloudIE(InfoExtractor):
2723 """Information extractor for www.mixcloud.com"""
2725 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2726 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2727 IE_NAME = u'mixcloud'
2729 def report_download_json(self, file_id):
2730 """Report JSON download."""
2731 self.to_screen(u'Downloading json')
2733 def get_urls(self, jsonData, fmt, bitrate='best'):
2734 """Get urls from 'audio_formats' section in json"""
2737 bitrate_list = jsonData[fmt]
2738 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2739 bitrate = max(bitrate_list) # select highest
2741 url_list = jsonData[fmt][bitrate]
2742 except TypeError: # we have no bitrate info.
2743 url_list = jsonData[fmt]
2746 def check_urls(self, url_list):
2747 """Returns 1st active url from list"""
2748 for url in url_list:
2750 compat_urllib_request.urlopen(url)
2752 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2757 def _print_formats(self, formats):
2758 print('Available formats:')
2759 for fmt in formats.keys():
2760 for b in formats[fmt]:
2762 ext = formats[fmt][b][0]
2763 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2764 except TypeError: # we have no bitrate info
2765 ext = formats[fmt][0]
2766 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2769 def _real_extract(self, url):
2770 mobj = re.match(self._VALID_URL, url)
2772 raise ExtractorError(u'Invalid URL: %s' % url)
2773 # extract uploader & filename from url
2774 uploader = mobj.group(1).decode('utf-8')
2775 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2777 # construct API request
2778 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2779 # retrieve .json file with links to files
2780 request = compat_urllib_request.Request(file_url)
2782 self.report_download_json(file_url)
2783 jsonData = compat_urllib_request.urlopen(request).read()
2784 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2788 json_data = json.loads(jsonData)
2789 player_url = json_data['player_swf_url']
2790 formats = dict(json_data['audio_formats'])
2792 req_format = self._downloader.params.get('format', None)
2795 if self._downloader.params.get('listformats', None):
2796 self._print_formats(formats)
2799 if req_format is None or req_format == 'best':
2800 for format_param in formats.keys():
2801 url_list = self.get_urls(formats, format_param)
2803 file_url = self.check_urls(url_list)
2804 if file_url is not None:
2807 if req_format not in formats:
2808 raise ExtractorError(u'Format is not available')
2810 url_list = self.get_urls(formats, req_format)
2811 file_url = self.check_urls(url_list)
2812 format_param = req_format
2815 'id': file_id.decode('utf-8'),
2816 'url': file_url.decode('utf-8'),
2817 'uploader': uploader.decode('utf-8'),
2818 'upload_date': None,
2819 'title': json_data['name'],
2820 'ext': file_url.split('.')[-1].decode('utf-8'),
2821 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2822 'thumbnail': json_data['thumbnail_url'],
2823 'description': json_data['description'],
2824 'player_url': player_url.decode('utf-8'),
2827 class StanfordOpenClassroomIE(InfoExtractor):
2828 """Information extractor for Stanford's Open ClassRoom"""
2830 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2831 IE_NAME = u'stanfordoc'
2833 def _real_extract(self, url):
2834 mobj = re.match(self._VALID_URL, url)
2836 raise ExtractorError(u'Invalid URL: %s' % url)
2838 if mobj.group('course') and mobj.group('video'): # A specific video
2839 course = mobj.group('course')
2840 video = mobj.group('video')
2842 'id': course + '_' + video,
2844 'upload_date': None,
2847 self.report_extraction(info['id'])
2848 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2849 xmlUrl = baseUrl + video + '.xml'
2851 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2854 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2856 info['title'] = mdoc.findall('./title')[0].text
2857 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2859 raise ExtractorError(u'Invalid metadata XML file')
2860 info['ext'] = info['url'].rpartition('.')[2]
2862 elif mobj.group('course'): # A course page
2863 course = mobj.group('course')
2868 'upload_date': None,
2871 coursepage = self._download_webpage(url, info['id'],
2872 note='Downloading course info page',
2873 errnote='Unable to download course info page')
2875 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2877 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2878 coursepage, u'description', fatal=False)
2880 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2883 'type': 'reference',
2884 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2888 for entry in info['list']:
2889 assert entry['type'] == 'reference'
2890 results += self.extract(entry['url'])
2894 'id': 'Stanford OpenClassroom',
2897 'upload_date': None,
2900 self.report_download_webpage(info['id'])
2901 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2903 rootpage = compat_urllib_request.urlopen(rootURL).read()
2904 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2905 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2907 info['title'] = info['id']
2909 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2912 'type': 'reference',
2913 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2918 for entry in info['list']:
2919 assert entry['type'] == 'reference'
2920 results += self.extract(entry['url'])
2923 class MTVIE(InfoExtractor):
2924 """Information extractor for MTV.com"""
2926 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2932 raise ExtractorError(u'Invalid URL: %s' % url)
2933 if not mobj.group('proto'):
2934 url = 'http://' + url
2935 video_id = mobj.group('videoid')
2937 webpage = self._download_webpage(url, video_id)
2939 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2940 webpage, u'song name', fatal=False)
2942 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2945 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2946 webpage, u'mtvn_uri', fatal=False)
2948 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2949 webpage, u'content id', fatal=False)
2951 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2952 self.report_extraction(video_id)
2953 request = compat_urllib_request.Request(videogen_url)
2955 metadataXml = compat_urllib_request.urlopen(request).read()
2956 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2957 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2959 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2960 renditions = mdoc.findall('.//rendition')
2962 # For now, always pick the highest quality.
2963 rendition = renditions[-1]
2966 _,_,ext = rendition.attrib['type'].partition('/')
2967 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2968 video_url = rendition.find('./src').text
2970 raise ExtractorError('Invalid rendition field.')
2975 'uploader': performer,
2976 'upload_date': None,
2977 'title': video_title,
2985 class YoukuIE(InfoExtractor):
2986 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2989 nowTime = int(time.time() * 1000)
2990 random1 = random.randint(1000,1998)
2991 random2 = random.randint(1000,9999)
2993 return "%d%d%d" %(nowTime,random1,random2)
2995 def _get_file_ID_mix_string(self, seed):
2997 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2999 for i in range(len(source)):
3000 seed = (seed * 211 + 30031 ) % 65536
3001 index = math.floor(seed / 65536 * len(source) )
3002 mixed.append(source[int(index)])
3003 source.remove(source[int(index)])
3004 #return ''.join(mixed)
3007 def _get_file_id(self, fileId, seed):
3008 mixed = self._get_file_ID_mix_string(seed)
3009 ids = fileId.split('*')
3013 realId.append(mixed[int(ch)])
3014 return ''.join(realId)
3016 def _real_extract(self, url):
3017 mobj = re.match(self._VALID_URL, url)
3019 raise ExtractorError(u'Invalid URL: %s' % url)
3020 video_id = mobj.group('ID')
3022 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3024 jsondata = self._download_webpage(info_url, video_id)
3026 self.report_extraction(video_id)
3028 config = json.loads(jsondata)
3030 video_title = config['data'][0]['title']
3031 seed = config['data'][0]['seed']
3033 format = self._downloader.params.get('format', None)
3034 supported_format = list(config['data'][0]['streamfileids'].keys())
3036 if format is None or format == 'best':
3037 if 'hd2' in supported_format:
3042 elif format == 'worst':
3050 fileid = config['data'][0]['streamfileids'][format]
3051 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3052 except (UnicodeDecodeError, ValueError, KeyError):
3053 raise ExtractorError(u'Unable to extract info section')
3056 sid = self._gen_sid()
3057 fileid = self._get_file_id(fileid, seed)
3059 #column 8,9 of fileid represent the segment number
3060 #fileid[7:9] should be changed
3061 for index, key in enumerate(keys):
3063 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3064 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3067 'id': '%s_part%02d' % (video_id, index),
3068 'url': download_url,
3070 'upload_date': None,
3071 'title': video_title,
3074 files_info.append(info)
3079 class XNXXIE(InfoExtractor):
3080 """Information extractor for xnxx.com"""
3082 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3084 VIDEO_URL_RE = r'flv_url=(.*?)&'
3085 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3086 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3088 def _real_extract(self, url):
3089 mobj = re.match(self._VALID_URL, url)
3091 raise ExtractorError(u'Invalid URL: %s' % url)
3092 video_id = mobj.group(1)
3094 # Get webpage content
3095 webpage = self._download_webpage(url, video_id)
3097 video_url = self._search_regex(self.VIDEO_URL_RE,
3098 webpage, u'video URL')
3099 video_url = compat_urllib_parse.unquote(video_url)
3101 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3104 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3105 webpage, u'thumbnail', fatal=False)
3111 'upload_date': None,
3112 'title': video_title,
3114 'thumbnail': video_thumbnail,
3115 'description': None,
3119 class GooglePlusIE(InfoExtractor):
3120 """Information extractor for plus.google.com."""
3122 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3123 IE_NAME = u'plus.google'
3125 def _real_extract(self, url):
3126 # Extract id from URL
3127 mobj = re.match(self._VALID_URL, url)
3129 raise ExtractorError(u'Invalid URL: %s' % url)
3131 post_url = mobj.group(0)
3132 video_id = mobj.group(1)
3134 video_extension = 'flv'
3136 # Step 1, Retrieve post webpage to extract further information
3137 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3139 self.report_extraction(video_id)
3141 # Extract update date
3142 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3143 webpage, u'upload date', fatal=False)
3145 # Convert timestring to a format suitable for filename
3146 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3147 upload_date = upload_date.strftime('%Y%m%d')
3150 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3151 webpage, u'uploader', fatal=False)
3154 # Get the first line for title
3155 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3156 webpage, 'title', default=u'NA')
3158 # Step 2, Stimulate clicking the image box to launch video
3159 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3160 webpage, u'video page URL')
3161 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3163 # Extract video links on video page
3164 """Extract video links of all sizes"""
3165 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3166 mobj = re.findall(pattern, webpage)
3168 raise ExtractorError(u'Unable to extract video links')
3170 # Sort in resolution
3171 links = sorted(mobj)
3173 # Choose the lowest of the sort, i.e. highest resolution
3174 video_url = links[-1]
3175 # Only get the url. The resolution part in the tuple has no use anymore
3176 video_url = video_url[-1]
3177 # Treat escaped \u0026 style hex
3179 video_url = video_url.decode("unicode_escape")
3180 except AttributeError: # Python 3
3181 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3187 'uploader': uploader,
3188 'upload_date': upload_date,
3189 'title': video_title,
3190 'ext': video_extension,
3193 class NBAIE(InfoExtractor):
3194 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3197 def _real_extract(self, url):
3198 mobj = re.match(self._VALID_URL, url)
3200 raise ExtractorError(u'Invalid URL: %s' % url)
3202 video_id = mobj.group(1)
3204 webpage = self._download_webpage(url, video_id)
3206 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3208 shortened_video_id = video_id.rpartition('/')[2]
3209 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3210 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3212 # It isn't there in the HTML it returns to us
3213 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3215 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3218 'id': shortened_video_id,
3222 # 'uploader_date': uploader_date,
3223 'description': description,
3227 class JustinTVIE(InfoExtractor):
3228 """Information extractor for justin.tv and twitch.tv"""
3229 # TODO: One broadcast may be split into multiple videos. The key
3230 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3231 # starts at 1 and increases. Can we treat all parts as one video?
3233 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3235 (?P<channelid>[^/]+)|
3236 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3237 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3241 _JUSTIN_PAGE_LIMIT = 100
3242 IE_NAME = u'justin.tv'
3244 def report_download_page(self, channel, offset):
3245 """Report attempt to download a single page of videos."""
3246 self.to_screen(u'%s: Downloading video information from %d to %d' %
3247 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3249 # Return count of items, list of *valid* items
3250 def _parse_page(self, url, video_id):
3251 webpage = self._download_webpage(url, video_id,
3252 u'Downloading video info JSON',
3253 u'unable to download video info JSON')
3255 response = json.loads(webpage)
3256 if type(response) != list:
3257 error_text = response.get('error', 'unknown error')
3258 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3260 for clip in response:
3261 video_url = clip['video_file_url']
3263 video_extension = os.path.splitext(video_url)[1][1:]
3264 video_date = re.sub('-', '', clip['start_time'][:10])
3265 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3266 video_id = clip['id']
3267 video_title = clip.get('title', video_id)
3271 'title': video_title,
3272 'uploader': clip.get('channel_name', video_uploader_id),
3273 'uploader_id': video_uploader_id,
3274 'upload_date': video_date,
3275 'ext': video_extension,
3277 return (len(response), info)
3279 def _real_extract(self, url):
3280 mobj = re.match(self._VALID_URL, url)
3282 raise ExtractorError(u'invalid URL: %s' % url)
3284 api_base = 'http://api.justin.tv'
3286 if mobj.group('channelid'):
3288 video_id = mobj.group('channelid')
3289 api = api_base + '/channel/archives/%s.json' % video_id
3290 elif mobj.group('chapterid'):
3291 chapter_id = mobj.group('chapterid')
3293 webpage = self._download_webpage(url, chapter_id)
3294 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3296 raise ExtractorError(u'Cannot find archive of a chapter')
3297 archive_id = m.group(1)
3299 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3300 chapter_info_xml = self._download_webpage(api, chapter_id,
3301 note=u'Downloading chapter information',
3302 errnote=u'Chapter information download failed')
3303 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3304 for a in doc.findall('.//archive'):
3305 if archive_id == a.find('./id').text:
3308 raise ExtractorError(u'Could not find chapter in chapter information')
3310 video_url = a.find('./video_file_url').text
3311 video_ext = video_url.rpartition('.')[2] or u'flv'
3313 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3314 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3315 note='Downloading chapter metadata',
3316 errnote='Download of chapter metadata failed')
3317 chapter_info = json.loads(chapter_info_json)
3319 bracket_start = int(doc.find('.//bracket_start').text)
3320 bracket_end = int(doc.find('.//bracket_end').text)
3322 # TODO determine start (and probably fix up file)
3323 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3324 #video_url += u'?start=' + TODO:start_timestamp
3325 # bracket_start is 13290, but we want 51670615
3326 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3327 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3330 'id': u'c' + chapter_id,
3333 'title': chapter_info['title'],
3334 'thumbnail': chapter_info['preview'],
3335 'description': chapter_info['description'],
3336 'uploader': chapter_info['channel']['display_name'],
3337 'uploader_id': chapter_info['channel']['name'],
3341 video_id = mobj.group('videoid')
3342 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3344 self.report_extraction(video_id)
3348 limit = self._JUSTIN_PAGE_LIMIT
3351 self.report_download_page(video_id, offset)
3352 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3353 page_count, page_info = self._parse_page(page_url, video_id)
3354 info.extend(page_info)
3355 if not paged or page_count != limit:
3360 class FunnyOrDieIE(InfoExtractor):
3361 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3363 def _real_extract(self, url):
3364 mobj = re.match(self._VALID_URL, url)
3366 raise ExtractorError(u'invalid URL: %s' % url)
3368 video_id = mobj.group('id')
3369 webpage = self._download_webpage(url, video_id)
3371 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3372 webpage, u'video URL', flags=re.DOTALL)
3374 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3375 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3377 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3378 webpage, u'description', fatal=False, flags=re.DOTALL)
3385 'description': video_description,
3389 class SteamIE(InfoExtractor):
3390 _VALID_URL = r"""http://store\.steampowered\.com/
3392 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3394 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3396 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3397 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3400 def suitable(cls, url):
3401 """Receives a URL and returns True if suitable for this IE."""
3402 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3404 def _real_extract(self, url):
3405 m = re.match(self._VALID_URL, url, re.VERBOSE)
3406 gameID = m.group('gameID')
3408 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3409 webpage = self._download_webpage(videourl, gameID)
3411 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3412 videourl = self._AGECHECK_TEMPLATE % gameID
3413 self.report_age_confirmation()
3414 webpage = self._download_webpage(videourl, gameID)
3416 self.report_extraction(gameID)
3417 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3418 webpage, 'game title')
3420 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3421 mweb = re.finditer(urlRE, webpage)
3422 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3423 titles = re.finditer(namesRE, webpage)
3424 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3425 thumbs = re.finditer(thumbsRE, webpage)
3427 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3428 video_id = vid.group('videoID')
3429 title = vtitle.group('videoName')
3430 video_url = vid.group('videoURL')
3431 video_thumb = thumb.group('thumbnail')
3433 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3438 'title': unescapeHTML(title),
3439 'thumbnail': video_thumb
3442 return [self.playlist_result(videos, gameID, game_title)]
3444 class UstreamIE(InfoExtractor):
3445 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3446 IE_NAME = u'ustream'
3448 def _real_extract(self, url):
3449 m = re.match(self._VALID_URL, url)
3450 video_id = m.group('videoID')
3452 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3453 webpage = self._download_webpage(url, video_id)
3455 self.report_extraction(video_id)
3457 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3460 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3461 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3463 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3464 webpage, u'thumbnail', fatal=False)
3470 'title': video_title,
3471 'uploader': uploader,
3472 'thumbnail': thumbnail,
3476 class WorldStarHipHopIE(InfoExtractor):
3477 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3478 IE_NAME = u'WorldStarHipHop'
3480 def _real_extract(self, url):
3481 m = re.match(self._VALID_URL, url)
3482 video_id = m.group('id')
3484 webpage_src = self._download_webpage(url, video_id)
3486 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3487 webpage_src, u'video URL')
3489 if 'mp4' in video_url:
3494 video_title = self._html_search_regex(r"<title>(.*)</title>",
3495 webpage_src, u'title')
3497 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3498 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3499 webpage_src, u'thumbnail', fatal=False)
3502 _title = r"""candytitles.*>(.*)</span>"""
3503 mobj = re.search(_title, webpage_src)
3504 if mobj is not None:
3505 video_title = mobj.group(1)
3510 'title' : video_title,
3511 'thumbnail' : thumbnail,
3516 class RBMARadioIE(InfoExtractor):
3517 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3519 def _real_extract(self, url):
3520 m = re.match(self._VALID_URL, url)
3521 video_id = m.group('videoID')
3523 webpage = self._download_webpage(url, video_id)
3525 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3526 webpage, u'json data', flags=re.MULTILINE)
3529 data = json.loads(json_data)
3530 except ValueError as e:
3531 raise ExtractorError(u'Invalid JSON: ' + str(e))
3533 video_url = data['akamai_url'] + '&cbr=256'
3534 url_parts = compat_urllib_parse_urlparse(video_url)
3535 video_ext = url_parts.path.rpartition('.')[2]
3540 'title': data['title'],
3541 'description': data.get('teaser_text'),
3542 'location': data.get('country_of_origin'),
3543 'uploader': data.get('host', {}).get('name'),
3544 'uploader_id': data.get('host', {}).get('slug'),
3545 'thumbnail': data.get('image', {}).get('large_url_2x'),
3546 'duration': data.get('duration'),
3551 class YouPornIE(InfoExtractor):
3552 """Information extractor for youporn.com."""
3553 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3555 def _print_formats(self, formats):
3556 """Print all available formats"""
3557 print(u'Available formats:')
3558 print(u'ext\t\tformat')
3559 print(u'---------------------------------')
3560 for format in formats:
3561 print(u'%s\t\t%s' % (format['ext'], format['format']))
3563 def _specific(self, req_format, formats):
3565 if(x["format"]==req_format):
3569 def _real_extract(self, url):
3570 mobj = re.match(self._VALID_URL, url)
3572 raise ExtractorError(u'Invalid URL: %s' % url)
3573 video_id = mobj.group('videoid')
3575 req = compat_urllib_request.Request(url)
3576 req.add_header('Cookie', 'age_verified=1')
3577 webpage = self._download_webpage(req, video_id)
3579 # Get JSON parameters
3580 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3582 params = json.loads(json_params)
3584 raise ExtractorError(u'Invalid JSON')
3586 self.report_extraction(video_id)
3588 video_title = params['title']
3589 upload_date = unified_strdate(params['release_date_f'])
3590 video_description = params['description']
3591 video_uploader = params['submitted_by']
3592 thumbnail = params['thumbnails'][0]['image']
3594 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3596 # Get all of the formats available
3597 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3598 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3599 webpage, u'download list').strip()
3601 # Get all of the links from the page
3602 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3603 links = re.findall(LINK_RE, download_list_html)
3604 if(len(links) == 0):
3605 raise ExtractorError(u'ERROR: no known formats available for video')
3607 self.to_screen(u'Links found: %d' % len(links))
3612 # A link looks like this:
3613 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3614 # A path looks like this:
3615 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3616 video_url = unescapeHTML( link )
3617 path = compat_urllib_parse_urlparse( video_url ).path
3618 extension = os.path.splitext( path )[1][1:]
3619 format = path.split('/')[4].split('_')[:2]
3622 format = "-".join( format )
3623 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3628 'uploader': video_uploader,
3629 'upload_date': upload_date,
3630 'title': video_title,
3633 'thumbnail': thumbnail,
3634 'description': video_description
3637 if self._downloader.params.get('listformats', None):
3638 self._print_formats(formats)
3641 req_format = self._downloader.params.get('format', None)
3642 self.to_screen(u'Format: %s' % req_format)
3644 if req_format is None or req_format == 'best':
3646 elif req_format == 'worst':
3647 return [formats[-1]]
3648 elif req_format in ('-1', 'all'):
3651 format = self._specific( req_format, formats )
3653 raise ExtractorError(u'Requested format not available')
3658 class PornotubeIE(InfoExtractor):
3659 """Information extractor for pornotube.com."""
3660 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3662 def _real_extract(self, url):
3663 mobj = re.match(self._VALID_URL, url)
3665 raise ExtractorError(u'Invalid URL: %s' % url)
3667 video_id = mobj.group('videoid')
3668 video_title = mobj.group('title')
3670 # Get webpage content
3671 webpage = self._download_webpage(url, video_id)
3674 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3675 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3676 video_url = compat_urllib_parse.unquote(video_url)
3678 #Get the uploaded date
3679 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3680 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3681 if upload_date: upload_date = unified_strdate(upload_date)
3683 info = {'id': video_id,
3686 'upload_date': upload_date,
3687 'title': video_title,
3693 class YouJizzIE(InfoExtractor):
3694 """Information extractor for youjizz.com."""
3695 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3697 def _real_extract(self, url):
3698 mobj = re.match(self._VALID_URL, url)
3700 raise ExtractorError(u'Invalid URL: %s' % url)
3702 video_id = mobj.group('videoid')
3704 # Get webpage content
3705 webpage = self._download_webpage(url, video_id)
3707 # Get the video title
3708 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3709 webpage, u'title').strip()
3711 # Get the embed page
3712 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3714 raise ExtractorError(u'ERROR: unable to extract embed page')
3716 embed_page_url = result.group(0).strip()
3717 video_id = result.group('videoid')
3719 webpage = self._download_webpage(embed_page_url, video_id)
3722 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3723 webpage, u'video URL')
3725 info = {'id': video_id,
3727 'title': video_title,
3730 'player_url': embed_page_url}
3734 class EightTracksIE(InfoExtractor):
3736 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3738 def _real_extract(self, url):
3739 mobj = re.match(self._VALID_URL, url)
3741 raise ExtractorError(u'Invalid URL: %s' % url)
3742 playlist_id = mobj.group('id')
3744 webpage = self._download_webpage(url, playlist_id)
3746 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3747 data = json.loads(json_like)
3749 session = str(random.randint(0, 1000000000))
3751 track_count = data['tracks_count']
3752 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3753 next_url = first_url
3755 for i in itertools.count():
3756 api_json = self._download_webpage(next_url, playlist_id,
3757 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3758 errnote=u'Failed to download song information')
3759 api_data = json.loads(api_json)
3760 track_data = api_data[u'set']['track']
3762 'id': track_data['id'],
3763 'url': track_data['track_file_stream_url'],
3764 'title': track_data['performer'] + u' - ' + track_data['name'],
3765 'raw_title': track_data['name'],
3766 'uploader_id': data['user']['login'],
3770 if api_data['set']['at_last_track']:
3772 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3775 class KeekIE(InfoExtractor):
3776 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3779 def _real_extract(self, url):
3780 m = re.match(self._VALID_URL, url)
3781 video_id = m.group('videoID')
3783 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3784 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3785 webpage = self._download_webpage(url, video_id)
3787 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3790 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3791 webpage, u'uploader', fatal=False)
3797 'title': video_title,
3798 'thumbnail': thumbnail,
3799 'uploader': uploader
3803 class TEDIE(InfoExtractor):
3804 _VALID_URL=r'''http://www\.ted\.com/
3806 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3808 ((?P<type_talk>talks)) # We have a simple talk
3810 (/lang/(.*?))? # The url may contain the language
3811 /(?P<name>\w+) # Here goes the name and then ".html"
3815 def suitable(cls, url):
3816 """Receives a URL and returns True if suitable for this IE."""
3817 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3819 def _real_extract(self, url):
3820 m=re.match(self._VALID_URL, url, re.VERBOSE)
3821 if m.group('type_talk'):
3822 return [self._talk_info(url)]
3824 playlist_id=m.group('playlist_id')
3825 name=m.group('name')
3826 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3827 return [self._playlist_videos_info(url,name,playlist_id)]
3829 def _playlist_videos_info(self,url,name,playlist_id=0):
3830 '''Returns the videos of the playlist'''
3832 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3833 ([.\s]*?)data-playlist_item_id="(\d+)"
3834 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3836 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3837 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3838 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3839 m_names=re.finditer(video_name_RE,webpage)
3841 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3842 webpage, 'playlist title')
3844 playlist_entries = []
3845 for m_video, m_name in zip(m_videos,m_names):
3846 video_id=m_video.group('video_id')
3847 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3848 playlist_entries.append(self.url_result(talk_url, 'TED'))
3849 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3851 def _talk_info(self, url, video_id=0):
3852 """Return the video for the talk in the url"""
3853 m = re.match(self._VALID_URL, url,re.VERBOSE)
3854 video_name = m.group('name')
3855 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3856 self.report_extraction(video_name)
3857 # If the url includes the language we get the title translated
3858 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3860 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3861 webpage, 'json data')
3862 info = json.loads(json_data)
3863 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3864 webpage, 'description', flags = re.DOTALL)
3866 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3867 webpage, 'thumbnail')
3870 'url': info['htmlStreams'][-1]['file'],
3873 'thumbnail': thumbnail,
3874 'description': desc,
3878 class MySpassIE(InfoExtractor):
3879 _VALID_URL = r'http://www.myspass.de/.*'
3881 def _real_extract(self, url):
3882 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3884 # video id is the last path element of the URL
3885 # usually there is a trailing slash, so also try the second but last
3886 url_path = compat_urllib_parse_urlparse(url).path
3887 url_parent_path, video_id = os.path.split(url_path)
3889 _, video_id = os.path.split(url_parent_path)
3892 metadata_url = META_DATA_URL_TEMPLATE % video_id
3893 metadata_text = self._download_webpage(metadata_url, video_id)
3894 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3896 # extract values from metadata
3897 url_flv_el = metadata.find('url_flv')
3898 if url_flv_el is None:
3899 raise ExtractorError(u'Unable to extract download url')
3900 video_url = url_flv_el.text
3901 extension = os.path.splitext(video_url)[1][1:]
3902 title_el = metadata.find('title')
3903 if title_el is None:
3904 raise ExtractorError(u'Unable to extract title')
3905 title = title_el.text
3906 format_id_el = metadata.find('format_id')
3907 if format_id_el is None:
3910 format = format_id_el.text
3911 description_el = metadata.find('description')
3912 if description_el is not None:
3913 description = description_el.text
3916 imagePreview_el = metadata.find('imagePreview')
3917 if imagePreview_el is not None:
3918 thumbnail = imagePreview_el.text
3927 'thumbnail': thumbnail,
3928 'description': description
3932 class SpiegelIE(InfoExtractor):
3933 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3935 def _real_extract(self, url):
3936 m = re.match(self._VALID_URL, url)
3937 video_id = m.group('videoID')
3939 webpage = self._download_webpage(url, video_id)
3941 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3944 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3945 xml_code = self._download_webpage(xml_url, video_id,
3946 note=u'Downloading XML', errnote=u'Failed to download XML')
3948 idoc = xml.etree.ElementTree.fromstring(xml_code)
3949 last_type = idoc[-1]
3950 filename = last_type.findall('./filename')[0].text
3951 duration = float(last_type.findall('./duration')[0].text)
3953 video_url = 'http://video2.spiegel.de/flash/' + filename
3954 video_ext = filename.rpartition('.')[2]
3959 'title': video_title,
3960 'duration': duration,
3964 class LiveLeakIE(InfoExtractor):
3966 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3967 IE_NAME = u'liveleak'
3969 def _real_extract(self, url):
3970 mobj = re.match(self._VALID_URL, url)
3972 raise ExtractorError(u'Invalid URL: %s' % url)
3974 video_id = mobj.group('video_id')
3976 webpage = self._download_webpage(url, video_id)
3978 video_url = self._search_regex(r'file: "(.*?)",',
3979 webpage, u'video URL')
3981 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3982 webpage, u'title').replace('LiveLeak.com -', '').strip()
3984 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3985 webpage, u'description', fatal=False)
3987 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3988 webpage, u'uploader', fatal=False)
3994 'title': video_title,
3995 'description': video_description,
3996 'uploader': video_uploader
4001 class ARDIE(InfoExtractor):
4002 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4003 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4004 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4006 def _real_extract(self, url):
4007 # determine video id from url
4008 m = re.match(self._VALID_URL, url)
4010 numid = re.search(r'documentId=([0-9]+)', url)
4012 video_id = numid.group(1)
4014 video_id = m.group('video_id')
4016 # determine title and media streams from webpage
4017 html = self._download_webpage(url, video_id)
4018 title = re.search(self._TITLE, html).group('title')
4019 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4021 assert '"fsk"' in html
4022 raise ExtractorError(u'This video is only available after 8:00 pm')
4024 # choose default media type and highest quality for now
4025 stream = max([s for s in streams if int(s["media_type"]) == 0],
4026 key=lambda s: int(s["quality"]))
4028 # there's two possibilities: RTMP stream or HTTP download
4029 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4030 if stream['rtmp_url']:
4031 self.to_screen(u'RTMP download detected')
4032 assert stream['video_url'].startswith('mp4:')
4033 info["url"] = stream["rtmp_url"]
4034 info["play_path"] = stream['video_url']
4036 assert stream["video_url"].endswith('.mp4')
4037 info["url"] = stream["video_url"]
4040 class ZDFIE(InfoExtractor):
4041 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4042 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4043 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4044 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4045 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4047 def _real_extract(self, url):
4048 mobj = re.match(self._VALID_URL, url)
4050 raise ExtractorError(u'Invalid URL: %s' % url)
4051 video_id = mobj.group('video_id')
4053 html = self._download_webpage(url, video_id)
4054 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4056 raise ExtractorError(u'No media url found.')
4058 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4059 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4060 # choose first/default media type and highest quality for now
4061 for s in streams: #find 300 - dsl1000mbit
4062 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4065 for s in streams: #find veryhigh - dsl2000mbit
4066 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4070 raise ExtractorError(u'No stream found.')
4072 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4074 self.report_extraction(video_id)
4075 mobj = re.search(self._TITLE, html)
4077 raise ExtractorError(u'Cannot extract title')
4078 title = unescapeHTML(mobj.group('title'))
4080 mobj = re.search(self._MMS_STREAM, media_link)
4082 mobj = re.search(self._RTSP_STREAM, media_link)
4084 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4085 mms_url = mobj.group('video_url')
4087 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4089 raise ExtractorError(u'Cannot extract extention')
4090 ext = mobj.group('ext')
4092 return [{'id': video_id,
4098 class TumblrIE(InfoExtractor):
4099 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4101 def _real_extract(self, url):
4102 m_url = re.match(self._VALID_URL, url)
4103 video_id = m_url.group('id')
4104 blog = m_url.group('blog_name')
4106 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4107 webpage = self._download_webpage(url, video_id)
4109 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4110 video = re.search(re_video, webpage)
4112 raise ExtractorError(u'Unable to extract video')
4113 video_url = video.group('video_url')
4114 ext = video.group('ext')
4116 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4117 webpage, u'thumbnail', fatal=False) # We pick the first poster
4118 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4120 # The only place where you can get a title, it's not complete,
4121 # but searching in other places doesn't work for all videos
4122 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4123 webpage, u'title', flags=re.DOTALL)
4125 return [{'id': video_id,
4127 'title': video_title,
4128 'thumbnail': video_thumbnail,
4132 class BandcampIE(InfoExtractor):
4133 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4135 def _real_extract(self, url):
4136 mobj = re.match(self._VALID_URL, url)
4137 title = mobj.group('title')
4138 webpage = self._download_webpage(url, title)
4139 # We get the link to the free download page
4140 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4141 if m_download is None:
4142 raise ExtractorError(u'No free songs found')
4144 download_link = m_download.group(1)
4145 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4146 webpage, re.MULTILINE|re.DOTALL).group('id')
4148 download_webpage = self._download_webpage(download_link, id,
4149 'Downloading free downloads page')
4150 # We get the dictionary of the track from some javascrip code
4151 info = re.search(r'items: (.*?),$',
4152 download_webpage, re.MULTILINE).group(1)
4153 info = json.loads(info)[0]
4154 # We pick mp3-320 for now, until format selection can be easily implemented.
4155 mp3_info = info[u'downloads'][u'mp3-320']
4156 # If we try to use this url it says the link has expired
4157 initial_url = mp3_info[u'url']
4158 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4159 m_url = re.match(re_url, initial_url)
4160 #We build the url we will use to get the final track url
4161 # This url is build in Bandcamp in the script download_bunde_*.js
4162 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4163 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4164 # If we could correctly generate the .rand field the url would be
4165 #in the "download_url" key
4166 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4168 track_info = {'id':id,
4169 'title' : info[u'title'],
4172 'thumbnail' : info[u'thumb_url'],
4173 'uploader' : info[u'artist']
4178 class RedTubeIE(InfoExtractor):
4179 """Information Extractor for redtube"""
4180 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4182 def _real_extract(self,url):
4183 mobj = re.match(self._VALID_URL, url)
4185 raise ExtractorError(u'Invalid URL: %s' % url)
4187 video_id = mobj.group('id')
4188 video_extension = 'mp4'
4189 webpage = self._download_webpage(url, video_id)
4191 self.report_extraction(video_id)
4193 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4194 webpage, u'video URL')
4196 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4202 'ext': video_extension,
4203 'title': video_title,
4206 class InaIE(InfoExtractor):
4207 """Information Extractor for Ina.fr"""
4208 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4210 def _real_extract(self,url):
4211 mobj = re.match(self._VALID_URL, url)
4213 video_id = mobj.group('id')
4214 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4215 video_extension = 'mp4'
4216 webpage = self._download_webpage(mrss_url, video_id)
4218 self.report_extraction(video_id)
4220 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4221 webpage, u'video URL')
4223 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4229 'ext': video_extension,
4230 'title': video_title,
4233 class HowcastIE(InfoExtractor):
4234 """Information Extractor for Howcast.com"""
4235 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4237 def _real_extract(self, url):
4238 mobj = re.match(self._VALID_URL, url)
4240 video_id = mobj.group('id')
4241 webpage_url = 'http://www.howcast.com/videos/' + video_id
4242 webpage = self._download_webpage(webpage_url, video_id)
4244 self.report_extraction(video_id)
4246 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4247 webpage, u'video URL')
4249 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4252 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4253 webpage, u'description', fatal=False)
4255 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4256 webpage, u'thumbnail', fatal=False)
4262 'title': video_title,
4263 'description': video_description,
4264 'thumbnail': thumbnail,
4267 class VineIE(InfoExtractor):
4268 """Information Extractor for Vine.co"""
4269 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4271 def _real_extract(self, url):
4272 mobj = re.match(self._VALID_URL, url)
4274 video_id = mobj.group('id')
4275 webpage_url = 'https://vine.co/v/' + video_id
4276 webpage = self._download_webpage(webpage_url, video_id)
4278 self.report_extraction(video_id)
4280 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4281 webpage, u'video URL')
4283 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4286 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4287 webpage, u'thumbnail', fatal=False)
4289 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4290 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4296 'title': video_title,
4297 'thumbnail': thumbnail,
4298 'uploader': uploader,
4301 class FlickrIE(InfoExtractor):
4302 """Information Extractor for Flickr videos"""
4303 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4305 def _real_extract(self, url):
4306 mobj = re.match(self._VALID_URL, url)
4308 video_id = mobj.group('id')
4309 video_uploader_id = mobj.group('uploader_id')
4310 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4311 webpage = self._download_webpage(webpage_url, video_id)
4313 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4315 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4316 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4318 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4319 first_xml, u'node_id')
4321 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4322 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4324 self.report_extraction(video_id)
4326 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4328 raise ExtractorError(u'Unable to extract video url')
4329 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4331 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4332 webpage, u'video title')
4334 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4335 webpage, u'description', fatal=False)
4337 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4338 webpage, u'thumbnail', fatal=False)
4344 'title': video_title,
4345 'description': video_description,
4346 'thumbnail': thumbnail,
4347 'uploader_id': video_uploader_id,
4350 class TeamcocoIE(InfoExtractor):
4351 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4353 def _real_extract(self, url):
4354 mobj = re.match(self._VALID_URL, url)
4356 raise ExtractorError(u'Invalid URL: %s' % url)
4357 url_title = mobj.group('url_title')
4358 webpage = self._download_webpage(url, url_title)
4360 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4361 webpage, u'video id')
4363 self.report_extraction(video_id)
4365 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4368 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4369 webpage, u'thumbnail', fatal=False)
4371 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4372 webpage, u'description', fatal=False)
4374 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4375 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4377 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4384 'title': video_title,
4385 'thumbnail': thumbnail,
4386 'description': video_description,
4389 class XHamsterIE(InfoExtractor):
4390 """Information Extractor for xHamster"""
4391 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4393 def _real_extract(self,url):
4394 mobj = re.match(self._VALID_URL, url)
4396 video_id = mobj.group('id')
4397 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4398 webpage = self._download_webpage(mrss_url, video_id)
4400 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4402 raise ExtractorError(u'Unable to extract media URL')
4403 if len(mobj.group('server')) == 0:
4404 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4406 video_url = mobj.group('server')+'/key='+mobj.group('file')
4407 video_extension = video_url.split('.')[-1]
4409 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4412 # Can't see the description anywhere in the UI
4413 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4414 # webpage, u'description', fatal=False)
4415 # if video_description: video_description = unescapeHTML(video_description)
4417 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4419 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4421 video_upload_date = None
4422 self._downloader.report_warning(u'Unable to extract upload date')
4424 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4425 webpage, u'uploader id', default=u'anonymous')
4427 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4428 webpage, u'thumbnail', fatal=False)
4433 'ext': video_extension,
4434 'title': video_title,
4435 # 'description': video_description,
4436 'upload_date': video_upload_date,
4437 'uploader_id': video_uploader_id,
4438 'thumbnail': video_thumbnail
4441 class HypemIE(InfoExtractor):
4442 """Information Extractor for hypem"""
4443 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4445 def _real_extract(self, url):
4446 mobj = re.match(self._VALID_URL, url)
4448 raise ExtractorError(u'Invalid URL: %s' % url)
4449 track_id = mobj.group(1)
4451 data = { 'ax': 1, 'ts': time.time() }
4452 data_encoded = compat_urllib_parse.urlencode(data)
4453 complete_url = url + "?" + data_encoded
4454 request = compat_urllib_request.Request(complete_url)
4455 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4456 cookie = urlh.headers.get('Set-Cookie', '')
4458 self.report_extraction(track_id)
4460 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4461 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4463 track_list = json.loads(html_tracks)
4464 track = track_list[u'tracks'][0]
4466 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4469 track_id = track[u"id"]
4470 artist = track[u"artist"]
4471 title = track[u"song"]
4473 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4474 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4475 request.add_header('cookie', cookie)
4476 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4478 song_data = json.loads(song_data_json)
4480 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4481 final_url = song_data[u"url"]
4491 class Vbox7IE(InfoExtractor):
4492 """Information Extractor for Vbox7"""
4493 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4495 def _real_extract(self,url):
4496 mobj = re.match(self._VALID_URL, url)
4498 raise ExtractorError(u'Invalid URL: %s' % url)
4499 video_id = mobj.group(1)
4501 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4502 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4503 redirect_url = urlh.geturl() + new_location
4504 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4506 title = self._html_search_regex(r'<title>(.*)</title>',
4507 webpage, u'title').split('/')[0].strip()
4510 info_url = "http://vbox7.com/play/magare.do"
4511 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4512 info_request = compat_urllib_request.Request(info_url, data)
4513 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4514 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4515 if info_response is None:
4516 raise ExtractorError(u'Unable to extract the media url')
4517 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4524 'thumbnail': thumbnail_url,
4527 class GametrailersIE(InfoExtractor):
4528 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4530 def _real_extract(self, url):
4531 mobj = re.match(self._VALID_URL, url)
4533 raise ExtractorError(u'Invalid URL: %s' % url)
4534 video_id = mobj.group('id')
4535 video_type = mobj.group('type')
4536 webpage = self._download_webpage(url, video_id)
4537 if video_type == 'full-episodes':
4538 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4540 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4541 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4542 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4544 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4545 video_id, u'Downloading video info')
4546 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4547 video_id, u'Downloading video urls info')
4549 self.report_extraction(video_id)
4550 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4551 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4553 <url>(?P<thumb>.*?)</url>.*
4556 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4558 raise ExtractorError(u'Unable to extract video info')
4559 video_title = m_info.group('title')
4560 video_description = m_info.group('description')
4561 video_thumb = m_info.group('thumb')
4563 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4564 if m_urls is None or len(m_urls) == 0:
4565 raise ExtractError(u'Unable to extrat video url')
4566 # They are sorted from worst to best quality
4567 video_url = m_urls[-1].group('url')
4569 return {'url': video_url,
4571 'title': video_title,
4572 # Videos are actually flv not mp4
4574 'thumbnail': video_thumb,
4575 'description': video_description,
4578 class StatigramIE(InfoExtractor):
4579 _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4581 def _real_extract(self, url):
4582 mobj = re.match(self._VALID_URL, url)
4584 video_id = mobj.group(1)
4585 webpage = self._download_webpage(url, video_id)
4586 video_url = self._html_search_regex(
4587 r'<meta property="og:video:secure_url" content="(.+?)">',
4588 webpage, u'video URL')
4589 thumbnail_url = self._html_search_regex(
4590 r'<meta property="og:image" content="(.+?)" />',
4591 webpage, u'thumbnail URL', fatal=False)
4592 html_title = self._html_search_regex(
4593 r'<title>(.+?)</title>',
4595 title = html_title.rpartition(u' | Statigram')[0]
4596 uploader = self._html_search_regex(
4597 r'@(.+) \(Videos\)', title, u'uploader name', fatal=False)
4605 'thumbnail': thumbnail_url,
4606 'uploader' : uploader
4609 def gen_extractors():
4610 """ Return a list of an instance of every supported extractor.
4611 The order does matter; the first extractor matched is the one handling the URL.
4614 YoutubePlaylistIE(),
4639 StanfordOpenClassroomIE(),
4649 WorldStarHipHopIE(),
4679 def get_info_extractor(ie_name):
4680 """Returns the info extractor class with the given ie_name"""
4681 return globals()[ie_name+'IE']