2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
728 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
729 info = json.loads(mobj.group(1))
730 if 'dashmpd' in info['args']:
731 # Vevo videos with encrypted signatures
732 self.to_screen(u'Vevo video detected.')
733 video_info['url_encoded_fmt_stream_map'] = [info['args']['url_encoded_fmt_stream_map']]
737 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
738 self.report_rtmp_download()
739 video_url_list = [(None, video_info['conn'][0])]
740 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
742 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
743 url_data = compat_parse_qs(url_data_str)
744 if 'itag' in url_data and 'url' in url_data:
745 url = url_data['url'][0]
746 if 'sig' in url_data:
747 url += '&signature=' + url_data['sig'][0]
750 """Decrypt the key the two subkeys must have a length of 43"""
752 b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
754 s_dec = '.'.join((a,b))[::-1]
756 key = k(url_data['s'][0])
757 url += '&signature=' + key
758 if 'ratebypass' not in url:
759 url += '&ratebypass=yes'
760 url_map[url_data['itag'][0]] = url
762 format_limit = self._downloader.params.get('format_limit', None)
763 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
764 if format_limit is not None and format_limit in available_formats:
765 format_list = available_formats[available_formats.index(format_limit):]
767 format_list = available_formats
768 existing_formats = [x for x in format_list if x in url_map]
769 if len(existing_formats) == 0:
770 raise ExtractorError(u'no known formats available for video')
771 if self._downloader.params.get('listformats', None):
772 self._print_formats(existing_formats)
774 if req_format is None or req_format == 'best':
775 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
776 elif req_format == 'worst':
777 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
778 elif req_format in ('-1', 'all'):
779 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
781 # Specific formats. We pick the first in a slash-delimeted sequence.
782 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
783 req_formats = req_format.split('/')
784 video_url_list = None
785 for rf in req_formats:
787 video_url_list = [(rf, url_map[rf])]
789 if video_url_list is None:
790 raise ExtractorError(u'requested format not available')
792 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
795 for format_param, video_real_url in video_url_list:
797 video_extension = self._video_extensions.get(format_param, 'flv')
799 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
800 self._video_dimensions.get(format_param, '???'))
804 'url': video_real_url,
805 'uploader': video_uploader,
806 'uploader_id': video_uploader_id,
807 'upload_date': upload_date,
808 'title': video_title,
809 'ext': video_extension,
810 'format': video_format,
811 'thumbnail': video_thumbnail,
812 'description': video_description,
813 'player_url': player_url,
814 'subtitles': video_subtitles,
815 'duration': video_duration
820 class MetacafeIE(InfoExtractor):
821 """Information Extractor for metacafe.com."""
823 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
824 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
825 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
826 IE_NAME = u'metacafe'
828 def report_disclaimer(self):
829 """Report disclaimer retrieval."""
830 self.to_screen(u'Retrieving disclaimer')
832 def _real_initialize(self):
833 # Retrieve disclaimer
834 request = compat_urllib_request.Request(self._DISCLAIMER)
836 self.report_disclaimer()
837 disclaimer = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
844 'submit': "Continue - I'm over 18",
846 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
848 self.report_age_confirmation()
849 disclaimer = compat_urllib_request.urlopen(request).read()
850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
851 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
853 def _real_extract(self, url):
854 # Extract id and simplified title from URL
855 mobj = re.match(self._VALID_URL, url)
857 raise ExtractorError(u'Invalid URL: %s' % url)
859 video_id = mobj.group(1)
861 # Check if video comes from YouTube
862 mobj2 = re.match(r'^yt-(.*)$', video_id)
863 if mobj2 is not None:
864 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
866 # Retrieve video webpage to extract further information
867 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
869 # Extract URL, uploader and title from webpage
870 self.report_extraction(video_id)
871 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
873 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
874 video_extension = mediaURL[-3:]
876 # Extract gdaKey if available
877 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
881 gdaKey = mobj.group(1)
882 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
884 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
886 raise ExtractorError(u'Unable to extract media URL')
887 vardict = compat_parse_qs(mobj.group(1))
888 if 'mediaData' not in vardict:
889 raise ExtractorError(u'Unable to extract media URL')
890 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
894 video_extension = mediaURL[-3:]
895 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
897 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
902 mobj = re.search(r'submitter=(.*?);', webpage)
904 raise ExtractorError(u'Unable to extract uploader nickname')
905 video_uploader = mobj.group(1)
908 'id': video_id.decode('utf-8'),
909 'url': video_url.decode('utf-8'),
910 'uploader': video_uploader.decode('utf-8'),
912 'title': video_title,
913 'ext': video_extension.decode('utf-8'),
916 class DailymotionIE(InfoExtractor):
917 """Information Extractor for Dailymotion"""
919 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
920 IE_NAME = u'dailymotion'
922 def _real_extract(self, url):
923 # Extract id and simplified title from URL
924 mobj = re.match(self._VALID_URL, url)
926 raise ExtractorError(u'Invalid URL: %s' % url)
928 video_id = mobj.group(1).split('_')[0].split('?')[0]
930 video_extension = 'mp4'
932 # Retrieve video webpage to extract further information
933 request = compat_urllib_request.Request(url)
934 request.add_header('Cookie', 'family_filter=off')
935 webpage = self._download_webpage(request, video_id)
937 # Extract URL, uploader and title from webpage
938 self.report_extraction(video_id)
939 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
941 raise ExtractorError(u'Unable to extract media URL')
942 flashvars = compat_urllib_parse.unquote(mobj.group(1))
944 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
947 self.to_screen(u'Using %s' % key)
950 raise ExtractorError(u'Unable to extract video URL')
952 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
954 raise ExtractorError(u'Unable to extract video URL')
956 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
958 # TODO: support choosing qualities
960 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
962 raise ExtractorError(u'Unable to extract title')
963 video_title = unescapeHTML(mobj.group('title'))
965 video_uploader = None
966 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
967 # Looking for official user
968 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
969 webpage, 'video uploader')
971 video_upload_date = None
972 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
974 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
979 'uploader': video_uploader,
980 'upload_date': video_upload_date,
981 'title': video_title,
982 'ext': video_extension,
986 class PhotobucketIE(InfoExtractor):
987 """Information extractor for photobucket.com."""
989 # TODO: the original _VALID_URL was:
990 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
991 # Check if it's necessary to keep the old extracion process
992 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
993 IE_NAME = u'photobucket'
995 def _real_extract(self, url):
996 # Extract id from URL
997 mobj = re.match(self._VALID_URL, url)
999 raise ExtractorError(u'Invalid URL: %s' % url)
1001 video_id = mobj.group('id')
1003 video_extension = mobj.group('ext')
1005 # Retrieve video webpage to extract further information
1006 webpage = self._download_webpage(url, video_id)
1008 # Extract URL, uploader, and title from webpage
1009 self.report_extraction(video_id)
1010 # We try first by looking the javascript code:
1011 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1012 if mobj is not None:
1013 info = json.loads(mobj.group('json'))
1016 'url': info[u'downloadUrl'],
1017 'uploader': info[u'username'],
1018 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1019 'title': info[u'title'],
1020 'ext': video_extension,
1021 'thumbnail': info[u'thumbUrl'],
1024 # We try looking in other parts of the webpage
1025 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1026 webpage, u'video URL')
1028 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1030 raise ExtractorError(u'Unable to extract title')
1031 video_title = mobj.group(1).decode('utf-8')
1032 video_uploader = mobj.group(2).decode('utf-8')
1035 'id': video_id.decode('utf-8'),
1036 'url': video_url.decode('utf-8'),
1037 'uploader': video_uploader,
1038 'upload_date': None,
1039 'title': video_title,
1040 'ext': video_extension.decode('utf-8'),
1044 class YahooIE(InfoExtractor):
1045 """Information extractor for screen.yahoo.com."""
1046 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1048 def _real_extract(self, url):
1049 mobj = re.match(self._VALID_URL, url)
1051 raise ExtractorError(u'Invalid URL: %s' % url)
1052 video_id = mobj.group('id')
1053 webpage = self._download_webpage(url, video_id)
1054 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1057 # TODO: Check which url parameters are required
1058 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1059 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1060 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1061 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1062 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1063 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1065 self.report_extraction(video_id)
1066 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1068 raise ExtractorError(u'Unable to extract video info')
1069 video_title = m_info.group('title')
1070 video_description = m_info.group('description')
1071 video_thumb = m_info.group('thumb')
1072 video_date = m_info.group('date')
1073 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1075 # TODO: Find a way to get mp4 videos
1076 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1077 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1078 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1079 video_url = m_rest.group('url')
1080 video_path = m_rest.group('path')
1082 raise ExtractorError(u'Unable to extract video url')
1084 else: # We have to use a different method if another id is defined
1085 long_id = m_id.group('new_id')
1086 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1087 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1088 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1089 info = json.loads(json_str)
1090 res = info[u'query'][u'results'][u'mediaObj'][0]
1091 stream = res[u'streams'][0]
1092 video_path = stream[u'path']
1093 video_url = stream[u'host']
1095 video_title = meta[u'title']
1096 video_description = meta[u'description']
1097 video_thumb = meta[u'thumbnail']
1098 video_date = None # I can't find it
1103 'play_path': video_path,
1104 'title':video_title,
1105 'description': video_description,
1106 'thumbnail': video_thumb,
1107 'upload_date': video_date,
1112 class VimeoIE(InfoExtractor):
1113 """Information extractor for vimeo.com."""
1115 # _VALID_URL matches Vimeo URLs
1116 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1119 def _real_extract(self, url, new_video=True):
1120 # Extract ID from URL
1121 mobj = re.match(self._VALID_URL, url)
1123 raise ExtractorError(u'Invalid URL: %s' % url)
1125 video_id = mobj.group('id')
1126 if not mobj.group('proto'):
1127 url = 'https://' + url
1128 if mobj.group('direct_link') or mobj.group('pro'):
1129 url = 'https://vimeo.com/' + video_id
1131 # Retrieve video webpage to extract further information
1132 request = compat_urllib_request.Request(url, None, std_headers)
1133 webpage = self._download_webpage(request, video_id)
1135 # Now we begin extracting as much information as we can from what we
1136 # retrieved. First we extract the information common to all extractors,
1137 # and latter we extract those that are Vimeo specific.
1138 self.report_extraction(video_id)
1140 # Extract the config JSON
1142 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1143 config = json.loads(config)
1145 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1146 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1148 raise ExtractorError(u'Unable to extract info section')
1151 video_title = config["video"]["title"]
1153 # Extract uploader and uploader_id
1154 video_uploader = config["video"]["owner"]["name"]
1155 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1157 # Extract video thumbnail
1158 video_thumbnail = config["video"]["thumbnail"]
1160 # Extract video description
1161 video_description = get_element_by_attribute("itemprop", "description", webpage)
1162 if video_description: video_description = clean_html(video_description)
1163 else: video_description = u''
1165 # Extract upload date
1166 video_upload_date = None
1167 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1168 if mobj is not None:
1169 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1171 # Vimeo specific: extract request signature and timestamp
1172 sig = config['request']['signature']
1173 timestamp = config['request']['timestamp']
1175 # Vimeo specific: extract video codec and quality information
1176 # First consider quality, then codecs, then take everything
1177 # TODO bind to format param
1178 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1179 files = { 'hd': [], 'sd': [], 'other': []}
1180 for codec_name, codec_extension in codecs:
1181 if codec_name in config["video"]["files"]:
1182 if 'hd' in config["video"]["files"][codec_name]:
1183 files['hd'].append((codec_name, codec_extension, 'hd'))
1184 elif 'sd' in config["video"]["files"][codec_name]:
1185 files['sd'].append((codec_name, codec_extension, 'sd'))
1187 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1189 for quality in ('hd', 'sd', 'other'):
1190 if len(files[quality]) > 0:
1191 video_quality = files[quality][0][2]
1192 video_codec = files[quality][0][0]
1193 video_extension = files[quality][0][1]
1194 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1197 raise ExtractorError(u'No known codec found')
1199 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1200 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1205 'uploader': video_uploader,
1206 'uploader_id': video_uploader_id,
1207 'upload_date': video_upload_date,
1208 'title': video_title,
1209 'ext': video_extension,
1210 'thumbnail': video_thumbnail,
1211 'description': video_description,
1215 class ArteTvIE(InfoExtractor):
1216 """arte.tv information extractor."""
1218 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1219 _LIVE_URL = r'index-[0-9]+\.html$'
1221 IE_NAME = u'arte.tv'
1223 def fetch_webpage(self, url):
1224 request = compat_urllib_request.Request(url)
1226 self.report_download_webpage(url)
1227 webpage = compat_urllib_request.urlopen(request).read()
1228 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1229 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1230 except ValueError as err:
1231 raise ExtractorError(u'Invalid URL: %s' % url)
1234 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1235 page = self.fetch_webpage(url)
1236 mobj = re.search(regex, page, regexFlags)
1240 raise ExtractorError(u'Invalid URL: %s' % url)
1242 for (i, key, err) in matchTuples:
1243 if mobj.group(i) is None:
1244 raise ExtractorError(err)
1246 info[key] = mobj.group(i)
1250 def extractLiveStream(self, url):
1251 video_lang = url.split('/')[-4]
1252 info = self.grep_webpage(
1254 r'src="(.*?/videothek_js.*?\.js)',
1257 (1, 'url', u'Invalid URL: %s' % url)
1260 http_host = url.split('/')[2]
1261 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1262 info = self.grep_webpage(
1264 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1265 '(http://.*?\.swf).*?' +
1269 (1, 'path', u'could not extract video path: %s' % url),
1270 (2, 'player', u'could not extract video player: %s' % url),
1271 (3, 'url', u'could not extract video url: %s' % url)
1274 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1276 def extractPlus7Stream(self, url):
1277 video_lang = url.split('/')[-3]
1278 info = self.grep_webpage(
1280 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1283 (1, 'url', u'Invalid URL: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1287 info = self.grep_webpage(
1289 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1292 (1, 'url', u'Could not find <video> tag: %s' % url)
1295 next_url = compat_urllib_parse.unquote(info.get('url'))
1297 info = self.grep_webpage(
1299 r'<video id="(.*?)".*?>.*?' +
1300 '<name>(.*?)</name>.*?' +
1301 '<dateVideo>(.*?)</dateVideo>.*?' +
1302 '<url quality="hd">(.*?)</url>',
1305 (1, 'id', u'could not extract video id: %s' % url),
1306 (2, 'title', u'could not extract video title: %s' % url),
1307 (3, 'date', u'could not extract video date: %s' % url),
1308 (4, 'url', u'could not extract video url: %s' % url)
1313 'id': info.get('id'),
1314 'url': compat_urllib_parse.unquote(info.get('url')),
1315 'uploader': u'arte.tv',
1316 'upload_date': unified_strdate(info.get('date')),
1317 'title': info.get('title').decode('utf-8'),
1323 def _real_extract(self, url):
1324 video_id = url.split('/')[-1]
1325 self.report_extraction(video_id)
1327 if re.search(self._LIVE_URL, video_id) is not None:
1328 self.extractLiveStream(url)
1331 info = self.extractPlus7Stream(url)
1336 class GenericIE(InfoExtractor):
1337 """Generic last-resort information extractor."""
1340 IE_NAME = u'generic'
1342 def report_download_webpage(self, video_id):
1343 """Report webpage download."""
1344 if not self._downloader.params.get('test', False):
1345 self._downloader.report_warning(u'Falling back on generic information extractor.')
1346 super(GenericIE, self).report_download_webpage(video_id)
1348 def report_following_redirect(self, new_url):
1349 """Report information extraction."""
1350 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1352 def _test_redirect(self, url):
1353 """Check if it is a redirect, like url shorteners, in case return the new url."""
1354 class HeadRequest(compat_urllib_request.Request):
1355 def get_method(self):
1358 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1360 Subclass the HTTPRedirectHandler to make it use our
1361 HeadRequest also on the redirected URL
1363 def redirect_request(self, req, fp, code, msg, headers, newurl):
1364 if code in (301, 302, 303, 307):
1365 newurl = newurl.replace(' ', '%20')
1366 newheaders = dict((k,v) for k,v in req.headers.items()
1367 if k.lower() not in ("content-length", "content-type"))
1368 return HeadRequest(newurl,
1370 origin_req_host=req.get_origin_req_host(),
1373 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1375 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1377 Fallback to GET if HEAD is not allowed (405 HTTP error)
1379 def http_error_405(self, req, fp, code, msg, headers):
1383 newheaders = dict((k,v) for k,v in req.headers.items()
1384 if k.lower() not in ("content-length", "content-type"))
1385 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1387 origin_req_host=req.get_origin_req_host(),
1391 opener = compat_urllib_request.OpenerDirector()
1392 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1393 HTTPMethodFallback, HEADRedirectHandler,
1394 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1395 opener.add_handler(handler())
1397 response = opener.open(HeadRequest(url))
1398 if response is None:
1399 raise ExtractorError(u'Invalid URL protocol')
1400 new_url = response.geturl()
1405 self.report_following_redirect(new_url)
1408 def _real_extract(self, url):
1409 new_url = self._test_redirect(url)
1410 if new_url: return [self.url_result(new_url)]
1412 video_id = url.split('/')[-1]
1414 webpage = self._download_webpage(url, video_id)
1415 except ValueError as err:
1416 # since this is the last-resort InfoExtractor, if
1417 # this error is thrown, it'll be thrown here
1418 raise ExtractorError(u'Invalid URL: %s' % url)
1420 self.report_extraction(video_id)
1421 # Start with something easy: JW Player in SWFObject
1422 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1424 # Broaden the search a little bit
1425 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1427 # Broaden the search a little bit: JWPlayer JS loader
1428 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1430 # Try to find twitter cards info
1431 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1433 raise ExtractorError(u'Invalid URL: %s' % url)
1435 # It's possible that one of the regexes
1436 # matched, but returned an empty group:
1437 if mobj.group(1) is None:
1438 raise ExtractorError(u'Invalid URL: %s' % url)
1440 video_url = compat_urllib_parse.unquote(mobj.group(1))
1441 video_id = os.path.basename(video_url)
1443 # here's a fun little line of code for you:
1444 video_extension = os.path.splitext(video_id)[1][1:]
1445 video_id = os.path.splitext(video_id)[0]
1447 # it's tempting to parse this further, but you would
1448 # have to take into account all the variations like
1449 # Video Title - Site Name
1450 # Site Name | Video Title
1451 # Video Title - Tagline | Site Name
1452 # and so on and so forth; it's just not practical
1453 video_title = self._html_search_regex(r'<title>(.*)</title>',
1454 webpage, u'video title')
1456 # video uploader is domain name
1457 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1458 url, u'video uploader')
1463 'uploader': video_uploader,
1464 'upload_date': None,
1465 'title': video_title,
1466 'ext': video_extension,
1470 class YoutubeSearchIE(SearchInfoExtractor):
1471 """Information Extractor for YouTube search queries."""
1472 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1474 IE_NAME = u'youtube:search'
1475 _SEARCH_KEY = 'ytsearch'
1477 def report_download_page(self, query, pagenum):
1478 """Report attempt to download search page with given number."""
1479 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1481 def _get_n_results(self, query, n):
1482 """Get a specified number of results for a query"""
1488 while (50 * pagenum) < limit:
1489 self.report_download_page(query, pagenum+1)
1490 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1491 request = compat_urllib_request.Request(result_url)
1493 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1496 api_response = json.loads(data)['data']
1498 if not 'items' in api_response:
1499 raise ExtractorError(u'[youtube] No video results')
1501 new_ids = list(video['id'] for video in api_response['items'])
1502 video_ids += new_ids
1504 limit = min(n, api_response['totalItems'])
1507 if len(video_ids) > n:
1508 video_ids = video_ids[:n]
1509 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1510 return self.playlist_result(videos, query)
1513 class GoogleSearchIE(SearchInfoExtractor):
1514 """Information Extractor for Google Video search queries."""
1515 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1517 IE_NAME = u'video.google:search'
1518 _SEARCH_KEY = 'gvsearch'
1520 def _get_n_results(self, query, n):
1521 """Get a specified number of results for a query"""
1524 '_type': 'playlist',
1529 for pagenum in itertools.count(1):
1530 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1531 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1532 note='Downloading result page ' + str(pagenum))
1534 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1537 'url': mobj.group(1)
1539 res['entries'].append(e)
1541 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1544 class YahooSearchIE(SearchInfoExtractor):
1545 """Information Extractor for Yahoo! Video search queries."""
1548 IE_NAME = u'screen.yahoo:search'
1549 _SEARCH_KEY = 'yvsearch'
1551 def _get_n_results(self, query, n):
1552 """Get a specified number of results for a query"""
1555 '_type': 'playlist',
1559 for pagenum in itertools.count(0):
1560 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1561 webpage = self._download_webpage(result_url, query,
1562 note='Downloading results page '+str(pagenum+1))
1563 info = json.loads(webpage)
1565 results = info[u'results']
1567 for (i, r) in enumerate(results):
1568 if (pagenum * 30) +i >= n:
1570 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1571 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1572 res['entries'].append(e)
1573 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1579 class YoutubePlaylistIE(InfoExtractor):
1580 """Information Extractor for YouTube playlists."""
1582 _VALID_URL = r"""(?:
1587 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1588 \? (?:.*?&)*? (?:p|a|list)=
1591 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1594 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1596 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1598 IE_NAME = u'youtube:playlist'
1601 def suitable(cls, url):
1602 """Receives a URL and returns True if suitable for this IE."""
1603 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1605 def _real_extract(self, url):
1606 # Extract playlist id
1607 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1609 raise ExtractorError(u'Invalid URL: %s' % url)
1611 # Download playlist videos from API
1612 playlist_id = mobj.group(1) or mobj.group(2)
1617 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1618 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1621 response = json.loads(page)
1622 except ValueError as err:
1623 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1625 if 'feed' not in response:
1626 raise ExtractorError(u'Got a malformed response from YouTube API')
1627 playlist_title = response['feed']['title']['$t']
1628 if 'entry' not in response['feed']:
1629 # Number of videos is a multiple of self._MAX_RESULTS
1632 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1633 for entry in response['feed']['entry']
1634 if 'content' in entry ]
1636 if len(response['feed']['entry']) < self._MAX_RESULTS:
1640 videos = [v[1] for v in sorted(videos)]
1642 url_results = [self.url_result(url, 'Youtube') for url in videos]
1643 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1646 class YoutubeChannelIE(InfoExtractor):
1647 """Information Extractor for YouTube channels."""
1649 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1650 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1651 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1652 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1653 IE_NAME = u'youtube:channel'
1655 def extract_videos_from_page(self, page):
1657 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1658 if mobj.group(1) not in ids_in_page:
1659 ids_in_page.append(mobj.group(1))
1662 def _real_extract(self, url):
1663 # Extract channel id
1664 mobj = re.match(self._VALID_URL, url)
1666 raise ExtractorError(u'Invalid URL: %s' % url)
1668 # Download channel page
1669 channel_id = mobj.group(1)
1673 url = self._TEMPLATE_URL % (channel_id, pagenum)
1674 page = self._download_webpage(url, channel_id,
1675 u'Downloading page #%s' % pagenum)
1677 # Extract video identifiers
1678 ids_in_page = self.extract_videos_from_page(page)
1679 video_ids.extend(ids_in_page)
1681 # Download any subsequent channel pages using the json-based channel_ajax query
1682 if self._MORE_PAGES_INDICATOR in page:
1684 pagenum = pagenum + 1
1686 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1687 page = self._download_webpage(url, channel_id,
1688 u'Downloading page #%s' % pagenum)
1690 page = json.loads(page)
1692 ids_in_page = self.extract_videos_from_page(page['content_html'])
1693 video_ids.extend(ids_in_page)
1695 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1698 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1700 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1701 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1702 return [self.playlist_result(url_entries, channel_id)]
1705 class YoutubeUserIE(InfoExtractor):
1706 """Information Extractor for YouTube users."""
1708 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1709 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1710 _GDATA_PAGE_SIZE = 50
1711 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1712 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1713 IE_NAME = u'youtube:user'
1715 def _real_extract(self, url):
1717 mobj = re.match(self._VALID_URL, url)
1719 raise ExtractorError(u'Invalid URL: %s' % url)
1721 username = mobj.group(1)
1723 # Download video ids using YouTube Data API. Result size per
1724 # query is limited (currently to 50 videos) so we need to query
1725 # page by page until there are no video ids - it means we got
1732 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1734 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1735 page = self._download_webpage(gdata_url, username,
1736 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1738 # Extract video identifiers
1741 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1742 if mobj.group(1) not in ids_in_page:
1743 ids_in_page.append(mobj.group(1))
1745 video_ids.extend(ids_in_page)
1747 # A little optimization - if current page is not
1748 # "full", ie. does not contain PAGE_SIZE video ids then
1749 # we can assume that this page is the last one - there
1750 # are no more ids on further pages - no need to query
1753 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1758 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1759 url_results = [self.url_result(url, 'Youtube') for url in urls]
1760 return [self.playlist_result(url_results, playlist_title = username)]
1763 class BlipTVUserIE(InfoExtractor):
1764 """Information Extractor for blip.tv users."""
1766 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1768 IE_NAME = u'blip.tv:user'
1770 def _real_extract(self, url):
1772 mobj = re.match(self._VALID_URL, url)
1774 raise ExtractorError(u'Invalid URL: %s' % url)
1776 username = mobj.group(1)
1778 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1780 page = self._download_webpage(url, username, u'Downloading user page')
1781 mobj = re.search(r'data-users-id="([^"]+)"', page)
1782 page_base = page_base % mobj.group(1)
1785 # Download video ids using BlipTV Ajax calls. Result size per
1786 # query is limited (currently to 12 videos) so we need to query
1787 # page by page until there are no video ids - it means we got
1794 url = page_base + "&page=" + str(pagenum)
1795 page = self._download_webpage(url, username,
1796 u'Downloading video ids from page %d' % pagenum)
1798 # Extract video identifiers
1801 for mobj in re.finditer(r'href="/([^"]+)"', page):
1802 if mobj.group(1) not in ids_in_page:
1803 ids_in_page.append(unescapeHTML(mobj.group(1)))
1805 video_ids.extend(ids_in_page)
1807 # A little optimization - if current page is not
1808 # "full", ie. does not contain PAGE_SIZE video ids then
1809 # we can assume that this page is the last one - there
1810 # are no more ids on further pages - no need to query
1813 if len(ids_in_page) < self._PAGE_SIZE:
1818 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1819 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1820 return [self.playlist_result(url_entries, playlist_title = username)]
1823 class DepositFilesIE(InfoExtractor):
1824 """Information extractor for depositfiles.com"""
1826 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1828 def _real_extract(self, url):
1829 file_id = url.split('/')[-1]
1830 # Rebuild url in english locale
1831 url = 'http://depositfiles.com/en/files/' + file_id
1833 # Retrieve file webpage with 'Free download' button pressed
1834 free_download_indication = { 'gateway_result' : '1' }
1835 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1837 self.report_download_webpage(file_id)
1838 webpage = compat_urllib_request.urlopen(request).read()
1839 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1840 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1842 # Search for the real file URL
1843 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1844 if (mobj is None) or (mobj.group(1) is None):
1845 # Try to figure out reason of the error.
1846 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1847 if (mobj is not None) and (mobj.group(1) is not None):
1848 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1849 raise ExtractorError(u'%s' % restriction_message)
1851 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1853 file_url = mobj.group(1)
1854 file_extension = os.path.splitext(file_url)[1][1:]
1856 # Search for file title
1857 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1860 'id': file_id.decode('utf-8'),
1861 'url': file_url.decode('utf-8'),
1863 'upload_date': None,
1864 'title': file_title,
1865 'ext': file_extension.decode('utf-8'),
1869 class FacebookIE(InfoExtractor):
1870 """Information Extractor for Facebook"""
1872 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1873 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1874 _NETRC_MACHINE = 'facebook'
1875 IE_NAME = u'facebook'
1877 def report_login(self):
1878 """Report attempt to log in."""
1879 self.to_screen(u'Logging in')
1881 def _real_initialize(self):
1882 if self._downloader is None:
1887 downloader_params = self._downloader.params
1889 # Attempt to use provided username and password or .netrc data
1890 if downloader_params.get('username', None) is not None:
1891 useremail = downloader_params['username']
1892 password = downloader_params['password']
1893 elif downloader_params.get('usenetrc', False):
1895 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1896 if info is not None:
1900 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1901 except (IOError, netrc.NetrcParseError) as err:
1902 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1905 if useremail is None:
1914 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1917 login_results = compat_urllib_request.urlopen(request).read()
1918 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1919 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1921 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1922 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1925 def _real_extract(self, url):
1926 mobj = re.match(self._VALID_URL, url)
1928 raise ExtractorError(u'Invalid URL: %s' % url)
1929 video_id = mobj.group('ID')
1931 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1932 webpage = self._download_webpage(url, video_id)
1934 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1935 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1936 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1938 raise ExtractorError(u'Cannot parse data')
1939 data = dict(json.loads(m.group(1)))
1940 params_raw = compat_urllib_parse.unquote(data['params'])
1941 params = json.loads(params_raw)
1942 video_data = params['video_data'][0]
1943 video_url = video_data.get('hd_src')
1945 video_url = video_data['sd_src']
1947 raise ExtractorError(u'Cannot find video URL')
1948 video_duration = int(video_data['video_duration'])
1949 thumbnail = video_data['thumbnail_src']
1951 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1956 'title': video_title,
1959 'duration': video_duration,
1960 'thumbnail': thumbnail,
1965 class BlipTVIE(InfoExtractor):
1966 """Information extractor for blip.tv"""
1968 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1969 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1970 IE_NAME = u'blip.tv'
1972 def report_direct_download(self, title):
1973 """Report information extraction."""
1974 self.to_screen(u'%s: Direct download detected' % title)
1976 def _real_extract(self, url):
1977 mobj = re.match(self._VALID_URL, url)
1979 raise ExtractorError(u'Invalid URL: %s' % url)
1981 # See https://github.com/rg3/youtube-dl/issues/857
1982 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1983 if api_mobj is not None:
1984 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1985 urlp = compat_urllib_parse_urlparse(url)
1986 if urlp.path.startswith('/play/'):
1987 request = compat_urllib_request.Request(url)
1988 response = compat_urllib_request.urlopen(request)
1989 redirecturl = response.geturl()
1990 rurlp = compat_urllib_parse_urlparse(redirecturl)
1991 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1992 url = 'http://blip.tv/a/a-' + file_id
1993 return self._real_extract(url)
2000 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2001 request = compat_urllib_request.Request(json_url)
2002 request.add_header('User-Agent', 'iTunes/10.6.1')
2003 self.report_extraction(mobj.group(1))
2006 urlh = compat_urllib_request.urlopen(request)
2007 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2008 basename = url.split('/')[-1]
2009 title,ext = os.path.splitext(basename)
2010 title = title.decode('UTF-8')
2011 ext = ext.replace('.', '')
2012 self.report_direct_download(title)
2017 'upload_date': None,
2022 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2023 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2024 if info is None: # Regular URL
2026 json_code_bytes = urlh.read()
2027 json_code = json_code_bytes.decode('utf-8')
2028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2029 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2032 json_data = json.loads(json_code)
2033 if 'Post' in json_data:
2034 data = json_data['Post']
2038 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2039 video_url = data['media']['url']
2040 umobj = re.match(self._URL_EXT, video_url)
2042 raise ValueError('Can not determine filename extension')
2043 ext = umobj.group(1)
2046 'id': data['item_id'],
2048 'uploader': data['display_name'],
2049 'upload_date': upload_date,
2050 'title': data['title'],
2052 'format': data['media']['mimeType'],
2053 'thumbnail': data['thumbnailUrl'],
2054 'description': data['description'],
2055 'player_url': data['embedUrl'],
2056 'user_agent': 'iTunes/10.6.1',
2058 except (ValueError,KeyError) as err:
2059 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2064 class MyVideoIE(InfoExtractor):
2065 """Information Extractor for myvideo.de."""
2067 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2068 IE_NAME = u'myvideo'
2070 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2071 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2072 # https://github.com/rg3/youtube-dl/pull/842
2073 def __rc4crypt(self,data, key):
2075 box = list(range(256))
2076 for i in list(range(256)):
2077 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2078 box[i], box[x] = box[x], box[i]
2084 y = (y + box[x]) % 256
2085 box[x], box[y] = box[y], box[x]
2086 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2090 return hashlib.md5(s).hexdigest().encode()
2092 def _real_extract(self,url):
2093 mobj = re.match(self._VALID_URL, url)
2095 raise ExtractorError(u'invalid URL: %s' % url)
2097 video_id = mobj.group(1)
2100 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2101 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2102 b'TnpsbA0KTVRkbU1tSTRNdz09'
2106 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2107 webpage = self._download_webpage(webpage_url, video_id)
2109 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2110 if mobj is not None:
2111 self.report_extraction(video_id)
2112 video_url = mobj.group(1) + '.flv'
2114 video_title = self._html_search_regex('<title>([^<]+)</title>',
2117 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2123 'upload_date': None,
2124 'title': video_title,
2129 mobj = re.search('var flashvars={(.+?)}', webpage)
2131 raise ExtractorError(u'Unable to extract video')
2136 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2137 if not a == '_encxml':
2140 encxml = compat_urllib_parse.unquote(b)
2141 if not params.get('domain'):
2142 params['domain'] = 'www.myvideo.de'
2143 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2144 if 'flash_playertype=MTV' in xmldata_url:
2145 self._downloader.report_warning(u'avoiding MTV player')
2147 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2148 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2152 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2153 enc_data_b = binascii.unhexlify(enc_data)
2155 base64.b64decode(base64.b64decode(GK)) +
2157 str(video_id).encode('utf-8')
2160 dec_data = self.__rc4crypt(enc_data_b, sk)
2163 self.report_extraction(video_id)
2166 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2168 video_url = compat_urllib_parse.unquote(mobj.group(1))
2169 if 'myvideo2flash' in video_url:
2170 self._downloader.report_warning(u'forcing RTMPT ...')
2171 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2174 # extract non rtmp videos
2175 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2177 raise ExtractorError(u'unable to extract url')
2178 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2180 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2181 video_file = compat_urllib_parse.unquote(video_file)
2183 if not video_file.endswith('f4m'):
2184 ppath, prefix = video_file.split('.')
2185 video_playpath = '%s:%s' % (prefix, ppath)
2186 video_hls_playlist = ''
2189 video_hls_playlist = (
2190 video_filepath + video_file
2191 ).replace('.f4m', '.m3u8')
2193 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2194 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2196 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2202 'tc_url': video_url,
2204 'upload_date': None,
2205 'title': video_title,
2207 'play_path': video_playpath,
2208 'video_file': video_file,
2209 'video_hls_playlist': video_hls_playlist,
2210 'player_url': video_swfobj,
2214 class ComedyCentralIE(InfoExtractor):
2215 """Information extractor for The Daily Show and Colbert Report """
2217 # urls can be abbreviations like :thedailyshow or :colbert
2218 # urls for episodes like:
2219 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2220 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2221 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2222 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2223 |(https?://)?(www\.)?
2224 (?P<showname>thedailyshow|colbertnation)\.com/
2225 (full-episodes/(?P<episode>.*)|
2227 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2228 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2231 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2233 _video_extensions = {
2241 _video_dimensions = {
2251 def suitable(cls, url):
2252 """Receives a URL and returns True if suitable for this IE."""
2253 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2255 def _print_formats(self, formats):
2256 print('Available formats:')
2258 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2261 def _real_extract(self, url):
2262 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2264 raise ExtractorError(u'Invalid URL: %s' % url)
2266 if mobj.group('shortname'):
2267 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2268 url = u'http://www.thedailyshow.com/full-episodes/'
2270 url = u'http://www.colbertnation.com/full-episodes/'
2271 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2272 assert mobj is not None
2274 if mobj.group('clip'):
2275 if mobj.group('showname') == 'thedailyshow':
2276 epTitle = mobj.group('tdstitle')
2278 epTitle = mobj.group('cntitle')
2281 dlNewest = not mobj.group('episode')
2283 epTitle = mobj.group('showname')
2285 epTitle = mobj.group('episode')
2287 self.report_extraction(epTitle)
2288 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2290 url = htmlHandle.geturl()
2291 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2293 raise ExtractorError(u'Invalid redirected URL: ' + url)
2294 if mobj.group('episode') == '':
2295 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2296 epTitle = mobj.group('episode')
2298 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2300 if len(mMovieParams) == 0:
2301 # The Colbert Report embeds the information in a without
2302 # a URL prefix; so extract the alternate reference
2303 # and then add the URL prefix manually.
2305 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2306 if len(altMovieParams) == 0:
2307 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2309 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2311 uri = mMovieParams[0][1]
2312 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2313 indexXml = self._download_webpage(indexUrl, epTitle,
2314 u'Downloading show index',
2315 u'unable to download episode index')
2319 idoc = xml.etree.ElementTree.fromstring(indexXml)
2320 itemEls = idoc.findall('.//item')
2321 for partNum,itemEl in enumerate(itemEls):
2322 mediaId = itemEl.findall('./guid')[0].text
2323 shortMediaId = mediaId.split(':')[-1]
2324 showId = mediaId.split(':')[-2].replace('.com', '')
2325 officialTitle = itemEl.findall('./title')[0].text
2326 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2328 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2329 compat_urllib_parse.urlencode({'uri': mediaId}))
2330 configXml = self._download_webpage(configUrl, epTitle,
2331 u'Downloading configuration for %s' % shortMediaId)
2333 cdoc = xml.etree.ElementTree.fromstring(configXml)
2335 for rendition in cdoc.findall('.//rendition'):
2336 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2340 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2343 if self._downloader.params.get('listformats', None):
2344 self._print_formats([i[0] for i in turls])
2347 # For now, just pick the highest bitrate
2348 format,rtmp_video_url = turls[-1]
2350 # Get the format arg from the arg stream
2351 req_format = self._downloader.params.get('format', None)
2353 # Select format if we can find one
2356 format, rtmp_video_url = f, v
2359 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2361 raise ExtractorError(u'Cannot transform RTMP url')
2362 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2363 video_url = base + m.group('finalid')
2365 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2370 'upload_date': officialDate,
2375 'description': officialTitle,
2377 results.append(info)
2382 class EscapistIE(InfoExtractor):
2383 """Information extractor for The Escapist """
2385 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2386 IE_NAME = u'escapist'
2388 def _real_extract(self, url):
2389 mobj = re.match(self._VALID_URL, url)
2391 raise ExtractorError(u'Invalid URL: %s' % url)
2392 showName = mobj.group('showname')
2393 videoId = mobj.group('episode')
2395 self.report_extraction(videoId)
2396 webpage = self._download_webpage(url, videoId)
2398 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2399 webpage, u'description', fatal=False)
2401 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2402 webpage, u'thumbnail', fatal=False)
2404 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2405 webpage, u'player url')
2407 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2408 webpage, u'player url').split(' : ')[-1]
2410 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2411 configUrl = compat_urllib_parse.unquote(configUrl)
2413 configJSON = self._download_webpage(configUrl, videoId,
2414 u'Downloading configuration',
2415 u'unable to download configuration')
2417 # Technically, it's JavaScript, not JSON
2418 configJSON = configJSON.replace("'", '"')
2421 config = json.loads(configJSON)
2422 except (ValueError,) as err:
2423 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2425 playlist = config['playlist']
2426 videoUrl = playlist[1]['url']
2431 'uploader': showName,
2432 'upload_date': None,
2435 'thumbnail': imgUrl,
2436 'description': videoDesc,
2437 'player_url': playerUrl,
2442 class CollegeHumorIE(InfoExtractor):
2443 """Information extractor for collegehumor.com"""
2446 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2447 IE_NAME = u'collegehumor'
2449 def report_manifest(self, video_id):
2450 """Report information extraction."""
2451 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2453 def _real_extract(self, url):
2454 mobj = re.match(self._VALID_URL, url)
2456 raise ExtractorError(u'Invalid URL: %s' % url)
2457 video_id = mobj.group('videoid')
2462 'upload_date': None,
2465 self.report_extraction(video_id)
2466 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2468 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2472 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2474 videoNode = mdoc.findall('./video')[0]
2475 info['description'] = videoNode.findall('./description')[0].text
2476 info['title'] = videoNode.findall('./caption')[0].text
2477 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2478 manifest_url = videoNode.findall('./file')[0].text
2480 raise ExtractorError(u'Invalid metadata XML file')
2482 manifest_url += '?hdcore=2.10.3'
2483 self.report_manifest(video_id)
2485 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2489 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2491 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2492 node_id = media_node.attrib['url']
2493 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2494 except IndexError as err:
2495 raise ExtractorError(u'Invalid manifest file')
2497 url_pr = compat_urllib_parse_urlparse(manifest_url)
2498 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2505 class XVideosIE(InfoExtractor):
2506 """Information extractor for xvideos.com"""
2508 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2509 IE_NAME = u'xvideos'
2511 def _real_extract(self, url):
2512 mobj = re.match(self._VALID_URL, url)
2514 raise ExtractorError(u'Invalid URL: %s' % url)
2515 video_id = mobj.group(1)
2517 webpage = self._download_webpage(url, video_id)
2519 self.report_extraction(video_id)
2522 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2523 webpage, u'video URL'))
2526 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2529 # Extract video thumbnail
2530 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2531 webpage, u'thumbnail', fatal=False)
2537 'upload_date': None,
2538 'title': video_title,
2540 'thumbnail': video_thumbnail,
2541 'description': None,
2547 class SoundcloudIE(InfoExtractor):
2548 """Information extractor for soundcloud.com
2549 To access the media, the uid of the song and a stream token
2550 must be extracted from the page source and the script must make
2551 a request to media.soundcloud.com/crossdomain.xml. Then
2552 the media can be grabbed by requesting from an url composed
2553 of the stream token and uid
2556 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2557 IE_NAME = u'soundcloud'
2559 def report_resolve(self, video_id):
2560 """Report information extraction."""
2561 self.to_screen(u'%s: Resolving id' % video_id)
2563 def _real_extract(self, url):
2564 mobj = re.match(self._VALID_URL, url)
2566 raise ExtractorError(u'Invalid URL: %s' % url)
2568 # extract uploader (which is in the url)
2569 uploader = mobj.group(1)
2570 # extract simple title (uploader + slug of song title)
2571 slug_title = mobj.group(2)
2572 simple_title = uploader + u'-' + slug_title
2573 full_title = '%s/%s' % (uploader, slug_title)
2575 self.report_resolve(full_title)
2577 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2578 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2579 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2581 info = json.loads(info_json)
2582 video_id = info['id']
2583 self.report_extraction(full_title)
2585 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2586 stream_json = self._download_webpage(streams_url, full_title,
2587 u'Downloading stream definitions',
2588 u'unable to download stream definitions')
2590 streams = json.loads(stream_json)
2591 mediaURL = streams['http_mp3_128_url']
2592 upload_date = unified_strdate(info['created_at'])
2597 'uploader': info['user']['username'],
2598 'upload_date': upload_date,
2599 'title': info['title'],
2601 'description': info['description'],
2604 class SoundcloudSetIE(InfoExtractor):
2605 """Information extractor for soundcloud.com sets
2606 To access the media, the uid of the song and a stream token
2607 must be extracted from the page source and the script must make
2608 a request to media.soundcloud.com/crossdomain.xml. Then
2609 the media can be grabbed by requesting from an url composed
2610 of the stream token and uid
2613 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2614 IE_NAME = u'soundcloud:set'
2616 def report_resolve(self, video_id):
2617 """Report information extraction."""
2618 self.to_screen(u'%s: Resolving id' % video_id)
2620 def _real_extract(self, url):
2621 mobj = re.match(self._VALID_URL, url)
2623 raise ExtractorError(u'Invalid URL: %s' % url)
2625 # extract uploader (which is in the url)
2626 uploader = mobj.group(1)
2627 # extract simple title (uploader + slug of song title)
2628 slug_title = mobj.group(2)
2629 simple_title = uploader + u'-' + slug_title
2630 full_title = '%s/sets/%s' % (uploader, slug_title)
2632 self.report_resolve(full_title)
2634 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2635 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2636 info_json = self._download_webpage(resolv_url, full_title)
2639 info = json.loads(info_json)
2640 if 'errors' in info:
2641 for err in info['errors']:
2642 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2645 self.report_extraction(full_title)
2646 for track in info['tracks']:
2647 video_id = track['id']
2649 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2652 self.report_extraction(video_id)
2653 streams = json.loads(stream_json)
2654 mediaURL = streams['http_mp3_128_url']
2659 'uploader': track['user']['username'],
2660 'upload_date': unified_strdate(track['created_at']),
2661 'title': track['title'],
2663 'description': track['description'],
2668 class InfoQIE(InfoExtractor):
2669 """Information extractor for infoq.com"""
2670 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672 def _real_extract(self, url):
2673 mobj = re.match(self._VALID_URL, url)
2675 raise ExtractorError(u'Invalid URL: %s' % url)
2677 webpage = self._download_webpage(url, video_id=url)
2678 self.report_extraction(url)
2681 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2683 raise ExtractorError(u'Unable to extract video url')
2684 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2685 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2688 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2691 # Extract description
2692 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2693 webpage, u'description', fatal=False)
2695 video_filename = video_url.split('/')[-1]
2696 video_id, extension = video_filename.split('.')
2702 'upload_date': None,
2703 'title': video_title,
2704 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2706 'description': video_description,
2711 class MixcloudIE(InfoExtractor):
2712 """Information extractor for www.mixcloud.com"""
2714 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2715 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2716 IE_NAME = u'mixcloud'
2718 def report_download_json(self, file_id):
2719 """Report JSON download."""
2720 self.to_screen(u'Downloading json')
2722 def get_urls(self, jsonData, fmt, bitrate='best'):
2723 """Get urls from 'audio_formats' section in json"""
2726 bitrate_list = jsonData[fmt]
2727 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2728 bitrate = max(bitrate_list) # select highest
2730 url_list = jsonData[fmt][bitrate]
2731 except TypeError: # we have no bitrate info.
2732 url_list = jsonData[fmt]
2735 def check_urls(self, url_list):
2736 """Returns 1st active url from list"""
2737 for url in url_list:
2739 compat_urllib_request.urlopen(url)
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746 def _print_formats(self, formats):
2747 print('Available formats:')
2748 for fmt in formats.keys():
2749 for b in formats[fmt]:
2751 ext = formats[fmt][b][0]
2752 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2753 except TypeError: # we have no bitrate info
2754 ext = formats[fmt][0]
2755 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2758 def _real_extract(self, url):
2759 mobj = re.match(self._VALID_URL, url)
2761 raise ExtractorError(u'Invalid URL: %s' % url)
2762 # extract uploader & filename from url
2763 uploader = mobj.group(1).decode('utf-8')
2764 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2766 # construct API request
2767 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2768 # retrieve .json file with links to files
2769 request = compat_urllib_request.Request(file_url)
2771 self.report_download_json(file_url)
2772 jsonData = compat_urllib_request.urlopen(request).read()
2773 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2777 json_data = json.loads(jsonData)
2778 player_url = json_data['player_swf_url']
2779 formats = dict(json_data['audio_formats'])
2781 req_format = self._downloader.params.get('format', None)
2784 if self._downloader.params.get('listformats', None):
2785 self._print_formats(formats)
2788 if req_format is None or req_format == 'best':
2789 for format_param in formats.keys():
2790 url_list = self.get_urls(formats, format_param)
2792 file_url = self.check_urls(url_list)
2793 if file_url is not None:
2796 if req_format not in formats:
2797 raise ExtractorError(u'Format is not available')
2799 url_list = self.get_urls(formats, req_format)
2800 file_url = self.check_urls(url_list)
2801 format_param = req_format
2804 'id': file_id.decode('utf-8'),
2805 'url': file_url.decode('utf-8'),
2806 'uploader': uploader.decode('utf-8'),
2807 'upload_date': None,
2808 'title': json_data['name'],
2809 'ext': file_url.split('.')[-1].decode('utf-8'),
2810 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2811 'thumbnail': json_data['thumbnail_url'],
2812 'description': json_data['description'],
2813 'player_url': player_url.decode('utf-8'),
2816 class StanfordOpenClassroomIE(InfoExtractor):
2817 """Information extractor for Stanford's Open ClassRoom"""
2819 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2820 IE_NAME = u'stanfordoc'
2822 def _real_extract(self, url):
2823 mobj = re.match(self._VALID_URL, url)
2825 raise ExtractorError(u'Invalid URL: %s' % url)
2827 if mobj.group('course') and mobj.group('video'): # A specific video
2828 course = mobj.group('course')
2829 video = mobj.group('video')
2831 'id': course + '_' + video,
2833 'upload_date': None,
2836 self.report_extraction(info['id'])
2837 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2838 xmlUrl = baseUrl + video + '.xml'
2840 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2841 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2842 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2843 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2845 info['title'] = mdoc.findall('./title')[0].text
2846 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2848 raise ExtractorError(u'Invalid metadata XML file')
2849 info['ext'] = info['url'].rpartition('.')[2]
2851 elif mobj.group('course'): # A course page
2852 course = mobj.group('course')
2857 'upload_date': None,
2860 coursepage = self._download_webpage(url, info['id'],
2861 note='Downloading course info page',
2862 errnote='Unable to download course info page')
2864 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2866 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2867 coursepage, u'description', fatal=False)
2869 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2872 'type': 'reference',
2873 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2877 for entry in info['list']:
2878 assert entry['type'] == 'reference'
2879 results += self.extract(entry['url'])
2883 'id': 'Stanford OpenClassroom',
2886 'upload_date': None,
2889 self.report_download_webpage(info['id'])
2890 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2892 rootpage = compat_urllib_request.urlopen(rootURL).read()
2893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2896 info['title'] = info['id']
2898 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2901 'type': 'reference',
2902 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2907 for entry in info['list']:
2908 assert entry['type'] == 'reference'
2909 results += self.extract(entry['url'])
2912 class MTVIE(InfoExtractor):
2913 """Information extractor for MTV.com"""
2915 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2918 def _real_extract(self, url):
2919 mobj = re.match(self._VALID_URL, url)
2921 raise ExtractorError(u'Invalid URL: %s' % url)
2922 if not mobj.group('proto'):
2923 url = 'http://' + url
2924 video_id = mobj.group('videoid')
2926 webpage = self._download_webpage(url, video_id)
2928 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2929 webpage, u'song name', fatal=False)
2931 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2934 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2935 webpage, u'mtvn_uri', fatal=False)
2937 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2938 webpage, u'content id', fatal=False)
2940 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2941 self.report_extraction(video_id)
2942 request = compat_urllib_request.Request(videogen_url)
2944 metadataXml = compat_urllib_request.urlopen(request).read()
2945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2948 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2949 renditions = mdoc.findall('.//rendition')
2951 # For now, always pick the highest quality.
2952 rendition = renditions[-1]
2955 _,_,ext = rendition.attrib['type'].partition('/')
2956 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2957 video_url = rendition.find('./src').text
2959 raise ExtractorError('Invalid rendition field.')
2964 'uploader': performer,
2965 'upload_date': None,
2966 'title': video_title,
2974 class YoukuIE(InfoExtractor):
2975 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2978 nowTime = int(time.time() * 1000)
2979 random1 = random.randint(1000,1998)
2980 random2 = random.randint(1000,9999)
2982 return "%d%d%d" %(nowTime,random1,random2)
2984 def _get_file_ID_mix_string(self, seed):
2986 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2988 for i in range(len(source)):
2989 seed = (seed * 211 + 30031 ) % 65536
2990 index = math.floor(seed / 65536 * len(source) )
2991 mixed.append(source[int(index)])
2992 source.remove(source[int(index)])
2993 #return ''.join(mixed)
2996 def _get_file_id(self, fileId, seed):
2997 mixed = self._get_file_ID_mix_string(seed)
2998 ids = fileId.split('*')
3002 realId.append(mixed[int(ch)])
3003 return ''.join(realId)
3005 def _real_extract(self, url):
3006 mobj = re.match(self._VALID_URL, url)
3008 raise ExtractorError(u'Invalid URL: %s' % url)
3009 video_id = mobj.group('ID')
3011 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3013 jsondata = self._download_webpage(info_url, video_id)
3015 self.report_extraction(video_id)
3017 config = json.loads(jsondata)
3019 video_title = config['data'][0]['title']
3020 seed = config['data'][0]['seed']
3022 format = self._downloader.params.get('format', None)
3023 supported_format = list(config['data'][0]['streamfileids'].keys())
3025 if format is None or format == 'best':
3026 if 'hd2' in supported_format:
3031 elif format == 'worst':
3039 fileid = config['data'][0]['streamfileids'][format]
3040 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3041 except (UnicodeDecodeError, ValueError, KeyError):
3042 raise ExtractorError(u'Unable to extract info section')
3045 sid = self._gen_sid()
3046 fileid = self._get_file_id(fileid, seed)
3048 #column 8,9 of fileid represent the segment number
3049 #fileid[7:9] should be changed
3050 for index, key in enumerate(keys):
3052 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3053 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3056 'id': '%s_part%02d' % (video_id, index),
3057 'url': download_url,
3059 'upload_date': None,
3060 'title': video_title,
3063 files_info.append(info)
3068 class XNXXIE(InfoExtractor):
3069 """Information extractor for xnxx.com"""
3071 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3073 VIDEO_URL_RE = r'flv_url=(.*?)&'
3074 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3075 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3077 def _real_extract(self, url):
3078 mobj = re.match(self._VALID_URL, url)
3080 raise ExtractorError(u'Invalid URL: %s' % url)
3081 video_id = mobj.group(1)
3083 # Get webpage content
3084 webpage = self._download_webpage(url, video_id)
3086 video_url = self._search_regex(self.VIDEO_URL_RE,
3087 webpage, u'video URL')
3088 video_url = compat_urllib_parse.unquote(video_url)
3090 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3093 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3094 webpage, u'thumbnail', fatal=False)
3100 'upload_date': None,
3101 'title': video_title,
3103 'thumbnail': video_thumbnail,
3104 'description': None,
3108 class GooglePlusIE(InfoExtractor):
3109 """Information extractor for plus.google.com."""
3111 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3112 IE_NAME = u'plus.google'
3114 def _real_extract(self, url):
3115 # Extract id from URL
3116 mobj = re.match(self._VALID_URL, url)
3118 raise ExtractorError(u'Invalid URL: %s' % url)
3120 post_url = mobj.group(0)
3121 video_id = mobj.group(1)
3123 video_extension = 'flv'
3125 # Step 1, Retrieve post webpage to extract further information
3126 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3128 self.report_extraction(video_id)
3130 # Extract update date
3131 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3132 webpage, u'upload date', fatal=False)
3134 # Convert timestring to a format suitable for filename
3135 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3136 upload_date = upload_date.strftime('%Y%m%d')
3139 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3140 webpage, u'uploader', fatal=False)
3143 # Get the first line for title
3144 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3145 webpage, 'title', default=u'NA')
3147 # Step 2, Stimulate clicking the image box to launch video
3148 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3149 webpage, u'video page URL')
3150 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3152 # Extract video links on video page
3153 """Extract video links of all sizes"""
3154 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3155 mobj = re.findall(pattern, webpage)
3157 raise ExtractorError(u'Unable to extract video links')
3159 # Sort in resolution
3160 links = sorted(mobj)
3162 # Choose the lowest of the sort, i.e. highest resolution
3163 video_url = links[-1]
3164 # Only get the url. The resolution part in the tuple has no use anymore
3165 video_url = video_url[-1]
3166 # Treat escaped \u0026 style hex
3168 video_url = video_url.decode("unicode_escape")
3169 except AttributeError: # Python 3
3170 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3176 'uploader': uploader,
3177 'upload_date': upload_date,
3178 'title': video_title,
3179 'ext': video_extension,
3182 class NBAIE(InfoExtractor):
3183 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3186 def _real_extract(self, url):
3187 mobj = re.match(self._VALID_URL, url)
3189 raise ExtractorError(u'Invalid URL: %s' % url)
3191 video_id = mobj.group(1)
3193 webpage = self._download_webpage(url, video_id)
3195 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3197 shortened_video_id = video_id.rpartition('/')[2]
3198 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3199 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3201 # It isn't there in the HTML it returns to us
3202 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3204 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3207 'id': shortened_video_id,
3211 # 'uploader_date': uploader_date,
3212 'description': description,
3216 class JustinTVIE(InfoExtractor):
3217 """Information extractor for justin.tv and twitch.tv"""
3218 # TODO: One broadcast may be split into multiple videos. The key
3219 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3220 # starts at 1 and increases. Can we treat all parts as one video?
3222 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3224 (?P<channelid>[^/]+)|
3225 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3226 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3230 _JUSTIN_PAGE_LIMIT = 100
3231 IE_NAME = u'justin.tv'
3233 def report_download_page(self, channel, offset):
3234 """Report attempt to download a single page of videos."""
3235 self.to_screen(u'%s: Downloading video information from %d to %d' %
3236 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3238 # Return count of items, list of *valid* items
3239 def _parse_page(self, url, video_id):
3240 webpage = self._download_webpage(url, video_id,
3241 u'Downloading video info JSON',
3242 u'unable to download video info JSON')
3244 response = json.loads(webpage)
3245 if type(response) != list:
3246 error_text = response.get('error', 'unknown error')
3247 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3249 for clip in response:
3250 video_url = clip['video_file_url']
3252 video_extension = os.path.splitext(video_url)[1][1:]
3253 video_date = re.sub('-', '', clip['start_time'][:10])
3254 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3255 video_id = clip['id']
3256 video_title = clip.get('title', video_id)
3260 'title': video_title,
3261 'uploader': clip.get('channel_name', video_uploader_id),
3262 'uploader_id': video_uploader_id,
3263 'upload_date': video_date,
3264 'ext': video_extension,
3266 return (len(response), info)
3268 def _real_extract(self, url):
3269 mobj = re.match(self._VALID_URL, url)
3271 raise ExtractorError(u'invalid URL: %s' % url)
3273 api_base = 'http://api.justin.tv'
3275 if mobj.group('channelid'):
3277 video_id = mobj.group('channelid')
3278 api = api_base + '/channel/archives/%s.json' % video_id
3279 elif mobj.group('chapterid'):
3280 chapter_id = mobj.group('chapterid')
3282 webpage = self._download_webpage(url, chapter_id)
3283 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3285 raise ExtractorError(u'Cannot find archive of a chapter')
3286 archive_id = m.group(1)
3288 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3289 chapter_info_xml = self._download_webpage(api, chapter_id,
3290 note=u'Downloading chapter information',
3291 errnote=u'Chapter information download failed')
3292 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3293 for a in doc.findall('.//archive'):
3294 if archive_id == a.find('./id').text:
3297 raise ExtractorError(u'Could not find chapter in chapter information')
3299 video_url = a.find('./video_file_url').text
3300 video_ext = video_url.rpartition('.')[2] or u'flv'
3302 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3303 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3304 note='Downloading chapter metadata',
3305 errnote='Download of chapter metadata failed')
3306 chapter_info = json.loads(chapter_info_json)
3308 bracket_start = int(doc.find('.//bracket_start').text)
3309 bracket_end = int(doc.find('.//bracket_end').text)
3311 # TODO determine start (and probably fix up file)
3312 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3313 #video_url += u'?start=' + TODO:start_timestamp
3314 # bracket_start is 13290, but we want 51670615
3315 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3316 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3319 'id': u'c' + chapter_id,
3322 'title': chapter_info['title'],
3323 'thumbnail': chapter_info['preview'],
3324 'description': chapter_info['description'],
3325 'uploader': chapter_info['channel']['display_name'],
3326 'uploader_id': chapter_info['channel']['name'],
3330 video_id = mobj.group('videoid')
3331 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3333 self.report_extraction(video_id)
3337 limit = self._JUSTIN_PAGE_LIMIT
3340 self.report_download_page(video_id, offset)
3341 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3342 page_count, page_info = self._parse_page(page_url, video_id)
3343 info.extend(page_info)
3344 if not paged or page_count != limit:
3349 class FunnyOrDieIE(InfoExtractor):
3350 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3352 def _real_extract(self, url):
3353 mobj = re.match(self._VALID_URL, url)
3355 raise ExtractorError(u'invalid URL: %s' % url)
3357 video_id = mobj.group('id')
3358 webpage = self._download_webpage(url, video_id)
3360 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3361 webpage, u'video URL', flags=re.DOTALL)
3363 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3364 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3366 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3367 webpage, u'description', fatal=False, flags=re.DOTALL)
3374 'description': video_description,
3378 class SteamIE(InfoExtractor):
3379 _VALID_URL = r"""http://store\.steampowered\.com/
3381 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3383 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3385 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3386 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3389 def suitable(cls, url):
3390 """Receives a URL and returns True if suitable for this IE."""
3391 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3393 def _real_extract(self, url):
3394 m = re.match(self._VALID_URL, url, re.VERBOSE)
3395 gameID = m.group('gameID')
3397 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3398 webpage = self._download_webpage(videourl, gameID)
3400 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3401 videourl = self._AGECHECK_TEMPLATE % gameID
3402 self.report_age_confirmation()
3403 webpage = self._download_webpage(videourl, gameID)
3405 self.report_extraction(gameID)
3406 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3407 webpage, 'game title')
3409 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3410 mweb = re.finditer(urlRE, webpage)
3411 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3412 titles = re.finditer(namesRE, webpage)
3413 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3414 thumbs = re.finditer(thumbsRE, webpage)
3416 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3417 video_id = vid.group('videoID')
3418 title = vtitle.group('videoName')
3419 video_url = vid.group('videoURL')
3420 video_thumb = thumb.group('thumbnail')
3422 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3427 'title': unescapeHTML(title),
3428 'thumbnail': video_thumb
3431 return [self.playlist_result(videos, gameID, game_title)]
3433 class UstreamIE(InfoExtractor):
3434 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3435 IE_NAME = u'ustream'
3437 def _real_extract(self, url):
3438 m = re.match(self._VALID_URL, url)
3439 video_id = m.group('videoID')
3441 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3442 webpage = self._download_webpage(url, video_id)
3444 self.report_extraction(video_id)
3446 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3449 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3450 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3452 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3453 webpage, u'thumbnail', fatal=False)
3459 'title': video_title,
3460 'uploader': uploader,
3461 'thumbnail': thumbnail,
3465 class WorldStarHipHopIE(InfoExtractor):
3466 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3467 IE_NAME = u'WorldStarHipHop'
3469 def _real_extract(self, url):
3470 m = re.match(self._VALID_URL, url)
3471 video_id = m.group('id')
3473 webpage_src = self._download_webpage(url, video_id)
3475 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3476 webpage_src, u'video URL')
3478 if 'mp4' in video_url:
3483 video_title = self._html_search_regex(r"<title>(.*)</title>",
3484 webpage_src, u'title')
3486 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3487 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3488 webpage_src, u'thumbnail', fatal=False)
3491 _title = r"""candytitles.*>(.*)</span>"""
3492 mobj = re.search(_title, webpage_src)
3493 if mobj is not None:
3494 video_title = mobj.group(1)
3499 'title' : video_title,
3500 'thumbnail' : thumbnail,
3505 class RBMARadioIE(InfoExtractor):
3506 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3508 def _real_extract(self, url):
3509 m = re.match(self._VALID_URL, url)
3510 video_id = m.group('videoID')
3512 webpage = self._download_webpage(url, video_id)
3514 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3515 webpage, u'json data', flags=re.MULTILINE)
3518 data = json.loads(json_data)
3519 except ValueError as e:
3520 raise ExtractorError(u'Invalid JSON: ' + str(e))
3522 video_url = data['akamai_url'] + '&cbr=256'
3523 url_parts = compat_urllib_parse_urlparse(video_url)
3524 video_ext = url_parts.path.rpartition('.')[2]
3529 'title': data['title'],
3530 'description': data.get('teaser_text'),
3531 'location': data.get('country_of_origin'),
3532 'uploader': data.get('host', {}).get('name'),
3533 'uploader_id': data.get('host', {}).get('slug'),
3534 'thumbnail': data.get('image', {}).get('large_url_2x'),
3535 'duration': data.get('duration'),
3540 class YouPornIE(InfoExtractor):
3541 """Information extractor for youporn.com."""
3542 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3544 def _print_formats(self, formats):
3545 """Print all available formats"""
3546 print(u'Available formats:')
3547 print(u'ext\t\tformat')
3548 print(u'---------------------------------')
3549 for format in formats:
3550 print(u'%s\t\t%s' % (format['ext'], format['format']))
3552 def _specific(self, req_format, formats):
3554 if(x["format"]==req_format):
3558 def _real_extract(self, url):
3559 mobj = re.match(self._VALID_URL, url)
3561 raise ExtractorError(u'Invalid URL: %s' % url)
3562 video_id = mobj.group('videoid')
3564 req = compat_urllib_request.Request(url)
3565 req.add_header('Cookie', 'age_verified=1')
3566 webpage = self._download_webpage(req, video_id)
3568 # Get JSON parameters
3569 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3571 params = json.loads(json_params)
3573 raise ExtractorError(u'Invalid JSON')
3575 self.report_extraction(video_id)
3577 video_title = params['title']
3578 upload_date = unified_strdate(params['release_date_f'])
3579 video_description = params['description']
3580 video_uploader = params['submitted_by']
3581 thumbnail = params['thumbnails'][0]['image']
3583 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3585 # Get all of the formats available
3586 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3587 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3588 webpage, u'download list').strip()
3590 # Get all of the links from the page
3591 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3592 links = re.findall(LINK_RE, download_list_html)
3593 if(len(links) == 0):
3594 raise ExtractorError(u'ERROR: no known formats available for video')
3596 self.to_screen(u'Links found: %d' % len(links))
3601 # A link looks like this:
3602 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3603 # A path looks like this:
3604 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3605 video_url = unescapeHTML( link )
3606 path = compat_urllib_parse_urlparse( video_url ).path
3607 extension = os.path.splitext( path )[1][1:]
3608 format = path.split('/')[4].split('_')[:2]
3611 format = "-".join( format )
3612 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3617 'uploader': video_uploader,
3618 'upload_date': upload_date,
3619 'title': video_title,
3622 'thumbnail': thumbnail,
3623 'description': video_description
3626 if self._downloader.params.get('listformats', None):
3627 self._print_formats(formats)
3630 req_format = self._downloader.params.get('format', None)
3631 self.to_screen(u'Format: %s' % req_format)
3633 if req_format is None or req_format == 'best':
3635 elif req_format == 'worst':
3636 return [formats[-1]]
3637 elif req_format in ('-1', 'all'):
3640 format = self._specific( req_format, formats )
3642 raise ExtractorError(u'Requested format not available')
3647 class PornotubeIE(InfoExtractor):
3648 """Information extractor for pornotube.com."""
3649 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3651 def _real_extract(self, url):
3652 mobj = re.match(self._VALID_URL, url)
3654 raise ExtractorError(u'Invalid URL: %s' % url)
3656 video_id = mobj.group('videoid')
3657 video_title = mobj.group('title')
3659 # Get webpage content
3660 webpage = self._download_webpage(url, video_id)
3663 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3664 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3665 video_url = compat_urllib_parse.unquote(video_url)
3667 #Get the uploaded date
3668 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3669 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3670 if upload_date: upload_date = unified_strdate(upload_date)
3672 info = {'id': video_id,
3675 'upload_date': upload_date,
3676 'title': video_title,
3682 class YouJizzIE(InfoExtractor):
3683 """Information extractor for youjizz.com."""
3684 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3686 def _real_extract(self, url):
3687 mobj = re.match(self._VALID_URL, url)
3689 raise ExtractorError(u'Invalid URL: %s' % url)
3691 video_id = mobj.group('videoid')
3693 # Get webpage content
3694 webpage = self._download_webpage(url, video_id)
3696 # Get the video title
3697 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3698 webpage, u'title').strip()
3700 # Get the embed page
3701 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3703 raise ExtractorError(u'ERROR: unable to extract embed page')
3705 embed_page_url = result.group(0).strip()
3706 video_id = result.group('videoid')
3708 webpage = self._download_webpage(embed_page_url, video_id)
3711 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3712 webpage, u'video URL')
3714 info = {'id': video_id,
3716 'title': video_title,
3719 'player_url': embed_page_url}
3723 class EightTracksIE(InfoExtractor):
3725 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3727 def _real_extract(self, url):
3728 mobj = re.match(self._VALID_URL, url)
3730 raise ExtractorError(u'Invalid URL: %s' % url)
3731 playlist_id = mobj.group('id')
3733 webpage = self._download_webpage(url, playlist_id)
3735 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3736 data = json.loads(json_like)
3738 session = str(random.randint(0, 1000000000))
3740 track_count = data['tracks_count']
3741 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3742 next_url = first_url
3744 for i in itertools.count():
3745 api_json = self._download_webpage(next_url, playlist_id,
3746 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3747 errnote=u'Failed to download song information')
3748 api_data = json.loads(api_json)
3749 track_data = api_data[u'set']['track']
3751 'id': track_data['id'],
3752 'url': track_data['track_file_stream_url'],
3753 'title': track_data['performer'] + u' - ' + track_data['name'],
3754 'raw_title': track_data['name'],
3755 'uploader_id': data['user']['login'],
3759 if api_data['set']['at_last_track']:
3761 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3764 class KeekIE(InfoExtractor):
3765 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3768 def _real_extract(self, url):
3769 m = re.match(self._VALID_URL, url)
3770 video_id = m.group('videoID')
3772 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3773 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3774 webpage = self._download_webpage(url, video_id)
3776 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3779 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3780 webpage, u'uploader', fatal=False)
3786 'title': video_title,
3787 'thumbnail': thumbnail,
3788 'uploader': uploader
3792 class TEDIE(InfoExtractor):
3793 _VALID_URL=r'''http://www\.ted\.com/
3795 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3797 ((?P<type_talk>talks)) # We have a simple talk
3799 (/lang/(.*?))? # The url may contain the language
3800 /(?P<name>\w+) # Here goes the name and then ".html"
3804 def suitable(cls, url):
3805 """Receives a URL and returns True if suitable for this IE."""
3806 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3808 def _real_extract(self, url):
3809 m=re.match(self._VALID_URL, url, re.VERBOSE)
3810 if m.group('type_talk'):
3811 return [self._talk_info(url)]
3813 playlist_id=m.group('playlist_id')
3814 name=m.group('name')
3815 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3816 return [self._playlist_videos_info(url,name,playlist_id)]
3818 def _playlist_videos_info(self,url,name,playlist_id=0):
3819 '''Returns the videos of the playlist'''
3821 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3822 ([.\s]*?)data-playlist_item_id="(\d+)"
3823 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3825 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3826 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3827 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3828 m_names=re.finditer(video_name_RE,webpage)
3830 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3831 webpage, 'playlist title')
3833 playlist_entries = []
3834 for m_video, m_name in zip(m_videos,m_names):
3835 video_id=m_video.group('video_id')
3836 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3837 playlist_entries.append(self.url_result(talk_url, 'TED'))
3838 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3840 def _talk_info(self, url, video_id=0):
3841 """Return the video for the talk in the url"""
3842 m = re.match(self._VALID_URL, url,re.VERBOSE)
3843 video_name = m.group('name')
3844 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3845 self.report_extraction(video_name)
3846 # If the url includes the language we get the title translated
3847 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3849 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3850 webpage, 'json data')
3851 info = json.loads(json_data)
3852 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3853 webpage, 'description', flags = re.DOTALL)
3855 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3856 webpage, 'thumbnail')
3859 'url': info['htmlStreams'][-1]['file'],
3862 'thumbnail': thumbnail,
3863 'description': desc,
3867 class MySpassIE(InfoExtractor):
3868 _VALID_URL = r'http://www.myspass.de/.*'
3870 def _real_extract(self, url):
3871 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3873 # video id is the last path element of the URL
3874 # usually there is a trailing slash, so also try the second but last
3875 url_path = compat_urllib_parse_urlparse(url).path
3876 url_parent_path, video_id = os.path.split(url_path)
3878 _, video_id = os.path.split(url_parent_path)
3881 metadata_url = META_DATA_URL_TEMPLATE % video_id
3882 metadata_text = self._download_webpage(metadata_url, video_id)
3883 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3885 # extract values from metadata
3886 url_flv_el = metadata.find('url_flv')
3887 if url_flv_el is None:
3888 raise ExtractorError(u'Unable to extract download url')
3889 video_url = url_flv_el.text
3890 extension = os.path.splitext(video_url)[1][1:]
3891 title_el = metadata.find('title')
3892 if title_el is None:
3893 raise ExtractorError(u'Unable to extract title')
3894 title = title_el.text
3895 format_id_el = metadata.find('format_id')
3896 if format_id_el is None:
3899 format = format_id_el.text
3900 description_el = metadata.find('description')
3901 if description_el is not None:
3902 description = description_el.text
3905 imagePreview_el = metadata.find('imagePreview')
3906 if imagePreview_el is not None:
3907 thumbnail = imagePreview_el.text
3916 'thumbnail': thumbnail,
3917 'description': description
3921 class SpiegelIE(InfoExtractor):
3922 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3924 def _real_extract(self, url):
3925 m = re.match(self._VALID_URL, url)
3926 video_id = m.group('videoID')
3928 webpage = self._download_webpage(url, video_id)
3930 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3933 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3934 xml_code = self._download_webpage(xml_url, video_id,
3935 note=u'Downloading XML', errnote=u'Failed to download XML')
3937 idoc = xml.etree.ElementTree.fromstring(xml_code)
3938 last_type = idoc[-1]
3939 filename = last_type.findall('./filename')[0].text
3940 duration = float(last_type.findall('./duration')[0].text)
3942 video_url = 'http://video2.spiegel.de/flash/' + filename
3943 video_ext = filename.rpartition('.')[2]
3948 'title': video_title,
3949 'duration': duration,
3953 class LiveLeakIE(InfoExtractor):
3955 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3956 IE_NAME = u'liveleak'
3958 def _real_extract(self, url):
3959 mobj = re.match(self._VALID_URL, url)
3961 raise ExtractorError(u'Invalid URL: %s' % url)
3963 video_id = mobj.group('video_id')
3965 webpage = self._download_webpage(url, video_id)
3967 video_url = self._search_regex(r'file: "(.*?)",',
3968 webpage, u'video URL')
3970 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3971 webpage, u'title').replace('LiveLeak.com -', '').strip()
3973 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3974 webpage, u'description', fatal=False)
3976 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3977 webpage, u'uploader', fatal=False)
3983 'title': video_title,
3984 'description': video_description,
3985 'uploader': video_uploader
3990 class ARDIE(InfoExtractor):
3991 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3992 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3993 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3995 def _real_extract(self, url):
3996 # determine video id from url
3997 m = re.match(self._VALID_URL, url)
3999 numid = re.search(r'documentId=([0-9]+)', url)
4001 video_id = numid.group(1)
4003 video_id = m.group('video_id')
4005 # determine title and media streams from webpage
4006 html = self._download_webpage(url, video_id)
4007 title = re.search(self._TITLE, html).group('title')
4008 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4010 assert '"fsk"' in html
4011 raise ExtractorError(u'This video is only available after 8:00 pm')
4013 # choose default media type and highest quality for now
4014 stream = max([s for s in streams if int(s["media_type"]) == 0],
4015 key=lambda s: int(s["quality"]))
4017 # there's two possibilities: RTMP stream or HTTP download
4018 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4019 if stream['rtmp_url']:
4020 self.to_screen(u'RTMP download detected')
4021 assert stream['video_url'].startswith('mp4:')
4022 info["url"] = stream["rtmp_url"]
4023 info["play_path"] = stream['video_url']
4025 assert stream["video_url"].endswith('.mp4')
4026 info["url"] = stream["video_url"]
4029 class ZDFIE(InfoExtractor):
4030 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4031 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4032 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4033 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4034 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4036 def _real_extract(self, url):
4037 mobj = re.match(self._VALID_URL, url)
4039 raise ExtractorError(u'Invalid URL: %s' % url)
4040 video_id = mobj.group('video_id')
4042 html = self._download_webpage(url, video_id)
4043 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4045 raise ExtractorError(u'No media url found.')
4047 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4048 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4049 # choose first/default media type and highest quality for now
4050 for s in streams: #find 300 - dsl1000mbit
4051 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4054 for s in streams: #find veryhigh - dsl2000mbit
4055 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4059 raise ExtractorError(u'No stream found.')
4061 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4063 self.report_extraction(video_id)
4064 mobj = re.search(self._TITLE, html)
4066 raise ExtractorError(u'Cannot extract title')
4067 title = unescapeHTML(mobj.group('title'))
4069 mobj = re.search(self._MMS_STREAM, media_link)
4071 mobj = re.search(self._RTSP_STREAM, media_link)
4073 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4074 mms_url = mobj.group('video_url')
4076 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4078 raise ExtractorError(u'Cannot extract extention')
4079 ext = mobj.group('ext')
4081 return [{'id': video_id,
4087 class TumblrIE(InfoExtractor):
4088 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4090 def _real_extract(self, url):
4091 m_url = re.match(self._VALID_URL, url)
4092 video_id = m_url.group('id')
4093 blog = m_url.group('blog_name')
4095 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4096 webpage = self._download_webpage(url, video_id)
4098 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4099 video = re.search(re_video, webpage)
4101 raise ExtractorError(u'Unable to extract video')
4102 video_url = video.group('video_url')
4103 ext = video.group('ext')
4105 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4106 webpage, u'thumbnail', fatal=False) # We pick the first poster
4107 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4109 # The only place where you can get a title, it's not complete,
4110 # but searching in other places doesn't work for all videos
4111 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4112 webpage, u'title', flags=re.DOTALL)
4114 return [{'id': video_id,
4116 'title': video_title,
4117 'thumbnail': video_thumbnail,
4121 class BandcampIE(InfoExtractor):
4122 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4124 def _real_extract(self, url):
4125 mobj = re.match(self._VALID_URL, url)
4126 title = mobj.group('title')
4127 webpage = self._download_webpage(url, title)
4128 # We get the link to the free download page
4129 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4130 if m_download is None:
4131 raise ExtractorError(u'No free songs found')
4133 download_link = m_download.group(1)
4134 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4135 webpage, re.MULTILINE|re.DOTALL).group('id')
4137 download_webpage = self._download_webpage(download_link, id,
4138 'Downloading free downloads page')
4139 # We get the dictionary of the track from some javascrip code
4140 info = re.search(r'items: (.*?),$',
4141 download_webpage, re.MULTILINE).group(1)
4142 info = json.loads(info)[0]
4143 # We pick mp3-320 for now, until format selection can be easily implemented.
4144 mp3_info = info[u'downloads'][u'mp3-320']
4145 # If we try to use this url it says the link has expired
4146 initial_url = mp3_info[u'url']
4147 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4148 m_url = re.match(re_url, initial_url)
4149 #We build the url we will use to get the final track url
4150 # This url is build in Bandcamp in the script download_bunde_*.js
4151 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4152 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4153 # If we could correctly generate the .rand field the url would be
4154 #in the "download_url" key
4155 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4157 track_info = {'id':id,
4158 'title' : info[u'title'],
4161 'thumbnail' : info[u'thumb_url'],
4162 'uploader' : info[u'artist']
4167 class RedTubeIE(InfoExtractor):
4168 """Information Extractor for redtube"""
4169 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4171 def _real_extract(self,url):
4172 mobj = re.match(self._VALID_URL, url)
4174 raise ExtractorError(u'Invalid URL: %s' % url)
4176 video_id = mobj.group('id')
4177 video_extension = 'mp4'
4178 webpage = self._download_webpage(url, video_id)
4180 self.report_extraction(video_id)
4182 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4183 webpage, u'video URL')
4185 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4191 'ext': video_extension,
4192 'title': video_title,
4195 class InaIE(InfoExtractor):
4196 """Information Extractor for Ina.fr"""
4197 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4199 def _real_extract(self,url):
4200 mobj = re.match(self._VALID_URL, url)
4202 video_id = mobj.group('id')
4203 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4204 video_extension = 'mp4'
4205 webpage = self._download_webpage(mrss_url, video_id)
4207 self.report_extraction(video_id)
4209 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4210 webpage, u'video URL')
4212 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4218 'ext': video_extension,
4219 'title': video_title,
4222 class HowcastIE(InfoExtractor):
4223 """Information Extractor for Howcast.com"""
4224 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4226 def _real_extract(self, url):
4227 mobj = re.match(self._VALID_URL, url)
4229 video_id = mobj.group('id')
4230 webpage_url = 'http://www.howcast.com/videos/' + video_id
4231 webpage = self._download_webpage(webpage_url, video_id)
4233 self.report_extraction(video_id)
4235 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4236 webpage, u'video URL')
4238 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4241 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4242 webpage, u'description', fatal=False)
4244 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4245 webpage, u'thumbnail', fatal=False)
4251 'title': video_title,
4252 'description': video_description,
4253 'thumbnail': thumbnail,
4256 class VineIE(InfoExtractor):
4257 """Information Extractor for Vine.co"""
4258 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4260 def _real_extract(self, url):
4261 mobj = re.match(self._VALID_URL, url)
4263 video_id = mobj.group('id')
4264 webpage_url = 'https://vine.co/v/' + video_id
4265 webpage = self._download_webpage(webpage_url, video_id)
4267 self.report_extraction(video_id)
4269 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4270 webpage, u'video URL')
4272 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4275 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4276 webpage, u'thumbnail', fatal=False)
4278 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4279 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4285 'title': video_title,
4286 'thumbnail': thumbnail,
4287 'uploader': uploader,
4290 class FlickrIE(InfoExtractor):
4291 """Information Extractor for Flickr videos"""
4292 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4294 def _real_extract(self, url):
4295 mobj = re.match(self._VALID_URL, url)
4297 video_id = mobj.group('id')
4298 video_uploader_id = mobj.group('uploader_id')
4299 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4300 webpage = self._download_webpage(webpage_url, video_id)
4302 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4304 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4305 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4307 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4308 first_xml, u'node_id')
4310 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4311 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4313 self.report_extraction(video_id)
4315 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4317 raise ExtractorError(u'Unable to extract video url')
4318 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4320 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4321 webpage, u'video title')
4323 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4324 webpage, u'description', fatal=False)
4326 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4327 webpage, u'thumbnail', fatal=False)
4333 'title': video_title,
4334 'description': video_description,
4335 'thumbnail': thumbnail,
4336 'uploader_id': video_uploader_id,
4339 class TeamcocoIE(InfoExtractor):
4340 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4342 def _real_extract(self, url):
4343 mobj = re.match(self._VALID_URL, url)
4345 raise ExtractorError(u'Invalid URL: %s' % url)
4346 url_title = mobj.group('url_title')
4347 webpage = self._download_webpage(url, url_title)
4349 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4350 webpage, u'video id')
4352 self.report_extraction(video_id)
4354 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4357 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4358 webpage, u'thumbnail', fatal=False)
4360 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4361 webpage, u'description', fatal=False)
4363 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4364 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4366 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4373 'title': video_title,
4374 'thumbnail': thumbnail,
4375 'description': video_description,
4378 class XHamsterIE(InfoExtractor):
4379 """Information Extractor for xHamster"""
4380 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4382 def _real_extract(self,url):
4383 mobj = re.match(self._VALID_URL, url)
4385 video_id = mobj.group('id')
4386 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4387 webpage = self._download_webpage(mrss_url, video_id)
4389 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4391 raise ExtractorError(u'Unable to extract media URL')
4392 if len(mobj.group('server')) == 0:
4393 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4395 video_url = mobj.group('server')+'/key='+mobj.group('file')
4396 video_extension = video_url.split('.')[-1]
4398 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4401 # Can't see the description anywhere in the UI
4402 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4403 # webpage, u'description', fatal=False)
4404 # if video_description: video_description = unescapeHTML(video_description)
4406 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4408 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4410 video_upload_date = None
4411 self._downloader.report_warning(u'Unable to extract upload date')
4413 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4414 webpage, u'uploader id', default=u'anonymous')
4416 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4417 webpage, u'thumbnail', fatal=False)
4422 'ext': video_extension,
4423 'title': video_title,
4424 # 'description': video_description,
4425 'upload_date': video_upload_date,
4426 'uploader_id': video_uploader_id,
4427 'thumbnail': video_thumbnail
4430 class HypemIE(InfoExtractor):
4431 """Information Extractor for hypem"""
4432 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4434 def _real_extract(self, url):
4435 mobj = re.match(self._VALID_URL, url)
4437 raise ExtractorError(u'Invalid URL: %s' % url)
4438 track_id = mobj.group(1)
4440 data = { 'ax': 1, 'ts': time.time() }
4441 data_encoded = compat_urllib_parse.urlencode(data)
4442 complete_url = url + "?" + data_encoded
4443 request = compat_urllib_request.Request(complete_url)
4444 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4445 cookie = urlh.headers.get('Set-Cookie', '')
4447 self.report_extraction(track_id)
4449 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4450 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4452 track_list = json.loads(html_tracks)
4453 track = track_list[u'tracks'][0]
4455 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4458 track_id = track[u"id"]
4459 artist = track[u"artist"]
4460 title = track[u"song"]
4462 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4463 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4464 request.add_header('cookie', cookie)
4465 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4467 song_data = json.loads(song_data_json)
4469 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4470 final_url = song_data[u"url"]
4480 class Vbox7IE(InfoExtractor):
4481 """Information Extractor for Vbox7"""
4482 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4484 def _real_extract(self,url):
4485 mobj = re.match(self._VALID_URL, url)
4487 raise ExtractorError(u'Invalid URL: %s' % url)
4488 video_id = mobj.group(1)
4490 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4491 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4492 redirect_url = urlh.geturl() + new_location
4493 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4495 title = self._html_search_regex(r'<title>(.*)</title>',
4496 webpage, u'title').split('/')[0].strip()
4499 info_url = "http://vbox7.com/play/magare.do"
4500 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4501 info_request = compat_urllib_request.Request(info_url, data)
4502 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4503 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4504 if info_response is None:
4505 raise ExtractorError(u'Unable to extract the media url')
4506 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4513 'thumbnail': thumbnail_url,
4516 class GametrailersIE(InfoExtractor):
4517 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4519 def _real_extract(self, url):
4520 mobj = re.match(self._VALID_URL, url)
4522 raise ExtractorError(u'Invalid URL: %s' % url)
4523 video_id = mobj.group('id')
4524 video_type = mobj.group('type')
4525 webpage = self._download_webpage(url, video_id)
4526 if video_type == 'full-episodes':
4527 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4529 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4530 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4531 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4533 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4534 video_id, u'Downloading video info')
4535 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4536 video_id, u'Downloading video urls info')
4538 self.report_extraction(video_id)
4539 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4540 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4542 <url>(?P<thumb>.*?)</url>.*
4545 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4547 raise ExtractorError(u'Unable to extract video info')
4548 video_title = m_info.group('title')
4549 video_description = m_info.group('description')
4550 video_thumb = m_info.group('thumb')
4552 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4553 if m_urls is None or len(m_urls) == 0:
4554 raise ExtractError(u'Unable to extrat video url')
4555 # They are sorted from worst to best quality
4556 video_url = m_urls[-1].group('url')
4558 return {'url': video_url,
4560 'title': video_title,
4561 # Videos are actually flv not mp4
4563 'thumbnail': video_thumb,
4564 'description': video_description,
4567 def gen_extractors():
4568 """ Return a list of an instance of every supported extractor.
4569 The order does matter; the first extractor matched is the one handling the URL.
4572 YoutubePlaylistIE(),
4597 StanfordOpenClassroomIE(),
4607 WorldStarHipHopIE(),
4636 def get_info_extractor(ie_name):
4637 """Returns the info extractor class with the given ie_name"""
4638 return globals()[ie_name+'IE']