2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 class SearchInfoExtractor(InfoExtractor):
227 Base class for paged search queries extractors.
228 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
229 Instances should define _SEARCH_KEY and _MAX_RESULTS.
233 def _make_valid_url(cls):
234 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
237 def suitable(cls, url):
238 return re.match(cls._make_valid_url(), url) is not None
240 def _real_extract(self, query):
241 mobj = re.match(self._make_valid_url(), query)
243 raise ExtractorError(u'Invalid search query "%s"' % query)
245 prefix = mobj.group('prefix')
246 query = mobj.group('query')
248 return self._get_n_results(query, 1)
249 elif prefix == 'all':
250 return self._get_n_results(query, self._MAX_RESULTS)
254 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
255 elif n > self._MAX_RESULTS:
256 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
257 n = self._MAX_RESULTS
258 return self._get_n_results(query, n)
260 def _get_n_results(self, query, n):
261 """Get a specified number of results for a query"""
262 raise NotImplementedError("This method must be implemented by sublclasses")
265 class YoutubeIE(InfoExtractor):
266 """Information extractor for youtube.com."""
270 (?:https?://)? # http(s):// (optional)
271 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
272 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
273 (?:.*?\#/)? # handle anchor (#/) redirect urls
274 (?: # the various things that can precede the ID:
275 (?:(?:v|embed|e)/) # v/ or embed/ or e/
276 |(?: # or the v= param in all its forms
277 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
278 (?:\?|\#!?) # the params delimiter ? or # or #!
279 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
282 )? # optional -> youtube.com/xxxx is OK
283 )? # all until now is optional -> you can pass the naked ID
284 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
285 (?(1).+)? # if we found the ID, everything can follow
287 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
288 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
289 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
290 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
291 _NETRC_MACHINE = 'youtube'
292 # Listed in order of quality
293 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
294 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
295 _video_extensions = {
301 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
307 _video_dimensions = {
326 def suitable(cls, url):
327 """Receives a URL and returns True if suitable for this IE."""
328 if YoutubePlaylistIE.suitable(url): return False
329 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
331 def report_lang(self):
332 """Report attempt to set language."""
333 self.to_screen(u'Setting language')
335 def report_login(self):
336 """Report attempt to log in."""
337 self.to_screen(u'Logging in')
339 def report_video_webpage_download(self, video_id):
340 """Report attempt to download video webpage."""
341 self.to_screen(u'%s: Downloading video webpage' % video_id)
343 def report_video_info_webpage_download(self, video_id):
344 """Report attempt to download video info webpage."""
345 self.to_screen(u'%s: Downloading video info webpage' % video_id)
347 def report_video_subtitles_download(self, video_id):
348 """Report attempt to download video info webpage."""
349 self.to_screen(u'%s: Checking available subtitles' % video_id)
351 def report_video_subtitles_request(self, video_id, sub_lang, format):
352 """Report attempt to download video info webpage."""
353 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
355 def report_video_subtitles_available(self, video_id, sub_lang_list):
356 """Report available subtitles."""
357 sub_lang = ",".join(list(sub_lang_list.keys()))
358 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
360 def report_information_extraction(self, video_id):
361 """Report attempt to extract video information."""
362 self.to_screen(u'%s: Extracting video information' % video_id)
364 def report_unavailable_format(self, video_id, format):
365 """Report extracted video URL."""
366 self.to_screen(u'%s: Format %s not available' % (video_id, format))
368 def report_rtmp_download(self):
369 """Indicate the download will use the RTMP protocol."""
370 self.to_screen(u'RTMP download detected')
372 def _get_available_subtitles(self, video_id):
373 self.report_video_subtitles_download(video_id)
374 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
376 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 return (u'unable to download video subtitles: %s' % compat_str(err), None)
379 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
380 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
381 if not sub_lang_list:
382 return (u'video doesn\'t have subtitles', None)
385 def _list_available_subtitles(self, video_id):
386 sub_lang_list = self._get_available_subtitles(video_id)
387 self.report_video_subtitles_available(video_id, sub_lang_list)
389 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
392 (error_message, sub_lang, sub)
394 self.report_video_subtitles_request(video_id, sub_lang, format)
395 params = compat_urllib_parse.urlencode({
401 url = 'http://www.youtube.com/api/timedtext?' + params
403 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
404 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
405 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
407 return (u'Did not fetch video subtitles', None, None)
408 return (None, sub_lang, sub)
410 def _request_automatic_caption(self, video_id, webpage):
411 """We need the webpage for getting the captions url, pass it as an
412 argument to speed up the process."""
413 sub_lang = self._downloader.params.get('subtitleslang')
414 sub_format = self._downloader.params.get('subtitlesformat')
415 self.to_screen(u'%s: Looking for automatic captions' % video_id)
416 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
417 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
419 return [(err_msg, None, None)]
420 player_config = json.loads(mobj.group(1))
422 args = player_config[u'args']
423 caption_url = args[u'ttsurl']
424 timestamp = args[u'timestamp']
425 params = compat_urllib_parse.urlencode({
432 subtitles_url = caption_url + '&' + params
433 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
434 return [(None, sub_lang, sub)]
436 return [(err_msg, None, None)]
438 def _extract_subtitle(self, video_id):
440 Return a list with a tuple:
441 [(error_message, sub_lang, sub)]
443 sub_lang_list = self._get_available_subtitles(video_id)
444 sub_format = self._downloader.params.get('subtitlesformat')
445 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
446 return [(sub_lang_list[0], None, None)]
447 if self._downloader.params.get('subtitleslang', False):
448 sub_lang = self._downloader.params.get('subtitleslang')
449 elif 'en' in sub_lang_list:
452 sub_lang = list(sub_lang_list.keys())[0]
453 if not sub_lang in sub_lang_list:
454 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
456 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
459 def _extract_all_subtitles(self, video_id):
460 sub_lang_list = self._get_available_subtitles(video_id)
461 sub_format = self._downloader.params.get('subtitlesformat')
462 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
463 return [(sub_lang_list[0], None, None)]
465 for sub_lang in sub_lang_list:
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
467 subtitles.append(subtitle)
470 def _print_formats(self, formats):
471 print('Available formats:')
473 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
475 def _real_initialize(self):
476 if self._downloader is None:
481 downloader_params = self._downloader.params
483 # Attempt to use provided username and password or .netrc data
484 if downloader_params.get('username', None) is not None:
485 username = downloader_params['username']
486 password = downloader_params['password']
487 elif downloader_params.get('usenetrc', False):
489 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
494 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
495 except (IOError, netrc.NetrcParseError) as err:
496 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
500 request = compat_urllib_request.Request(self._LANG_URL)
503 compat_urllib_request.urlopen(request).read()
504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
505 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
508 # No authentication to be performed
512 request = compat_urllib_request.Request(self._LOGIN_URL)
514 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
516 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
521 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
523 galx = match.group(1)
525 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
531 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
535 u'PersistentCookie': u'yes',
537 u'bgresponse': u'js_disabled',
538 u'checkConnection': u'',
539 u'checkedDomains': u'youtube',
545 u'signIn': u'Sign in',
547 u'service': u'youtube',
551 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
553 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
554 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
555 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
558 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
559 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
560 self._downloader.report_warning(u'unable to log in: bad username or password')
562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
563 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
569 'action_confirm': 'Confirm',
571 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
573 self.report_age_confirmation()
574 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
578 def _extract_id(self, url):
579 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
581 raise ExtractorError(u'Invalid URL: %s' % url)
582 video_id = mobj.group(2)
585 def _real_extract(self, url):
586 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
587 mobj = re.search(self._NEXT_URL_RE, url)
589 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
590 video_id = self._extract_id(url)
593 self.report_video_webpage_download(video_id)
594 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
595 request = compat_urllib_request.Request(url)
597 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
598 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
599 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
601 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
603 # Attempt to extract SWF player URL
604 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
606 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
611 self.report_video_info_webpage_download(video_id)
612 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
613 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
614 % (video_id, el_type))
615 video_info_webpage = self._download_webpage(video_info_url, video_id,
617 errnote='unable to download video info webpage')
618 video_info = compat_parse_qs(video_info_webpage)
619 if 'token' in video_info:
621 if 'token' not in video_info:
622 if 'reason' in video_info:
623 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
625 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
627 # Check for "rental" videos
628 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
629 raise ExtractorError(u'"rental" videos not supported')
631 # Start extracting information
632 self.report_information_extraction(video_id)
635 if 'author' not in video_info:
636 raise ExtractorError(u'Unable to extract uploader name')
637 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
640 video_uploader_id = None
641 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
643 video_uploader_id = mobj.group(1)
645 self._downloader.report_warning(u'unable to extract uploader nickname')
648 if 'title' not in video_info:
649 raise ExtractorError(u'Unable to extract video title')
650 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
653 if 'thumbnail_url' not in video_info:
654 self._downloader.report_warning(u'unable to extract video thumbnail')
656 else: # don't panic if we can't find it
657 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
661 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
663 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
664 upload_date = unified_strdate(upload_date)
667 video_description = get_element_by_id("eow-description", video_webpage)
668 if video_description:
669 video_description = clean_html(video_description)
671 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
673 video_description = unescapeHTML(fd_mobj.group(1))
675 video_description = u''
678 video_subtitles = None
680 if self._downloader.params.get('writesubtitles', False):
681 video_subtitles = self._extract_subtitle(video_id)
683 (sub_error, sub_lang, sub) = video_subtitles[0]
685 # We try with the automatic captions
686 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
687 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
691 # We report the original error
692 self._downloader.report_error(sub_error)
694 if self._downloader.params.get('allsubtitles', False):
695 video_subtitles = self._extract_all_subtitles(video_id)
696 for video_subtitle in video_subtitles:
697 (sub_error, sub_lang, sub) = video_subtitle
699 self._downloader.report_error(sub_error)
701 if self._downloader.params.get('listsubtitles', False):
702 sub_lang_list = self._list_available_subtitles(video_id)
705 if 'length_seconds' not in video_info:
706 self._downloader.report_warning(u'unable to extract video duration')
709 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
712 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
714 # Decide which formats to download
715 req_format = self._downloader.params.get('format', None)
717 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
718 self.report_rtmp_download()
719 video_url_list = [(None, video_info['conn'][0])]
720 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
722 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
723 url_data = compat_parse_qs(url_data_str)
724 if 'itag' in url_data and 'url' in url_data:
725 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
726 if not 'ratebypass' in url: url += '&ratebypass=yes'
727 url_map[url_data['itag'][0]] = url
729 format_limit = self._downloader.params.get('format_limit', None)
730 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
731 if format_limit is not None and format_limit in available_formats:
732 format_list = available_formats[available_formats.index(format_limit):]
734 format_list = available_formats
735 existing_formats = [x for x in format_list if x in url_map]
736 if len(existing_formats) == 0:
737 raise ExtractorError(u'no known formats available for video')
738 if self._downloader.params.get('listformats', None):
739 self._print_formats(existing_formats)
741 if req_format is None or req_format == 'best':
742 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
743 elif req_format == 'worst':
744 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
745 elif req_format in ('-1', 'all'):
746 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
748 # Specific formats. We pick the first in a slash-delimeted sequence.
749 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
750 req_formats = req_format.split('/')
751 video_url_list = None
752 for rf in req_formats:
754 video_url_list = [(rf, url_map[rf])]
756 if video_url_list is None:
757 raise ExtractorError(u'requested format not available')
759 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
762 for format_param, video_real_url in video_url_list:
764 video_extension = self._video_extensions.get(format_param, 'flv')
766 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
767 self._video_dimensions.get(format_param, '???'))
771 'url': video_real_url,
772 'uploader': video_uploader,
773 'uploader_id': video_uploader_id,
774 'upload_date': upload_date,
775 'title': video_title,
776 'ext': video_extension,
777 'format': video_format,
778 'thumbnail': video_thumbnail,
779 'description': video_description,
780 'player_url': player_url,
781 'subtitles': video_subtitles,
782 'duration': video_duration
787 class MetacafeIE(InfoExtractor):
788 """Information Extractor for metacafe.com."""
790 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
791 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
792 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
793 IE_NAME = u'metacafe'
795 def report_disclaimer(self):
796 """Report disclaimer retrieval."""
797 self.to_screen(u'Retrieving disclaimer')
799 def _real_initialize(self):
800 # Retrieve disclaimer
801 request = compat_urllib_request.Request(self._DISCLAIMER)
803 self.report_disclaimer()
804 disclaimer = compat_urllib_request.urlopen(request).read()
805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
806 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
811 'submit': "Continue - I'm over 18",
813 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
815 self.report_age_confirmation()
816 disclaimer = compat_urllib_request.urlopen(request).read()
817 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
818 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
820 def _real_extract(self, url):
821 # Extract id and simplified title from URL
822 mobj = re.match(self._VALID_URL, url)
824 raise ExtractorError(u'Invalid URL: %s' % url)
826 video_id = mobj.group(1)
828 # Check if video comes from YouTube
829 mobj2 = re.match(r'^yt-(.*)$', video_id)
830 if mobj2 is not None:
831 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
833 # Retrieve video webpage to extract further information
834 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
836 # Extract URL, uploader and title from webpage
837 self.report_extraction(video_id)
838 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
840 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
841 video_extension = mediaURL[-3:]
843 # Extract gdaKey if available
844 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
848 gdaKey = mobj.group(1)
849 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
851 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
853 raise ExtractorError(u'Unable to extract media URL')
854 vardict = compat_parse_qs(mobj.group(1))
855 if 'mediaData' not in vardict:
856 raise ExtractorError(u'Unable to extract media URL')
857 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
859 raise ExtractorError(u'Unable to extract media URL')
860 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
861 video_extension = mediaURL[-3:]
862 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
864 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
866 raise ExtractorError(u'Unable to extract title')
867 video_title = mobj.group(1).decode('utf-8')
869 mobj = re.search(r'submitter=(.*?);', webpage)
871 raise ExtractorError(u'Unable to extract uploader nickname')
872 video_uploader = mobj.group(1)
875 'id': video_id.decode('utf-8'),
876 'url': video_url.decode('utf-8'),
877 'uploader': video_uploader.decode('utf-8'),
879 'title': video_title,
880 'ext': video_extension.decode('utf-8'),
883 class DailymotionIE(InfoExtractor):
884 """Information Extractor for Dailymotion"""
886 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
887 IE_NAME = u'dailymotion'
889 def _real_extract(self, url):
890 # Extract id and simplified title from URL
891 mobj = re.match(self._VALID_URL, url)
893 raise ExtractorError(u'Invalid URL: %s' % url)
895 video_id = mobj.group(1).split('_')[0].split('?')[0]
897 video_extension = 'mp4'
899 # Retrieve video webpage to extract further information
900 request = compat_urllib_request.Request(url)
901 request.add_header('Cookie', 'family_filter=off')
902 webpage = self._download_webpage(request, video_id)
904 # Extract URL, uploader and title from webpage
905 self.report_extraction(video_id)
906 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
908 raise ExtractorError(u'Unable to extract media URL')
909 flashvars = compat_urllib_parse.unquote(mobj.group(1))
911 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
914 self.to_screen(u'Using %s' % key)
917 raise ExtractorError(u'Unable to extract video URL')
919 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
921 raise ExtractorError(u'Unable to extract video URL')
923 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
925 # TODO: support choosing qualities
927 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
929 raise ExtractorError(u'Unable to extract title')
930 video_title = unescapeHTML(mobj.group('title'))
932 video_uploader = None
933 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
935 # lookin for official user
936 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
937 if mobj_official is None:
938 self._downloader.report_warning(u'unable to extract uploader nickname')
940 video_uploader = mobj_official.group(1)
942 video_uploader = mobj.group(1)
944 video_upload_date = None
945 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
947 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
952 'uploader': video_uploader,
953 'upload_date': video_upload_date,
954 'title': video_title,
955 'ext': video_extension,
959 class PhotobucketIE(InfoExtractor):
960 """Information extractor for photobucket.com."""
962 # TODO: the original _VALID_URL was:
963 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
964 # Check if it's necessary to keep the old extracion process
965 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
966 IE_NAME = u'photobucket'
968 def _real_extract(self, url):
969 # Extract id from URL
970 mobj = re.match(self._VALID_URL, url)
972 raise ExtractorError(u'Invalid URL: %s' % url)
974 video_id = mobj.group('id')
976 video_extension = mobj.group('ext')
978 # Retrieve video webpage to extract further information
979 webpage = self._download_webpage(url, video_id)
981 # Extract URL, uploader, and title from webpage
982 self.report_extraction(video_id)
983 # We try first by looking the javascript code:
984 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
986 info = json.loads(mobj.group('json'))
989 'url': info[u'downloadUrl'],
990 'uploader': info[u'username'],
991 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
992 'title': info[u'title'],
993 'ext': video_extension,
994 'thumbnail': info[u'thumbUrl'],
997 # We try looking in other parts of the webpage
998 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
999 webpage, u'video URL')
1001 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1003 raise ExtractorError(u'Unable to extract title')
1004 video_title = mobj.group(1).decode('utf-8')
1005 video_uploader = mobj.group(2).decode('utf-8')
1008 'id': video_id.decode('utf-8'),
1009 'url': video_url.decode('utf-8'),
1010 'uploader': video_uploader,
1011 'upload_date': None,
1012 'title': video_title,
1013 'ext': video_extension.decode('utf-8'),
1017 class YahooIE(InfoExtractor):
1018 """Information extractor for screen.yahoo.com."""
1019 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1021 def _real_extract(self, url):
1022 mobj = re.match(self._VALID_URL, url)
1024 raise ExtractorError(u'Invalid URL: %s' % url)
1025 video_id = mobj.group('id')
1026 webpage = self._download_webpage(url, video_id)
1027 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1030 # TODO: Check which url parameters are required
1031 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1032 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1033 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1034 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1035 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1036 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1038 self.report_extraction(video_id)
1039 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1041 raise ExtractorError(u'Unable to extract video info')
1042 video_title = m_info.group('title')
1043 video_description = m_info.group('description')
1044 video_thumb = m_info.group('thumb')
1045 video_date = m_info.group('date')
1046 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1048 # TODO: Find a way to get mp4 videos
1049 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1050 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1051 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1052 video_url = m_rest.group('url')
1053 video_path = m_rest.group('path')
1055 raise ExtractorError(u'Unable to extract video url')
1057 else: # We have to use a different method if another id is defined
1058 long_id = m_id.group('new_id')
1059 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1060 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1061 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1062 info = json.loads(json_str)
1063 res = info[u'query'][u'results'][u'mediaObj'][0]
1064 stream = res[u'streams'][0]
1065 video_path = stream[u'path']
1066 video_url = stream[u'host']
1068 video_title = meta[u'title']
1069 video_description = meta[u'description']
1070 video_thumb = meta[u'thumbnail']
1071 video_date = None # I can't find it
1076 'play_path': video_path,
1077 'title':video_title,
1078 'description': video_description,
1079 'thumbnail': video_thumb,
1080 'upload_date': video_date,
1085 class VimeoIE(InfoExtractor):
1086 """Information extractor for vimeo.com."""
1088 # _VALID_URL matches Vimeo URLs
1089 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092 def _real_extract(self, url, new_video=True):
1093 # Extract ID from URL
1094 mobj = re.match(self._VALID_URL, url)
1096 raise ExtractorError(u'Invalid URL: %s' % url)
1098 video_id = mobj.group('id')
1099 if not mobj.group('proto'):
1100 url = 'https://' + url
1101 if mobj.group('direct_link') or mobj.group('pro'):
1102 url = 'https://vimeo.com/' + video_id
1104 # Retrieve video webpage to extract further information
1105 request = compat_urllib_request.Request(url, None, std_headers)
1106 webpage = self._download_webpage(request, video_id)
1108 # Now we begin extracting as much information as we can from what we
1109 # retrieved. First we extract the information common to all extractors,
1110 # and latter we extract those that are Vimeo specific.
1111 self.report_extraction(video_id)
1113 # Extract the config JSON
1115 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1116 config = json.loads(config)
1118 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1119 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1121 raise ExtractorError(u'Unable to extract info section')
1124 video_title = config["video"]["title"]
1126 # Extract uploader and uploader_id
1127 video_uploader = config["video"]["owner"]["name"]
1128 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1130 # Extract video thumbnail
1131 video_thumbnail = config["video"]["thumbnail"]
1133 # Extract video description
1134 video_description = get_element_by_attribute("itemprop", "description", webpage)
1135 if video_description: video_description = clean_html(video_description)
1136 else: video_description = u''
1138 # Extract upload date
1139 video_upload_date = None
1140 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1141 if mobj is not None:
1142 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1144 # Vimeo specific: extract request signature and timestamp
1145 sig = config['request']['signature']
1146 timestamp = config['request']['timestamp']
1148 # Vimeo specific: extract video codec and quality information
1149 # First consider quality, then codecs, then take everything
1150 # TODO bind to format param
1151 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1152 files = { 'hd': [], 'sd': [], 'other': []}
1153 for codec_name, codec_extension in codecs:
1154 if codec_name in config["video"]["files"]:
1155 if 'hd' in config["video"]["files"][codec_name]:
1156 files['hd'].append((codec_name, codec_extension, 'hd'))
1157 elif 'sd' in config["video"]["files"][codec_name]:
1158 files['sd'].append((codec_name, codec_extension, 'sd'))
1160 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1162 for quality in ('hd', 'sd', 'other'):
1163 if len(files[quality]) > 0:
1164 video_quality = files[quality][0][2]
1165 video_codec = files[quality][0][0]
1166 video_extension = files[quality][0][1]
1167 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170 raise ExtractorError(u'No known codec found')
1172 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178 'uploader': video_uploader,
1179 'uploader_id': video_uploader_id,
1180 'upload_date': video_upload_date,
1181 'title': video_title,
1182 'ext': video_extension,
1183 'thumbnail': video_thumbnail,
1184 'description': video_description,
1188 class ArteTvIE(InfoExtractor):
1189 """arte.tv information extractor."""
1191 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192 _LIVE_URL = r'index-[0-9]+\.html$'
1194 IE_NAME = u'arte.tv'
1196 def fetch_webpage(self, url):
1197 request = compat_urllib_request.Request(url)
1199 self.report_download_webpage(url)
1200 webpage = compat_urllib_request.urlopen(request).read()
1201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1202 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1203 except ValueError as err:
1204 raise ExtractorError(u'Invalid URL: %s' % url)
1207 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1208 page = self.fetch_webpage(url)
1209 mobj = re.search(regex, page, regexFlags)
1213 raise ExtractorError(u'Invalid URL: %s' % url)
1215 for (i, key, err) in matchTuples:
1216 if mobj.group(i) is None:
1217 raise ExtractorError(err)
1219 info[key] = mobj.group(i)
1223 def extractLiveStream(self, url):
1224 video_lang = url.split('/')[-4]
1225 info = self.grep_webpage(
1227 r'src="(.*?/videothek_js.*?\.js)',
1230 (1, 'url', u'Invalid URL: %s' % url)
1233 http_host = url.split('/')[2]
1234 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1235 info = self.grep_webpage(
1237 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1238 '(http://.*?\.swf).*?' +
1242 (1, 'path', u'could not extract video path: %s' % url),
1243 (2, 'player', u'could not extract video player: %s' % url),
1244 (3, 'url', u'could not extract video url: %s' % url)
1247 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249 def extractPlus7Stream(self, url):
1250 video_lang = url.split('/')[-3]
1251 info = self.grep_webpage(
1253 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1256 (1, 'url', u'Invalid URL: %s' % url)
1259 next_url = compat_urllib_parse.unquote(info.get('url'))
1260 info = self.grep_webpage(
1262 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1265 (1, 'url', u'Could not find <video> tag: %s' % url)
1268 next_url = compat_urllib_parse.unquote(info.get('url'))
1270 info = self.grep_webpage(
1272 r'<video id="(.*?)".*?>.*?' +
1273 '<name>(.*?)</name>.*?' +
1274 '<dateVideo>(.*?)</dateVideo>.*?' +
1275 '<url quality="hd">(.*?)</url>',
1278 (1, 'id', u'could not extract video id: %s' % url),
1279 (2, 'title', u'could not extract video title: %s' % url),
1280 (3, 'date', u'could not extract video date: %s' % url),
1281 (4, 'url', u'could not extract video url: %s' % url)
1286 'id': info.get('id'),
1287 'url': compat_urllib_parse.unquote(info.get('url')),
1288 'uploader': u'arte.tv',
1289 'upload_date': unified_strdate(info.get('date')),
1290 'title': info.get('title').decode('utf-8'),
1296 def _real_extract(self, url):
1297 video_id = url.split('/')[-1]
1298 self.report_extraction(video_id)
1300 if re.search(self._LIVE_URL, video_id) is not None:
1301 self.extractLiveStream(url)
1304 info = self.extractPlus7Stream(url)
1309 class GenericIE(InfoExtractor):
1310 """Generic last-resort information extractor."""
1313 IE_NAME = u'generic'
1315 def report_download_webpage(self, video_id):
1316 """Report webpage download."""
1317 if not self._downloader.params.get('test', False):
1318 self._downloader.report_warning(u'Falling back on generic information extractor.')
1319 super(GenericIE, self).report_download_webpage(video_id)
1321 def report_following_redirect(self, new_url):
1322 """Report information extraction."""
1323 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1325 def _test_redirect(self, url):
1326 """Check if it is a redirect, like url shorteners, in case return the new url."""
1327 class HeadRequest(compat_urllib_request.Request):
1328 def get_method(self):
1331 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1333 Subclass the HTTPRedirectHandler to make it use our
1334 HeadRequest also on the redirected URL
1336 def redirect_request(self, req, fp, code, msg, headers, newurl):
1337 if code in (301, 302, 303, 307):
1338 newurl = newurl.replace(' ', '%20')
1339 newheaders = dict((k,v) for k,v in req.headers.items()
1340 if k.lower() not in ("content-length", "content-type"))
1341 return HeadRequest(newurl,
1343 origin_req_host=req.get_origin_req_host(),
1346 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1348 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1350 Fallback to GET if HEAD is not allowed (405 HTTP error)
1352 def http_error_405(self, req, fp, code, msg, headers):
1356 newheaders = dict((k,v) for k,v in req.headers.items()
1357 if k.lower() not in ("content-length", "content-type"))
1358 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1360 origin_req_host=req.get_origin_req_host(),
1364 opener = compat_urllib_request.OpenerDirector()
1365 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1366 HTTPMethodFallback, HEADRedirectHandler,
1367 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1368 opener.add_handler(handler())
1370 response = opener.open(HeadRequest(url))
1371 if response is None:
1372 raise ExtractorError(u'Invalid URL protocol')
1373 new_url = response.geturl()
1378 self.report_following_redirect(new_url)
1381 def _real_extract(self, url):
1382 new_url = self._test_redirect(url)
1383 if new_url: return [self.url_result(new_url)]
1385 video_id = url.split('/')[-1]
1387 webpage = self._download_webpage(url, video_id)
1388 except ValueError as err:
1389 # since this is the last-resort InfoExtractor, if
1390 # this error is thrown, it'll be thrown here
1391 raise ExtractorError(u'Invalid URL: %s' % url)
1393 self.report_extraction(video_id)
1394 # Start with something easy: JW Player in SWFObject
1395 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1397 # Broaden the search a little bit
1398 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1400 # Broaden the search a little bit: JWPlayer JS loader
1401 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1403 raise ExtractorError(u'Invalid URL: %s' % url)
1405 # It's possible that one of the regexes
1406 # matched, but returned an empty group:
1407 if mobj.group(1) is None:
1408 raise ExtractorError(u'Invalid URL: %s' % url)
1410 video_url = compat_urllib_parse.unquote(mobj.group(1))
1411 video_id = os.path.basename(video_url)
1413 # here's a fun little line of code for you:
1414 video_extension = os.path.splitext(video_id)[1][1:]
1415 video_id = os.path.splitext(video_id)[0]
1417 # it's tempting to parse this further, but you would
1418 # have to take into account all the variations like
1419 # Video Title - Site Name
1420 # Site Name | Video Title
1421 # Video Title - Tagline | Site Name
1422 # and so on and so forth; it's just not practical
1423 mobj = re.search(r'<title>(.*)</title>', webpage)
1425 raise ExtractorError(u'Unable to extract title')
1426 video_title = mobj.group(1)
1428 # video uploader is domain name
1429 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1431 raise ExtractorError(u'Unable to extract title')
1432 video_uploader = mobj.group(1)
1437 'uploader': video_uploader,
1438 'upload_date': None,
1439 'title': video_title,
1440 'ext': video_extension,
1444 class YoutubeSearchIE(SearchInfoExtractor):
1445 """Information Extractor for YouTube search queries."""
1446 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1448 IE_NAME = u'youtube:search'
1449 _SEARCH_KEY = 'ytsearch'
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download search page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1456 def _get_n_results(self, query, n):
1457 """Get a specified number of results for a query"""
1463 while (50 * pagenum) < limit:
1464 self.report_download_page(query, pagenum+1)
1465 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1466 request = compat_urllib_request.Request(result_url)
1468 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1470 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1471 api_response = json.loads(data)['data']
1473 if not 'items' in api_response:
1474 raise ExtractorError(u'[youtube] No video results')
1476 new_ids = list(video['id'] for video in api_response['items'])
1477 video_ids += new_ids
1479 limit = min(n, api_response['totalItems'])
1482 if len(video_ids) > n:
1483 video_ids = video_ids[:n]
1484 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1485 return self.playlist_result(videos, query)
1488 class GoogleSearchIE(SearchInfoExtractor):
1489 """Information Extractor for Google Video search queries."""
1490 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1492 IE_NAME = u'video.google:search'
1493 _SEARCH_KEY = 'gvsearch'
1495 def _get_n_results(self, query, n):
1496 """Get a specified number of results for a query"""
1499 '_type': 'playlist',
1504 for pagenum in itertools.count(1):
1505 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1506 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1507 note='Downloading result page ' + str(pagenum))
1509 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1512 'url': mobj.group(1)
1514 res['entries'].append(e)
1516 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1519 class YahooSearchIE(SearchInfoExtractor):
1520 """Information Extractor for Yahoo! Video search queries."""
1523 IE_NAME = u'screen.yahoo:search'
1524 _SEARCH_KEY = 'yvsearch'
1526 def _get_n_results(self, query, n):
1527 """Get a specified number of results for a query"""
1530 '_type': 'playlist',
1534 for pagenum in itertools.count(0):
1535 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1536 webpage = self._download_webpage(result_url, query,
1537 note='Downloading results page '+str(pagenum+1))
1538 info = json.loads(webpage)
1540 results = info[u'results']
1542 for (i, r) in enumerate(results):
1543 if (pagenum * 30) +i >= n:
1545 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1546 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1547 res['entries'].append(e)
1548 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1554 class YoutubePlaylistIE(InfoExtractor):
1555 """Information Extractor for YouTube playlists."""
1557 _VALID_URL = r"""(?:
1562 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1563 \? (?:.*?&)*? (?:p|a|list)=
1566 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1569 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1571 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1573 IE_NAME = u'youtube:playlist'
1576 def suitable(cls, url):
1577 """Receives a URL and returns True if suitable for this IE."""
1578 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1580 def _real_extract(self, url):
1581 # Extract playlist id
1582 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1584 raise ExtractorError(u'Invalid URL: %s' % url)
1586 # Download playlist videos from API
1587 playlist_id = mobj.group(1) or mobj.group(2)
1592 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1593 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1596 response = json.loads(page)
1597 except ValueError as err:
1598 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1600 if 'feed' not in response:
1601 raise ExtractorError(u'Got a malformed response from YouTube API')
1602 playlist_title = response['feed']['title']['$t']
1603 if 'entry' not in response['feed']:
1604 # Number of videos is a multiple of self._MAX_RESULTS
1607 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1608 for entry in response['feed']['entry']
1609 if 'content' in entry ]
1611 if len(response['feed']['entry']) < self._MAX_RESULTS:
1615 videos = [v[1] for v in sorted(videos)]
1617 url_results = [self.url_result(url, 'Youtube') for url in videos]
1618 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1621 class YoutubeChannelIE(InfoExtractor):
1622 """Information Extractor for YouTube channels."""
1624 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1625 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1626 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1627 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1628 IE_NAME = u'youtube:channel'
1630 def extract_videos_from_page(self, page):
1632 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1633 if mobj.group(1) not in ids_in_page:
1634 ids_in_page.append(mobj.group(1))
1637 def _real_extract(self, url):
1638 # Extract channel id
1639 mobj = re.match(self._VALID_URL, url)
1641 raise ExtractorError(u'Invalid URL: %s' % url)
1643 # Download channel page
1644 channel_id = mobj.group(1)
1648 url = self._TEMPLATE_URL % (channel_id, pagenum)
1649 page = self._download_webpage(url, channel_id,
1650 u'Downloading page #%s' % pagenum)
1652 # Extract video identifiers
1653 ids_in_page = self.extract_videos_from_page(page)
1654 video_ids.extend(ids_in_page)
1656 # Download any subsequent channel pages using the json-based channel_ajax query
1657 if self._MORE_PAGES_INDICATOR in page:
1659 pagenum = pagenum + 1
1661 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1662 page = self._download_webpage(url, channel_id,
1663 u'Downloading page #%s' % pagenum)
1665 page = json.loads(page)
1667 ids_in_page = self.extract_videos_from_page(page['content_html'])
1668 video_ids.extend(ids_in_page)
1670 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1673 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1675 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1676 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1677 return [self.playlist_result(url_entries, channel_id)]
1680 class YoutubeUserIE(InfoExtractor):
1681 """Information Extractor for YouTube users."""
1683 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1684 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1685 _GDATA_PAGE_SIZE = 50
1686 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1687 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1688 IE_NAME = u'youtube:user'
1690 def _real_extract(self, url):
1692 mobj = re.match(self._VALID_URL, url)
1694 raise ExtractorError(u'Invalid URL: %s' % url)
1696 username = mobj.group(1)
1698 # Download video ids using YouTube Data API. Result size per
1699 # query is limited (currently to 50 videos) so we need to query
1700 # page by page until there are no video ids - it means we got
1707 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1709 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1710 page = self._download_webpage(gdata_url, username,
1711 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1713 # Extract video identifiers
1716 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717 if mobj.group(1) not in ids_in_page:
1718 ids_in_page.append(mobj.group(1))
1720 video_ids.extend(ids_in_page)
1722 # A little optimization - if current page is not
1723 # "full", ie. does not contain PAGE_SIZE video ids then
1724 # we can assume that this page is the last one - there
1725 # are no more ids on further pages - no need to query
1728 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1733 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1734 url_results = [self.url_result(url, 'Youtube') for url in urls]
1735 return [self.playlist_result(url_results, playlist_title = username)]
1738 class BlipTVUserIE(InfoExtractor):
1739 """Information Extractor for blip.tv users."""
1741 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1743 IE_NAME = u'blip.tv:user'
1745 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1749 raise ExtractorError(u'Invalid URL: %s' % url)
1751 username = mobj.group(1)
1753 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1755 page = self._download_webpage(url, username, u'Downloading user page')
1756 mobj = re.search(r'data-users-id="([^"]+)"', page)
1757 page_base = page_base % mobj.group(1)
1760 # Download video ids using BlipTV Ajax calls. Result size per
1761 # query is limited (currently to 12 videos) so we need to query
1762 # page by page until there are no video ids - it means we got
1769 url = page_base + "&page=" + str(pagenum)
1770 page = self._download_webpage(url, username,
1771 u'Downloading video ids from page %d' % pagenum)
1773 # Extract video identifiers
1776 for mobj in re.finditer(r'href="/([^"]+)"', page):
1777 if mobj.group(1) not in ids_in_page:
1778 ids_in_page.append(unescapeHTML(mobj.group(1)))
1780 video_ids.extend(ids_in_page)
1782 # A little optimization - if current page is not
1783 # "full", ie. does not contain PAGE_SIZE video ids then
1784 # we can assume that this page is the last one - there
1785 # are no more ids on further pages - no need to query
1788 if len(ids_in_page) < self._PAGE_SIZE:
1793 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1794 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1795 return [self.playlist_result(url_entries, playlist_title = username)]
1798 class DepositFilesIE(InfoExtractor):
1799 """Information extractor for depositfiles.com"""
1801 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1803 def _real_extract(self, url):
1804 file_id = url.split('/')[-1]
1805 # Rebuild url in english locale
1806 url = 'http://depositfiles.com/en/files/' + file_id
1808 # Retrieve file webpage with 'Free download' button pressed
1809 free_download_indication = { 'gateway_result' : '1' }
1810 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1812 self.report_download_webpage(file_id)
1813 webpage = compat_urllib_request.urlopen(request).read()
1814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1815 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1817 # Search for the real file URL
1818 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1819 if (mobj is None) or (mobj.group(1) is None):
1820 # Try to figure out reason of the error.
1821 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1822 if (mobj is not None) and (mobj.group(1) is not None):
1823 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1824 raise ExtractorError(u'%s' % restriction_message)
1826 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1828 file_url = mobj.group(1)
1829 file_extension = os.path.splitext(file_url)[1][1:]
1831 # Search for file title
1832 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1835 'id': file_id.decode('utf-8'),
1836 'url': file_url.decode('utf-8'),
1838 'upload_date': None,
1839 'title': file_title,
1840 'ext': file_extension.decode('utf-8'),
1844 class FacebookIE(InfoExtractor):
1845 """Information Extractor for Facebook"""
1847 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1848 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1849 _NETRC_MACHINE = 'facebook'
1850 IE_NAME = u'facebook'
1852 def report_login(self):
1853 """Report attempt to log in."""
1854 self.to_screen(u'Logging in')
1856 def _real_initialize(self):
1857 if self._downloader is None:
1862 downloader_params = self._downloader.params
1864 # Attempt to use provided username and password or .netrc data
1865 if downloader_params.get('username', None) is not None:
1866 useremail = downloader_params['username']
1867 password = downloader_params['password']
1868 elif downloader_params.get('usenetrc', False):
1870 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1871 if info is not None:
1875 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1876 except (IOError, netrc.NetrcParseError) as err:
1877 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1880 if useremail is None:
1889 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1892 login_results = compat_urllib_request.urlopen(request).read()
1893 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1894 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1897 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1900 def _real_extract(self, url):
1901 mobj = re.match(self._VALID_URL, url)
1903 raise ExtractorError(u'Invalid URL: %s' % url)
1904 video_id = mobj.group('ID')
1906 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1907 webpage = self._download_webpage(url, video_id)
1909 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1910 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1911 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1913 raise ExtractorError(u'Cannot parse data')
1914 data = dict(json.loads(m.group(1)))
1915 params_raw = compat_urllib_parse.unquote(data['params'])
1916 params = json.loads(params_raw)
1917 video_data = params['video_data'][0]
1918 video_url = video_data.get('hd_src')
1920 video_url = video_data['sd_src']
1922 raise ExtractorError(u'Cannot find video URL')
1923 video_duration = int(video_data['video_duration'])
1924 thumbnail = video_data['thumbnail_src']
1926 video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1928 video_title = unescapeHTML(video_title)
1932 'title': video_title,
1935 'duration': video_duration,
1936 'thumbnail': thumbnail,
1941 class BlipTVIE(InfoExtractor):
1942 """Information extractor for blip.tv"""
1944 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1945 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1946 IE_NAME = u'blip.tv'
1948 def report_direct_download(self, title):
1949 """Report information extraction."""
1950 self.to_screen(u'%s: Direct download detected' % title)
1952 def _real_extract(self, url):
1953 mobj = re.match(self._VALID_URL, url)
1955 raise ExtractorError(u'Invalid URL: %s' % url)
1957 # See https://github.com/rg3/youtube-dl/issues/857
1958 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1959 if api_mobj is not None:
1960 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1961 urlp = compat_urllib_parse_urlparse(url)
1962 if urlp.path.startswith('/play/'):
1963 request = compat_urllib_request.Request(url)
1964 response = compat_urllib_request.urlopen(request)
1965 redirecturl = response.geturl()
1966 rurlp = compat_urllib_parse_urlparse(redirecturl)
1967 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1968 url = 'http://blip.tv/a/a-' + file_id
1969 return self._real_extract(url)
1976 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1977 request = compat_urllib_request.Request(json_url)
1978 request.add_header('User-Agent', 'iTunes/10.6.1')
1979 self.report_extraction(mobj.group(1))
1982 urlh = compat_urllib_request.urlopen(request)
1983 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1984 basename = url.split('/')[-1]
1985 title,ext = os.path.splitext(basename)
1986 title = title.decode('UTF-8')
1987 ext = ext.replace('.', '')
1988 self.report_direct_download(title)
1993 'upload_date': None,
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2000 if info is None: # Regular URL
2002 json_code_bytes = urlh.read()
2003 json_code = json_code_bytes.decode('utf-8')
2004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2005 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2008 json_data = json.loads(json_code)
2009 if 'Post' in json_data:
2010 data = json_data['Post']
2014 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2015 video_url = data['media']['url']
2016 umobj = re.match(self._URL_EXT, video_url)
2018 raise ValueError('Can not determine filename extension')
2019 ext = umobj.group(1)
2022 'id': data['item_id'],
2024 'uploader': data['display_name'],
2025 'upload_date': upload_date,
2026 'title': data['title'],
2028 'format': data['media']['mimeType'],
2029 'thumbnail': data['thumbnailUrl'],
2030 'description': data['description'],
2031 'player_url': data['embedUrl'],
2032 'user_agent': 'iTunes/10.6.1',
2034 except (ValueError,KeyError) as err:
2035 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2040 class MyVideoIE(InfoExtractor):
2041 """Information Extractor for myvideo.de."""
2043 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2044 IE_NAME = u'myvideo'
2046 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2047 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2048 # https://github.com/rg3/youtube-dl/pull/842
2049 def __rc4crypt(self,data, key):
2051 box = list(range(256))
2052 for i in list(range(256)):
2053 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2054 box[i], box[x] = box[x], box[i]
2060 y = (y + box[x]) % 256
2061 box[x], box[y] = box[y], box[x]
2062 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2066 return hashlib.md5(s).hexdigest().encode()
2068 def _real_extract(self,url):
2069 mobj = re.match(self._VALID_URL, url)
2071 raise ExtractorError(u'invalid URL: %s' % url)
2073 video_id = mobj.group(1)
2076 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2077 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2078 b'TnpsbA0KTVRkbU1tSTRNdz09'
2082 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2083 webpage = self._download_webpage(webpage_url, video_id)
2085 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2086 if mobj is not None:
2087 self.report_extraction(video_id)
2088 video_url = mobj.group(1) + '.flv'
2090 video_title = self._search_regex('<title>([^<]+)</title>',
2093 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2099 'upload_date': None,
2100 'title': video_title,
2105 mobj = re.search('var flashvars={(.+?)}', webpage)
2107 raise ExtractorError(u'Unable to extract video')
2112 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2113 if not a == '_encxml':
2116 encxml = compat_urllib_parse.unquote(b)
2117 if not params.get('domain'):
2118 params['domain'] = 'www.myvideo.de'
2119 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2120 if 'flash_playertype=MTV' in xmldata_url:
2121 self._downloader.report_warning(u'avoiding MTV player')
2123 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2124 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2128 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2129 enc_data_b = binascii.unhexlify(enc_data)
2131 base64.b64decode(base64.b64decode(GK)) +
2133 str(video_id).encode('utf-8')
2136 dec_data = self.__rc4crypt(enc_data_b, sk)
2139 self.report_extraction(video_id)
2142 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2144 video_url = compat_urllib_parse.unquote(mobj.group(1))
2145 if 'myvideo2flash' in video_url:
2146 self._downloader.report_warning(u'forcing RTMPT ...')
2147 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2150 # extract non rtmp videos
2151 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2153 raise ExtractorError(u'unable to extract url')
2154 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2156 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2157 video_file = compat_urllib_parse.unquote(video_file)
2159 if not video_file.endswith('f4m'):
2160 ppath, prefix = video_file.split('.')
2161 video_playpath = '%s:%s' % (prefix, ppath)
2162 video_hls_playlist = ''
2165 video_hls_playlist = (
2166 video_filepath + video_file
2167 ).replace('.f4m', '.m3u8')
2169 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2170 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2172 video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2178 'tc_url': video_url,
2180 'upload_date': None,
2181 'title': video_title,
2183 'play_path': video_playpath,
2184 'video_file': video_file,
2185 'video_hls_playlist': video_hls_playlist,
2186 'player_url': video_swfobj,
2190 class ComedyCentralIE(InfoExtractor):
2191 """Information extractor for The Daily Show and Colbert Report """
2193 # urls can be abbreviations like :thedailyshow or :colbert
2194 # urls for episodes like:
2195 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2196 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2197 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2198 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2199 |(https?://)?(www\.)?
2200 (?P<showname>thedailyshow|colbertnation)\.com/
2201 (full-episodes/(?P<episode>.*)|
2203 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2204 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2207 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2209 _video_extensions = {
2217 _video_dimensions = {
2227 def suitable(cls, url):
2228 """Receives a URL and returns True if suitable for this IE."""
2229 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2231 def _print_formats(self, formats):
2232 print('Available formats:')
2234 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2237 def _real_extract(self, url):
2238 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2240 raise ExtractorError(u'Invalid URL: %s' % url)
2242 if mobj.group('shortname'):
2243 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2244 url = u'http://www.thedailyshow.com/full-episodes/'
2246 url = u'http://www.colbertnation.com/full-episodes/'
2247 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248 assert mobj is not None
2250 if mobj.group('clip'):
2251 if mobj.group('showname') == 'thedailyshow':
2252 epTitle = mobj.group('tdstitle')
2254 epTitle = mobj.group('cntitle')
2257 dlNewest = not mobj.group('episode')
2259 epTitle = mobj.group('showname')
2261 epTitle = mobj.group('episode')
2263 self.report_extraction(epTitle)
2264 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2266 url = htmlHandle.geturl()
2267 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269 raise ExtractorError(u'Invalid redirected URL: ' + url)
2270 if mobj.group('episode') == '':
2271 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2272 epTitle = mobj.group('episode')
2274 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2276 if len(mMovieParams) == 0:
2277 # The Colbert Report embeds the information in a without
2278 # a URL prefix; so extract the alternate reference
2279 # and then add the URL prefix manually.
2281 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2282 if len(altMovieParams) == 0:
2283 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2285 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2287 uri = mMovieParams[0][1]
2288 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2289 indexXml = self._download_webpage(indexUrl, epTitle,
2290 u'Downloading show index',
2291 u'unable to download episode index')
2295 idoc = xml.etree.ElementTree.fromstring(indexXml)
2296 itemEls = idoc.findall('.//item')
2297 for partNum,itemEl in enumerate(itemEls):
2298 mediaId = itemEl.findall('./guid')[0].text
2299 shortMediaId = mediaId.split(':')[-1]
2300 showId = mediaId.split(':')[-2].replace('.com', '')
2301 officialTitle = itemEl.findall('./title')[0].text
2302 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2304 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2305 compat_urllib_parse.urlencode({'uri': mediaId}))
2306 configXml = self._download_webpage(configUrl, epTitle,
2307 u'Downloading configuration for %s' % shortMediaId)
2309 cdoc = xml.etree.ElementTree.fromstring(configXml)
2311 for rendition in cdoc.findall('.//rendition'):
2312 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2316 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2319 if self._downloader.params.get('listformats', None):
2320 self._print_formats([i[0] for i in turls])
2323 # For now, just pick the highest bitrate
2324 format,rtmp_video_url = turls[-1]
2326 # Get the format arg from the arg stream
2327 req_format = self._downloader.params.get('format', None)
2329 # Select format if we can find one
2332 format, rtmp_video_url = f, v
2335 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2337 raise ExtractorError(u'Cannot transform RTMP url')
2338 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2339 video_url = base + m.group('finalid')
2341 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2346 'upload_date': officialDate,
2351 'description': officialTitle,
2353 results.append(info)
2358 class EscapistIE(InfoExtractor):
2359 """Information extractor for The Escapist """
2361 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2362 IE_NAME = u'escapist'
2364 def _real_extract(self, url):
2365 mobj = re.match(self._VALID_URL, url)
2367 raise ExtractorError(u'Invalid URL: %s' % url)
2368 showName = mobj.group('showname')
2369 videoId = mobj.group('episode')
2371 self.report_extraction(showName)
2372 webpage = self._download_webpage(url, showName)
2374 videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2375 webpage, u'description', fatal=False)
2376 if videoDesc: videoDesc = unescapeHTML(videoDesc)
2378 imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2379 webpage, u'thumbnail', fatal=False)
2380 if imgUrl: imgUrl = unescapeHTML(imgUrl)
2382 playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2383 webpage, u'player url')
2384 playerUrl = unescapeHTML(playerUrl)
2386 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2387 configUrl = compat_urllib_parse.unquote(configUrl)
2389 configJSON = self._download_webpage(configUrl, showName,
2390 u'Downloading configuration',
2391 u'unable to download configuration')
2393 # Technically, it's JavaScript, not JSON
2394 configJSON = configJSON.replace("'", '"')
2397 config = json.loads(configJSON)
2398 except (ValueError,) as err:
2399 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2401 playlist = config['playlist']
2402 videoUrl = playlist[1]['url']
2407 'uploader': showName,
2408 'upload_date': None,
2411 'thumbnail': imgUrl,
2412 'description': videoDesc,
2413 'player_url': playerUrl,
2418 class CollegeHumorIE(InfoExtractor):
2419 """Information extractor for collegehumor.com"""
2422 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2423 IE_NAME = u'collegehumor'
2425 def report_manifest(self, video_id):
2426 """Report information extraction."""
2427 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2429 def _real_extract(self, url):
2430 mobj = re.match(self._VALID_URL, url)
2432 raise ExtractorError(u'Invalid URL: %s' % url)
2433 video_id = mobj.group('videoid')
2438 'upload_date': None,
2441 self.report_extraction(video_id)
2442 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2444 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2450 videoNode = mdoc.findall('./video')[0]
2451 info['description'] = videoNode.findall('./description')[0].text
2452 info['title'] = videoNode.findall('./caption')[0].text
2453 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2454 manifest_url = videoNode.findall('./file')[0].text
2456 raise ExtractorError(u'Invalid metadata XML file')
2458 manifest_url += '?hdcore=2.10.3'
2459 self.report_manifest(video_id)
2461 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2465 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2467 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2468 node_id = media_node.attrib['url']
2469 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2470 except IndexError as err:
2471 raise ExtractorError(u'Invalid manifest file')
2473 url_pr = compat_urllib_parse_urlparse(manifest_url)
2474 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2481 class XVideosIE(InfoExtractor):
2482 """Information extractor for xvideos.com"""
2484 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2485 IE_NAME = u'xvideos'
2487 def _real_extract(self, url):
2488 mobj = re.match(self._VALID_URL, url)
2490 raise ExtractorError(u'Invalid URL: %s' % url)
2491 video_id = mobj.group(1)
2493 webpage = self._download_webpage(url, video_id)
2495 self.report_extraction(video_id)
2498 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2499 webpage, u'video URL'))
2502 video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2505 # Extract video thumbnail
2506 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2507 webpage, u'thumbnail', fatal=False)
2513 'upload_date': None,
2514 'title': video_title,
2516 'thumbnail': video_thumbnail,
2517 'description': None,
2523 class SoundcloudIE(InfoExtractor):
2524 """Information extractor for soundcloud.com
2525 To access the media, the uid of the song and a stream token
2526 must be extracted from the page source and the script must make
2527 a request to media.soundcloud.com/crossdomain.xml. Then
2528 the media can be grabbed by requesting from an url composed
2529 of the stream token and uid
2532 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2533 IE_NAME = u'soundcloud'
2535 def report_resolve(self, video_id):
2536 """Report information extraction."""
2537 self.to_screen(u'%s: Resolving id' % video_id)
2539 def _real_extract(self, url):
2540 mobj = re.match(self._VALID_URL, url)
2542 raise ExtractorError(u'Invalid URL: %s' % url)
2544 # extract uploader (which is in the url)
2545 uploader = mobj.group(1)
2546 # extract simple title (uploader + slug of song title)
2547 slug_title = mobj.group(2)
2548 simple_title = uploader + u'-' + slug_title
2549 full_title = '%s/%s' % (uploader, slug_title)
2551 self.report_resolve(full_title)
2553 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2554 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2555 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2557 info = json.loads(info_json)
2558 video_id = info['id']
2559 self.report_extraction(full_title)
2561 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562 stream_json = self._download_webpage(streams_url, full_title,
2563 u'Downloading stream definitions',
2564 u'unable to download stream definitions')
2566 streams = json.loads(stream_json)
2567 mediaURL = streams['http_mp3_128_url']
2568 upload_date = unified_strdate(info['created_at'])
2573 'uploader': info['user']['username'],
2574 'upload_date': upload_date,
2575 'title': info['title'],
2577 'description': info['description'],
2580 class SoundcloudSetIE(InfoExtractor):
2581 """Information extractor for soundcloud.com sets
2582 To access the media, the uid of the song and a stream token
2583 must be extracted from the page source and the script must make
2584 a request to media.soundcloud.com/crossdomain.xml. Then
2585 the media can be grabbed by requesting from an url composed
2586 of the stream token and uid
2589 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2590 IE_NAME = u'soundcloud:set'
2592 def report_resolve(self, video_id):
2593 """Report information extraction."""
2594 self.to_screen(u'%s: Resolving id' % video_id)
2596 def _real_extract(self, url):
2597 mobj = re.match(self._VALID_URL, url)
2599 raise ExtractorError(u'Invalid URL: %s' % url)
2601 # extract uploader (which is in the url)
2602 uploader = mobj.group(1)
2603 # extract simple title (uploader + slug of song title)
2604 slug_title = mobj.group(2)
2605 simple_title = uploader + u'-' + slug_title
2606 full_title = '%s/sets/%s' % (uploader, slug_title)
2608 self.report_resolve(full_title)
2610 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2611 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2612 info_json = self._download_webpage(resolv_url, full_title)
2615 info = json.loads(info_json)
2616 if 'errors' in info:
2617 for err in info['errors']:
2618 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2621 self.report_extraction(full_title)
2622 for track in info['tracks']:
2623 video_id = track['id']
2625 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2626 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2628 self.report_extraction(video_id)
2629 streams = json.loads(stream_json)
2630 mediaURL = streams['http_mp3_128_url']
2635 'uploader': track['user']['username'],
2636 'upload_date': unified_strdate(track['created_at']),
2637 'title': track['title'],
2639 'description': track['description'],
2644 class InfoQIE(InfoExtractor):
2645 """Information extractor for infoq.com"""
2646 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2648 def _real_extract(self, url):
2649 mobj = re.match(self._VALID_URL, url)
2651 raise ExtractorError(u'Invalid URL: %s' % url)
2653 webpage = self._download_webpage(url, video_id=url)
2654 self.report_extraction(url)
2657 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2659 raise ExtractorError(u'Unable to extract video url')
2660 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2661 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2664 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2667 # Extract description
2668 video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2669 webpage, u'description', fatal=False)
2671 video_filename = video_url.split('/')[-1]
2672 video_id, extension = video_filename.split('.')
2678 'upload_date': None,
2679 'title': video_title,
2680 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2682 'description': video_description,
2687 class MixcloudIE(InfoExtractor):
2688 """Information extractor for www.mixcloud.com"""
2690 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2691 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2692 IE_NAME = u'mixcloud'
2694 def report_download_json(self, file_id):
2695 """Report JSON download."""
2696 self.to_screen(u'Downloading json')
2698 def get_urls(self, jsonData, fmt, bitrate='best'):
2699 """Get urls from 'audio_formats' section in json"""
2702 bitrate_list = jsonData[fmt]
2703 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2704 bitrate = max(bitrate_list) # select highest
2706 url_list = jsonData[fmt][bitrate]
2707 except TypeError: # we have no bitrate info.
2708 url_list = jsonData[fmt]
2711 def check_urls(self, url_list):
2712 """Returns 1st active url from list"""
2713 for url in url_list:
2715 compat_urllib_request.urlopen(url)
2717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2722 def _print_formats(self, formats):
2723 print('Available formats:')
2724 for fmt in formats.keys():
2725 for b in formats[fmt]:
2727 ext = formats[fmt][b][0]
2728 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2729 except TypeError: # we have no bitrate info
2730 ext = formats[fmt][0]
2731 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2734 def _real_extract(self, url):
2735 mobj = re.match(self._VALID_URL, url)
2737 raise ExtractorError(u'Invalid URL: %s' % url)
2738 # extract uploader & filename from url
2739 uploader = mobj.group(1).decode('utf-8')
2740 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2742 # construct API request
2743 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2744 # retrieve .json file with links to files
2745 request = compat_urllib_request.Request(file_url)
2747 self.report_download_json(file_url)
2748 jsonData = compat_urllib_request.urlopen(request).read()
2749 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2750 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2753 json_data = json.loads(jsonData)
2754 player_url = json_data['player_swf_url']
2755 formats = dict(json_data['audio_formats'])
2757 req_format = self._downloader.params.get('format', None)
2760 if self._downloader.params.get('listformats', None):
2761 self._print_formats(formats)
2764 if req_format is None or req_format == 'best':
2765 for format_param in formats.keys():
2766 url_list = self.get_urls(formats, format_param)
2768 file_url = self.check_urls(url_list)
2769 if file_url is not None:
2772 if req_format not in formats:
2773 raise ExtractorError(u'Format is not available')
2775 url_list = self.get_urls(formats, req_format)
2776 file_url = self.check_urls(url_list)
2777 format_param = req_format
2780 'id': file_id.decode('utf-8'),
2781 'url': file_url.decode('utf-8'),
2782 'uploader': uploader.decode('utf-8'),
2783 'upload_date': None,
2784 'title': json_data['name'],
2785 'ext': file_url.split('.')[-1].decode('utf-8'),
2786 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2787 'thumbnail': json_data['thumbnail_url'],
2788 'description': json_data['description'],
2789 'player_url': player_url.decode('utf-8'),
2792 class StanfordOpenClassroomIE(InfoExtractor):
2793 """Information extractor for Stanford's Open ClassRoom"""
2795 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2796 IE_NAME = u'stanfordoc'
2798 def _real_extract(self, url):
2799 mobj = re.match(self._VALID_URL, url)
2801 raise ExtractorError(u'Invalid URL: %s' % url)
2803 if mobj.group('course') and mobj.group('video'): # A specific video
2804 course = mobj.group('course')
2805 video = mobj.group('video')
2807 'id': course + '_' + video,
2809 'upload_date': None,
2812 self.report_extraction(info['id'])
2813 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2814 xmlUrl = baseUrl + video + '.xml'
2816 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2817 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2818 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2819 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821 info['title'] = mdoc.findall('./title')[0].text
2822 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824 raise ExtractorError(u'Invalid metadata XML file')
2825 info['ext'] = info['url'].rpartition('.')[2]
2827 elif mobj.group('course'): # A course page
2828 course = mobj.group('course')
2833 'upload_date': None,
2836 coursepage = self._download_webpage(url, info['id'],
2837 note='Downloading course info page',
2838 errnote='Unable to download course info page')
2840 info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2841 info['title'] = unescapeHTML(info['title'])
2843 info['description'] = self._search_regex('<description>([^<]+)</description>',
2844 coursepage, u'description', fatal=False)
2845 if info['description']: info['description'] = unescapeHTML(info['description'])
2847 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2850 'type': 'reference',
2851 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2855 for entry in info['list']:
2856 assert entry['type'] == 'reference'
2857 results += self.extract(entry['url'])
2861 'id': 'Stanford OpenClassroom',
2864 'upload_date': None,
2867 self.report_download_webpage(info['id'])
2868 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2874 info['title'] = info['id']
2876 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2879 'type': 'reference',
2880 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2885 for entry in info['list']:
2886 assert entry['type'] == 'reference'
2887 results += self.extract(entry['url'])
2890 class MTVIE(InfoExtractor):
2891 """Information extractor for MTV.com"""
2893 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2896 def _real_extract(self, url):
2897 mobj = re.match(self._VALID_URL, url)
2899 raise ExtractorError(u'Invalid URL: %s' % url)
2900 if not mobj.group('proto'):
2901 url = 'http://' + url
2902 video_id = mobj.group('videoid')
2904 webpage = self._download_webpage(url, video_id)
2906 song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2907 webpage, u'song name', fatal=False)
2908 if song_name: song_name = unescapeHTML(song_name)
2910 video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2912 video_title = unescapeHTML(video_title)
2914 mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2915 webpage, u'mtvn_uri', fatal=False)
2917 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2918 webpage, u'content id', fatal=False)
2920 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2921 self.report_extraction(video_id)
2922 request = compat_urllib_request.Request(videogen_url)
2924 metadataXml = compat_urllib_request.urlopen(request).read()
2925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2928 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2929 renditions = mdoc.findall('.//rendition')
2931 # For now, always pick the highest quality.
2932 rendition = renditions[-1]
2935 _,_,ext = rendition.attrib['type'].partition('/')
2936 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2937 video_url = rendition.find('./src').text
2939 raise ExtractorError('Invalid rendition field.')
2944 'uploader': performer,
2945 'upload_date': None,
2946 'title': video_title,
2954 class YoukuIE(InfoExtractor):
2955 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2958 nowTime = int(time.time() * 1000)
2959 random1 = random.randint(1000,1998)
2960 random2 = random.randint(1000,9999)
2962 return "%d%d%d" %(nowTime,random1,random2)
2964 def _get_file_ID_mix_string(self, seed):
2966 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2968 for i in range(len(source)):
2969 seed = (seed * 211 + 30031 ) % 65536
2970 index = math.floor(seed / 65536 * len(source) )
2971 mixed.append(source[int(index)])
2972 source.remove(source[int(index)])
2973 #return ''.join(mixed)
2976 def _get_file_id(self, fileId, seed):
2977 mixed = self._get_file_ID_mix_string(seed)
2978 ids = fileId.split('*')
2982 realId.append(mixed[int(ch)])
2983 return ''.join(realId)
2985 def _real_extract(self, url):
2986 mobj = re.match(self._VALID_URL, url)
2988 raise ExtractorError(u'Invalid URL: %s' % url)
2989 video_id = mobj.group('ID')
2991 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2993 jsondata = self._download_webpage(info_url, video_id)
2995 self.report_extraction(video_id)
2997 config = json.loads(jsondata)
2999 video_title = config['data'][0]['title']
3000 seed = config['data'][0]['seed']
3002 format = self._downloader.params.get('format', None)
3003 supported_format = list(config['data'][0]['streamfileids'].keys())
3005 if format is None or format == 'best':
3006 if 'hd2' in supported_format:
3011 elif format == 'worst':
3019 fileid = config['data'][0]['streamfileids'][format]
3020 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3021 except (UnicodeDecodeError, ValueError, KeyError):
3022 raise ExtractorError(u'Unable to extract info section')
3025 sid = self._gen_sid()
3026 fileid = self._get_file_id(fileid, seed)
3028 #column 8,9 of fileid represent the segment number
3029 #fileid[7:9] should be changed
3030 for index, key in enumerate(keys):
3032 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3033 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3036 'id': '%s_part%02d' % (video_id, index),
3037 'url': download_url,
3039 'upload_date': None,
3040 'title': video_title,
3043 files_info.append(info)
3048 class XNXXIE(InfoExtractor):
3049 """Information extractor for xnxx.com"""
3051 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3053 VIDEO_URL_RE = r'flv_url=(.*?)&'
3054 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3055 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3057 def _real_extract(self, url):
3058 mobj = re.match(self._VALID_URL, url)
3060 raise ExtractorError(u'Invalid URL: %s' % url)
3061 video_id = mobj.group(1)
3063 # Get webpage content
3064 webpage = self._download_webpage(url, video_id)
3066 video_url = self._search_regex(self.VIDEO_URL_RE,
3067 webpage, u'video URL')
3068 video_url = compat_urllib_parse.unquote(video_url)
3070 video_title = self._search_regex(self.VIDEO_TITLE_RE,
3073 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3074 webpage, u'thumbnail', fatal=False)
3080 'upload_date': None,
3081 'title': video_title,
3083 'thumbnail': video_thumbnail,
3084 'description': None,
3088 class GooglePlusIE(InfoExtractor):
3089 """Information extractor for plus.google.com."""
3091 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3092 IE_NAME = u'plus.google'
3094 def _real_extract(self, url):
3095 # Extract id from URL
3096 mobj = re.match(self._VALID_URL, url)
3098 raise ExtractorError(u'Invalid URL: %s' % url)
3100 post_url = mobj.group(0)
3101 video_id = mobj.group(1)
3103 video_extension = 'flv'
3105 # Step 1, Retrieve post webpage to extract further information
3106 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3108 self.report_extraction(video_id)
3110 # Extract update date
3111 upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3112 webpage, u'upload date', fatal=False)
3114 # Convert timestring to a format suitable for filename
3115 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3116 upload_date = upload_date.strftime('%Y%m%d')
3119 uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3120 webpage, u'uploader', fatal=False)
3123 # Get the first line for title
3124 video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3125 webpage, 'title', default=u'NA')
3127 # Step 2, Stimulate clicking the image box to launch video
3128 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3129 webpage, u'video page URL')
3130 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132 # Extract video links on video page
3133 """Extract video links of all sizes"""
3134 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3135 mobj = re.findall(pattern, webpage)
3137 raise ExtractorError(u'Unable to extract video links')
3139 # Sort in resolution
3140 links = sorted(mobj)
3142 # Choose the lowest of the sort, i.e. highest resolution
3143 video_url = links[-1]
3144 # Only get the url. The resolution part in the tuple has no use anymore
3145 video_url = video_url[-1]
3146 # Treat escaped \u0026 style hex
3148 video_url = video_url.decode("unicode_escape")
3149 except AttributeError: # Python 3
3150 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3156 'uploader': uploader,
3157 'upload_date': upload_date,
3158 'title': video_title,
3159 'ext': video_extension,
3162 class NBAIE(InfoExtractor):
3163 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3166 def _real_extract(self, url):
3167 mobj = re.match(self._VALID_URL, url)
3169 raise ExtractorError(u'Invalid URL: %s' % url)
3171 video_id = mobj.group(1)
3173 webpage = self._download_webpage(url, video_id)
3175 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3177 shortened_video_id = video_id.rpartition('/')[2]
3178 title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3179 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3181 # It isn't there in the HTML it returns to us
3182 # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3184 description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3187 'id': shortened_video_id,
3191 # 'uploader_date': uploader_date,
3192 'description': description,
3196 class JustinTVIE(InfoExtractor):
3197 """Information extractor for justin.tv and twitch.tv"""
3198 # TODO: One broadcast may be split into multiple videos. The key
3199 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3200 # starts at 1 and increases. Can we treat all parts as one video?
3202 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3204 (?P<channelid>[^/]+)|
3205 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3206 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3210 _JUSTIN_PAGE_LIMIT = 100
3211 IE_NAME = u'justin.tv'
3213 def report_download_page(self, channel, offset):
3214 """Report attempt to download a single page of videos."""
3215 self.to_screen(u'%s: Downloading video information from %d to %d' %
3216 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3218 # Return count of items, list of *valid* items
3219 def _parse_page(self, url, video_id):
3220 webpage = self._download_webpage(url, video_id,
3221 u'Downloading video info JSON',
3222 u'unable to download video info JSON')
3224 response = json.loads(webpage)
3225 if type(response) != list:
3226 error_text = response.get('error', 'unknown error')
3227 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3229 for clip in response:
3230 video_url = clip['video_file_url']
3232 video_extension = os.path.splitext(video_url)[1][1:]
3233 video_date = re.sub('-', '', clip['start_time'][:10])
3234 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3235 video_id = clip['id']
3236 video_title = clip.get('title', video_id)
3240 'title': video_title,
3241 'uploader': clip.get('channel_name', video_uploader_id),
3242 'uploader_id': video_uploader_id,
3243 'upload_date': video_date,
3244 'ext': video_extension,
3246 return (len(response), info)
3248 def _real_extract(self, url):
3249 mobj = re.match(self._VALID_URL, url)
3251 raise ExtractorError(u'invalid URL: %s' % url)
3253 api_base = 'http://api.justin.tv'
3255 if mobj.group('channelid'):
3257 video_id = mobj.group('channelid')
3258 api = api_base + '/channel/archives/%s.json' % video_id
3259 elif mobj.group('chapterid'):
3260 chapter_id = mobj.group('chapterid')
3262 webpage = self._download_webpage(url, chapter_id)
3263 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3265 raise ExtractorError(u'Cannot find archive of a chapter')
3266 archive_id = m.group(1)
3268 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3269 chapter_info_xml = self._download_webpage(api, chapter_id,
3270 note=u'Downloading chapter information',
3271 errnote=u'Chapter information download failed')
3272 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3273 for a in doc.findall('.//archive'):
3274 if archive_id == a.find('./id').text:
3277 raise ExtractorError(u'Could not find chapter in chapter information')
3279 video_url = a.find('./video_file_url').text
3280 video_ext = video_url.rpartition('.')[2] or u'flv'
3282 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3283 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3284 note='Downloading chapter metadata',
3285 errnote='Download of chapter metadata failed')
3286 chapter_info = json.loads(chapter_info_json)
3288 bracket_start = int(doc.find('.//bracket_start').text)
3289 bracket_end = int(doc.find('.//bracket_end').text)
3291 # TODO determine start (and probably fix up file)
3292 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3293 #video_url += u'?start=' + TODO:start_timestamp
3294 # bracket_start is 13290, but we want 51670615
3295 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3296 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3299 'id': u'c' + chapter_id,
3302 'title': chapter_info['title'],
3303 'thumbnail': chapter_info['preview'],
3304 'description': chapter_info['description'],
3305 'uploader': chapter_info['channel']['display_name'],
3306 'uploader_id': chapter_info['channel']['name'],
3310 video_id = mobj.group('videoid')
3311 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3313 self.report_extraction(video_id)
3317 limit = self._JUSTIN_PAGE_LIMIT
3320 self.report_download_page(video_id, offset)
3321 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3322 page_count, page_info = self._parse_page(page_url, video_id)
3323 info.extend(page_info)
3324 if not paged or page_count != limit:
3329 class FunnyOrDieIE(InfoExtractor):
3330 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3332 def _real_extract(self, url):
3333 mobj = re.match(self._VALID_URL, url)
3335 raise ExtractorError(u'invalid URL: %s' % url)
3337 video_id = mobj.group('id')
3338 webpage = self._download_webpage(url, video_id)
3340 video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3341 webpage, u'video URL', flags=re.DOTALL)
3342 video_url = unescapeHTML(video_url)
3344 title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3345 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3346 title = clean_html(title)
3348 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3349 webpage, u'description', fatal=False, flags=re.DOTALL)
3350 if video_description: video_description = unescapeHTML(video_description)
3357 'description': video_description,
3361 class SteamIE(InfoExtractor):
3362 _VALID_URL = r"""http://store\.steampowered\.com/
3364 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3366 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3370 def suitable(cls, url):
3371 """Receives a URL and returns True if suitable for this IE."""
3372 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3374 def _real_extract(self, url):
3375 m = re.match(self._VALID_URL, url, re.VERBOSE)
3376 gameID = m.group('gameID')
3377 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3378 self.report_age_confirmation()
3379 webpage = self._download_webpage(videourl, gameID)
3380 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3382 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3383 mweb = re.finditer(urlRE, webpage)
3384 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3385 titles = re.finditer(namesRE, webpage)
3386 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3387 thumbs = re.finditer(thumbsRE, webpage)
3389 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3390 video_id = vid.group('videoID')
3391 title = vtitle.group('videoName')
3392 video_url = vid.group('videoURL')
3393 video_thumb = thumb.group('thumbnail')
3395 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3400 'title': unescapeHTML(title),
3401 'thumbnail': video_thumb
3404 return [self.playlist_result(videos, gameID, game_title)]
3406 class UstreamIE(InfoExtractor):
3407 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3408 IE_NAME = u'ustream'
3410 def _real_extract(self, url):
3411 m = re.match(self._VALID_URL, url)
3412 video_id = m.group('videoID')
3414 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3415 webpage = self._download_webpage(url, video_id)
3417 self.report_extraction(video_id)
3419 video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3422 uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3423 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3424 if uploader: uploader = unescapeHTML(uploader.strip())
3426 thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3427 webpage, u'thumbnail', fatal=False)
3433 'title': video_title,
3434 'uploader': uploader,
3435 'thumbnail': thumbnail,
3439 class WorldStarHipHopIE(InfoExtractor):
3440 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3441 IE_NAME = u'WorldStarHipHop'
3443 def _real_extract(self, url):
3444 m = re.match(self._VALID_URL, url)
3445 video_id = m.group('id')
3447 webpage_src = self._download_webpage(url, video_id)
3449 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3450 webpage_src, u'video URL')
3452 if 'mp4' in video_url:
3457 video_title = self._search_regex(r"<title>(.*)</title>",
3458 webpage_src, u'title')
3460 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3461 thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3462 webpage_src, u'thumbnail', fatal=False)
3465 _title = r"""candytitles.*>(.*)</span>"""
3466 mobj = re.search(_title, webpage_src)
3467 if mobj is not None:
3468 video_title = mobj.group(1)
3473 'title' : video_title,
3474 'thumbnail' : thumbnail,
3479 class RBMARadioIE(InfoExtractor):
3480 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3482 def _real_extract(self, url):
3483 m = re.match(self._VALID_URL, url)
3484 video_id = m.group('videoID')
3486 webpage = self._download_webpage(url, video_id)
3488 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3489 webpage, u'json data')
3492 data = json.loads(json_data)
3493 except ValueError as e:
3494 raise ExtractorError(u'Invalid JSON: ' + str(e))
3496 video_url = data['akamai_url'] + '&cbr=256'
3497 url_parts = compat_urllib_parse_urlparse(video_url)
3498 video_ext = url_parts.path.rpartition('.')[2]
3503 'title': data['title'],
3504 'description': data.get('teaser_text'),
3505 'location': data.get('country_of_origin'),
3506 'uploader': data.get('host', {}).get('name'),
3507 'uploader_id': data.get('host', {}).get('slug'),
3508 'thumbnail': data.get('image', {}).get('large_url_2x'),
3509 'duration': data.get('duration'),
3514 class YouPornIE(InfoExtractor):
3515 """Information extractor for youporn.com."""
3516 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3518 def _print_formats(self, formats):
3519 """Print all available formats"""
3520 print(u'Available formats:')
3521 print(u'ext\t\tformat')
3522 print(u'---------------------------------')
3523 for format in formats:
3524 print(u'%s\t\t%s' % (format['ext'], format['format']))
3526 def _specific(self, req_format, formats):
3528 if(x["format"]==req_format):
3532 def _real_extract(self, url):
3533 mobj = re.match(self._VALID_URL, url)
3535 raise ExtractorError(u'Invalid URL: %s' % url)
3536 video_id = mobj.group('videoid')
3538 req = compat_urllib_request.Request(url)
3539 req.add_header('Cookie', 'age_verified=1')
3540 webpage = self._download_webpage(req, video_id)
3542 # Get JSON parameters
3543 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3545 params = json.loads(json_params)
3547 raise ExtractorError(u'Invalid JSON')
3549 self.report_extraction(video_id)
3551 video_title = params['title']
3552 upload_date = unified_strdate(params['release_date_f'])
3553 video_description = params['description']
3554 video_uploader = params['submitted_by']
3555 thumbnail = params['thumbnails'][0]['image']
3557 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3559 # Get all of the formats available
3560 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3561 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3562 webpage, u'download list').strip()
3564 # Get all of the links from the page
3565 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3566 links = re.findall(LINK_RE, download_list_html)
3567 if(len(links) == 0):
3568 raise ExtractorError(u'ERROR: no known formats available for video')
3570 self.to_screen(u'Links found: %d' % len(links))
3575 # A link looks like this:
3576 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3577 # A path looks like this:
3578 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3579 video_url = unescapeHTML( link )
3580 path = compat_urllib_parse_urlparse( video_url ).path
3581 extension = os.path.splitext( path )[1][1:]
3582 format = path.split('/')[4].split('_')[:2]
3585 format = "-".join( format )
3586 title = u'%s-%s-%s' % (video_title, size, bitrate)
3591 'uploader': video_uploader,
3592 'upload_date': upload_date,
3596 'thumbnail': thumbnail,
3597 'description': video_description
3600 if self._downloader.params.get('listformats', None):
3601 self._print_formats(formats)
3604 req_format = self._downloader.params.get('format', None)
3605 self.to_screen(u'Format: %s' % req_format)
3607 if req_format is None or req_format == 'best':
3609 elif req_format == 'worst':
3610 return [formats[-1]]
3611 elif req_format in ('-1', 'all'):
3614 format = self._specific( req_format, formats )
3616 raise ExtractorError(u'Requested format not available')
3621 class PornotubeIE(InfoExtractor):
3622 """Information extractor for pornotube.com."""
3623 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3625 def _real_extract(self, url):
3626 mobj = re.match(self._VALID_URL, url)
3628 raise ExtractorError(u'Invalid URL: %s' % url)
3630 video_id = mobj.group('videoid')
3631 video_title = mobj.group('title')
3633 # Get webpage content
3634 webpage = self._download_webpage(url, video_id)
3637 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3638 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3639 video_url = compat_urllib_parse.unquote(video_url)
3641 #Get the uploaded date
3642 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3643 upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3644 if upload_date: upload_date = unified_strdate(upload_date)
3646 info = {'id': video_id,
3649 'upload_date': upload_date,
3650 'title': video_title,
3656 class YouJizzIE(InfoExtractor):
3657 """Information extractor for youjizz.com."""
3658 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3660 def _real_extract(self, url):
3661 mobj = re.match(self._VALID_URL, url)
3663 raise ExtractorError(u'Invalid URL: %s' % url)
3665 video_id = mobj.group('videoid')
3667 # Get webpage content
3668 webpage = self._download_webpage(url, video_id)
3670 # Get the video title
3671 video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3672 webpage, u'title').strip()
3674 # Get the embed page
3675 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3677 raise ExtractorError(u'ERROR: unable to extract embed page')
3679 embed_page_url = result.group(0).strip()
3680 video_id = result.group('videoid')
3682 webpage = self._download_webpage(embed_page_url, video_id)
3685 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3686 webpage, u'video URL')
3688 info = {'id': video_id,
3690 'title': video_title,
3693 'player_url': embed_page_url}
3697 class EightTracksIE(InfoExtractor):
3699 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3701 def _real_extract(self, url):
3702 mobj = re.match(self._VALID_URL, url)
3704 raise ExtractorError(u'Invalid URL: %s' % url)
3705 playlist_id = mobj.group('id')
3707 webpage = self._download_webpage(url, playlist_id)
3709 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3710 data = json.loads(json_like)
3712 session = str(random.randint(0, 1000000000))
3714 track_count = data['tracks_count']
3715 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3716 next_url = first_url
3718 for i in itertools.count():
3719 api_json = self._download_webpage(next_url, playlist_id,
3720 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3721 errnote=u'Failed to download song information')
3722 api_data = json.loads(api_json)
3723 track_data = api_data[u'set']['track']
3725 'id': track_data['id'],
3726 'url': track_data['track_file_stream_url'],
3727 'title': track_data['performer'] + u' - ' + track_data['name'],
3728 'raw_title': track_data['name'],
3729 'uploader_id': data['user']['login'],
3733 if api_data['set']['at_last_track']:
3735 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3738 class KeekIE(InfoExtractor):
3739 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3742 def _real_extract(self, url):
3743 m = re.match(self._VALID_URL, url)
3744 video_id = m.group('videoID')
3746 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3747 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3748 webpage = self._download_webpage(url, video_id)
3750 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3752 video_title = unescapeHTML(video_title)
3754 uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755 webpage, u'uploader', fatal=False)
3756 if uploader: uploader = clean_html(uploader)
3762 'title': video_title,
3763 'thumbnail': thumbnail,
3764 'uploader': uploader
3768 class TEDIE(InfoExtractor):
3769 _VALID_URL=r'''http://www\.ted\.com/
3771 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773 ((?P<type_talk>talks)) # We have a simple talk
3775 (/lang/(.*?))? # The url may contain the language
3776 /(?P<name>\w+) # Here goes the name and then ".html"
3780 def suitable(cls, url):
3781 """Receives a URL and returns True if suitable for this IE."""
3782 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784 def _real_extract(self, url):
3785 m=re.match(self._VALID_URL, url, re.VERBOSE)
3786 if m.group('type_talk'):
3787 return [self._talk_info(url)]
3789 playlist_id=m.group('playlist_id')
3790 name=m.group('name')
3791 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792 return [self._playlist_videos_info(url,name,playlist_id)]
3794 def _talk_video_link(self,mediaSlug):
3795 '''Returns the video link for that mediaSlug'''
3796 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3798 def _playlist_videos_info(self,url,name,playlist_id=0):
3799 '''Returns the videos of the playlist'''
3801 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802 ([.\s]*?)data-playlist_item_id="(\d+)"
3803 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808 m_names=re.finditer(video_name_RE,webpage)
3810 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811 m_playlist = re.search(playlist_RE, webpage)
3812 playlist_title = m_playlist.group('playlist_title')
3814 playlist_entries = []
3815 for m_video, m_name in zip(m_videos,m_names):
3816 video_id=m_video.group('video_id')
3817 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818 playlist_entries.append(self.url_result(talk_url, 'TED'))
3819 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821 def _talk_info(self, url, video_id=0):
3822 """Return the video for the talk in the url"""
3823 m=re.match(self._VALID_URL, url,re.VERBOSE)
3824 videoName=m.group('name')
3825 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826 # If the url includes the language we get the title translated
3827 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828 title=re.search(title_RE, webpage).group('title')
3829 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830 "id":(?P<videoID>[\d]+).*?
3831 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833 thumb_match=re.search(thumb_RE,webpage)
3834 info_match=re.search(info_RE,webpage,re.VERBOSE)
3835 video_id=info_match.group('videoID')
3836 mediaSlug=info_match.group('mediaSlug')
3837 video_url=self._talk_video_link(mediaSlug)
3843 'thumbnail': thumb_match.group('thumbnail')
3847 class MySpassIE(InfoExtractor):
3848 _VALID_URL = r'http://www.myspass.de/.*'
3850 def _real_extract(self, url):
3851 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853 # video id is the last path element of the URL
3854 # usually there is a trailing slash, so also try the second but last
3855 url_path = compat_urllib_parse_urlparse(url).path
3856 url_parent_path, video_id = os.path.split(url_path)
3858 _, video_id = os.path.split(url_parent_path)
3861 metadata_url = META_DATA_URL_TEMPLATE % video_id
3862 metadata_text = self._download_webpage(metadata_url, video_id)
3863 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865 # extract values from metadata
3866 url_flv_el = metadata.find('url_flv')
3867 if url_flv_el is None:
3868 raise ExtractorError(u'Unable to extract download url')
3869 video_url = url_flv_el.text
3870 extension = os.path.splitext(video_url)[1][1:]
3871 title_el = metadata.find('title')
3872 if title_el is None:
3873 raise ExtractorError(u'Unable to extract title')
3874 title = title_el.text
3875 format_id_el = metadata.find('format_id')
3876 if format_id_el is None:
3879 format = format_id_el.text
3880 description_el = metadata.find('description')
3881 if description_el is not None:
3882 description = description_el.text
3885 imagePreview_el = metadata.find('imagePreview')
3886 if imagePreview_el is not None:
3887 thumbnail = imagePreview_el.text
3896 'thumbnail': thumbnail,
3897 'description': description
3901 class SpiegelIE(InfoExtractor):
3902 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904 def _real_extract(self, url):
3905 m = re.match(self._VALID_URL, url)
3906 video_id = m.group('videoID')
3908 webpage = self._download_webpage(url, video_id)
3910 video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3912 video_title = unescapeHTML(video_title)
3914 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915 xml_code = self._download_webpage(xml_url, video_id,
3916 note=u'Downloading XML', errnote=u'Failed to download XML')
3918 idoc = xml.etree.ElementTree.fromstring(xml_code)
3919 last_type = idoc[-1]
3920 filename = last_type.findall('./filename')[0].text
3921 duration = float(last_type.findall('./duration')[0].text)
3923 video_url = 'http://video2.spiegel.de/flash/' + filename
3924 video_ext = filename.rpartition('.')[2]
3929 'title': video_title,
3930 'duration': duration,
3934 class LiveLeakIE(InfoExtractor):
3936 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937 IE_NAME = u'liveleak'
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3944 video_id = mobj.group('video_id')
3946 webpage = self._download_webpage(url, video_id)
3948 video_url = self._search_regex(r'file: "(.*?)",',
3949 webpage, u'video URL')
3951 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3953 video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3955 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956 webpage, u'description', fatal=False)
3957 if video_description: video_description = unescapeHTML(video_description)
3959 video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3960 webpage, u'uploader', fatal=False)
3966 'title': video_title,
3967 'description': video_description,
3968 'uploader': video_uploader
3973 class ARDIE(InfoExtractor):
3974 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3975 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3976 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3978 def _real_extract(self, url):
3979 # determine video id from url
3980 m = re.match(self._VALID_URL, url)
3982 numid = re.search(r'documentId=([0-9]+)', url)
3984 video_id = numid.group(1)
3986 video_id = m.group('video_id')
3988 # determine title and media streams from webpage
3989 html = self._download_webpage(url, video_id)
3990 title = re.search(self._TITLE, html).group('title')
3991 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3993 assert '"fsk"' in html
3994 raise ExtractorError(u'This video is only available after 8:00 pm')
3996 # choose default media type and highest quality for now
3997 stream = max([s for s in streams if int(s["media_type"]) == 0],
3998 key=lambda s: int(s["quality"]))
4000 # there's two possibilities: RTMP stream or HTTP download
4001 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4002 if stream['rtmp_url']:
4003 self.to_screen(u'RTMP download detected')
4004 assert stream['video_url'].startswith('mp4:')
4005 info["url"] = stream["rtmp_url"]
4006 info["play_path"] = stream['video_url']
4008 assert stream["video_url"].endswith('.mp4')
4009 info["url"] = stream["video_url"]
4012 class TumblrIE(InfoExtractor):
4013 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4015 def _real_extract(self, url):
4016 m_url = re.match(self._VALID_URL, url)
4017 video_id = m_url.group('id')
4018 blog = m_url.group('blog_name')
4020 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4021 webpage = self._download_webpage(url, video_id)
4023 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4024 video = re.search(re_video, webpage)
4026 raise ExtractorError(u'Unable to extract video')
4027 video_url = video.group('video_url')
4028 ext = video.group('ext')
4030 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4031 webpage, u'thumbnail', fatal=False) # We pick the first poster
4032 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4034 # The only place where you can get a title, it's not complete,
4035 # but searching in other places doesn't work for all videos
4036 video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4037 webpage, u'title', flags=re.DOTALL)
4038 video_title = unescapeHTML(video_title)
4040 return [{'id': video_id,
4042 'title': video_title,
4043 'thumbnail': video_thumbnail,
4047 class BandcampIE(InfoExtractor):
4048 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4050 def _real_extract(self, url):
4051 mobj = re.match(self._VALID_URL, url)
4052 title = mobj.group('title')
4053 webpage = self._download_webpage(url, title)
4054 # We get the link to the free download page
4055 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4056 if m_download is None:
4057 raise ExtractorError(u'No free songs found')
4059 download_link = m_download.group(1)
4060 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4061 webpage, re.MULTILINE|re.DOTALL).group('id')
4063 download_webpage = self._download_webpage(download_link, id,
4064 'Downloading free downloads page')
4065 # We get the dictionary of the track from some javascrip code
4066 info = re.search(r'items: (.*?),$',
4067 download_webpage, re.MULTILINE).group(1)
4068 info = json.loads(info)[0]
4069 # We pick mp3-320 for now, until format selection can be easily implemented.
4070 mp3_info = info[u'downloads'][u'mp3-320']
4071 # If we try to use this url it says the link has expired
4072 initial_url = mp3_info[u'url']
4073 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4074 m_url = re.match(re_url, initial_url)
4075 #We build the url we will use to get the final track url
4076 # This url is build in Bandcamp in the script download_bunde_*.js
4077 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4078 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4079 # If we could correctly generate the .rand field the url would be
4080 #in the "download_url" key
4081 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4083 track_info = {'id':id,
4084 'title' : info[u'title'],
4087 'thumbnail' : info[u'thumb_url'],
4088 'uploader' : info[u'artist']
4093 class RedTubeIE(InfoExtractor):
4094 """Information Extractor for redtube"""
4095 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4097 def _real_extract(self,url):
4098 mobj = re.match(self._VALID_URL, url)
4100 raise ExtractorError(u'Invalid URL: %s' % url)
4102 video_id = mobj.group('id')
4103 video_extension = 'mp4'
4104 webpage = self._download_webpage(url, video_id)
4106 self.report_extraction(video_id)
4108 video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4109 webpage, u'video URL')
4111 video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4117 'ext': video_extension,
4118 'title': video_title,
4121 class InaIE(InfoExtractor):
4122 """Information Extractor for Ina.fr"""
4123 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4125 def _real_extract(self,url):
4126 mobj = re.match(self._VALID_URL, url)
4128 video_id = mobj.group('id')
4129 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4130 video_extension = 'mp4'
4131 webpage = self._download_webpage(mrss_url, video_id)
4133 self.report_extraction(video_id)
4135 video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4136 webpage, u'video URL')
4138 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4144 'ext': video_extension,
4145 'title': video_title,
4148 class HowcastIE(InfoExtractor):
4149 """Information Extractor for Howcast.com"""
4150 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4152 def _real_extract(self, url):
4153 mobj = re.match(self._VALID_URL, url)
4155 video_id = mobj.group('id')
4156 webpage_url = 'http://www.howcast.com/videos/' + video_id
4157 webpage = self._download_webpage(webpage_url, video_id)
4159 self.report_extraction(video_id)
4161 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4162 webpage, u'video URL')
4164 video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4167 video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4168 webpage, u'description', fatal=False)
4170 thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4171 webpage, u'thumbnail', fatal=False)
4177 'title': video_title,
4178 'description': video_description,
4179 'thumbnail': thumbnail,
4182 class VineIE(InfoExtractor):
4183 """Information Extractor for Vine.co"""
4184 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4186 def _real_extract(self, url):
4187 mobj = re.match(self._VALID_URL, url)
4189 video_id = mobj.group('id')
4190 webpage_url = 'https://vine.co/v/' + video_id
4191 webpage = self._download_webpage(webpage_url, video_id)
4193 self.report_extraction(video_id)
4195 video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4196 webpage, u'video URL')
4198 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4201 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4202 webpage, u'thumbnail', fatal=False)
4204 uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4205 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4211 'title': video_title,
4212 'thumbnail': thumbnail,
4213 'uploader': uploader,
4216 class FlickrIE(InfoExtractor):
4217 """Information Extractor for Flickr videos"""
4218 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4220 def _real_extract(self, url):
4221 mobj = re.match(self._VALID_URL, url)
4223 video_id = mobj.group('id')
4224 video_uploader_id = mobj.group('uploader_id')
4225 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4226 webpage = self._download_webpage(webpage_url, video_id)
4228 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4230 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4231 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4233 node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4234 first_xml, u'node_id')
4236 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4237 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4239 self.report_extraction(video_id)
4241 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4243 raise ExtractorError(u'Unable to extract video url')
4244 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4246 video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4247 webpage, u'video title')
4249 video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4250 webpage, u'description', fatal=False)
4252 thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4253 webpage, u'thumbnail', fatal=False)
4259 'title': video_title,
4260 'description': video_description,
4261 'thumbnail': thumbnail,
4262 'uploader_id': video_uploader_id,
4265 class TeamcocoIE(InfoExtractor):
4266 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4268 def _real_extract(self, url):
4269 mobj = re.match(self._VALID_URL, url)
4271 raise ExtractorError(u'Invalid URL: %s' % url)
4272 url_title = mobj.group('url_title')
4273 webpage = self._download_webpage(url, url_title)
4275 video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4276 webpage, u'video id')
4278 self.report_extraction(video_id)
4280 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4283 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4284 webpage, u'thumbnail', fatal=False)
4286 video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4287 webpage, u'description', fatal=False)
4289 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4290 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4292 video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4299 'title': video_title,
4300 'thumbnail': thumbnail,
4301 'description': video_description,
4304 class XHamsterIE(InfoExtractor):
4305 """Information Extractor for xHamster"""
4306 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4308 def _real_extract(self,url):
4309 mobj = re.match(self._VALID_URL, url)
4311 video_id = mobj.group('id')
4312 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4313 webpage = self._download_webpage(mrss_url, video_id)
4315 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4317 raise ExtractorError(u'Unable to extract media URL')
4318 if len(mobj.group('server')) == 0:
4319 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4321 video_url = mobj.group('server')+'/key='+mobj.group('file')
4322 video_extension = video_url.split('.')[-1]
4324 video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4326 video_title = unescapeHTML(video_title)
4328 # Can't see the description anywhere in the UI
4329 # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4330 # webpage, u'description', fatal=False)
4331 # if video_description: video_description = unescapeHTML(video_description)
4333 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4335 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4337 video_upload_date = None
4338 self._downloader.report_warning(u'Unable to extract upload date')
4340 video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
4341 webpage, u'uploader id', default=u'anonymous')
4343 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4344 webpage, u'thumbnail', fatal=False)
4349 'ext': video_extension,
4350 'title': video_title,
4351 # 'description': video_description,
4352 'upload_date': video_upload_date,
4353 'uploader_id': video_uploader_id,
4354 'thumbnail': video_thumbnail
4357 class HypemIE(InfoExtractor):
4358 """Information Extractor for hypem"""
4359 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4361 def _real_extract(self, url):
4362 mobj = re.match(self._VALID_URL, url)
4364 raise ExtractorError(u'Invalid URL: %s' % url)
4365 track_id = mobj.group(1)
4367 data = { 'ax': 1, 'ts': time.time() }
4368 data_encoded = compat_urllib_parse.urlencode(data)
4369 complete_url = url + "?" + data_encoded
4370 request = compat_urllib_request.Request(complete_url)
4371 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4372 cookie = urlh.headers.get('Set-Cookie', '')
4374 self.report_extraction(track_id)
4376 html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4377 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4379 track_list = json.loads(html_tracks)
4380 track = track_list[u'tracks'][0]
4382 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4385 track_id = track[u"id"]
4386 artist = track[u"artist"]
4387 title = track[u"song"]
4389 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4390 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4391 request.add_header('cookie', cookie)
4392 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4394 song_data = json.loads(song_data_json)
4396 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4397 final_url = song_data[u"url"]
4408 def gen_extractors():
4409 """ Return a list of an instance of every supported extractor.
4410 The order does matter; the first extractor matched is the one handling the URL.
4413 YoutubePlaylistIE(),
4438 StanfordOpenClassroomIE(),
4448 WorldStarHipHopIE(),
4474 def get_info_extractor(ie_name):
4475 """Returns the info extractor class with the given ie_name"""
4476 return globals()[ie_name+'IE']