2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s; '
220 u'please report this issue on GitHub.' % _name)
222 self._downloader.report_warning(u'unable to extract %s; '
223 u'please report this issue on GitHub.' % _name)
226 class SearchInfoExtractor(InfoExtractor):
228 Base class for paged search queries extractors.
229 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
230 Instances should define _SEARCH_KEY and _MAX_RESULTS.
234 def _make_valid_url(cls):
235 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
238 def suitable(cls, url):
239 return re.match(cls._make_valid_url(), url) is not None
241 def _real_extract(self, query):
242 mobj = re.match(self._make_valid_url(), query)
244 raise ExtractorError(u'Invalid search query "%s"' % query)
246 prefix = mobj.group('prefix')
247 query = mobj.group('query')
249 return self._get_n_results(query, 1)
250 elif prefix == 'all':
251 return self._get_n_results(query, self._MAX_RESULTS)
255 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
256 elif n > self._MAX_RESULTS:
257 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
258 n = self._MAX_RESULTS
259 return self._get_n_results(query, n)
261 def _get_n_results(self, query, n):
262 """Get a specified number of results for a query"""
263 raise NotImplementedError("This method must be implemented by sublclasses")
266 class YoutubeIE(InfoExtractor):
267 """Information extractor for youtube.com."""
271 (?:https?://)? # http(s):// (optional)
272 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
273 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
274 (?:.*?\#/)? # handle anchor (#/) redirect urls
275 (?: # the various things that can precede the ID:
276 (?:(?:v|embed|e)/) # v/ or embed/ or e/
277 |(?: # or the v= param in all its forms
278 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
279 (?:\?|\#!?) # the params delimiter ? or # or #!
280 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
283 )? # optional -> youtube.com/xxxx is OK
284 )? # all until now is optional -> you can pass the naked ID
285 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
286 (?(1).+)? # if we found the ID, everything can follow
288 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
289 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
290 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
291 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
292 _NETRC_MACHINE = 'youtube'
293 # Listed in order of quality
294 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
295 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
296 _video_extensions = {
302 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
308 _video_dimensions = {
327 def suitable(cls, url):
328 """Receives a URL and returns True if suitable for this IE."""
329 if YoutubePlaylistIE.suitable(url): return False
330 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
332 def report_lang(self):
333 """Report attempt to set language."""
334 self.to_screen(u'Setting language')
336 def report_login(self):
337 """Report attempt to log in."""
338 self.to_screen(u'Logging in')
340 def report_video_webpage_download(self, video_id):
341 """Report attempt to download video webpage."""
342 self.to_screen(u'%s: Downloading video webpage' % video_id)
344 def report_video_info_webpage_download(self, video_id):
345 """Report attempt to download video info webpage."""
346 self.to_screen(u'%s: Downloading video info webpage' % video_id)
348 def report_video_subtitles_download(self, video_id):
349 """Report attempt to download video info webpage."""
350 self.to_screen(u'%s: Checking available subtitles' % video_id)
352 def report_video_subtitles_request(self, video_id, sub_lang, format):
353 """Report attempt to download video info webpage."""
354 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
356 def report_video_subtitles_available(self, video_id, sub_lang_list):
357 """Report available subtitles."""
358 sub_lang = ",".join(list(sub_lang_list.keys()))
359 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
361 def report_information_extraction(self, video_id):
362 """Report attempt to extract video information."""
363 self.to_screen(u'%s: Extracting video information' % video_id)
365 def report_unavailable_format(self, video_id, format):
366 """Report extracted video URL."""
367 self.to_screen(u'%s: Format %s not available' % (video_id, format))
369 def report_rtmp_download(self):
370 """Indicate the download will use the RTMP protocol."""
371 self.to_screen(u'RTMP download detected')
373 def _get_available_subtitles(self, video_id):
374 self.report_video_subtitles_download(video_id)
375 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
377 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
378 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
379 return (u'unable to download video subtitles: %s' % compat_str(err), None)
380 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
381 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
382 if not sub_lang_list:
383 return (u'video doesn\'t have subtitles', None)
386 def _list_available_subtitles(self, video_id):
387 sub_lang_list = self._get_available_subtitles(video_id)
388 self.report_video_subtitles_available(video_id, sub_lang_list)
390 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
393 (error_message, sub_lang, sub)
395 self.report_video_subtitles_request(video_id, sub_lang, format)
396 params = compat_urllib_parse.urlencode({
402 url = 'http://www.youtube.com/api/timedtext?' + params
404 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
408 return (u'Did not fetch video subtitles', None, None)
409 return (None, sub_lang, sub)
411 def _request_automatic_caption(self, video_id, webpage):
412 """We need the webpage for getting the captions url, pass it as an
413 argument to speed up the process."""
414 sub_lang = self._downloader.params.get('subtitleslang')
415 sub_format = self._downloader.params.get('subtitlesformat')
416 self.to_screen(u'%s: Looking for automatic captions' % video_id)
417 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
418 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
420 return [(err_msg, None, None)]
421 player_config = json.loads(mobj.group(1))
423 args = player_config[u'args']
424 caption_url = args[u'ttsurl']
425 timestamp = args[u'timestamp']
426 params = compat_urllib_parse.urlencode({
433 subtitles_url = caption_url + '&' + params
434 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
435 return [(None, sub_lang, sub)]
437 return [(err_msg, None, None)]
439 def _extract_subtitle(self, video_id):
441 Return a list with a tuple:
442 [(error_message, sub_lang, sub)]
444 sub_lang_list = self._get_available_subtitles(video_id)
445 sub_format = self._downloader.params.get('subtitlesformat')
446 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
447 return [(sub_lang_list[0], None, None)]
448 if self._downloader.params.get('subtitleslang', False):
449 sub_lang = self._downloader.params.get('subtitleslang')
450 elif 'en' in sub_lang_list:
453 sub_lang = list(sub_lang_list.keys())[0]
454 if not sub_lang in sub_lang_list:
455 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
457 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
460 def _extract_all_subtitles(self, video_id):
461 sub_lang_list = self._get_available_subtitles(video_id)
462 sub_format = self._downloader.params.get('subtitlesformat')
463 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
464 return [(sub_lang_list[0], None, None)]
466 for sub_lang in sub_lang_list:
467 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
468 subtitles.append(subtitle)
471 def _print_formats(self, formats):
472 print('Available formats:')
474 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
476 def _real_initialize(self):
477 if self._downloader is None:
482 downloader_params = self._downloader.params
484 # Attempt to use provided username and password or .netrc data
485 if downloader_params.get('username', None) is not None:
486 username = downloader_params['username']
487 password = downloader_params['password']
488 elif downloader_params.get('usenetrc', False):
490 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
495 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
496 except (IOError, netrc.NetrcParseError) as err:
497 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
501 request = compat_urllib_request.Request(self._LANG_URL)
504 compat_urllib_request.urlopen(request).read()
505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
506 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
509 # No authentication to be performed
513 request = compat_urllib_request.Request(self._LOGIN_URL)
515 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
522 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
524 galx = match.group(1)
526 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
532 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
536 u'PersistentCookie': u'yes',
538 u'bgresponse': u'js_disabled',
539 u'checkConnection': u'',
540 u'checkedDomains': u'youtube',
546 u'signIn': u'Sign in',
548 u'service': u'youtube',
552 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
554 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
555 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
556 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
559 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
560 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
561 self._downloader.report_warning(u'unable to log in: bad username or password')
563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
564 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
570 'action_confirm': 'Confirm',
572 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
574 self.report_age_confirmation()
575 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
577 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
579 def _extract_id(self, url):
580 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
582 raise ExtractorError(u'Invalid URL: %s' % url)
583 video_id = mobj.group(2)
586 def _real_extract(self, url):
587 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
588 mobj = re.search(self._NEXT_URL_RE, url)
590 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
591 video_id = self._extract_id(url)
594 self.report_video_webpage_download(video_id)
595 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
596 request = compat_urllib_request.Request(url)
598 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
599 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
600 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
602 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
604 # Attempt to extract SWF player URL
605 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
607 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
612 self.report_video_info_webpage_download(video_id)
613 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
614 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
615 % (video_id, el_type))
616 video_info_webpage = self._download_webpage(video_info_url, video_id,
618 errnote='unable to download video info webpage')
619 video_info = compat_parse_qs(video_info_webpage)
620 if 'token' in video_info:
622 if 'token' not in video_info:
623 if 'reason' in video_info:
624 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
626 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
628 # Check for "rental" videos
629 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
630 raise ExtractorError(u'"rental" videos not supported')
632 # Start extracting information
633 self.report_information_extraction(video_id)
636 if 'author' not in video_info:
637 raise ExtractorError(u'Unable to extract uploader name')
638 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
641 video_uploader_id = None
642 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
644 video_uploader_id = mobj.group(1)
646 self._downloader.report_warning(u'unable to extract uploader nickname')
649 if 'title' not in video_info:
650 raise ExtractorError(u'Unable to extract video title')
651 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
654 if 'thumbnail_url' not in video_info:
655 self._downloader.report_warning(u'unable to extract video thumbnail')
657 else: # don't panic if we can't find it
658 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
662 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
664 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
665 upload_date = unified_strdate(upload_date)
668 video_description = get_element_by_id("eow-description", video_webpage)
669 if video_description:
670 video_description = clean_html(video_description)
672 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
674 video_description = unescapeHTML(fd_mobj.group(1))
676 video_description = u''
679 video_subtitles = None
681 if self._downloader.params.get('writesubtitles', False):
682 video_subtitles = self._extract_subtitle(video_id)
684 (sub_error, sub_lang, sub) = video_subtitles[0]
686 # We try with the automatic captions
687 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
688 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
692 # We report the original error
693 self._downloader.report_error(sub_error)
695 if self._downloader.params.get('allsubtitles', False):
696 video_subtitles = self._extract_all_subtitles(video_id)
697 for video_subtitle in video_subtitles:
698 (sub_error, sub_lang, sub) = video_subtitle
700 self._downloader.report_error(sub_error)
702 if self._downloader.params.get('listsubtitles', False):
703 sub_lang_list = self._list_available_subtitles(video_id)
706 if 'length_seconds' not in video_info:
707 self._downloader.report_warning(u'unable to extract video duration')
710 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
713 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
715 # Decide which formats to download
716 req_format = self._downloader.params.get('format', None)
718 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
719 self.report_rtmp_download()
720 video_url_list = [(None, video_info['conn'][0])]
721 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
723 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
724 url_data = compat_parse_qs(url_data_str)
725 if 'itag' in url_data and 'url' in url_data:
726 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
727 if not 'ratebypass' in url: url += '&ratebypass=yes'
728 url_map[url_data['itag'][0]] = url
730 format_limit = self._downloader.params.get('format_limit', None)
731 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
732 if format_limit is not None and format_limit in available_formats:
733 format_list = available_formats[available_formats.index(format_limit):]
735 format_list = available_formats
736 existing_formats = [x for x in format_list if x in url_map]
737 if len(existing_formats) == 0:
738 raise ExtractorError(u'no known formats available for video')
739 if self._downloader.params.get('listformats', None):
740 self._print_formats(existing_formats)
742 if req_format is None or req_format == 'best':
743 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
744 elif req_format == 'worst':
745 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
746 elif req_format in ('-1', 'all'):
747 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
749 # Specific formats. We pick the first in a slash-delimeted sequence.
750 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
751 req_formats = req_format.split('/')
752 video_url_list = None
753 for rf in req_formats:
755 video_url_list = [(rf, url_map[rf])]
757 if video_url_list is None:
758 raise ExtractorError(u'requested format not available')
760 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
763 for format_param, video_real_url in video_url_list:
765 video_extension = self._video_extensions.get(format_param, 'flv')
767 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
768 self._video_dimensions.get(format_param, '???'))
772 'url': video_real_url,
773 'uploader': video_uploader,
774 'uploader_id': video_uploader_id,
775 'upload_date': upload_date,
776 'title': video_title,
777 'ext': video_extension,
778 'format': video_format,
779 'thumbnail': video_thumbnail,
780 'description': video_description,
781 'player_url': player_url,
782 'subtitles': video_subtitles,
783 'duration': video_duration
788 class MetacafeIE(InfoExtractor):
789 """Information Extractor for metacafe.com."""
791 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
792 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
793 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
794 IE_NAME = u'metacafe'
796 def report_disclaimer(self):
797 """Report disclaimer retrieval."""
798 self.to_screen(u'Retrieving disclaimer')
800 def _real_initialize(self):
801 # Retrieve disclaimer
802 request = compat_urllib_request.Request(self._DISCLAIMER)
804 self.report_disclaimer()
805 disclaimer = compat_urllib_request.urlopen(request).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
812 'submit': "Continue - I'm over 18",
814 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
816 self.report_age_confirmation()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
821 def _real_extract(self, url):
822 # Extract id and simplified title from URL
823 mobj = re.match(self._VALID_URL, url)
825 raise ExtractorError(u'Invalid URL: %s' % url)
827 video_id = mobj.group(1)
829 # Check if video comes from YouTube
830 mobj2 = re.match(r'^yt-(.*)$', video_id)
831 if mobj2 is not None:
832 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
834 # Retrieve video webpage to extract further information
835 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
837 # Extract URL, uploader and title from webpage
838 self.report_extraction(video_id)
839 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
841 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
842 video_extension = mediaURL[-3:]
844 # Extract gdaKey if available
845 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
849 gdaKey = mobj.group(1)
850 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
852 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
854 raise ExtractorError(u'Unable to extract media URL')
855 vardict = compat_parse_qs(mobj.group(1))
856 if 'mediaData' not in vardict:
857 raise ExtractorError(u'Unable to extract media URL')
858 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
860 raise ExtractorError(u'Unable to extract media URL')
861 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
862 video_extension = mediaURL[-3:]
863 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
865 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
867 raise ExtractorError(u'Unable to extract title')
868 video_title = mobj.group(1).decode('utf-8')
870 mobj = re.search(r'submitter=(.*?);', webpage)
872 raise ExtractorError(u'Unable to extract uploader nickname')
873 video_uploader = mobj.group(1)
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader.decode('utf-8'),
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
884 class DailymotionIE(InfoExtractor):
885 """Information Extractor for Dailymotion"""
887 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
888 IE_NAME = u'dailymotion'
890 def _real_extract(self, url):
891 # Extract id and simplified title from URL
892 mobj = re.match(self._VALID_URL, url)
894 raise ExtractorError(u'Invalid URL: %s' % url)
896 video_id = mobj.group(1).split('_')[0].split('?')[0]
898 video_extension = 'mp4'
900 # Retrieve video webpage to extract further information
901 request = compat_urllib_request.Request(url)
902 request.add_header('Cookie', 'family_filter=off')
903 webpage = self._download_webpage(request, video_id)
905 # Extract URL, uploader and title from webpage
906 self.report_extraction(video_id)
907 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
909 raise ExtractorError(u'Unable to extract media URL')
910 flashvars = compat_urllib_parse.unquote(mobj.group(1))
912 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
915 self.to_screen(u'Using %s' % key)
918 raise ExtractorError(u'Unable to extract video URL')
920 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
922 raise ExtractorError(u'Unable to extract video URL')
924 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
926 # TODO: support choosing qualities
928 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
930 raise ExtractorError(u'Unable to extract title')
931 video_title = unescapeHTML(mobj.group('title'))
933 video_uploader = None
934 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
936 # lookin for official user
937 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
938 if mobj_official is None:
939 self._downloader.report_warning(u'unable to extract uploader nickname')
941 video_uploader = mobj_official.group(1)
943 video_uploader = mobj.group(1)
945 video_upload_date = None
946 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
948 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
953 'uploader': video_uploader,
954 'upload_date': video_upload_date,
955 'title': video_title,
956 'ext': video_extension,
960 class PhotobucketIE(InfoExtractor):
961 """Information extractor for photobucket.com."""
963 # TODO: the original _VALID_URL was:
964 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
965 # Check if it's necessary to keep the old extracion process
966 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
967 IE_NAME = u'photobucket'
969 def _real_extract(self, url):
970 # Extract id from URL
971 mobj = re.match(self._VALID_URL, url)
973 raise ExtractorError(u'Invalid URL: %s' % url)
975 video_id = mobj.group('id')
977 video_extension = mobj.group('ext')
979 # Retrieve video webpage to extract further information
980 webpage = self._download_webpage(url, video_id)
982 # Extract URL, uploader, and title from webpage
983 self.report_extraction(video_id)
984 # We try first by looking the javascript code:
985 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
987 info = json.loads(mobj.group('json'))
990 'url': info[u'downloadUrl'],
991 'uploader': info[u'username'],
992 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
993 'title': info[u'title'],
994 'ext': video_extension,
995 'thumbnail': info[u'thumbUrl'],
998 # We try looking in other parts of the webpage
999 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1000 webpage, u'video URL')
1002 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1004 raise ExtractorError(u'Unable to extract title')
1005 video_title = mobj.group(1).decode('utf-8')
1006 video_uploader = mobj.group(2).decode('utf-8')
1009 'id': video_id.decode('utf-8'),
1010 'url': video_url.decode('utf-8'),
1011 'uploader': video_uploader,
1012 'upload_date': None,
1013 'title': video_title,
1014 'ext': video_extension.decode('utf-8'),
1018 class YahooIE(InfoExtractor):
1019 """Information extractor for screen.yahoo.com."""
1020 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1022 def _real_extract(self, url):
1023 mobj = re.match(self._VALID_URL, url)
1025 raise ExtractorError(u'Invalid URL: %s' % url)
1026 video_id = mobj.group('id')
1027 webpage = self._download_webpage(url, video_id)
1028 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1031 # TODO: Check which url parameters are required
1032 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1034 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1035 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1036 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1037 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1039 self.report_extraction(video_id)
1040 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1042 raise ExtractorError(u'Unable to extract video info')
1043 video_title = m_info.group('title')
1044 video_description = m_info.group('description')
1045 video_thumb = m_info.group('thumb')
1046 video_date = m_info.group('date')
1047 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1049 # TODO: Find a way to get mp4 videos
1050 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1051 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1052 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1053 video_url = m_rest.group('url')
1054 video_path = m_rest.group('path')
1056 raise ExtractorError(u'Unable to extract video url')
1058 else: # We have to use a different method if another id is defined
1059 long_id = m_id.group('new_id')
1060 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1061 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1062 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1063 info = json.loads(json_str)
1064 res = info[u'query'][u'results'][u'mediaObj'][0]
1065 stream = res[u'streams'][0]
1066 video_path = stream[u'path']
1067 video_url = stream[u'host']
1069 video_title = meta[u'title']
1070 video_description = meta[u'description']
1071 video_thumb = meta[u'thumbnail']
1072 video_date = None # I can't find it
1077 'play_path': video_path,
1078 'title':video_title,
1079 'description': video_description,
1080 'thumbnail': video_thumb,
1081 'upload_date': video_date,
1086 class VimeoIE(InfoExtractor):
1087 """Information extractor for vimeo.com."""
1089 # _VALID_URL matches Vimeo URLs
1090 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1093 def _real_extract(self, url, new_video=True):
1094 # Extract ID from URL
1095 mobj = re.match(self._VALID_URL, url)
1097 raise ExtractorError(u'Invalid URL: %s' % url)
1099 video_id = mobj.group('id')
1100 if not mobj.group('proto'):
1101 url = 'https://' + url
1102 if mobj.group('direct_link') or mobj.group('pro'):
1103 url = 'https://vimeo.com/' + video_id
1105 # Retrieve video webpage to extract further information
1106 request = compat_urllib_request.Request(url, None, std_headers)
1107 webpage = self._download_webpage(request, video_id)
1109 # Now we begin extracting as much information as we can from what we
1110 # retrieved. First we extract the information common to all extractors,
1111 # and latter we extract those that are Vimeo specific.
1112 self.report_extraction(video_id)
1114 # Extract the config JSON
1116 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117 config = json.loads(config)
1119 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1120 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1122 raise ExtractorError(u'Unable to extract info section')
1125 video_title = config["video"]["title"]
1127 # Extract uploader and uploader_id
1128 video_uploader = config["video"]["owner"]["name"]
1129 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1131 # Extract video thumbnail
1132 video_thumbnail = config["video"]["thumbnail"]
1134 # Extract video description
1135 video_description = get_element_by_attribute("itemprop", "description", webpage)
1136 if video_description: video_description = clean_html(video_description)
1137 else: video_description = u''
1139 # Extract upload date
1140 video_upload_date = None
1141 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1142 if mobj is not None:
1143 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145 # Vimeo specific: extract request signature and timestamp
1146 sig = config['request']['signature']
1147 timestamp = config['request']['timestamp']
1149 # Vimeo specific: extract video codec and quality information
1150 # First consider quality, then codecs, then take everything
1151 # TODO bind to format param
1152 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1153 files = { 'hd': [], 'sd': [], 'other': []}
1154 for codec_name, codec_extension in codecs:
1155 if codec_name in config["video"]["files"]:
1156 if 'hd' in config["video"]["files"][codec_name]:
1157 files['hd'].append((codec_name, codec_extension, 'hd'))
1158 elif 'sd' in config["video"]["files"][codec_name]:
1159 files['sd'].append((codec_name, codec_extension, 'sd'))
1161 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163 for quality in ('hd', 'sd', 'other'):
1164 if len(files[quality]) > 0:
1165 video_quality = files[quality][0][2]
1166 video_codec = files[quality][0][0]
1167 video_extension = files[quality][0][1]
1168 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1171 raise ExtractorError(u'No known codec found')
1173 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1174 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1179 'uploader': video_uploader,
1180 'uploader_id': video_uploader_id,
1181 'upload_date': video_upload_date,
1182 'title': video_title,
1183 'ext': video_extension,
1184 'thumbnail': video_thumbnail,
1185 'description': video_description,
1189 class ArteTvIE(InfoExtractor):
1190 """arte.tv information extractor."""
1192 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1193 _LIVE_URL = r'index-[0-9]+\.html$'
1195 IE_NAME = u'arte.tv'
1197 def fetch_webpage(self, url):
1198 request = compat_urllib_request.Request(url)
1200 self.report_download_webpage(url)
1201 webpage = compat_urllib_request.urlopen(request).read()
1202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1203 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1204 except ValueError as err:
1205 raise ExtractorError(u'Invalid URL: %s' % url)
1208 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1209 page = self.fetch_webpage(url)
1210 mobj = re.search(regex, page, regexFlags)
1214 raise ExtractorError(u'Invalid URL: %s' % url)
1216 for (i, key, err) in matchTuples:
1217 if mobj.group(i) is None:
1218 raise ExtractorError(err)
1220 info[key] = mobj.group(i)
1224 def extractLiveStream(self, url):
1225 video_lang = url.split('/')[-4]
1226 info = self.grep_webpage(
1228 r'src="(.*?/videothek_js.*?\.js)',
1231 (1, 'url', u'Invalid URL: %s' % url)
1234 http_host = url.split('/')[2]
1235 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236 info = self.grep_webpage(
1238 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239 '(http://.*?\.swf).*?' +
1243 (1, 'path', u'could not extract video path: %s' % url),
1244 (2, 'player', u'could not extract video player: %s' % url),
1245 (3, 'url', u'could not extract video url: %s' % url)
1248 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1250 def extractPlus7Stream(self, url):
1251 video_lang = url.split('/')[-3]
1252 info = self.grep_webpage(
1254 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1257 (1, 'url', u'Invalid URL: %s' % url)
1260 next_url = compat_urllib_parse.unquote(info.get('url'))
1261 info = self.grep_webpage(
1263 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1266 (1, 'url', u'Could not find <video> tag: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1271 info = self.grep_webpage(
1273 r'<video id="(.*?)".*?>.*?' +
1274 '<name>(.*?)</name>.*?' +
1275 '<dateVideo>(.*?)</dateVideo>.*?' +
1276 '<url quality="hd">(.*?)</url>',
1279 (1, 'id', u'could not extract video id: %s' % url),
1280 (2, 'title', u'could not extract video title: %s' % url),
1281 (3, 'date', u'could not extract video date: %s' % url),
1282 (4, 'url', u'could not extract video url: %s' % url)
1287 'id': info.get('id'),
1288 'url': compat_urllib_parse.unquote(info.get('url')),
1289 'uploader': u'arte.tv',
1290 'upload_date': unified_strdate(info.get('date')),
1291 'title': info.get('title').decode('utf-8'),
1297 def _real_extract(self, url):
1298 video_id = url.split('/')[-1]
1299 self.report_extraction(video_id)
1301 if re.search(self._LIVE_URL, video_id) is not None:
1302 self.extractLiveStream(url)
1305 info = self.extractPlus7Stream(url)
1310 class GenericIE(InfoExtractor):
1311 """Generic last-resort information extractor."""
1314 IE_NAME = u'generic'
1316 def report_download_webpage(self, video_id):
1317 """Report webpage download."""
1318 if not self._downloader.params.get('test', False):
1319 self._downloader.report_warning(u'Falling back on generic information extractor.')
1320 super(GenericIE, self).report_download_webpage(video_id)
1322 def report_following_redirect(self, new_url):
1323 """Report information extraction."""
1324 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1326 def _test_redirect(self, url):
1327 """Check if it is a redirect, like url shorteners, in case return the new url."""
1328 class HeadRequest(compat_urllib_request.Request):
1329 def get_method(self):
1332 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1334 Subclass the HTTPRedirectHandler to make it use our
1335 HeadRequest also on the redirected URL
1337 def redirect_request(self, req, fp, code, msg, headers, newurl):
1338 if code in (301, 302, 303, 307):
1339 newurl = newurl.replace(' ', '%20')
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return HeadRequest(newurl,
1344 origin_req_host=req.get_origin_req_host(),
1347 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1349 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1351 Fallback to GET if HEAD is not allowed (405 HTTP error)
1353 def http_error_405(self, req, fp, code, msg, headers):
1357 newheaders = dict((k,v) for k,v in req.headers.items()
1358 if k.lower() not in ("content-length", "content-type"))
1359 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1361 origin_req_host=req.get_origin_req_host(),
1365 opener = compat_urllib_request.OpenerDirector()
1366 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1367 HTTPMethodFallback, HEADRedirectHandler,
1368 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1369 opener.add_handler(handler())
1371 response = opener.open(HeadRequest(url))
1372 if response is None:
1373 raise ExtractorError(u'Invalid URL protocol')
1374 new_url = response.geturl()
1379 self.report_following_redirect(new_url)
1382 def _real_extract(self, url):
1383 new_url = self._test_redirect(url)
1384 if new_url: return [self.url_result(new_url)]
1386 video_id = url.split('/')[-1]
1388 webpage = self._download_webpage(url, video_id)
1389 except ValueError as err:
1390 # since this is the last-resort InfoExtractor, if
1391 # this error is thrown, it'll be thrown here
1392 raise ExtractorError(u'Invalid URL: %s' % url)
1394 self.report_extraction(video_id)
1395 # Start with something easy: JW Player in SWFObject
1396 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1398 # Broaden the search a little bit
1399 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit: JWPlayer JS loader
1402 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1404 raise ExtractorError(u'Invalid URL: %s' % url)
1406 # It's possible that one of the regexes
1407 # matched, but returned an empty group:
1408 if mobj.group(1) is None:
1409 raise ExtractorError(u'Invalid URL: %s' % url)
1411 video_url = compat_urllib_parse.unquote(mobj.group(1))
1412 video_id = os.path.basename(video_url)
1414 # here's a fun little line of code for you:
1415 video_extension = os.path.splitext(video_id)[1][1:]
1416 video_id = os.path.splitext(video_id)[0]
1418 # it's tempting to parse this further, but you would
1419 # have to take into account all the variations like
1420 # Video Title - Site Name
1421 # Site Name | Video Title
1422 # Video Title - Tagline | Site Name
1423 # and so on and so forth; it's just not practical
1424 mobj = re.search(r'<title>(.*)</title>', webpage)
1426 raise ExtractorError(u'Unable to extract title')
1427 video_title = mobj.group(1)
1429 # video uploader is domain name
1430 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1432 raise ExtractorError(u'Unable to extract title')
1433 video_uploader = mobj.group(1)
1438 'uploader': video_uploader,
1439 'upload_date': None,
1440 'title': video_title,
1441 'ext': video_extension,
1445 class YoutubeSearchIE(SearchInfoExtractor):
1446 """Information Extractor for YouTube search queries."""
1447 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1449 IE_NAME = u'youtube:search'
1450 _SEARCH_KEY = 'ytsearch'
1452 def report_download_page(self, query, pagenum):
1453 """Report attempt to download search page with given number."""
1454 query = query.decode(preferredencoding())
1455 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1457 def _get_n_results(self, query, n):
1458 """Get a specified number of results for a query"""
1464 while (50 * pagenum) < limit:
1465 self.report_download_page(query, pagenum+1)
1466 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467 request = compat_urllib_request.Request(result_url)
1469 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1472 api_response = json.loads(data)['data']
1474 if not 'items' in api_response:
1475 raise ExtractorError(u'[youtube] No video results')
1477 new_ids = list(video['id'] for video in api_response['items'])
1478 video_ids += new_ids
1480 limit = min(n, api_response['totalItems'])
1483 if len(video_ids) > n:
1484 video_ids = video_ids[:n]
1485 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1486 return self.playlist_result(videos, query)
1489 class GoogleSearchIE(SearchInfoExtractor):
1490 """Information Extractor for Google Video search queries."""
1491 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1493 IE_NAME = u'video.google:search'
1494 _SEARCH_KEY = 'gvsearch'
1496 def _get_n_results(self, query, n):
1497 """Get a specified number of results for a query"""
1500 '_type': 'playlist',
1505 for pagenum in itertools.count(1):
1506 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1508 note='Downloading result page ' + str(pagenum))
1510 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1513 'url': mobj.group(1)
1515 res['entries'].append(e)
1517 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1520 class YahooSearchIE(SearchInfoExtractor):
1521 """Information Extractor for Yahoo! Video search queries."""
1524 IE_NAME = u'screen.yahoo:search'
1525 _SEARCH_KEY = 'yvsearch'
1527 def _get_n_results(self, query, n):
1528 """Get a specified number of results for a query"""
1531 '_type': 'playlist',
1535 for pagenum in itertools.count(0):
1536 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1537 webpage = self._download_webpage(result_url, query,
1538 note='Downloading results page '+str(pagenum+1))
1539 info = json.loads(webpage)
1541 results = info[u'results']
1543 for (i, r) in enumerate(results):
1544 if (pagenum * 30) +i >= n:
1546 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1547 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1548 res['entries'].append(e)
1549 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1555 class YoutubePlaylistIE(InfoExtractor):
1556 """Information Extractor for YouTube playlists."""
1558 _VALID_URL = r"""(?:
1563 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1564 \? (?:.*?&)*? (?:p|a|list)=
1567 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1570 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1572 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1574 IE_NAME = u'youtube:playlist'
1577 def suitable(cls, url):
1578 """Receives a URL and returns True if suitable for this IE."""
1579 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1581 def _real_extract(self, url):
1582 # Extract playlist id
1583 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1585 raise ExtractorError(u'Invalid URL: %s' % url)
1587 # Download playlist videos from API
1588 playlist_id = mobj.group(1) or mobj.group(2)
1593 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1594 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1597 response = json.loads(page)
1598 except ValueError as err:
1599 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1601 if 'feed' not in response:
1602 raise ExtractorError(u'Got a malformed response from YouTube API')
1603 playlist_title = response['feed']['title']['$t']
1604 if 'entry' not in response['feed']:
1605 # Number of videos is a multiple of self._MAX_RESULTS
1608 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1609 for entry in response['feed']['entry']
1610 if 'content' in entry ]
1612 if len(response['feed']['entry']) < self._MAX_RESULTS:
1616 videos = [v[1] for v in sorted(videos)]
1618 url_results = [self.url_result(url, 'Youtube') for url in videos]
1619 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1622 class YoutubeChannelIE(InfoExtractor):
1623 """Information Extractor for YouTube channels."""
1625 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1626 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1627 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1628 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1629 IE_NAME = u'youtube:channel'
1631 def extract_videos_from_page(self, page):
1633 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1634 if mobj.group(1) not in ids_in_page:
1635 ids_in_page.append(mobj.group(1))
1638 def _real_extract(self, url):
1639 # Extract channel id
1640 mobj = re.match(self._VALID_URL, url)
1642 raise ExtractorError(u'Invalid URL: %s' % url)
1644 # Download channel page
1645 channel_id = mobj.group(1)
1649 url = self._TEMPLATE_URL % (channel_id, pagenum)
1650 page = self._download_webpage(url, channel_id,
1651 u'Downloading page #%s' % pagenum)
1653 # Extract video identifiers
1654 ids_in_page = self.extract_videos_from_page(page)
1655 video_ids.extend(ids_in_page)
1657 # Download any subsequent channel pages using the json-based channel_ajax query
1658 if self._MORE_PAGES_INDICATOR in page:
1660 pagenum = pagenum + 1
1662 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1663 page = self._download_webpage(url, channel_id,
1664 u'Downloading page #%s' % pagenum)
1666 page = json.loads(page)
1668 ids_in_page = self.extract_videos_from_page(page['content_html'])
1669 video_ids.extend(ids_in_page)
1671 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1674 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1676 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1677 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1678 return [self.playlist_result(url_entries, channel_id)]
1681 class YoutubeUserIE(InfoExtractor):
1682 """Information Extractor for YouTube users."""
1684 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1685 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1686 _GDATA_PAGE_SIZE = 50
1687 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1688 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1689 IE_NAME = u'youtube:user'
1691 def _real_extract(self, url):
1693 mobj = re.match(self._VALID_URL, url)
1695 raise ExtractorError(u'Invalid URL: %s' % url)
1697 username = mobj.group(1)
1699 # Download video ids using YouTube Data API. Result size per
1700 # query is limited (currently to 50 videos) so we need to query
1701 # page by page until there are no video ids - it means we got
1708 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1710 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1711 page = self._download_webpage(gdata_url, username,
1712 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1714 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 # A little optimization - if current page is not
1724 # "full", ie. does not contain PAGE_SIZE video ids then
1725 # we can assume that this page is the last one - there
1726 # are no more ids on further pages - no need to query
1729 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1734 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1735 url_results = [self.url_result(url, 'Youtube') for url in urls]
1736 return [self.playlist_result(url_results, playlist_title = username)]
1739 class BlipTVUserIE(InfoExtractor):
1740 """Information Extractor for blip.tv users."""
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1744 IE_NAME = u'blip.tv:user'
1746 def _real_extract(self, url):
1748 mobj = re.match(self._VALID_URL, url)
1750 raise ExtractorError(u'Invalid URL: %s' % url)
1752 username = mobj.group(1)
1754 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756 page = self._download_webpage(url, username, u'Downloading user page')
1757 mobj = re.search(r'data-users-id="([^"]+)"', page)
1758 page_base = page_base % mobj.group(1)
1761 # Download video ids using BlipTV Ajax calls. Result size per
1762 # query is limited (currently to 12 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1770 url = page_base + "&page=" + str(pagenum)
1771 page = self._download_webpage(url, username,
1772 u'Downloading video ids from page %d' % pagenum)
1774 # Extract video identifiers
1777 for mobj in re.finditer(r'href="/([^"]+)"', page):
1778 if mobj.group(1) not in ids_in_page:
1779 ids_in_page.append(unescapeHTML(mobj.group(1)))
1781 video_ids.extend(ids_in_page)
1783 # A little optimization - if current page is not
1784 # "full", ie. does not contain PAGE_SIZE video ids then
1785 # we can assume that this page is the last one - there
1786 # are no more ids on further pages - no need to query
1789 if len(ids_in_page) < self._PAGE_SIZE:
1794 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1795 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1796 return [self.playlist_result(url_entries, playlist_title = username)]
1799 class DepositFilesIE(InfoExtractor):
1800 """Information extractor for depositfiles.com"""
1802 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1804 def _real_extract(self, url):
1805 file_id = url.split('/')[-1]
1806 # Rebuild url in english locale
1807 url = 'http://depositfiles.com/en/files/' + file_id
1809 # Retrieve file webpage with 'Free download' button pressed
1810 free_download_indication = { 'gateway_result' : '1' }
1811 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1813 self.report_download_webpage(file_id)
1814 webpage = compat_urllib_request.urlopen(request).read()
1815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1816 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1818 # Search for the real file URL
1819 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1820 if (mobj is None) or (mobj.group(1) is None):
1821 # Try to figure out reason of the error.
1822 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1823 if (mobj is not None) and (mobj.group(1) is not None):
1824 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1825 raise ExtractorError(u'%s' % restriction_message)
1827 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1829 file_url = mobj.group(1)
1830 file_extension = os.path.splitext(file_url)[1][1:]
1832 # Search for file title
1833 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1836 'id': file_id.decode('utf-8'),
1837 'url': file_url.decode('utf-8'),
1839 'upload_date': None,
1840 'title': file_title,
1841 'ext': file_extension.decode('utf-8'),
1845 class FacebookIE(InfoExtractor):
1846 """Information Extractor for Facebook"""
1848 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1849 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1850 _NETRC_MACHINE = 'facebook'
1851 IE_NAME = u'facebook'
1853 def report_login(self):
1854 """Report attempt to log in."""
1855 self.to_screen(u'Logging in')
1857 def _real_initialize(self):
1858 if self._downloader is None:
1863 downloader_params = self._downloader.params
1865 # Attempt to use provided username and password or .netrc data
1866 if downloader_params.get('username', None) is not None:
1867 useremail = downloader_params['username']
1868 password = downloader_params['password']
1869 elif downloader_params.get('usenetrc', False):
1871 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1872 if info is not None:
1876 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1877 except (IOError, netrc.NetrcParseError) as err:
1878 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1881 if useremail is None:
1890 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1893 login_results = compat_urllib_request.urlopen(request).read()
1894 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1895 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1897 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1898 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1901 def _real_extract(self, url):
1902 mobj = re.match(self._VALID_URL, url)
1904 raise ExtractorError(u'Invalid URL: %s' % url)
1905 video_id = mobj.group('ID')
1907 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1908 webpage = self._download_webpage(url, video_id)
1910 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1911 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1912 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1914 raise ExtractorError(u'Cannot parse data')
1915 data = dict(json.loads(m.group(1)))
1916 params_raw = compat_urllib_parse.unquote(data['params'])
1917 params = json.loads(params_raw)
1918 video_data = params['video_data'][0]
1919 video_url = video_data.get('hd_src')
1921 video_url = video_data['sd_src']
1923 raise ExtractorError(u'Cannot find video URL')
1924 video_duration = int(video_data['video_duration'])
1925 thumbnail = video_data['thumbnail_src']
1927 video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1929 video_title = unescapeHTML(video_title)
1933 'title': video_title,
1936 'duration': video_duration,
1937 'thumbnail': thumbnail,
1942 class BlipTVIE(InfoExtractor):
1943 """Information extractor for blip.tv"""
1945 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947 IE_NAME = u'blip.tv'
1949 def report_direct_download(self, title):
1950 """Report information extraction."""
1951 self.to_screen(u'%s: Direct download detected' % title)
1953 def _real_extract(self, url):
1954 mobj = re.match(self._VALID_URL, url)
1956 raise ExtractorError(u'Invalid URL: %s' % url)
1958 # See https://github.com/rg3/youtube-dl/issues/857
1959 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960 if api_mobj is not None:
1961 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962 urlp = compat_urllib_parse_urlparse(url)
1963 if urlp.path.startswith('/play/'):
1964 request = compat_urllib_request.Request(url)
1965 response = compat_urllib_request.urlopen(request)
1966 redirecturl = response.geturl()
1967 rurlp = compat_urllib_parse_urlparse(redirecturl)
1968 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969 url = 'http://blip.tv/a/a-' + file_id
1970 return self._real_extract(url)
1977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978 request = compat_urllib_request.Request(json_url)
1979 request.add_header('User-Agent', 'iTunes/10.6.1')
1980 self.report_extraction(mobj.group(1))
1983 urlh = compat_urllib_request.urlopen(request)
1984 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985 basename = url.split('/')[-1]
1986 title,ext = os.path.splitext(basename)
1987 title = title.decode('UTF-8')
1988 ext = ext.replace('.', '')
1989 self.report_direct_download(title)
1994 'upload_date': None,
1999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001 if info is None: # Regular URL
2003 json_code_bytes = urlh.read()
2004 json_code = json_code_bytes.decode('utf-8')
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2009 json_data = json.loads(json_code)
2010 if 'Post' in json_data:
2011 data = json_data['Post']
2015 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016 video_url = data['media']['url']
2017 umobj = re.match(self._URL_EXT, video_url)
2019 raise ValueError('Can not determine filename extension')
2020 ext = umobj.group(1)
2023 'id': data['item_id'],
2025 'uploader': data['display_name'],
2026 'upload_date': upload_date,
2027 'title': data['title'],
2029 'format': data['media']['mimeType'],
2030 'thumbnail': data['thumbnailUrl'],
2031 'description': data['description'],
2032 'player_url': data['embedUrl'],
2033 'user_agent': 'iTunes/10.6.1',
2035 except (ValueError,KeyError) as err:
2036 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2041 class MyVideoIE(InfoExtractor):
2042 """Information Extractor for myvideo.de."""
2044 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045 IE_NAME = u'myvideo'
2047 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049 # https://github.com/rg3/youtube-dl/pull/842
2050 def __rc4crypt(self,data, key):
2052 box = list(range(256))
2053 for i in list(range(256)):
2054 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055 box[i], box[x] = box[x], box[i]
2061 y = (y + box[x]) % 256
2062 box[x], box[y] = box[y], box[x]
2063 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2067 return hashlib.md5(s).hexdigest().encode()
2069 def _real_extract(self,url):
2070 mobj = re.match(self._VALID_URL, url)
2072 raise ExtractorError(u'invalid URL: %s' % url)
2074 video_id = mobj.group(1)
2077 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079 b'TnpsbA0KTVRkbU1tSTRNdz09'
2083 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084 webpage = self._download_webpage(webpage_url, video_id)
2086 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087 if mobj is not None:
2088 self.report_extraction(video_id)
2089 video_url = mobj.group(1) + '.flv'
2091 video_title = self._search_regex('<title>([^<]+)</title>',
2094 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2100 'upload_date': None,
2101 'title': video_title,
2106 mobj = re.search('var flashvars={(.+?)}', webpage)
2108 raise ExtractorError(u'Unable to extract video')
2113 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114 if not a == '_encxml':
2117 encxml = compat_urllib_parse.unquote(b)
2118 if not params.get('domain'):
2119 params['domain'] = 'www.myvideo.de'
2120 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121 if 'flash_playertype=MTV' in xmldata_url:
2122 self._downloader.report_warning(u'avoiding MTV player')
2124 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2129 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130 enc_data_b = binascii.unhexlify(enc_data)
2132 base64.b64decode(base64.b64decode(GK)) +
2134 str(video_id).encode('utf-8')
2137 dec_data = self.__rc4crypt(enc_data_b, sk)
2140 self.report_extraction(video_id)
2143 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2145 video_url = compat_urllib_parse.unquote(mobj.group(1))
2146 if 'myvideo2flash' in video_url:
2147 self._downloader.report_warning(u'forcing RTMPT ...')
2148 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2151 # extract non rtmp videos
2152 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2154 raise ExtractorError(u'unable to extract url')
2155 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2157 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158 video_file = compat_urllib_parse.unquote(video_file)
2160 if not video_file.endswith('f4m'):
2161 ppath, prefix = video_file.split('.')
2162 video_playpath = '%s:%s' % (prefix, ppath)
2163 video_hls_playlist = ''
2166 video_hls_playlist = (
2167 video_filepath + video_file
2168 ).replace('.f4m', '.m3u8')
2170 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2173 video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2179 'tc_url': video_url,
2181 'upload_date': None,
2182 'title': video_title,
2184 'play_path': video_playpath,
2185 'video_file': video_file,
2186 'video_hls_playlist': video_hls_playlist,
2187 'player_url': video_swfobj,
2191 class ComedyCentralIE(InfoExtractor):
2192 """Information extractor for The Daily Show and Colbert Report """
2194 # urls can be abbreviations like :thedailyshow or :colbert
2195 # urls for episodes like:
2196 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200 |(https?://)?(www\.)?
2201 (?P<showname>thedailyshow|colbertnation)\.com/
2202 (full-episodes/(?P<episode>.*)|
2204 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2208 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2210 _video_extensions = {
2218 _video_dimensions = {
2228 def suitable(cls, url):
2229 """Receives a URL and returns True if suitable for this IE."""
2230 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2232 def _print_formats(self, formats):
2233 print('Available formats:')
2235 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2238 def _real_extract(self, url):
2239 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2241 raise ExtractorError(u'Invalid URL: %s' % url)
2243 if mobj.group('shortname'):
2244 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245 url = u'http://www.thedailyshow.com/full-episodes/'
2247 url = u'http://www.colbertnation.com/full-episodes/'
2248 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 assert mobj is not None
2251 if mobj.group('clip'):
2252 if mobj.group('showname') == 'thedailyshow':
2253 epTitle = mobj.group('tdstitle')
2255 epTitle = mobj.group('cntitle')
2258 dlNewest = not mobj.group('episode')
2260 epTitle = mobj.group('showname')
2262 epTitle = mobj.group('episode')
2264 self.report_extraction(epTitle)
2265 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2267 url = htmlHandle.geturl()
2268 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2270 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271 if mobj.group('episode') == '':
2272 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273 epTitle = mobj.group('episode')
2275 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2277 if len(mMovieParams) == 0:
2278 # The Colbert Report embeds the information in a without
2279 # a URL prefix; so extract the alternate reference
2280 # and then add the URL prefix manually.
2282 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283 if len(altMovieParams) == 0:
2284 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2286 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2288 uri = mMovieParams[0][1]
2289 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290 indexXml = self._download_webpage(indexUrl, epTitle,
2291 u'Downloading show index',
2292 u'unable to download episode index')
2296 idoc = xml.etree.ElementTree.fromstring(indexXml)
2297 itemEls = idoc.findall('.//item')
2298 for partNum,itemEl in enumerate(itemEls):
2299 mediaId = itemEl.findall('./guid')[0].text
2300 shortMediaId = mediaId.split(':')[-1]
2301 showId = mediaId.split(':')[-2].replace('.com', '')
2302 officialTitle = itemEl.findall('./title')[0].text
2303 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2305 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306 compat_urllib_parse.urlencode({'uri': mediaId}))
2307 configXml = self._download_webpage(configUrl, epTitle,
2308 u'Downloading configuration for %s' % shortMediaId)
2310 cdoc = xml.etree.ElementTree.fromstring(configXml)
2312 for rendition in cdoc.findall('.//rendition'):
2313 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2317 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2320 if self._downloader.params.get('listformats', None):
2321 self._print_formats([i[0] for i in turls])
2324 # For now, just pick the highest bitrate
2325 format,rtmp_video_url = turls[-1]
2327 # Get the format arg from the arg stream
2328 req_format = self._downloader.params.get('format', None)
2330 # Select format if we can find one
2333 format, rtmp_video_url = f, v
2336 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2338 raise ExtractorError(u'Cannot transform RTMP url')
2339 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340 video_url = base + m.group('finalid')
2342 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2347 'upload_date': officialDate,
2352 'description': officialTitle,
2354 results.append(info)
2359 class EscapistIE(InfoExtractor):
2360 """Information extractor for The Escapist """
2362 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME = u'escapist'
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url)
2368 raise ExtractorError(u'Invalid URL: %s' % url)
2369 showName = mobj.group('showname')
2370 videoId = mobj.group('episode')
2372 self.report_extraction(showName)
2373 webpage = self._download_webpage(url, showName)
2375 videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2376 webpage, u'description', fatal=False)
2377 if videoDesc: videoDesc = unescapeHTML(videoDesc)
2379 imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2380 webpage, u'thumbnail', fatal=False)
2381 if imgUrl: imgUrl = unescapeHTML(imgUrl)
2383 playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2384 webpage, u'player url')
2385 playerUrl = unescapeHTML(playerUrl)
2387 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388 configUrl = compat_urllib_parse.unquote(configUrl)
2390 configJSON = self._download_webpage(configUrl, showName,
2391 u'Downloading configuration',
2392 u'unable to download configuration')
2394 # Technically, it's JavaScript, not JSON
2395 configJSON = configJSON.replace("'", '"')
2398 config = json.loads(configJSON)
2399 except (ValueError,) as err:
2400 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2402 playlist = config['playlist']
2403 videoUrl = playlist[1]['url']
2408 'uploader': showName,
2409 'upload_date': None,
2412 'thumbnail': imgUrl,
2413 'description': videoDesc,
2414 'player_url': playerUrl,
2419 class CollegeHumorIE(InfoExtractor):
2420 """Information extractor for collegehumor.com"""
2423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424 IE_NAME = u'collegehumor'
2426 def report_manifest(self, video_id):
2427 """Report information extraction."""
2428 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2433 raise ExtractorError(u'Invalid URL: %s' % url)
2434 video_id = mobj.group('videoid')
2439 'upload_date': None,
2442 self.report_extraction(video_id)
2443 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2445 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2449 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2451 videoNode = mdoc.findall('./video')[0]
2452 info['description'] = videoNode.findall('./description')[0].text
2453 info['title'] = videoNode.findall('./caption')[0].text
2454 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455 manifest_url = videoNode.findall('./file')[0].text
2457 raise ExtractorError(u'Invalid metadata XML file')
2459 manifest_url += '?hdcore=2.10.3'
2460 self.report_manifest(video_id)
2462 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2466 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2468 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469 node_id = media_node.attrib['url']
2470 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471 except IndexError as err:
2472 raise ExtractorError(u'Invalid manifest file')
2474 url_pr = compat_urllib_parse_urlparse(manifest_url)
2475 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2482 class XVideosIE(InfoExtractor):
2483 """Information extractor for xvideos.com"""
2485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486 IE_NAME = u'xvideos'
2488 def _real_extract(self, url):
2489 mobj = re.match(self._VALID_URL, url)
2491 raise ExtractorError(u'Invalid URL: %s' % url)
2492 video_id = mobj.group(1)
2494 webpage = self._download_webpage(url, video_id)
2496 self.report_extraction(video_id)
2499 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500 webpage, u'video URL'))
2503 video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2506 # Extract video thumbnail
2507 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508 webpage, u'thumbnail', fatal=False)
2514 'upload_date': None,
2515 'title': video_title,
2517 'thumbnail': video_thumbnail,
2518 'description': None,
2524 class SoundcloudIE(InfoExtractor):
2525 """Information extractor for soundcloud.com
2526 To access the media, the uid of the song and a stream token
2527 must be extracted from the page source and the script must make
2528 a request to media.soundcloud.com/crossdomain.xml. Then
2529 the media can be grabbed by requesting from an url composed
2530 of the stream token and uid
2533 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534 IE_NAME = u'soundcloud'
2536 def report_resolve(self, video_id):
2537 """Report information extraction."""
2538 self.to_screen(u'%s: Resolving id' % video_id)
2540 def _real_extract(self, url):
2541 mobj = re.match(self._VALID_URL, url)
2543 raise ExtractorError(u'Invalid URL: %s' % url)
2545 # extract uploader (which is in the url)
2546 uploader = mobj.group(1)
2547 # extract simple title (uploader + slug of song title)
2548 slug_title = mobj.group(2)
2549 simple_title = uploader + u'-' + slug_title
2550 full_title = '%s/%s' % (uploader, slug_title)
2552 self.report_resolve(full_title)
2554 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2558 info = json.loads(info_json)
2559 video_id = info['id']
2560 self.report_extraction(full_title)
2562 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563 stream_json = self._download_webpage(streams_url, full_title,
2564 u'Downloading stream definitions',
2565 u'unable to download stream definitions')
2567 streams = json.loads(stream_json)
2568 mediaURL = streams['http_mp3_128_url']
2569 upload_date = unified_strdate(info['created_at'])
2574 'uploader': info['user']['username'],
2575 'upload_date': upload_date,
2576 'title': info['title'],
2578 'description': info['description'],
2581 class SoundcloudSetIE(InfoExtractor):
2582 """Information extractor for soundcloud.com sets
2583 To access the media, the uid of the song and a stream token
2584 must be extracted from the page source and the script must make
2585 a request to media.soundcloud.com/crossdomain.xml. Then
2586 the media can be grabbed by requesting from an url composed
2587 of the stream token and uid
2590 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591 IE_NAME = u'soundcloud:set'
2593 def report_resolve(self, video_id):
2594 """Report information extraction."""
2595 self.to_screen(u'%s: Resolving id' % video_id)
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2600 raise ExtractorError(u'Invalid URL: %s' % url)
2602 # extract uploader (which is in the url)
2603 uploader = mobj.group(1)
2604 # extract simple title (uploader + slug of song title)
2605 slug_title = mobj.group(2)
2606 simple_title = uploader + u'-' + slug_title
2607 full_title = '%s/sets/%s' % (uploader, slug_title)
2609 self.report_resolve(full_title)
2611 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613 info_json = self._download_webpage(resolv_url, full_title)
2616 info = json.loads(info_json)
2617 if 'errors' in info:
2618 for err in info['errors']:
2619 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2622 self.report_extraction(full_title)
2623 for track in info['tracks']:
2624 video_id = track['id']
2626 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2629 self.report_extraction(video_id)
2630 streams = json.loads(stream_json)
2631 mediaURL = streams['http_mp3_128_url']
2636 'uploader': track['user']['username'],
2637 'upload_date': unified_strdate(track['created_at']),
2638 'title': track['title'],
2640 'description': track['description'],
2645 class InfoQIE(InfoExtractor):
2646 """Information extractor for infoq.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2649 def _real_extract(self, url):
2650 mobj = re.match(self._VALID_URL, url)
2652 raise ExtractorError(u'Invalid URL: %s' % url)
2654 webpage = self._download_webpage(url, video_id=url)
2655 self.report_extraction(url)
2658 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2660 raise ExtractorError(u'Unable to extract video url')
2661 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2665 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2668 # Extract description
2669 video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670 webpage, u'description', fatal=False)
2672 video_filename = video_url.split('/')[-1]
2673 video_id, extension = video_filename.split('.')
2679 'upload_date': None,
2680 'title': video_title,
2681 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2683 'description': video_description,
2688 class MixcloudIE(InfoExtractor):
2689 """Information extractor for www.mixcloud.com"""
2691 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME = u'mixcloud'
2695 def report_download_json(self, file_id):
2696 """Report JSON download."""
2697 self.to_screen(u'Downloading json')
2699 def get_urls(self, jsonData, fmt, bitrate='best'):
2700 """Get urls from 'audio_formats' section in json"""
2703 bitrate_list = jsonData[fmt]
2704 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705 bitrate = max(bitrate_list) # select highest
2707 url_list = jsonData[fmt][bitrate]
2708 except TypeError: # we have no bitrate info.
2709 url_list = jsonData[fmt]
2712 def check_urls(self, url_list):
2713 """Returns 1st active url from list"""
2714 for url in url_list:
2716 compat_urllib_request.urlopen(url)
2718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723 def _print_formats(self, formats):
2724 print('Available formats:')
2725 for fmt in formats.keys():
2726 for b in formats[fmt]:
2728 ext = formats[fmt][b][0]
2729 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730 except TypeError: # we have no bitrate info
2731 ext = formats[fmt][0]
2732 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2735 def _real_extract(self, url):
2736 mobj = re.match(self._VALID_URL, url)
2738 raise ExtractorError(u'Invalid URL: %s' % url)
2739 # extract uploader & filename from url
2740 uploader = mobj.group(1).decode('utf-8')
2741 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2743 # construct API request
2744 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745 # retrieve .json file with links to files
2746 request = compat_urllib_request.Request(file_url)
2748 self.report_download_json(file_url)
2749 jsonData = compat_urllib_request.urlopen(request).read()
2750 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2754 json_data = json.loads(jsonData)
2755 player_url = json_data['player_swf_url']
2756 formats = dict(json_data['audio_formats'])
2758 req_format = self._downloader.params.get('format', None)
2761 if self._downloader.params.get('listformats', None):
2762 self._print_formats(formats)
2765 if req_format is None or req_format == 'best':
2766 for format_param in formats.keys():
2767 url_list = self.get_urls(formats, format_param)
2769 file_url = self.check_urls(url_list)
2770 if file_url is not None:
2773 if req_format not in formats:
2774 raise ExtractorError(u'Format is not available')
2776 url_list = self.get_urls(formats, req_format)
2777 file_url = self.check_urls(url_list)
2778 format_param = req_format
2781 'id': file_id.decode('utf-8'),
2782 'url': file_url.decode('utf-8'),
2783 'uploader': uploader.decode('utf-8'),
2784 'upload_date': None,
2785 'title': json_data['name'],
2786 'ext': file_url.split('.')[-1].decode('utf-8'),
2787 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788 'thumbnail': json_data['thumbnail_url'],
2789 'description': json_data['description'],
2790 'player_url': player_url.decode('utf-8'),
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794 """Information extractor for Stanford's Open ClassRoom"""
2796 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797 IE_NAME = u'stanfordoc'
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2802 raise ExtractorError(u'Invalid URL: %s' % url)
2804 if mobj.group('course') and mobj.group('video'): # A specific video
2805 course = mobj.group('course')
2806 video = mobj.group('video')
2808 'id': course + '_' + video,
2810 'upload_date': None,
2813 self.report_extraction(info['id'])
2814 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815 xmlUrl = baseUrl + video + '.xml'
2817 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2822 info['title'] = mdoc.findall('./title')[0].text
2823 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2825 raise ExtractorError(u'Invalid metadata XML file')
2826 info['ext'] = info['url'].rpartition('.')[2]
2828 elif mobj.group('course'): # A course page
2829 course = mobj.group('course')
2834 'upload_date': None,
2837 coursepage = self._download_webpage(url, info['id'],
2838 note='Downloading course info page',
2839 errnote='Unable to download course info page')
2841 info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842 info['title'] = unescapeHTML(info['title'])
2844 info['description'] = self._search_regex('<description>([^<]+)</description>',
2845 coursepage, u'description', fatal=False)
2846 if info['description']: info['description'] = unescapeHTML(info['description'])
2848 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2851 'type': 'reference',
2852 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2856 for entry in info['list']:
2857 assert entry['type'] == 'reference'
2858 results += self.extract(entry['url'])
2862 'id': 'Stanford OpenClassroom',
2865 'upload_date': None,
2868 self.report_download_webpage(info['id'])
2869 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2871 rootpage = compat_urllib_request.urlopen(rootURL).read()
2872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2875 info['title'] = info['id']
2877 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2880 'type': 'reference',
2881 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2886 for entry in info['list']:
2887 assert entry['type'] == 'reference'
2888 results += self.extract(entry['url'])
2891 class MTVIE(InfoExtractor):
2892 """Information extractor for MTV.com"""
2894 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2897 def _real_extract(self, url):
2898 mobj = re.match(self._VALID_URL, url)
2900 raise ExtractorError(u'Invalid URL: %s' % url)
2901 if not mobj.group('proto'):
2902 url = 'http://' + url
2903 video_id = mobj.group('videoid')
2905 webpage = self._download_webpage(url, video_id)
2907 song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2908 webpage, u'song name', fatal=False)
2909 if song_name: song_name = unescapeHTML(song_name)
2911 video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2913 video_title = unescapeHTML(video_title)
2915 mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916 webpage, u'mtvn_uri', fatal=False)
2918 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919 webpage, u'content id', fatal=False)
2921 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922 self.report_extraction(video_id)
2923 request = compat_urllib_request.Request(videogen_url)
2925 metadataXml = compat_urllib_request.urlopen(request).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2929 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930 renditions = mdoc.findall('.//rendition')
2932 # For now, always pick the highest quality.
2933 rendition = renditions[-1]
2936 _,_,ext = rendition.attrib['type'].partition('/')
2937 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938 video_url = rendition.find('./src').text
2940 raise ExtractorError('Invalid rendition field.')
2945 'uploader': performer,
2946 'upload_date': None,
2947 'title': video_title,
2955 class YoukuIE(InfoExtractor):
2956 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2959 nowTime = int(time.time() * 1000)
2960 random1 = random.randint(1000,1998)
2961 random2 = random.randint(1000,9999)
2963 return "%d%d%d" %(nowTime,random1,random2)
2965 def _get_file_ID_mix_string(self, seed):
2967 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2969 for i in range(len(source)):
2970 seed = (seed * 211 + 30031 ) % 65536
2971 index = math.floor(seed / 65536 * len(source) )
2972 mixed.append(source[int(index)])
2973 source.remove(source[int(index)])
2974 #return ''.join(mixed)
2977 def _get_file_id(self, fileId, seed):
2978 mixed = self._get_file_ID_mix_string(seed)
2979 ids = fileId.split('*')
2983 realId.append(mixed[int(ch)])
2984 return ''.join(realId)
2986 def _real_extract(self, url):
2987 mobj = re.match(self._VALID_URL, url)
2989 raise ExtractorError(u'Invalid URL: %s' % url)
2990 video_id = mobj.group('ID')
2992 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2994 jsondata = self._download_webpage(info_url, video_id)
2996 self.report_extraction(video_id)
2998 config = json.loads(jsondata)
3000 video_title = config['data'][0]['title']
3001 seed = config['data'][0]['seed']
3003 format = self._downloader.params.get('format', None)
3004 supported_format = list(config['data'][0]['streamfileids'].keys())
3006 if format is None or format == 'best':
3007 if 'hd2' in supported_format:
3012 elif format == 'worst':
3020 fileid = config['data'][0]['streamfileids'][format]
3021 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022 except (UnicodeDecodeError, ValueError, KeyError):
3023 raise ExtractorError(u'Unable to extract info section')
3026 sid = self._gen_sid()
3027 fileid = self._get_file_id(fileid, seed)
3029 #column 8,9 of fileid represent the segment number
3030 #fileid[7:9] should be changed
3031 for index, key in enumerate(keys):
3033 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3037 'id': '%s_part%02d' % (video_id, index),
3038 'url': download_url,
3040 'upload_date': None,
3041 'title': video_title,
3044 files_info.append(info)
3049 class XNXXIE(InfoExtractor):
3050 """Information extractor for xnxx.com"""
3052 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3054 VIDEO_URL_RE = r'flv_url=(.*?)&'
3055 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3058 def _real_extract(self, url):
3059 mobj = re.match(self._VALID_URL, url)
3061 raise ExtractorError(u'Invalid URL: %s' % url)
3062 video_id = mobj.group(1)
3064 # Get webpage content
3065 webpage = self._download_webpage(url, video_id)
3067 video_url = self._search_regex(self.VIDEO_URL_RE,
3068 webpage, u'video URL')
3069 video_url = compat_urllib_parse.unquote(video_url)
3071 video_title = self._search_regex(self.VIDEO_TITLE_RE,
3074 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075 webpage, u'thumbnail', fatal=False)
3081 'upload_date': None,
3082 'title': video_title,
3084 'thumbnail': video_thumbnail,
3085 'description': None,
3089 class GooglePlusIE(InfoExtractor):
3090 """Information extractor for plus.google.com."""
3092 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093 IE_NAME = u'plus.google'
3095 def _real_extract(self, url):
3096 # Extract id from URL
3097 mobj = re.match(self._VALID_URL, url)
3099 raise ExtractorError(u'Invalid URL: %s' % url)
3101 post_url = mobj.group(0)
3102 video_id = mobj.group(1)
3104 video_extension = 'flv'
3106 # Step 1, Retrieve post webpage to extract further information
3107 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3109 self.report_extraction(video_id)
3111 # Extract update date
3112 upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3113 webpage, u'upload date', fatal=False)
3115 # Convert timestring to a format suitable for filename
3116 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117 upload_date = upload_date.strftime('%Y%m%d')
3120 uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3121 webpage, u'uploader', fatal=False)
3124 # Get the first line for title
3125 video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126 webpage, 'title', default=u'NA')
3128 # Step 2, Stimulate clicking the image box to launch video
3129 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130 webpage, u'video page URL')
3131 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3133 # Extract video links on video page
3134 """Extract video links of all sizes"""
3135 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136 mobj = re.findall(pattern, webpage)
3138 raise ExtractorError(u'Unable to extract video links')
3140 # Sort in resolution
3141 links = sorted(mobj)
3143 # Choose the lowest of the sort, i.e. highest resolution
3144 video_url = links[-1]
3145 # Only get the url. The resolution part in the tuple has no use anymore
3146 video_url = video_url[-1]
3147 # Treat escaped \u0026 style hex
3149 video_url = video_url.decode("unicode_escape")
3150 except AttributeError: # Python 3
3151 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3157 'uploader': uploader,
3158 'upload_date': upload_date,
3159 'title': video_title,
3160 'ext': video_extension,
3163 class NBAIE(InfoExtractor):
3164 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3167 def _real_extract(self, url):
3168 mobj = re.match(self._VALID_URL, url)
3170 raise ExtractorError(u'Invalid URL: %s' % url)
3172 video_id = mobj.group(1)
3174 webpage = self._download_webpage(url, video_id)
3176 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3178 shortened_video_id = video_id.rpartition('/')[2]
3179 title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3180 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3182 # It isn't there in the HTML it returns to us
3183 # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3185 description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3188 'id': shortened_video_id,
3192 # 'uploader_date': uploader_date,
3193 'description': description,
3197 class JustinTVIE(InfoExtractor):
3198 """Information extractor for justin.tv and twitch.tv"""
3199 # TODO: One broadcast may be split into multiple videos. The key
3200 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3201 # starts at 1 and increases. Can we treat all parts as one video?
3203 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3205 (?P<channelid>[^/]+)|
3206 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3207 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3211 _JUSTIN_PAGE_LIMIT = 100
3212 IE_NAME = u'justin.tv'
3214 def report_download_page(self, channel, offset):
3215 """Report attempt to download a single page of videos."""
3216 self.to_screen(u'%s: Downloading video information from %d to %d' %
3217 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3219 # Return count of items, list of *valid* items
3220 def _parse_page(self, url, video_id):
3221 webpage = self._download_webpage(url, video_id,
3222 u'Downloading video info JSON',
3223 u'unable to download video info JSON')
3225 response = json.loads(webpage)
3226 if type(response) != list:
3227 error_text = response.get('error', 'unknown error')
3228 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3230 for clip in response:
3231 video_url = clip['video_file_url']
3233 video_extension = os.path.splitext(video_url)[1][1:]
3234 video_date = re.sub('-', '', clip['start_time'][:10])
3235 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3236 video_id = clip['id']
3237 video_title = clip.get('title', video_id)
3241 'title': video_title,
3242 'uploader': clip.get('channel_name', video_uploader_id),
3243 'uploader_id': video_uploader_id,
3244 'upload_date': video_date,
3245 'ext': video_extension,
3247 return (len(response), info)
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3252 raise ExtractorError(u'invalid URL: %s' % url)
3254 api_base = 'http://api.justin.tv'
3256 if mobj.group('channelid'):
3258 video_id = mobj.group('channelid')
3259 api = api_base + '/channel/archives/%s.json' % video_id
3260 elif mobj.group('chapterid'):
3261 chapter_id = mobj.group('chapterid')
3263 webpage = self._download_webpage(url, chapter_id)
3264 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3266 raise ExtractorError(u'Cannot find archive of a chapter')
3267 archive_id = m.group(1)
3269 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3270 chapter_info_xml = self._download_webpage(api, chapter_id,
3271 note=u'Downloading chapter information',
3272 errnote=u'Chapter information download failed')
3273 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3274 for a in doc.findall('.//archive'):
3275 if archive_id == a.find('./id').text:
3278 raise ExtractorError(u'Could not find chapter in chapter information')
3280 video_url = a.find('./video_file_url').text
3281 video_ext = video_url.rpartition('.')[2] or u'flv'
3283 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3284 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3285 note='Downloading chapter metadata',
3286 errnote='Download of chapter metadata failed')
3287 chapter_info = json.loads(chapter_info_json)
3289 bracket_start = int(doc.find('.//bracket_start').text)
3290 bracket_end = int(doc.find('.//bracket_end').text)
3292 # TODO determine start (and probably fix up file)
3293 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3294 #video_url += u'?start=' + TODO:start_timestamp
3295 # bracket_start is 13290, but we want 51670615
3296 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3297 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3300 'id': u'c' + chapter_id,
3303 'title': chapter_info['title'],
3304 'thumbnail': chapter_info['preview'],
3305 'description': chapter_info['description'],
3306 'uploader': chapter_info['channel']['display_name'],
3307 'uploader_id': chapter_info['channel']['name'],
3311 video_id = mobj.group('videoid')
3312 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3314 self.report_extraction(video_id)
3318 limit = self._JUSTIN_PAGE_LIMIT
3321 self.report_download_page(video_id, offset)
3322 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3323 page_count, page_info = self._parse_page(page_url, video_id)
3324 info.extend(page_info)
3325 if not paged or page_count != limit:
3330 class FunnyOrDieIE(InfoExtractor):
3331 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3333 def _real_extract(self, url):
3334 mobj = re.match(self._VALID_URL, url)
3336 raise ExtractorError(u'invalid URL: %s' % url)
3338 video_id = mobj.group('id')
3339 webpage = self._download_webpage(url, video_id)
3341 video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3342 webpage, u'video URL', flags=re.DOTALL)
3343 video_url = unescapeHTML(video_url)
3345 title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3346 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3347 title = clean_html(title)
3349 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3350 webpage, u'description', fatal=False, flags=re.DOTALL)
3351 if video_description: video_description = unescapeHTML(video_description)
3358 'description': video_description,
3362 class SteamIE(InfoExtractor):
3363 _VALID_URL = r"""http://store\.steampowered\.com/
3365 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3367 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3371 def suitable(cls, url):
3372 """Receives a URL and returns True if suitable for this IE."""
3373 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3375 def _real_extract(self, url):
3376 m = re.match(self._VALID_URL, url, re.VERBOSE)
3377 gameID = m.group('gameID')
3378 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3379 self.report_age_confirmation()
3380 webpage = self._download_webpage(videourl, gameID)
3381 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3383 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3384 mweb = re.finditer(urlRE, webpage)
3385 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3386 titles = re.finditer(namesRE, webpage)
3387 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3388 thumbs = re.finditer(thumbsRE, webpage)
3390 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3391 video_id = vid.group('videoID')
3392 title = vtitle.group('videoName')
3393 video_url = vid.group('videoURL')
3394 video_thumb = thumb.group('thumbnail')
3396 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3401 'title': unescapeHTML(title),
3402 'thumbnail': video_thumb
3405 return [self.playlist_result(videos, gameID, game_title)]
3407 class UstreamIE(InfoExtractor):
3408 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3409 IE_NAME = u'ustream'
3411 def _real_extract(self, url):
3412 m = re.match(self._VALID_URL, url)
3413 video_id = m.group('videoID')
3415 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3416 webpage = self._download_webpage(url, video_id)
3418 self.report_extraction(video_id)
3420 video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3423 uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3424 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3425 if uploader: uploader = unescapeHTML(uploader.strip())
3427 thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3428 webpage, u'thumbnail', fatal=False)
3434 'title': video_title,
3435 'uploader': uploader,
3436 'thumbnail': thumbnail,
3440 class WorldStarHipHopIE(InfoExtractor):
3441 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3442 IE_NAME = u'WorldStarHipHop'
3444 def _real_extract(self, url):
3445 m = re.match(self._VALID_URL, url)
3446 video_id = m.group('id')
3448 webpage_src = self._download_webpage(url, video_id)
3450 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3451 webpage_src, u'video URL')
3453 if 'mp4' in video_url:
3458 video_title = self._search_regex(r"<title>(.*)</title>",
3459 webpage_src, u'title')
3461 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3462 thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3463 webpage_src, u'thumbnail', fatal=False)
3466 _title = r"""candytitles.*>(.*)</span>"""
3467 mobj = re.search(_title, webpage_src)
3468 if mobj is not None:
3469 video_title = mobj.group(1)
3474 'title' : video_title,
3475 'thumbnail' : thumbnail,
3480 class RBMARadioIE(InfoExtractor):
3481 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3483 def _real_extract(self, url):
3484 m = re.match(self._VALID_URL, url)
3485 video_id = m.group('videoID')
3487 webpage = self._download_webpage(url, video_id)
3489 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3490 webpage, u'json data')
3493 data = json.loads(json_data)
3494 except ValueError as e:
3495 raise ExtractorError(u'Invalid JSON: ' + str(e))
3497 video_url = data['akamai_url'] + '&cbr=256'
3498 url_parts = compat_urllib_parse_urlparse(video_url)
3499 video_ext = url_parts.path.rpartition('.')[2]
3504 'title': data['title'],
3505 'description': data.get('teaser_text'),
3506 'location': data.get('country_of_origin'),
3507 'uploader': data.get('host', {}).get('name'),
3508 'uploader_id': data.get('host', {}).get('slug'),
3509 'thumbnail': data.get('image', {}).get('large_url_2x'),
3510 'duration': data.get('duration'),
3515 class YouPornIE(InfoExtractor):
3516 """Information extractor for youporn.com."""
3517 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3519 def _print_formats(self, formats):
3520 """Print all available formats"""
3521 print(u'Available formats:')
3522 print(u'ext\t\tformat')
3523 print(u'---------------------------------')
3524 for format in formats:
3525 print(u'%s\t\t%s' % (format['ext'], format['format']))
3527 def _specific(self, req_format, formats):
3529 if(x["format"]==req_format):
3533 def _real_extract(self, url):
3534 mobj = re.match(self._VALID_URL, url)
3536 raise ExtractorError(u'Invalid URL: %s' % url)
3537 video_id = mobj.group('videoid')
3539 req = compat_urllib_request.Request(url)
3540 req.add_header('Cookie', 'age_verified=1')
3541 webpage = self._download_webpage(req, video_id)
3543 # Get JSON parameters
3544 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3546 params = json.loads(json_params)
3548 raise ExtractorError(u'Invalid JSON')
3550 self.report_extraction(video_id)
3552 video_title = params['title']
3553 upload_date = unified_strdate(params['release_date_f'])
3554 video_description = params['description']
3555 video_uploader = params['submitted_by']
3556 thumbnail = params['thumbnails'][0]['image']
3558 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3560 # Get all of the formats available
3561 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3562 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3563 webpage, u'download list').strip()
3565 # Get all of the links from the page
3566 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3567 links = re.findall(LINK_RE, download_list_html)
3568 if(len(links) == 0):
3569 raise ExtractorError(u'ERROR: no known formats available for video')
3571 self.to_screen(u'Links found: %d' % len(links))
3576 # A link looks like this:
3577 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3578 # A path looks like this:
3579 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3580 video_url = unescapeHTML( link )
3581 path = compat_urllib_parse_urlparse( video_url ).path
3582 extension = os.path.splitext( path )[1][1:]
3583 format = path.split('/')[4].split('_')[:2]
3586 format = "-".join( format )
3587 title = u'%s-%s-%s' % (video_title, size, bitrate)
3592 'uploader': video_uploader,
3593 'upload_date': upload_date,
3597 'thumbnail': thumbnail,
3598 'description': video_description
3601 if self._downloader.params.get('listformats', None):
3602 self._print_formats(formats)
3605 req_format = self._downloader.params.get('format', None)
3606 self.to_screen(u'Format: %s' % req_format)
3608 if req_format is None or req_format == 'best':
3610 elif req_format == 'worst':
3611 return [formats[-1]]
3612 elif req_format in ('-1', 'all'):
3615 format = self._specific( req_format, formats )
3617 raise ExtractorError(u'Requested format not available')
3622 class PornotubeIE(InfoExtractor):
3623 """Information extractor for pornotube.com."""
3624 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3626 def _real_extract(self, url):
3627 mobj = re.match(self._VALID_URL, url)
3629 raise ExtractorError(u'Invalid URL: %s' % url)
3631 video_id = mobj.group('videoid')
3632 video_title = mobj.group('title')
3634 # Get webpage content
3635 webpage = self._download_webpage(url, video_id)
3638 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3639 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3640 video_url = compat_urllib_parse.unquote(video_url)
3642 #Get the uploaded date
3643 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3644 upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3645 if upload_date: upload_date = unified_strdate(upload_date)
3647 info = {'id': video_id,
3650 'upload_date': upload_date,
3651 'title': video_title,
3657 class YouJizzIE(InfoExtractor):
3658 """Information extractor for youjizz.com."""
3659 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3661 def _real_extract(self, url):
3662 mobj = re.match(self._VALID_URL, url)
3664 raise ExtractorError(u'Invalid URL: %s' % url)
3666 video_id = mobj.group('videoid')
3668 # Get webpage content
3669 webpage = self._download_webpage(url, video_id)
3671 # Get the video title
3672 video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3673 webpage, u'title').strip()
3675 # Get the embed page
3676 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3678 raise ExtractorError(u'ERROR: unable to extract embed page')
3680 embed_page_url = result.group(0).strip()
3681 video_id = result.group('videoid')
3683 webpage = self._download_webpage(embed_page_url, video_id)
3686 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3687 webpage, u'video URL')
3689 info = {'id': video_id,
3691 'title': video_title,
3694 'player_url': embed_page_url}
3698 class EightTracksIE(InfoExtractor):
3700 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3702 def _real_extract(self, url):
3703 mobj = re.match(self._VALID_URL, url)
3705 raise ExtractorError(u'Invalid URL: %s' % url)
3706 playlist_id = mobj.group('id')
3708 webpage = self._download_webpage(url, playlist_id)
3710 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3711 data = json.loads(json_like)
3713 session = str(random.randint(0, 1000000000))
3715 track_count = data['tracks_count']
3716 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3717 next_url = first_url
3719 for i in itertools.count():
3720 api_json = self._download_webpage(next_url, playlist_id,
3721 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3722 errnote=u'Failed to download song information')
3723 api_data = json.loads(api_json)
3724 track_data = api_data[u'set']['track']
3726 'id': track_data['id'],
3727 'url': track_data['track_file_stream_url'],
3728 'title': track_data['performer'] + u' - ' + track_data['name'],
3729 'raw_title': track_data['name'],
3730 'uploader_id': data['user']['login'],
3734 if api_data['set']['at_last_track']:
3736 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3739 class KeekIE(InfoExtractor):
3740 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3743 def _real_extract(self, url):
3744 m = re.match(self._VALID_URL, url)
3745 video_id = m.group('videoID')
3747 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3748 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3749 webpage = self._download_webpage(url, video_id)
3751 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3753 video_title = unescapeHTML(video_title)
3755 uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3756 webpage, u'uploader', fatal=False)
3757 if uploader: uploader = clean_html(uploader)
3763 'title': video_title,
3764 'thumbnail': thumbnail,
3765 'uploader': uploader
3769 class TEDIE(InfoExtractor):
3770 _VALID_URL=r'''http://www\.ted\.com/
3772 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3774 ((?P<type_talk>talks)) # We have a simple talk
3776 (/lang/(.*?))? # The url may contain the language
3777 /(?P<name>\w+) # Here goes the name and then ".html"
3781 def suitable(cls, url):
3782 """Receives a URL and returns True if suitable for this IE."""
3783 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3785 def _real_extract(self, url):
3786 m=re.match(self._VALID_URL, url, re.VERBOSE)
3787 if m.group('type_talk'):
3788 return [self._talk_info(url)]
3790 playlist_id=m.group('playlist_id')
3791 name=m.group('name')
3792 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793 return [self._playlist_videos_info(url,name,playlist_id)]
3795 def _talk_video_link(self,mediaSlug):
3796 '''Returns the video link for that mediaSlug'''
3797 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3799 def _playlist_videos_info(self,url,name,playlist_id=0):
3800 '''Returns the videos of the playlist'''
3802 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803 ([.\s]*?)data-playlist_item_id="(\d+)"
3804 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3806 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809 m_names=re.finditer(video_name_RE,webpage)
3811 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3812 m_playlist = re.search(playlist_RE, webpage)
3813 playlist_title = m_playlist.group('playlist_title')
3815 playlist_entries = []
3816 for m_video, m_name in zip(m_videos,m_names):
3817 video_id=m_video.group('video_id')
3818 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3819 playlist_entries.append(self.url_result(talk_url, 'TED'))
3820 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3822 def _talk_info(self, url, video_id=0):
3823 """Return the video for the talk in the url"""
3824 m=re.match(self._VALID_URL, url,re.VERBOSE)
3825 videoName=m.group('name')
3826 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3827 # If the url includes the language we get the title translated
3828 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3829 title=re.search(title_RE, webpage).group('title')
3830 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3831 "id":(?P<videoID>[\d]+).*?
3832 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3833 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3834 thumb_match=re.search(thumb_RE,webpage)
3835 info_match=re.search(info_RE,webpage,re.VERBOSE)
3836 video_id=info_match.group('videoID')
3837 mediaSlug=info_match.group('mediaSlug')
3838 video_url=self._talk_video_link(mediaSlug)
3844 'thumbnail': thumb_match.group('thumbnail')
3848 class MySpassIE(InfoExtractor):
3849 _VALID_URL = r'http://www.myspass.de/.*'
3851 def _real_extract(self, url):
3852 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3854 # video id is the last path element of the URL
3855 # usually there is a trailing slash, so also try the second but last
3856 url_path = compat_urllib_parse_urlparse(url).path
3857 url_parent_path, video_id = os.path.split(url_path)
3859 _, video_id = os.path.split(url_parent_path)
3862 metadata_url = META_DATA_URL_TEMPLATE % video_id
3863 metadata_text = self._download_webpage(metadata_url, video_id)
3864 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3866 # extract values from metadata
3867 url_flv_el = metadata.find('url_flv')
3868 if url_flv_el is None:
3869 raise ExtractorError(u'Unable to extract download url')
3870 video_url = url_flv_el.text
3871 extension = os.path.splitext(video_url)[1][1:]
3872 title_el = metadata.find('title')
3873 if title_el is None:
3874 raise ExtractorError(u'Unable to extract title')
3875 title = title_el.text
3876 format_id_el = metadata.find('format_id')
3877 if format_id_el is None:
3880 format = format_id_el.text
3881 description_el = metadata.find('description')
3882 if description_el is not None:
3883 description = description_el.text
3886 imagePreview_el = metadata.find('imagePreview')
3887 if imagePreview_el is not None:
3888 thumbnail = imagePreview_el.text
3897 'thumbnail': thumbnail,
3898 'description': description
3902 class SpiegelIE(InfoExtractor):
3903 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3905 def _real_extract(self, url):
3906 m = re.match(self._VALID_URL, url)
3907 video_id = m.group('videoID')
3909 webpage = self._download_webpage(url, video_id)
3911 video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3913 video_title = unescapeHTML(video_title)
3915 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3916 xml_code = self._download_webpage(xml_url, video_id,
3917 note=u'Downloading XML', errnote=u'Failed to download XML')
3919 idoc = xml.etree.ElementTree.fromstring(xml_code)
3920 last_type = idoc[-1]
3921 filename = last_type.findall('./filename')[0].text
3922 duration = float(last_type.findall('./duration')[0].text)
3924 video_url = 'http://video2.spiegel.de/flash/' + filename
3925 video_ext = filename.rpartition('.')[2]
3930 'title': video_title,
3931 'duration': duration,
3935 class LiveLeakIE(InfoExtractor):
3937 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3938 IE_NAME = u'liveleak'
3940 def _real_extract(self, url):
3941 mobj = re.match(self._VALID_URL, url)
3943 raise ExtractorError(u'Invalid URL: %s' % url)
3945 video_id = mobj.group('video_id')
3947 webpage = self._download_webpage(url, video_id)
3949 video_url = self._search_regex(r'file: "(.*?)",',
3950 webpage, u'video URL')
3952 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3954 video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3956 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3957 webpage, u'description', fatal=False)
3958 if video_description: video_description = unescapeHTML(video_description)
3960 video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3961 webpage, u'uploader', fatal=False)
3967 'title': video_title,
3968 'description': video_description,
3969 'uploader': video_uploader
3974 class ARDIE(InfoExtractor):
3975 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3976 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3977 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3979 def _real_extract(self, url):
3980 # determine video id from url
3981 m = re.match(self._VALID_URL, url)
3983 numid = re.search(r'documentId=([0-9]+)', url)
3985 video_id = numid.group(1)
3987 video_id = m.group('video_id')
3989 # determine title and media streams from webpage
3990 html = self._download_webpage(url, video_id)
3991 title = re.search(self._TITLE, html).group('title')
3992 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3994 assert '"fsk"' in html
3995 raise ExtractorError(u'This video is only available after 8:00 pm')
3997 # choose default media type and highest quality for now
3998 stream = max([s for s in streams if int(s["media_type"]) == 0],
3999 key=lambda s: int(s["quality"]))
4001 # there's two possibilities: RTMP stream or HTTP download
4002 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4003 if stream['rtmp_url']:
4004 self.to_screen(u'RTMP download detected')
4005 assert stream['video_url'].startswith('mp4:')
4006 info["url"] = stream["rtmp_url"]
4007 info["play_path"] = stream['video_url']
4009 assert stream["video_url"].endswith('.mp4')
4010 info["url"] = stream["video_url"]
4013 class TumblrIE(InfoExtractor):
4014 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4016 def _real_extract(self, url):
4017 m_url = re.match(self._VALID_URL, url)
4018 video_id = m_url.group('id')
4019 blog = m_url.group('blog_name')
4021 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4022 webpage = self._download_webpage(url, video_id)
4024 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4025 video = re.search(re_video, webpage)
4027 raise ExtractorError(u'Unable to extract video')
4028 video_url = video.group('video_url')
4029 ext = video.group('ext')
4031 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4032 webpage, u'thumbnail', fatal=False) # We pick the first poster
4033 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4035 # The only place where you can get a title, it's not complete,
4036 # but searching in other places doesn't work for all videos
4037 video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4038 webpage, u'title', flags=re.DOTALL)
4039 video_title = unescapeHTML(video_title)
4041 return [{'id': video_id,
4043 'title': video_title,
4044 'thumbnail': video_thumbnail,
4048 class BandcampIE(InfoExtractor):
4049 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4051 def _real_extract(self, url):
4052 mobj = re.match(self._VALID_URL, url)
4053 title = mobj.group('title')
4054 webpage = self._download_webpage(url, title)
4055 # We get the link to the free download page
4056 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4057 if m_download is None:
4058 raise ExtractorError(u'No free songs found')
4060 download_link = m_download.group(1)
4061 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4062 webpage, re.MULTILINE|re.DOTALL).group('id')
4064 download_webpage = self._download_webpage(download_link, id,
4065 'Downloading free downloads page')
4066 # We get the dictionary of the track from some javascrip code
4067 info = re.search(r'items: (.*?),$',
4068 download_webpage, re.MULTILINE).group(1)
4069 info = json.loads(info)[0]
4070 # We pick mp3-320 for now, until format selection can be easily implemented.
4071 mp3_info = info[u'downloads'][u'mp3-320']
4072 # If we try to use this url it says the link has expired
4073 initial_url = mp3_info[u'url']
4074 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4075 m_url = re.match(re_url, initial_url)
4076 #We build the url we will use to get the final track url
4077 # This url is build in Bandcamp in the script download_bunde_*.js
4078 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4079 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4080 # If we could correctly generate the .rand field the url would be
4081 #in the "download_url" key
4082 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4084 track_info = {'id':id,
4085 'title' : info[u'title'],
4088 'thumbnail' : info[u'thumb_url'],
4089 'uploader' : info[u'artist']
4094 class RedTubeIE(InfoExtractor):
4095 """Information Extractor for redtube"""
4096 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4098 def _real_extract(self,url):
4099 mobj = re.match(self._VALID_URL, url)
4101 raise ExtractorError(u'Invalid URL: %s' % url)
4103 video_id = mobj.group('id')
4104 video_extension = 'mp4'
4105 webpage = self._download_webpage(url, video_id)
4107 self.report_extraction(video_id)
4109 video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4110 webpage, u'video URL')
4112 video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4118 'ext': video_extension,
4119 'title': video_title,
4122 class InaIE(InfoExtractor):
4123 """Information Extractor for Ina.fr"""
4124 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4126 def _real_extract(self,url):
4127 mobj = re.match(self._VALID_URL, url)
4129 video_id = mobj.group('id')
4130 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4131 video_extension = 'mp4'
4132 webpage = self._download_webpage(mrss_url, video_id)
4134 self.report_extraction(video_id)
4136 video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4137 webpage, u'video URL')
4139 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4145 'ext': video_extension,
4146 'title': video_title,
4149 class HowcastIE(InfoExtractor):
4150 """Information Extractor for Howcast.com"""
4151 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4153 def _real_extract(self, url):
4154 mobj = re.match(self._VALID_URL, url)
4156 video_id = mobj.group('id')
4157 webpage_url = 'http://www.howcast.com/videos/' + video_id
4158 webpage = self._download_webpage(webpage_url, video_id)
4160 self.report_extraction(video_id)
4162 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4163 webpage, u'video URL')
4165 video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4168 video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4169 webpage, u'description', fatal=False)
4171 thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4172 webpage, u'thumbnail', fatal=False)
4178 'title': video_title,
4179 'description': video_description,
4180 'thumbnail': thumbnail,
4183 class VineIE(InfoExtractor):
4184 """Information Extractor for Vine.co"""
4185 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4187 def _real_extract(self, url):
4188 mobj = re.match(self._VALID_URL, url)
4190 video_id = mobj.group('id')
4191 webpage_url = 'https://vine.co/v/' + video_id
4192 webpage = self._download_webpage(webpage_url, video_id)
4194 self.report_extraction(video_id)
4196 video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4197 webpage, u'video URL')
4199 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4202 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4203 webpage, u'thumbnail', fatal=False)
4205 uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4206 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4212 'title': video_title,
4213 'thumbnail': thumbnail,
4214 'uploader': uploader,
4217 class FlickrIE(InfoExtractor):
4218 """Information Extractor for Flickr videos"""
4219 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4221 def _real_extract(self, url):
4222 mobj = re.match(self._VALID_URL, url)
4224 video_id = mobj.group('id')
4225 video_uploader_id = mobj.group('uploader_id')
4226 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4227 webpage = self._download_webpage(webpage_url, video_id)
4229 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4231 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4232 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4234 node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4235 first_xml, u'node_id')
4237 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4238 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4240 self.report_extraction(video_id)
4242 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4244 raise ExtractorError(u'Unable to extract video url')
4245 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4247 video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4248 webpage, u'video title')
4250 video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4251 webpage, u'description', fatal=False)
4253 thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4254 webpage, u'thumbnail', fatal=False)
4260 'title': video_title,
4261 'description': video_description,
4262 'thumbnail': thumbnail,
4263 'uploader_id': video_uploader_id,
4266 class TeamcocoIE(InfoExtractor):
4267 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4269 def _real_extract(self, url):
4270 mobj = re.match(self._VALID_URL, url)
4272 raise ExtractorError(u'Invalid URL: %s' % url)
4273 url_title = mobj.group('url_title')
4274 webpage = self._download_webpage(url, url_title)
4276 video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4277 webpage, u'video id')
4279 self.report_extraction(video_id)
4281 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4284 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4285 webpage, u'thumbnail', fatal=False)
4287 video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4288 webpage, u'description', fatal=False)
4290 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4291 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4293 video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4300 'title': video_title,
4301 'thumbnail': thumbnail,
4302 'description': video_description,
4305 class XHamsterIE(InfoExtractor):
4306 """Information Extractor for xHamster"""
4307 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4309 def _real_extract(self,url):
4310 mobj = re.match(self._VALID_URL, url)
4312 video_id = mobj.group('id')
4313 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4314 webpage = self._download_webpage(mrss_url, video_id)
4316 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4318 raise ExtractorError(u'Unable to extract media URL')
4319 if len(mobj.group('server')) == 0:
4320 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4322 video_url = mobj.group('server')+'/key='+mobj.group('file')
4323 video_extension = video_url.split('.')[-1]
4325 video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4327 video_title = unescapeHTML(video_title)
4329 # Can't see the description anywhere in the UI
4330 # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4331 # webpage, u'description', fatal=False)
4332 # if video_description: video_description = unescapeHTML(video_description)
4334 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4336 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4338 video_upload_date = None
4339 self._downloader.report_warning(u'Unable to extract upload date')
4341 video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
4342 webpage, u'uploader id', default=u'anonymous')
4344 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4345 webpage, u'thumbnail', fatal=False)
4350 'ext': video_extension,
4351 'title': video_title,
4352 # 'description': video_description,
4353 'upload_date': video_upload_date,
4354 'uploader_id': video_uploader_id,
4355 'thumbnail': video_thumbnail
4358 class HypemIE(InfoExtractor):
4359 """Information Extractor for hypem"""
4360 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4362 def _real_extract(self, url):
4363 mobj = re.match(self._VALID_URL, url)
4365 raise ExtractorError(u'Invalid URL: %s' % url)
4366 track_id = mobj.group(1)
4368 data = { 'ax': 1, 'ts': time.time() }
4369 data_encoded = compat_urllib_parse.urlencode(data)
4370 complete_url = url + "?" + data_encoded
4371 request = compat_urllib_request.Request(complete_url)
4372 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4373 cookie = urlh.headers.get('Set-Cookie', '')
4375 self.report_extraction(track_id)
4377 html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4378 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4380 track_list = json.loads(html_tracks)
4381 track = track_list[u'tracks'][0]
4383 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4386 track_id = track[u"id"]
4387 artist = track[u"artist"]
4388 title = track[u"song"]
4390 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4391 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4392 request.add_header('cookie', cookie)
4393 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4395 song_data = json.loads(song_data_json)
4397 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4398 final_url = song_data[u"url"]
4409 def gen_extractors():
4410 """ Return a list of an instance of every supported extractor.
4411 The order does matter; the first extractor matched is the one handling the URL.
4414 YoutubePlaylistIE(),
4439 StanfordOpenClassroomIE(),
4449 WorldStarHipHopIE(),
4475 def get_info_extractor(ie_name):
4476 """Returns the info extractor class with the given ie_name"""
4477 return globals()[ie_name+'IE']