2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 class SearchInfoExtractor(InfoExtractor):
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
212 raise ExtractorError(u'Invalid search query "%s"' % query)
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
276 _video_dimensions = {
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
361 (error_message, sub_lang, sub)
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
370 url = 'http://www.youtube.com/api/timedtext?' + params
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
379 def _request_automatic_caption(self, video_id, webpage):
380 """We need the webpage for getting the captions url, pass it as an
381 argument to speed up the process."""
382 sub_lang = self._downloader.params.get('subtitleslang')
383 sub_format = self._downloader.params.get('subtitlesformat')
384 self.to_screen(u'%s: Looking for automatic captions' % video_id)
385 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
386 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
388 return [(err_msg, None, None)]
389 player_config = json.loads(mobj.group(1))
391 args = player_config[u'args']
392 caption_url = args[u'ttsurl']
393 timestamp = args[u'timestamp']
394 params = compat_urllib_parse.urlencode({
401 subtitles_url = caption_url + '&' + params
402 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
403 return [(None, sub_lang, sub)]
405 return [(err_msg, None, None)]
407 def _extract_subtitle(self, video_id):
409 Return a list with a tuple:
410 [(error_message, sub_lang, sub)]
412 sub_lang_list = self._get_available_subtitles(video_id)
413 sub_format = self._downloader.params.get('subtitlesformat')
414 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
415 return [(sub_lang_list[0], None, None)]
416 if self._downloader.params.get('subtitleslang', False):
417 sub_lang = self._downloader.params.get('subtitleslang')
418 elif 'en' in sub_lang_list:
421 sub_lang = list(sub_lang_list.keys())[0]
422 if not sub_lang in sub_lang_list:
423 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
425 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
428 def _extract_all_subtitles(self, video_id):
429 sub_lang_list = self._get_available_subtitles(video_id)
430 sub_format = self._downloader.params.get('subtitlesformat')
431 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
432 return [(sub_lang_list[0], None, None)]
434 for sub_lang in sub_lang_list:
435 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
436 subtitles.append(subtitle)
439 def _print_formats(self, formats):
440 print('Available formats:')
442 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
444 def _real_initialize(self):
445 if self._downloader is None:
450 downloader_params = self._downloader.params
452 # Attempt to use provided username and password or .netrc data
453 if downloader_params.get('username', None) is not None:
454 username = downloader_params['username']
455 password = downloader_params['password']
456 elif downloader_params.get('usenetrc', False):
458 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
463 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
464 except (IOError, netrc.NetrcParseError) as err:
465 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
469 request = compat_urllib_request.Request(self._LANG_URL)
472 compat_urllib_request.urlopen(request).read()
473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
474 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
477 # No authentication to be performed
481 request = compat_urllib_request.Request(self._LOGIN_URL)
483 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
485 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
490 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
492 galx = match.group(1)
494 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
500 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
504 u'PersistentCookie': u'yes',
506 u'bgresponse': u'js_disabled',
507 u'checkConnection': u'',
508 u'checkedDomains': u'youtube',
514 u'signIn': u'Sign in',
516 u'service': u'youtube',
520 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
522 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
523 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
524 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
527 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
528 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
529 self._downloader.report_warning(u'unable to log in: bad username or password')
531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
532 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
538 'action_confirm': 'Confirm',
540 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
542 self.report_age_confirmation()
543 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
547 def _extract_id(self, url):
548 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
550 raise ExtractorError(u'Invalid URL: %s' % url)
551 video_id = mobj.group(2)
554 def _real_extract(self, url):
555 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
556 mobj = re.search(self._NEXT_URL_RE, url)
558 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
559 video_id = self._extract_id(url)
562 self.report_video_webpage_download(video_id)
563 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
564 request = compat_urllib_request.Request(url)
566 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
570 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
572 # Attempt to extract SWF player URL
573 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
575 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
580 self.report_video_info_webpage_download(video_id)
581 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
582 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
583 % (video_id, el_type))
584 video_info_webpage = self._download_webpage(video_info_url, video_id,
586 errnote='unable to download video info webpage')
587 video_info = compat_parse_qs(video_info_webpage)
588 if 'token' in video_info:
590 if 'token' not in video_info:
591 if 'reason' in video_info:
592 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
594 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
596 # Check for "rental" videos
597 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
598 raise ExtractorError(u'"rental" videos not supported')
600 # Start extracting information
601 self.report_information_extraction(video_id)
604 if 'author' not in video_info:
605 raise ExtractorError(u'Unable to extract uploader name')
606 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
609 video_uploader_id = None
610 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
612 video_uploader_id = mobj.group(1)
614 self._downloader.report_warning(u'unable to extract uploader nickname')
617 if 'title' not in video_info:
618 raise ExtractorError(u'Unable to extract video title')
619 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
622 if 'thumbnail_url' not in video_info:
623 self._downloader.report_warning(u'unable to extract video thumbnail')
625 else: # don't panic if we can't find it
626 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
630 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
632 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
633 upload_date = unified_strdate(upload_date)
636 video_description = get_element_by_id("eow-description", video_webpage)
637 if video_description:
638 video_description = clean_html(video_description)
640 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
642 video_description = unescapeHTML(fd_mobj.group(1))
644 video_description = u''
647 video_subtitles = None
649 if self._downloader.params.get('writesubtitles', False):
650 video_subtitles = self._extract_subtitle(video_id)
652 (sub_error, sub_lang, sub) = video_subtitles[0]
654 # We try with the automatic captions
655 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
656 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
660 # We report the original error
661 self._downloader.report_error(sub_error)
663 if self._downloader.params.get('allsubtitles', False):
664 video_subtitles = self._extract_all_subtitles(video_id)
665 for video_subtitle in video_subtitles:
666 (sub_error, sub_lang, sub) = video_subtitle
668 self._downloader.report_error(sub_error)
670 if self._downloader.params.get('listsubtitles', False):
671 sub_lang_list = self._list_available_subtitles(video_id)
674 if 'length_seconds' not in video_info:
675 self._downloader.report_warning(u'unable to extract video duration')
678 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
681 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
683 # Decide which formats to download
684 req_format = self._downloader.params.get('format', None)
686 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
687 self.report_rtmp_download()
688 video_url_list = [(None, video_info['conn'][0])]
689 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
691 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
692 url_data = compat_parse_qs(url_data_str)
693 if 'itag' in url_data and 'url' in url_data:
694 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
695 if not 'ratebypass' in url: url += '&ratebypass=yes'
696 url_map[url_data['itag'][0]] = url
698 format_limit = self._downloader.params.get('format_limit', None)
699 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
700 if format_limit is not None and format_limit in available_formats:
701 format_list = available_formats[available_formats.index(format_limit):]
703 format_list = available_formats
704 existing_formats = [x for x in format_list if x in url_map]
705 if len(existing_formats) == 0:
706 raise ExtractorError(u'no known formats available for video')
707 if self._downloader.params.get('listformats', None):
708 self._print_formats(existing_formats)
710 if req_format is None or req_format == 'best':
711 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
712 elif req_format == 'worst':
713 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
714 elif req_format in ('-1', 'all'):
715 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
717 # Specific formats. We pick the first in a slash-delimeted sequence.
718 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
719 req_formats = req_format.split('/')
720 video_url_list = None
721 for rf in req_formats:
723 video_url_list = [(rf, url_map[rf])]
725 if video_url_list is None:
726 raise ExtractorError(u'requested format not available')
728 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
731 for format_param, video_real_url in video_url_list:
733 video_extension = self._video_extensions.get(format_param, 'flv')
735 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
736 self._video_dimensions.get(format_param, '???'))
740 'url': video_real_url,
741 'uploader': video_uploader,
742 'uploader_id': video_uploader_id,
743 'upload_date': upload_date,
744 'title': video_title,
745 'ext': video_extension,
746 'format': video_format,
747 'thumbnail': video_thumbnail,
748 'description': video_description,
749 'player_url': player_url,
750 'subtitles': video_subtitles,
751 'duration': video_duration
756 class MetacafeIE(InfoExtractor):
757 """Information Extractor for metacafe.com."""
759 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
760 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
761 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
762 IE_NAME = u'metacafe'
764 def report_disclaimer(self):
765 """Report disclaimer retrieval."""
766 self.to_screen(u'Retrieving disclaimer')
768 def _real_initialize(self):
769 # Retrieve disclaimer
770 request = compat_urllib_request.Request(self._DISCLAIMER)
772 self.report_disclaimer()
773 disclaimer = compat_urllib_request.urlopen(request).read()
774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
775 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
780 'submit': "Continue - I'm over 18",
782 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
784 self.report_age_confirmation()
785 disclaimer = compat_urllib_request.urlopen(request).read()
786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
787 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 raise ExtractorError(u'Invalid URL: %s' % url)
795 video_id = mobj.group(1)
797 # Check if video comes from YouTube
798 mobj2 = re.match(r'^yt-(.*)$', video_id)
799 if mobj2 is not None:
800 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
802 # Retrieve video webpage to extract further information
803 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 video_extension = mediaURL[-3:]
812 # Extract gdaKey if available
813 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
817 gdaKey = mobj.group(1)
818 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
820 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
822 raise ExtractorError(u'Unable to extract media URL')
823 vardict = compat_parse_qs(mobj.group(1))
824 if 'mediaData' not in vardict:
825 raise ExtractorError(u'Unable to extract media URL')
826 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
828 raise ExtractorError(u'Unable to extract media URL')
829 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
830 video_extension = mediaURL[-3:]
831 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
833 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
835 raise ExtractorError(u'Unable to extract title')
836 video_title = mobj.group(1).decode('utf-8')
838 mobj = re.search(r'submitter=(.*?);', webpage)
840 raise ExtractorError(u'Unable to extract uploader nickname')
841 video_uploader = mobj.group(1)
844 'id': video_id.decode('utf-8'),
845 'url': video_url.decode('utf-8'),
846 'uploader': video_uploader.decode('utf-8'),
848 'title': video_title,
849 'ext': video_extension.decode('utf-8'),
852 class DailymotionIE(InfoExtractor):
853 """Information Extractor for Dailymotion"""
855 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
856 IE_NAME = u'dailymotion'
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1).split('_')[0].split('?')[0]
866 video_extension = 'mp4'
868 # Retrieve video webpage to extract further information
869 request = compat_urllib_request.Request(url)
870 request.add_header('Cookie', 'family_filter=off')
871 webpage = self._download_webpage(request, video_id)
873 # Extract URL, uploader and title from webpage
874 self.report_extraction(video_id)
875 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
877 raise ExtractorError(u'Unable to extract media URL')
878 flashvars = compat_urllib_parse.unquote(mobj.group(1))
880 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
883 self.to_screen(u'Using %s' % key)
886 raise ExtractorError(u'Unable to extract video URL')
888 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
890 raise ExtractorError(u'Unable to extract video URL')
892 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
894 # TODO: support choosing qualities
896 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
898 raise ExtractorError(u'Unable to extract title')
899 video_title = unescapeHTML(mobj.group('title'))
901 video_uploader = None
902 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
904 # lookin for official user
905 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
906 if mobj_official is None:
907 self._downloader.report_warning(u'unable to extract uploader nickname')
909 video_uploader = mobj_official.group(1)
911 video_uploader = mobj.group(1)
913 video_upload_date = None
914 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
916 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
921 'uploader': video_uploader,
922 'upload_date': video_upload_date,
923 'title': video_title,
924 'ext': video_extension,
928 class PhotobucketIE(InfoExtractor):
929 """Information extractor for photobucket.com."""
931 # TODO: the original _VALID_URL was:
932 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
933 # Check if it's necessary to keep the old extracion process
934 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
935 IE_NAME = u'photobucket'
937 def _real_extract(self, url):
938 # Extract id from URL
939 mobj = re.match(self._VALID_URL, url)
941 raise ExtractorError(u'Invalid URL: %s' % url)
943 video_id = mobj.group('id')
945 video_extension = mobj.group('ext')
947 # Retrieve video webpage to extract further information
948 webpage = self._download_webpage(url, video_id)
950 # Extract URL, uploader, and title from webpage
951 self.report_extraction(video_id)
952 # We try first by looking the javascript code:
953 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
955 info = json.loads(mobj.group('json'))
958 'url': info[u'downloadUrl'],
959 'uploader': info[u'username'],
960 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
961 'title': info[u'title'],
962 'ext': video_extension,
963 'thumbnail': info[u'thumbUrl'],
966 # We try looking in other parts of the webpage
967 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
969 raise ExtractorError(u'Unable to extract media URL')
970 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
974 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
976 raise ExtractorError(u'Unable to extract title')
977 video_title = mobj.group(1).decode('utf-8')
979 video_uploader = mobj.group(2).decode('utf-8')
982 'id': video_id.decode('utf-8'),
983 'url': video_url.decode('utf-8'),
984 'uploader': video_uploader,
986 'title': video_title,
987 'ext': video_extension.decode('utf-8'),
991 class YahooIE(InfoExtractor):
992 """Information extractor for screen.yahoo.com."""
993 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
995 def _real_extract(self, url):
996 mobj = re.match(self._VALID_URL, url)
998 raise ExtractorError(u'Invalid URL: %s' % url)
999 video_id = mobj.group('id')
1000 webpage = self._download_webpage(url, video_id)
1001 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1004 # TODO: Check which url parameters are required
1005 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1012 self.report_extraction(video_id)
1013 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1015 raise ExtractorError(u'Unable to extract video info')
1016 video_title = m_info.group('title')
1017 video_description = m_info.group('description')
1018 video_thumb = m_info.group('thumb')
1019 video_date = m_info.group('date')
1020 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1022 # TODO: Find a way to get mp4 videos
1023 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026 video_url = m_rest.group('url')
1027 video_path = m_rest.group('path')
1029 raise ExtractorError(u'Unable to extract video url')
1031 else: # We have to use a different method if another id is defined
1032 long_id = m_id.group('new_id')
1033 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036 info = json.loads(json_str)
1037 res = info[u'query'][u'results'][u'mediaObj'][0]
1038 stream = res[u'streams'][0]
1039 video_path = stream[u'path']
1040 video_url = stream[u'host']
1042 video_title = meta[u'title']
1043 video_description = meta[u'description']
1044 video_thumb = meta[u'thumbnail']
1045 video_date = None # I can't find it
1050 'play_path': video_path,
1051 'title':video_title,
1052 'description': video_description,
1053 'thumbnail': video_thumb,
1054 'upload_date': video_date,
1059 class VimeoIE(InfoExtractor):
1060 """Information extractor for vimeo.com."""
1062 # _VALID_URL matches Vimeo URLs
1063 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1066 def _verify_video_password(self, url, video_id, webpage):
1067 password = self._downloader.params.get('password', None)
1068 if password is None:
1069 raise ExtractorError(u'This video is protected by a password, use the --password option')
1070 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1071 data = compat_urllib_parse.urlencode({'password': password,
1073 # I didn't manage to use the password with https
1074 if url.startswith('https'):
1075 pass_url = url.replace('https','http')
1078 password_request = compat_urllib_request.Request(pass_url+'/password', data)
1079 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1080 password_request.add_header('Cookie', 'xsrft=%s' % token)
1081 pass_web = self._download_webpage(password_request, video_id,
1082 u'Verifying the password',
1085 def _real_extract(self, url, new_video=True):
1086 # Extract ID from URL
1087 mobj = re.match(self._VALID_URL, url)
1089 raise ExtractorError(u'Invalid URL: %s' % url)
1091 video_id = mobj.group('id')
1092 if not mobj.group('proto'):
1093 url = 'https://' + url
1094 if mobj.group('direct_link') or mobj.group('pro'):
1095 url = 'https://vimeo.com/' + video_id
1097 # Retrieve video webpage to extract further information
1098 request = compat_urllib_request.Request(url, None, std_headers)
1099 webpage = self._download_webpage(request, video_id)
1101 # Now we begin extracting as much information as we can from what we
1102 # retrieved. First we extract the information common to all extractors,
1103 # and latter we extract those that are Vimeo specific.
1104 self.report_extraction(video_id)
1106 # Extract the config JSON
1108 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109 config = json.loads(config)
1111 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1112 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1114 if re.search('If so please provide the correct password.', webpage):
1115 self._verify_video_password(url, video_id, webpage)
1116 return self._real_extract(url)
1118 raise ExtractorError(u'Unable to extract info section')
1121 video_title = config["video"]["title"]
1123 # Extract uploader and uploader_id
1124 video_uploader = config["video"]["owner"]["name"]
1125 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1127 # Extract video thumbnail
1128 video_thumbnail = config["video"]["thumbnail"]
1130 # Extract video description
1131 video_description = get_element_by_attribute("itemprop", "description", webpage)
1132 if video_description: video_description = clean_html(video_description)
1133 else: video_description = u''
1135 # Extract upload date
1136 video_upload_date = None
1137 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138 if mobj is not None:
1139 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1141 # Vimeo specific: extract request signature and timestamp
1142 sig = config['request']['signature']
1143 timestamp = config['request']['timestamp']
1145 # Vimeo specific: extract video codec and quality information
1146 # First consider quality, then codecs, then take everything
1147 # TODO bind to format param
1148 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149 files = { 'hd': [], 'sd': [], 'other': []}
1150 for codec_name, codec_extension in codecs:
1151 if codec_name in config["video"]["files"]:
1152 if 'hd' in config["video"]["files"][codec_name]:
1153 files['hd'].append((codec_name, codec_extension, 'hd'))
1154 elif 'sd' in config["video"]["files"][codec_name]:
1155 files['sd'].append((codec_name, codec_extension, 'sd'))
1157 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1159 for quality in ('hd', 'sd', 'other'):
1160 if len(files[quality]) > 0:
1161 video_quality = files[quality][0][2]
1162 video_codec = files[quality][0][0]
1163 video_extension = files[quality][0][1]
1164 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1167 raise ExtractorError(u'No known codec found')
1169 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1170 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1175 'uploader': video_uploader,
1176 'uploader_id': video_uploader_id,
1177 'upload_date': video_upload_date,
1178 'title': video_title,
1179 'ext': video_extension,
1180 'thumbnail': video_thumbnail,
1181 'description': video_description,
1185 class ArteTvIE(InfoExtractor):
1186 """arte.tv information extractor."""
1188 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1189 _LIVE_URL = r'index-[0-9]+\.html$'
1191 IE_NAME = u'arte.tv'
1193 def fetch_webpage(self, url):
1194 request = compat_urllib_request.Request(url)
1196 self.report_download_webpage(url)
1197 webpage = compat_urllib_request.urlopen(request).read()
1198 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1200 except ValueError as err:
1201 raise ExtractorError(u'Invalid URL: %s' % url)
1204 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1205 page = self.fetch_webpage(url)
1206 mobj = re.search(regex, page, regexFlags)
1210 raise ExtractorError(u'Invalid URL: %s' % url)
1212 for (i, key, err) in matchTuples:
1213 if mobj.group(i) is None:
1214 raise ExtractorError(err)
1216 info[key] = mobj.group(i)
1220 def extractLiveStream(self, url):
1221 video_lang = url.split('/')[-4]
1222 info = self.grep_webpage(
1224 r'src="(.*?/videothek_js.*?\.js)',
1227 (1, 'url', u'Invalid URL: %s' % url)
1230 http_host = url.split('/')[2]
1231 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1232 info = self.grep_webpage(
1234 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1235 '(http://.*?\.swf).*?' +
1239 (1, 'path', u'could not extract video path: %s' % url),
1240 (2, 'player', u'could not extract video player: %s' % url),
1241 (3, 'url', u'could not extract video url: %s' % url)
1244 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1246 def extractPlus7Stream(self, url):
1247 video_lang = url.split('/')[-3]
1248 info = self.grep_webpage(
1250 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1253 (1, 'url', u'Invalid URL: %s' % url)
1256 next_url = compat_urllib_parse.unquote(info.get('url'))
1257 info = self.grep_webpage(
1259 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1262 (1, 'url', u'Could not find <video> tag: %s' % url)
1265 next_url = compat_urllib_parse.unquote(info.get('url'))
1267 info = self.grep_webpage(
1269 r'<video id="(.*?)".*?>.*?' +
1270 '<name>(.*?)</name>.*?' +
1271 '<dateVideo>(.*?)</dateVideo>.*?' +
1272 '<url quality="hd">(.*?)</url>',
1275 (1, 'id', u'could not extract video id: %s' % url),
1276 (2, 'title', u'could not extract video title: %s' % url),
1277 (3, 'date', u'could not extract video date: %s' % url),
1278 (4, 'url', u'could not extract video url: %s' % url)
1283 'id': info.get('id'),
1284 'url': compat_urllib_parse.unquote(info.get('url')),
1285 'uploader': u'arte.tv',
1286 'upload_date': unified_strdate(info.get('date')),
1287 'title': info.get('title').decode('utf-8'),
1293 def _real_extract(self, url):
1294 video_id = url.split('/')[-1]
1295 self.report_extraction(video_id)
1297 if re.search(self._LIVE_URL, video_id) is not None:
1298 self.extractLiveStream(url)
1301 info = self.extractPlus7Stream(url)
1306 class GenericIE(InfoExtractor):
1307 """Generic last-resort information extractor."""
1310 IE_NAME = u'generic'
1312 def report_download_webpage(self, video_id):
1313 """Report webpage download."""
1314 if not self._downloader.params.get('test', False):
1315 self._downloader.report_warning(u'Falling back on generic information extractor.')
1316 super(GenericIE, self).report_download_webpage(video_id)
1318 def report_following_redirect(self, new_url):
1319 """Report information extraction."""
1320 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1322 def _test_redirect(self, url):
1323 """Check if it is a redirect, like url shorteners, in case return the new url."""
1324 class HeadRequest(compat_urllib_request.Request):
1325 def get_method(self):
1328 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1330 Subclass the HTTPRedirectHandler to make it use our
1331 HeadRequest also on the redirected URL
1333 def redirect_request(self, req, fp, code, msg, headers, newurl):
1334 if code in (301, 302, 303, 307):
1335 newurl = newurl.replace(' ', '%20')
1336 newheaders = dict((k,v) for k,v in req.headers.items()
1337 if k.lower() not in ("content-length", "content-type"))
1338 return HeadRequest(newurl,
1340 origin_req_host=req.get_origin_req_host(),
1343 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1345 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1347 Fallback to GET if HEAD is not allowed (405 HTTP error)
1349 def http_error_405(self, req, fp, code, msg, headers):
1353 newheaders = dict((k,v) for k,v in req.headers.items()
1354 if k.lower() not in ("content-length", "content-type"))
1355 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1357 origin_req_host=req.get_origin_req_host(),
1361 opener = compat_urllib_request.OpenerDirector()
1362 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1363 HTTPMethodFallback, HEADRedirectHandler,
1364 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1365 opener.add_handler(handler())
1367 response = opener.open(HeadRequest(url))
1368 if response is None:
1369 raise ExtractorError(u'Invalid URL protocol')
1370 new_url = response.geturl()
1375 self.report_following_redirect(new_url)
1378 def _real_extract(self, url):
1379 new_url = self._test_redirect(url)
1380 if new_url: return [self.url_result(new_url)]
1382 video_id = url.split('/')[-1]
1384 webpage = self._download_webpage(url, video_id)
1385 except ValueError as err:
1386 # since this is the last-resort InfoExtractor, if
1387 # this error is thrown, it'll be thrown here
1388 raise ExtractorError(u'Invalid URL: %s' % url)
1390 self.report_extraction(video_id)
1391 # Start with something easy: JW Player in SWFObject
1392 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1394 # Broaden the search a little bit
1395 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1397 # Broaden the search a little bit: JWPlayer JS loader
1398 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1400 # Try to find twitter cards info
1401 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1403 raise ExtractorError(u'Invalid URL: %s' % url)
1405 # It's possible that one of the regexes
1406 # matched, but returned an empty group:
1407 if mobj.group(1) is None:
1408 raise ExtractorError(u'Invalid URL: %s' % url)
1410 video_url = compat_urllib_parse.unquote(mobj.group(1))
1411 video_id = os.path.basename(video_url)
1413 # here's a fun little line of code for you:
1414 video_extension = os.path.splitext(video_id)[1][1:]
1415 video_id = os.path.splitext(video_id)[0]
1417 # it's tempting to parse this further, but you would
1418 # have to take into account all the variations like
1419 # Video Title - Site Name
1420 # Site Name | Video Title
1421 # Video Title - Tagline | Site Name
1422 # and so on and so forth; it's just not practical
1423 mobj = re.search(r'<title>(.*)</title>', webpage)
1425 raise ExtractorError(u'Unable to extract title')
1426 video_title = mobj.group(1)
1428 # video uploader is domain name
1429 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1431 raise ExtractorError(u'Unable to extract title')
1432 video_uploader = mobj.group(1)
1437 'uploader': video_uploader,
1438 'upload_date': None,
1439 'title': video_title,
1440 'ext': video_extension,
1444 class YoutubeSearchIE(SearchInfoExtractor):
1445 """Information Extractor for YouTube search queries."""
1446 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1448 IE_NAME = u'youtube:search'
1449 _SEARCH_KEY = 'ytsearch'
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download search page with given number."""
1453 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1455 def _get_n_results(self, query, n):
1456 """Get a specified number of results for a query"""
1462 while (50 * pagenum) < limit:
1463 self.report_download_page(query, pagenum+1)
1464 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1465 request = compat_urllib_request.Request(result_url)
1467 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1469 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1470 api_response = json.loads(data)['data']
1472 if not 'items' in api_response:
1473 raise ExtractorError(u'[youtube] No video results')
1475 new_ids = list(video['id'] for video in api_response['items'])
1476 video_ids += new_ids
1478 limit = min(n, api_response['totalItems'])
1481 if len(video_ids) > n:
1482 video_ids = video_ids[:n]
1483 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1484 return self.playlist_result(videos, query)
1487 class GoogleSearchIE(SearchInfoExtractor):
1488 """Information Extractor for Google Video search queries."""
1489 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1491 IE_NAME = u'video.google:search'
1492 _SEARCH_KEY = 'gvsearch'
1494 def _get_n_results(self, query, n):
1495 """Get a specified number of results for a query"""
1498 '_type': 'playlist',
1503 for pagenum in itertools.count(1):
1504 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1506 note='Downloading result page ' + str(pagenum))
1508 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1511 'url': mobj.group(1)
1513 res['entries'].append(e)
1515 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1518 class YahooSearchIE(SearchInfoExtractor):
1519 """Information Extractor for Yahoo! Video search queries."""
1522 IE_NAME = u'screen.yahoo:search'
1523 _SEARCH_KEY = 'yvsearch'
1525 def _get_n_results(self, query, n):
1526 """Get a specified number of results for a query"""
1529 '_type': 'playlist',
1533 for pagenum in itertools.count(0):
1534 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1535 webpage = self._download_webpage(result_url, query,
1536 note='Downloading results page '+str(pagenum+1))
1537 info = json.loads(webpage)
1539 results = info[u'results']
1541 for (i, r) in enumerate(results):
1542 if (pagenum * 30) +i >= n:
1544 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1545 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1546 res['entries'].append(e)
1547 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1553 class YoutubePlaylistIE(InfoExtractor):
1554 """Information Extractor for YouTube playlists."""
1556 _VALID_URL = r"""(?:
1561 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1562 \? (?:.*?&)*? (?:p|a|list)=
1565 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1568 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1570 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1572 IE_NAME = u'youtube:playlist'
1575 def suitable(cls, url):
1576 """Receives a URL and returns True if suitable for this IE."""
1577 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1579 def _real_extract(self, url):
1580 # Extract playlist id
1581 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1583 raise ExtractorError(u'Invalid URL: %s' % url)
1585 # Download playlist videos from API
1586 playlist_id = mobj.group(1) or mobj.group(2)
1591 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1592 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1595 response = json.loads(page)
1596 except ValueError as err:
1597 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1599 if 'feed' not in response:
1600 raise ExtractorError(u'Got a malformed response from YouTube API')
1601 playlist_title = response['feed']['title']['$t']
1602 if 'entry' not in response['feed']:
1603 # Number of videos is a multiple of self._MAX_RESULTS
1606 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1607 for entry in response['feed']['entry']
1608 if 'content' in entry ]
1610 if len(response['feed']['entry']) < self._MAX_RESULTS:
1614 videos = [v[1] for v in sorted(videos)]
1616 url_results = [self.url_result(url, 'Youtube') for url in videos]
1617 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1620 class YoutubeChannelIE(InfoExtractor):
1621 """Information Extractor for YouTube channels."""
1623 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1624 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1625 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1626 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1627 IE_NAME = u'youtube:channel'
1629 def extract_videos_from_page(self, page):
1631 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1632 if mobj.group(1) not in ids_in_page:
1633 ids_in_page.append(mobj.group(1))
1636 def _real_extract(self, url):
1637 # Extract channel id
1638 mobj = re.match(self._VALID_URL, url)
1640 raise ExtractorError(u'Invalid URL: %s' % url)
1642 # Download channel page
1643 channel_id = mobj.group(1)
1647 url = self._TEMPLATE_URL % (channel_id, pagenum)
1648 page = self._download_webpage(url, channel_id,
1649 u'Downloading page #%s' % pagenum)
1651 # Extract video identifiers
1652 ids_in_page = self.extract_videos_from_page(page)
1653 video_ids.extend(ids_in_page)
1655 # Download any subsequent channel pages using the json-based channel_ajax query
1656 if self._MORE_PAGES_INDICATOR in page:
1658 pagenum = pagenum + 1
1660 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1661 page = self._download_webpage(url, channel_id,
1662 u'Downloading page #%s' % pagenum)
1664 page = json.loads(page)
1666 ids_in_page = self.extract_videos_from_page(page['content_html'])
1667 video_ids.extend(ids_in_page)
1669 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1672 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1674 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1675 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1676 return [self.playlist_result(url_entries, channel_id)]
1679 class YoutubeUserIE(InfoExtractor):
1680 """Information Extractor for YouTube users."""
1682 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1683 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1684 _GDATA_PAGE_SIZE = 50
1685 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1686 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1687 IE_NAME = u'youtube:user'
1689 def _real_extract(self, url):
1691 mobj = re.match(self._VALID_URL, url)
1693 raise ExtractorError(u'Invalid URL: %s' % url)
1695 username = mobj.group(1)
1697 # Download video ids using YouTube Data API. Result size per
1698 # query is limited (currently to 50 videos) so we need to query
1699 # page by page until there are no video ids - it means we got
1706 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1708 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1709 page = self._download_webpage(gdata_url, username,
1710 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1712 # Extract video identifiers
1715 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1716 if mobj.group(1) not in ids_in_page:
1717 ids_in_page.append(mobj.group(1))
1719 video_ids.extend(ids_in_page)
1721 # A little optimization - if current page is not
1722 # "full", ie. does not contain PAGE_SIZE video ids then
1723 # we can assume that this page is the last one - there
1724 # are no more ids on further pages - no need to query
1727 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1732 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1733 url_results = [self.url_result(url, 'Youtube') for url in urls]
1734 return [self.playlist_result(url_results, playlist_title = username)]
1737 class BlipTVUserIE(InfoExtractor):
1738 """Information Extractor for blip.tv users."""
1740 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1742 IE_NAME = u'blip.tv:user'
1744 def _real_extract(self, url):
1746 mobj = re.match(self._VALID_URL, url)
1748 raise ExtractorError(u'Invalid URL: %s' % url)
1750 username = mobj.group(1)
1752 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1754 page = self._download_webpage(url, username, u'Downloading user page')
1755 mobj = re.search(r'data-users-id="([^"]+)"', page)
1756 page_base = page_base % mobj.group(1)
1759 # Download video ids using BlipTV Ajax calls. Result size per
1760 # query is limited (currently to 12 videos) so we need to query
1761 # page by page until there are no video ids - it means we got
1768 url = page_base + "&page=" + str(pagenum)
1769 page = self._download_webpage(url, username,
1770 u'Downloading video ids from page %d' % pagenum)
1772 # Extract video identifiers
1775 for mobj in re.finditer(r'href="/([^"]+)"', page):
1776 if mobj.group(1) not in ids_in_page:
1777 ids_in_page.append(unescapeHTML(mobj.group(1)))
1779 video_ids.extend(ids_in_page)
1781 # A little optimization - if current page is not
1782 # "full", ie. does not contain PAGE_SIZE video ids then
1783 # we can assume that this page is the last one - there
1784 # are no more ids on further pages - no need to query
1787 if len(ids_in_page) < self._PAGE_SIZE:
1792 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1793 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1794 return [self.playlist_result(url_entries, playlist_title = username)]
1797 class DepositFilesIE(InfoExtractor):
1798 """Information extractor for depositfiles.com"""
1800 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1802 def _real_extract(self, url):
1803 file_id = url.split('/')[-1]
1804 # Rebuild url in english locale
1805 url = 'http://depositfiles.com/en/files/' + file_id
1807 # Retrieve file webpage with 'Free download' button pressed
1808 free_download_indication = { 'gateway_result' : '1' }
1809 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1811 self.report_download_webpage(file_id)
1812 webpage = compat_urllib_request.urlopen(request).read()
1813 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1814 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1816 # Search for the real file URL
1817 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1818 if (mobj is None) or (mobj.group(1) is None):
1819 # Try to figure out reason of the error.
1820 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1821 if (mobj is not None) and (mobj.group(1) is not None):
1822 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1823 raise ExtractorError(u'%s' % restriction_message)
1825 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1827 file_url = mobj.group(1)
1828 file_extension = os.path.splitext(file_url)[1][1:]
1830 # Search for file title
1831 mobj = re.search(r'<b title="(.*?)">', webpage)
1833 raise ExtractorError(u'Unable to extract title')
1834 file_title = mobj.group(1).decode('utf-8')
1837 'id': file_id.decode('utf-8'),
1838 'url': file_url.decode('utf-8'),
1840 'upload_date': None,
1841 'title': file_title,
1842 'ext': file_extension.decode('utf-8'),
1846 class FacebookIE(InfoExtractor):
1847 """Information Extractor for Facebook"""
1849 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851 _NETRC_MACHINE = 'facebook'
1852 IE_NAME = u'facebook'
1854 def report_login(self):
1855 """Report attempt to log in."""
1856 self.to_screen(u'Logging in')
1858 def _real_initialize(self):
1859 if self._downloader is None:
1864 downloader_params = self._downloader.params
1866 # Attempt to use provided username and password or .netrc data
1867 if downloader_params.get('username', None) is not None:
1868 useremail = downloader_params['username']
1869 password = downloader_params['password']
1870 elif downloader_params.get('usenetrc', False):
1872 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1873 if info is not None:
1877 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1878 except (IOError, netrc.NetrcParseError) as err:
1879 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1882 if useremail is None:
1891 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1894 login_results = compat_urllib_request.urlopen(request).read()
1895 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1896 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1902 def _real_extract(self, url):
1903 mobj = re.match(self._VALID_URL, url)
1905 raise ExtractorError(u'Invalid URL: %s' % url)
1906 video_id = mobj.group('ID')
1908 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909 webpage = self._download_webpage(url, video_id)
1911 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1912 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1915 raise ExtractorError(u'Cannot parse data')
1916 data = dict(json.loads(m.group(1)))
1917 params_raw = compat_urllib_parse.unquote(data['params'])
1918 params = json.loads(params_raw)
1919 video_data = params['video_data'][0]
1920 video_url = video_data.get('hd_src')
1922 video_url = video_data['sd_src']
1924 raise ExtractorError(u'Cannot find video URL')
1925 video_duration = int(video_data['video_duration'])
1926 thumbnail = video_data['thumbnail_src']
1928 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1930 raise ExtractorError(u'Cannot find title in webpage')
1931 video_title = unescapeHTML(m.group(1))
1935 'title': video_title,
1938 'duration': video_duration,
1939 'thumbnail': thumbnail,
1944 class BlipTVIE(InfoExtractor):
1945 """Information extractor for blip.tv"""
1947 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1948 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1949 IE_NAME = u'blip.tv'
1951 def report_direct_download(self, title):
1952 """Report information extraction."""
1953 self.to_screen(u'%s: Direct download detected' % title)
1955 def _real_extract(self, url):
1956 mobj = re.match(self._VALID_URL, url)
1958 raise ExtractorError(u'Invalid URL: %s' % url)
1960 # See https://github.com/rg3/youtube-dl/issues/857
1961 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1962 if api_mobj is not None:
1963 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1964 urlp = compat_urllib_parse_urlparse(url)
1965 if urlp.path.startswith('/play/'):
1966 request = compat_urllib_request.Request(url)
1967 response = compat_urllib_request.urlopen(request)
1968 redirecturl = response.geturl()
1969 rurlp = compat_urllib_parse_urlparse(redirecturl)
1970 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1971 url = 'http://blip.tv/a/a-' + file_id
1972 return self._real_extract(url)
1979 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1980 request = compat_urllib_request.Request(json_url)
1981 request.add_header('User-Agent', 'iTunes/10.6.1')
1982 self.report_extraction(mobj.group(1))
1985 urlh = compat_urllib_request.urlopen(request)
1986 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1987 basename = url.split('/')[-1]
1988 title,ext = os.path.splitext(basename)
1989 title = title.decode('UTF-8')
1990 ext = ext.replace('.', '')
1991 self.report_direct_download(title)
1996 'upload_date': None,
2001 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2003 if info is None: # Regular URL
2005 json_code_bytes = urlh.read()
2006 json_code = json_code_bytes.decode('utf-8')
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2011 json_data = json.loads(json_code)
2012 if 'Post' in json_data:
2013 data = json_data['Post']
2017 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2018 video_url = data['media']['url']
2019 umobj = re.match(self._URL_EXT, video_url)
2021 raise ValueError('Can not determine filename extension')
2022 ext = umobj.group(1)
2025 'id': data['item_id'],
2027 'uploader': data['display_name'],
2028 'upload_date': upload_date,
2029 'title': data['title'],
2031 'format': data['media']['mimeType'],
2032 'thumbnail': data['thumbnailUrl'],
2033 'description': data['description'],
2034 'player_url': data['embedUrl'],
2035 'user_agent': 'iTunes/10.6.1',
2037 except (ValueError,KeyError) as err:
2038 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2043 class MyVideoIE(InfoExtractor):
2044 """Information Extractor for myvideo.de."""
2046 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2047 IE_NAME = u'myvideo'
2049 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2050 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2051 # https://github.com/rg3/youtube-dl/pull/842
2052 def __rc4crypt(self,data, key):
2054 box = list(range(256))
2055 for i in list(range(256)):
2056 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2057 box[i], box[x] = box[x], box[i]
2063 y = (y + box[x]) % 256
2064 box[x], box[y] = box[y], box[x]
2065 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2069 return hashlib.md5(s).hexdigest().encode()
2071 def _real_extract(self,url):
2072 mobj = re.match(self._VALID_URL, url)
2074 raise ExtractorError(u'invalid URL: %s' % url)
2076 video_id = mobj.group(1)
2079 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2080 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2081 b'TnpsbA0KTVRkbU1tSTRNdz09'
2085 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2086 webpage = self._download_webpage(webpage_url, video_id)
2088 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2089 if mobj is not None:
2090 self.report_extraction(video_id)
2091 video_url = mobj.group(1) + '.flv'
2093 mobj = re.search('<title>([^<]+)</title>', webpage)
2095 raise ExtractorError(u'Unable to extract title')
2096 video_title = mobj.group(1)
2098 mobj = re.search('[.](.+?)$', video_url)
2100 raise ExtractorError(u'Unable to extract extention')
2101 video_ext = mobj.group(1)
2107 'upload_date': None,
2108 'title': video_title,
2113 mobj = re.search('var flashvars={(.+?)}', webpage)
2115 raise ExtractorError(u'Unable to extract video')
2120 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2121 if not a == '_encxml':
2124 encxml = compat_urllib_parse.unquote(b)
2125 if not params.get('domain'):
2126 params['domain'] = 'www.myvideo.de'
2127 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2128 if 'flash_playertype=MTV' in xmldata_url:
2129 self._downloader.report_warning(u'avoiding MTV player')
2131 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2132 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2136 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2137 enc_data_b = binascii.unhexlify(enc_data)
2139 base64.b64decode(base64.b64decode(GK)) +
2141 str(video_id).encode('utf-8')
2144 dec_data = self.__rc4crypt(enc_data_b, sk)
2147 self.report_extraction(video_id)
2149 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2151 raise ExtractorError(u'unable to extract rtmpurl')
2152 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2153 if 'myvideo2flash' in video_rtmpurl:
2154 self._downloader.report_warning(u'forcing RTMPT ...')
2155 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2157 # extract non rtmp videos
2158 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2159 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2161 raise ExtractorError(u'unable to extract url')
2162 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2164 mobj = re.search('source=\'(.*?)\'', dec_data)
2166 raise ExtractorError(u'unable to extract swfobj')
2167 video_file = compat_urllib_parse.unquote(mobj.group(1))
2169 if not video_file.endswith('f4m'):
2170 ppath, prefix = video_file.split('.')
2171 video_playpath = '%s:%s' % (prefix, ppath)
2172 video_hls_playlist = ''
2175 video_hls_playlist = (
2176 video_filepath + video_file
2177 ).replace('.f4m', '.m3u8')
2179 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2181 raise ExtractorError(u'unable to extract swfobj')
2182 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2184 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2186 raise ExtractorError(u'unable to extract title')
2187 video_title = mobj.group(1)
2191 'url': video_rtmpurl,
2192 'tc_url': video_rtmpurl,
2194 'upload_date': None,
2195 'title': video_title,
2197 'play_path': video_playpath,
2198 'video_file': video_file,
2199 'video_hls_playlist': video_hls_playlist,
2200 'player_url': video_swfobj,
2203 class ComedyCentralIE(InfoExtractor):
2204 """Information extractor for The Daily Show and Colbert Report """
2206 # urls can be abbreviations like :thedailyshow or :colbert
2207 # urls for episodes like:
2208 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2209 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2210 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2211 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2212 |(https?://)?(www\.)?
2213 (?P<showname>thedailyshow|colbertnation)\.com/
2214 (full-episodes/(?P<episode>.*)|
2216 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2217 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2220 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2222 _video_extensions = {
2230 _video_dimensions = {
2240 def suitable(cls, url):
2241 """Receives a URL and returns True if suitable for this IE."""
2242 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2244 def _print_formats(self, formats):
2245 print('Available formats:')
2247 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2250 def _real_extract(self, url):
2251 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253 raise ExtractorError(u'Invalid URL: %s' % url)
2255 if mobj.group('shortname'):
2256 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2257 url = u'http://www.thedailyshow.com/full-episodes/'
2259 url = u'http://www.colbertnation.com/full-episodes/'
2260 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2261 assert mobj is not None
2263 if mobj.group('clip'):
2264 if mobj.group('showname') == 'thedailyshow':
2265 epTitle = mobj.group('tdstitle')
2267 epTitle = mobj.group('cntitle')
2270 dlNewest = not mobj.group('episode')
2272 epTitle = mobj.group('showname')
2274 epTitle = mobj.group('episode')
2276 self.report_extraction(epTitle)
2277 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2279 url = htmlHandle.geturl()
2280 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2282 raise ExtractorError(u'Invalid redirected URL: ' + url)
2283 if mobj.group('episode') == '':
2284 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2285 epTitle = mobj.group('episode')
2287 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2289 if len(mMovieParams) == 0:
2290 # The Colbert Report embeds the information in a without
2291 # a URL prefix; so extract the alternate reference
2292 # and then add the URL prefix manually.
2294 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2295 if len(altMovieParams) == 0:
2296 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2298 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2300 uri = mMovieParams[0][1]
2301 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2302 indexXml = self._download_webpage(indexUrl, epTitle,
2303 u'Downloading show index',
2304 u'unable to download episode index')
2308 idoc = xml.etree.ElementTree.fromstring(indexXml)
2309 itemEls = idoc.findall('.//item')
2310 for partNum,itemEl in enumerate(itemEls):
2311 mediaId = itemEl.findall('./guid')[0].text
2312 shortMediaId = mediaId.split(':')[-1]
2313 showId = mediaId.split(':')[-2].replace('.com', '')
2314 officialTitle = itemEl.findall('./title')[0].text
2315 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2317 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2318 compat_urllib_parse.urlencode({'uri': mediaId}))
2319 configXml = self._download_webpage(configUrl, epTitle,
2320 u'Downloading configuration for %s' % shortMediaId)
2322 cdoc = xml.etree.ElementTree.fromstring(configXml)
2324 for rendition in cdoc.findall('.//rendition'):
2325 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2329 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2332 if self._downloader.params.get('listformats', None):
2333 self._print_formats([i[0] for i in turls])
2336 # For now, just pick the highest bitrate
2337 format,rtmp_video_url = turls[-1]
2339 # Get the format arg from the arg stream
2340 req_format = self._downloader.params.get('format', None)
2342 # Select format if we can find one
2345 format, rtmp_video_url = f, v
2348 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2350 raise ExtractorError(u'Cannot transform RTMP url')
2351 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2352 video_url = base + m.group('finalid')
2354 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2359 'upload_date': officialDate,
2364 'description': officialTitle,
2366 results.append(info)
2371 class EscapistIE(InfoExtractor):
2372 """Information extractor for The Escapist """
2374 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2375 IE_NAME = u'escapist'
2377 def _real_extract(self, url):
2378 mobj = re.match(self._VALID_URL, url)
2380 raise ExtractorError(u'Invalid URL: %s' % url)
2381 showName = mobj.group('showname')
2382 videoId = mobj.group('episode')
2384 self.report_extraction(showName)
2385 webPage = self._download_webpage(url, showName)
2387 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2388 description = unescapeHTML(descMatch.group(1))
2389 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2390 imgUrl = unescapeHTML(imgMatch.group(1))
2391 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2392 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2393 configUrlMatch = re.search('config=(.*)$', playerUrl)
2394 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2396 configJSON = self._download_webpage(configUrl, showName,
2397 u'Downloading configuration',
2398 u'unable to download configuration')
2400 # Technically, it's JavaScript, not JSON
2401 configJSON = configJSON.replace("'", '"')
2404 config = json.loads(configJSON)
2405 except (ValueError,) as err:
2406 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2408 playlist = config['playlist']
2409 videoUrl = playlist[1]['url']
2414 'uploader': showName,
2415 'upload_date': None,
2418 'thumbnail': imgUrl,
2419 'description': description,
2420 'player_url': playerUrl,
2425 class CollegeHumorIE(InfoExtractor):
2426 """Information extractor for collegehumor.com"""
2429 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2430 IE_NAME = u'collegehumor'
2432 def report_manifest(self, video_id):
2433 """Report information extraction."""
2434 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2436 def _real_extract(self, url):
2437 mobj = re.match(self._VALID_URL, url)
2439 raise ExtractorError(u'Invalid URL: %s' % url)
2440 video_id = mobj.group('videoid')
2445 'upload_date': None,
2448 self.report_extraction(video_id)
2449 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2451 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2455 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2457 videoNode = mdoc.findall('./video')[0]
2458 info['description'] = videoNode.findall('./description')[0].text
2459 info['title'] = videoNode.findall('./caption')[0].text
2460 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461 manifest_url = videoNode.findall('./file')[0].text
2463 raise ExtractorError(u'Invalid metadata XML file')
2465 manifest_url += '?hdcore=2.10.3'
2466 self.report_manifest(video_id)
2468 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2472 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2474 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475 node_id = media_node.attrib['url']
2476 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477 except IndexError as err:
2478 raise ExtractorError(u'Invalid manifest file')
2480 url_pr = compat_urllib_parse_urlparse(manifest_url)
2481 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2488 class XVideosIE(InfoExtractor):
2489 """Information extractor for xvideos.com"""
2491 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2492 IE_NAME = u'xvideos'
2494 def _real_extract(self, url):
2495 mobj = re.match(self._VALID_URL, url)
2497 raise ExtractorError(u'Invalid URL: %s' % url)
2498 video_id = mobj.group(1)
2500 webpage = self._download_webpage(url, video_id)
2502 self.report_extraction(video_id)
2506 mobj = re.search(r'flv_url=(.+?)&', webpage)
2508 raise ExtractorError(u'Unable to extract video url')
2509 video_url = compat_urllib_parse.unquote(mobj.group(1))
2513 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2515 raise ExtractorError(u'Unable to extract video title')
2516 video_title = mobj.group(1)
2519 # Extract video thumbnail
2520 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2522 raise ExtractorError(u'Unable to extract video thumbnail')
2523 video_thumbnail = mobj.group(0)
2529 'upload_date': None,
2530 'title': video_title,
2532 'thumbnail': video_thumbnail,
2533 'description': None,
2539 class SoundcloudIE(InfoExtractor):
2540 """Information extractor for soundcloud.com
2541 To access the media, the uid of the song and a stream token
2542 must be extracted from the page source and the script must make
2543 a request to media.soundcloud.com/crossdomain.xml. Then
2544 the media can be grabbed by requesting from an url composed
2545 of the stream token and uid
2548 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2549 IE_NAME = u'soundcloud'
2551 def report_resolve(self, video_id):
2552 """Report information extraction."""
2553 self.to_screen(u'%s: Resolving id' % video_id)
2555 def _real_extract(self, url):
2556 mobj = re.match(self._VALID_URL, url)
2558 raise ExtractorError(u'Invalid URL: %s' % url)
2560 # extract uploader (which is in the url)
2561 uploader = mobj.group(1)
2562 # extract simple title (uploader + slug of song title)
2563 slug_title = mobj.group(2)
2564 simple_title = uploader + u'-' + slug_title
2565 full_title = '%s/%s' % (uploader, slug_title)
2567 self.report_resolve(full_title)
2569 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2570 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2571 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2573 info = json.loads(info_json)
2574 video_id = info['id']
2575 self.report_extraction(full_title)
2577 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578 stream_json = self._download_webpage(streams_url, full_title,
2579 u'Downloading stream definitions',
2580 u'unable to download stream definitions')
2582 streams = json.loads(stream_json)
2583 mediaURL = streams['http_mp3_128_url']
2584 upload_date = unified_strdate(info['created_at'])
2589 'uploader': info['user']['username'],
2590 'upload_date': upload_date,
2591 'title': info['title'],
2593 'description': info['description'],
2596 class SoundcloudSetIE(InfoExtractor):
2597 """Information extractor for soundcloud.com sets
2598 To access the media, the uid of the song and a stream token
2599 must be extracted from the page source and the script must make
2600 a request to media.soundcloud.com/crossdomain.xml. Then
2601 the media can be grabbed by requesting from an url composed
2602 of the stream token and uid
2605 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2606 IE_NAME = u'soundcloud:set'
2608 def report_resolve(self, video_id):
2609 """Report information extraction."""
2610 self.to_screen(u'%s: Resolving id' % video_id)
2612 def _real_extract(self, url):
2613 mobj = re.match(self._VALID_URL, url)
2615 raise ExtractorError(u'Invalid URL: %s' % url)
2617 # extract uploader (which is in the url)
2618 uploader = mobj.group(1)
2619 # extract simple title (uploader + slug of song title)
2620 slug_title = mobj.group(2)
2621 simple_title = uploader + u'-' + slug_title
2622 full_title = '%s/sets/%s' % (uploader, slug_title)
2624 self.report_resolve(full_title)
2626 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2627 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2628 info_json = self._download_webpage(resolv_url, full_title)
2631 info = json.loads(info_json)
2632 if 'errors' in info:
2633 for err in info['errors']:
2634 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2637 self.report_extraction(full_title)
2638 for track in info['tracks']:
2639 video_id = track['id']
2641 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2642 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2644 self.report_extraction(video_id)
2645 streams = json.loads(stream_json)
2646 mediaURL = streams['http_mp3_128_url']
2651 'uploader': track['user']['username'],
2652 'upload_date': unified_strdate(track['created_at']),
2653 'title': track['title'],
2655 'description': track['description'],
2660 class InfoQIE(InfoExtractor):
2661 """Information extractor for infoq.com"""
2662 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2667 raise ExtractorError(u'Invalid URL: %s' % url)
2669 webpage = self._download_webpage(url, video_id=url)
2670 self.report_extraction(url)
2673 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2675 raise ExtractorError(u'Unable to extract video url')
2676 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2677 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2680 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2682 raise ExtractorError(u'Unable to extract video title')
2683 video_title = mobj.group(1)
2685 # Extract description
2686 video_description = u'No description available.'
2687 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2688 if mobj is not None:
2689 video_description = mobj.group(1)
2691 video_filename = video_url.split('/')[-1]
2692 video_id, extension = video_filename.split('.')
2698 'upload_date': None,
2699 'title': video_title,
2700 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2702 'description': video_description,
2707 class MixcloudIE(InfoExtractor):
2708 """Information extractor for www.mixcloud.com"""
2710 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2711 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2712 IE_NAME = u'mixcloud'
2714 def report_download_json(self, file_id):
2715 """Report JSON download."""
2716 self.to_screen(u'Downloading json')
2718 def get_urls(self, jsonData, fmt, bitrate='best'):
2719 """Get urls from 'audio_formats' section in json"""
2722 bitrate_list = jsonData[fmt]
2723 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2724 bitrate = max(bitrate_list) # select highest
2726 url_list = jsonData[fmt][bitrate]
2727 except TypeError: # we have no bitrate info.
2728 url_list = jsonData[fmt]
2731 def check_urls(self, url_list):
2732 """Returns 1st active url from list"""
2733 for url in url_list:
2735 compat_urllib_request.urlopen(url)
2737 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742 def _print_formats(self, formats):
2743 print('Available formats:')
2744 for fmt in formats.keys():
2745 for b in formats[fmt]:
2747 ext = formats[fmt][b][0]
2748 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2749 except TypeError: # we have no bitrate info
2750 ext = formats[fmt][0]
2751 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2754 def _real_extract(self, url):
2755 mobj = re.match(self._VALID_URL, url)
2757 raise ExtractorError(u'Invalid URL: %s' % url)
2758 # extract uploader & filename from url
2759 uploader = mobj.group(1).decode('utf-8')
2760 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2762 # construct API request
2763 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2764 # retrieve .json file with links to files
2765 request = compat_urllib_request.Request(file_url)
2767 self.report_download_json(file_url)
2768 jsonData = compat_urllib_request.urlopen(request).read()
2769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2770 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2773 json_data = json.loads(jsonData)
2774 player_url = json_data['player_swf_url']
2775 formats = dict(json_data['audio_formats'])
2777 req_format = self._downloader.params.get('format', None)
2780 if self._downloader.params.get('listformats', None):
2781 self._print_formats(formats)
2784 if req_format is None or req_format == 'best':
2785 for format_param in formats.keys():
2786 url_list = self.get_urls(formats, format_param)
2788 file_url = self.check_urls(url_list)
2789 if file_url is not None:
2792 if req_format not in formats:
2793 raise ExtractorError(u'Format is not available')
2795 url_list = self.get_urls(formats, req_format)
2796 file_url = self.check_urls(url_list)
2797 format_param = req_format
2800 'id': file_id.decode('utf-8'),
2801 'url': file_url.decode('utf-8'),
2802 'uploader': uploader.decode('utf-8'),
2803 'upload_date': None,
2804 'title': json_data['name'],
2805 'ext': file_url.split('.')[-1].decode('utf-8'),
2806 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2807 'thumbnail': json_data['thumbnail_url'],
2808 'description': json_data['description'],
2809 'player_url': player_url.decode('utf-8'),
2812 class StanfordOpenClassroomIE(InfoExtractor):
2813 """Information extractor for Stanford's Open ClassRoom"""
2815 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2816 IE_NAME = u'stanfordoc'
2818 def _real_extract(self, url):
2819 mobj = re.match(self._VALID_URL, url)
2821 raise ExtractorError(u'Invalid URL: %s' % url)
2823 if mobj.group('course') and mobj.group('video'): # A specific video
2824 course = mobj.group('course')
2825 video = mobj.group('video')
2827 'id': course + '_' + video,
2829 'upload_date': None,
2832 self.report_extraction(info['id'])
2833 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2834 xmlUrl = baseUrl + video + '.xml'
2836 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2838 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2839 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2841 info['title'] = mdoc.findall('./title')[0].text
2842 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2844 raise ExtractorError(u'Invalid metadata XML file')
2845 info['ext'] = info['url'].rpartition('.')[2]
2847 elif mobj.group('course'): # A course page
2848 course = mobj.group('course')
2853 'upload_date': None,
2856 coursepage = self._download_webpage(url, info['id'],
2857 note='Downloading course info page',
2858 errnote='Unable to download course info page')
2860 m = re.search('<h1>([^<]+)</h1>', coursepage)
2862 info['title'] = unescapeHTML(m.group(1))
2864 info['title'] = info['id']
2866 m = re.search('<description>([^<]+)</description>', coursepage)
2868 info['description'] = unescapeHTML(m.group(1))
2870 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2873 'type': 'reference',
2874 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2878 for entry in info['list']:
2879 assert entry['type'] == 'reference'
2880 results += self.extract(entry['url'])
2884 'id': 'Stanford OpenClassroom',
2887 'upload_date': None,
2890 self.report_download_webpage(info['id'])
2891 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2893 rootpage = compat_urllib_request.urlopen(rootURL).read()
2894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2897 info['title'] = info['id']
2899 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2902 'type': 'reference',
2903 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2908 for entry in info['list']:
2909 assert entry['type'] == 'reference'
2910 results += self.extract(entry['url'])
2913 class MTVIE(InfoExtractor):
2914 """Information extractor for MTV.com"""
2916 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2919 def _real_extract(self, url):
2920 mobj = re.match(self._VALID_URL, url)
2922 raise ExtractorError(u'Invalid URL: %s' % url)
2923 if not mobj.group('proto'):
2924 url = 'http://' + url
2925 video_id = mobj.group('videoid')
2927 webpage = self._download_webpage(url, video_id)
2929 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2931 raise ExtractorError(u'Unable to extract song name')
2932 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2933 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2935 raise ExtractorError(u'Unable to extract performer')
2936 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2937 video_title = performer + ' - ' + song_name
2939 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2941 raise ExtractorError(u'Unable to mtvn_uri')
2942 mtvn_uri = mobj.group(1)
2944 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2946 raise ExtractorError(u'Unable to extract content id')
2947 content_id = mobj.group(1)
2949 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2950 self.report_extraction(video_id)
2951 request = compat_urllib_request.Request(videogen_url)
2953 metadataXml = compat_urllib_request.urlopen(request).read()
2954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2957 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2958 renditions = mdoc.findall('.//rendition')
2960 # For now, always pick the highest quality.
2961 rendition = renditions[-1]
2964 _,_,ext = rendition.attrib['type'].partition('/')
2965 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2966 video_url = rendition.find('./src').text
2968 raise ExtractorError('Invalid rendition field.')
2973 'uploader': performer,
2974 'upload_date': None,
2975 'title': video_title,
2983 class YoukuIE(InfoExtractor):
2984 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2987 nowTime = int(time.time() * 1000)
2988 random1 = random.randint(1000,1998)
2989 random2 = random.randint(1000,9999)
2991 return "%d%d%d" %(nowTime,random1,random2)
2993 def _get_file_ID_mix_string(self, seed):
2995 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2997 for i in range(len(source)):
2998 seed = (seed * 211 + 30031 ) % 65536
2999 index = math.floor(seed / 65536 * len(source) )
3000 mixed.append(source[int(index)])
3001 source.remove(source[int(index)])
3002 #return ''.join(mixed)
3005 def _get_file_id(self, fileId, seed):
3006 mixed = self._get_file_ID_mix_string(seed)
3007 ids = fileId.split('*')
3011 realId.append(mixed[int(ch)])
3012 return ''.join(realId)
3014 def _real_extract(self, url):
3015 mobj = re.match(self._VALID_URL, url)
3017 raise ExtractorError(u'Invalid URL: %s' % url)
3018 video_id = mobj.group('ID')
3020 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3022 jsondata = self._download_webpage(info_url, video_id)
3024 self.report_extraction(video_id)
3026 config = json.loads(jsondata)
3028 video_title = config['data'][0]['title']
3029 seed = config['data'][0]['seed']
3031 format = self._downloader.params.get('format', None)
3032 supported_format = list(config['data'][0]['streamfileids'].keys())
3034 if format is None or format == 'best':
3035 if 'hd2' in supported_format:
3040 elif format == 'worst':
3048 fileid = config['data'][0]['streamfileids'][format]
3049 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3050 except (UnicodeDecodeError, ValueError, KeyError):
3051 raise ExtractorError(u'Unable to extract info section')
3054 sid = self._gen_sid()
3055 fileid = self._get_file_id(fileid, seed)
3057 #column 8,9 of fileid represent the segment number
3058 #fileid[7:9] should be changed
3059 for index, key in enumerate(keys):
3061 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3062 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3065 'id': '%s_part%02d' % (video_id, index),
3066 'url': download_url,
3068 'upload_date': None,
3069 'title': video_title,
3072 files_info.append(info)
3077 class XNXXIE(InfoExtractor):
3078 """Information extractor for xnxx.com"""
3080 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3082 VIDEO_URL_RE = r'flv_url=(.*?)&'
3083 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3084 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3086 def _real_extract(self, url):
3087 mobj = re.match(self._VALID_URL, url)
3089 raise ExtractorError(u'Invalid URL: %s' % url)
3090 video_id = mobj.group(1)
3092 # Get webpage content
3093 webpage = self._download_webpage(url, video_id)
3095 result = re.search(self.VIDEO_URL_RE, webpage)
3097 raise ExtractorError(u'Unable to extract video url')
3098 video_url = compat_urllib_parse.unquote(result.group(1))
3100 result = re.search(self.VIDEO_TITLE_RE, webpage)
3102 raise ExtractorError(u'Unable to extract video title')
3103 video_title = result.group(1)
3105 result = re.search(self.VIDEO_THUMB_RE, webpage)
3107 raise ExtractorError(u'Unable to extract video thumbnail')
3108 video_thumbnail = result.group(1)
3114 'upload_date': None,
3115 'title': video_title,
3117 'thumbnail': video_thumbnail,
3118 'description': None,
3122 class GooglePlusIE(InfoExtractor):
3123 """Information extractor for plus.google.com."""
3125 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3126 IE_NAME = u'plus.google'
3128 def report_extract_entry(self, url):
3129 """Report downloading extry"""
3130 self.to_screen(u'Downloading entry: %s' % url)
3132 def report_date(self, upload_date):
3133 """Report downloading extry"""
3134 self.to_screen(u'Entry date: %s' % upload_date)
3136 def report_uploader(self, uploader):
3137 """Report downloading extry"""
3138 self.to_screen(u'Uploader: %s' % uploader)
3140 def report_title(self, video_title):
3141 """Report downloading extry"""
3142 self.to_screen(u'Title: %s' % video_title)
3144 def report_extract_vid_page(self, video_page):
3145 """Report information extraction."""
3146 self.to_screen(u'Extracting video page: %s' % video_page)
3148 def _real_extract(self, url):
3149 # Extract id from URL
3150 mobj = re.match(self._VALID_URL, url)
3152 raise ExtractorError(u'Invalid URL: %s' % url)
3154 post_url = mobj.group(0)
3155 video_id = mobj.group(1)
3157 video_extension = 'flv'
3159 # Step 1, Retrieve post webpage to extract further information
3160 self.report_extract_entry(post_url)
3161 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3163 # Extract update date
3165 pattern = 'title="Timestamp">(.*?)</a>'
3166 mobj = re.search(pattern, webpage)
3168 upload_date = mobj.group(1)
3169 # Convert timestring to a format suitable for filename
3170 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3171 upload_date = upload_date.strftime('%Y%m%d')
3172 self.report_date(upload_date)
3176 pattern = r'rel\="author".*?>(.*?)</a>'
3177 mobj = re.search(pattern, webpage)
3179 uploader = mobj.group(1)
3180 self.report_uploader(uploader)
3183 # Get the first line for title
3185 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3186 mobj = re.search(pattern, webpage)
3188 video_title = mobj.group(1)
3189 self.report_title(video_title)
3191 # Step 2, Stimulate clicking the image box to launch video
3192 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3193 mobj = re.search(pattern, webpage)
3195 raise ExtractorError(u'Unable to extract video page URL')
3197 video_page = mobj.group(1)
3198 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3199 self.report_extract_vid_page(video_page)
3202 # Extract video links on video page
3203 """Extract video links of all sizes"""
3204 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3205 mobj = re.findall(pattern, webpage)
3207 raise ExtractorError(u'Unable to extract video links')
3209 # Sort in resolution
3210 links = sorted(mobj)
3212 # Choose the lowest of the sort, i.e. highest resolution
3213 video_url = links[-1]
3214 # Only get the url. The resolution part in the tuple has no use anymore
3215 video_url = video_url[-1]
3216 # Treat escaped \u0026 style hex
3218 video_url = video_url.decode("unicode_escape")
3219 except AttributeError: # Python 3
3220 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3226 'uploader': uploader,
3227 'upload_date': upload_date,
3228 'title': video_title,
3229 'ext': video_extension,
3232 class NBAIE(InfoExtractor):
3233 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3236 def _real_extract(self, url):
3237 mobj = re.match(self._VALID_URL, url)
3239 raise ExtractorError(u'Invalid URL: %s' % url)
3241 video_id = mobj.group(1)
3242 if video_id.endswith('/index.html'):
3243 video_id = video_id[:-len('/index.html')]
3245 webpage = self._download_webpage(url, video_id)
3247 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3248 def _findProp(rexp, default=None):
3249 m = re.search(rexp, webpage)
3251 return unescapeHTML(m.group(1))
3255 shortened_video_id = video_id.rpartition('/')[2]
3256 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3258 'id': shortened_video_id,
3262 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3263 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3267 class JustinTVIE(InfoExtractor):
3268 """Information extractor for justin.tv and twitch.tv"""
3269 # TODO: One broadcast may be split into multiple videos. The key
3270 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3271 # starts at 1 and increases. Can we treat all parts as one video?
3273 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3275 (?P<channelid>[^/]+)|
3276 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3277 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3281 _JUSTIN_PAGE_LIMIT = 100
3282 IE_NAME = u'justin.tv'
3284 def report_download_page(self, channel, offset):
3285 """Report attempt to download a single page of videos."""
3286 self.to_screen(u'%s: Downloading video information from %d to %d' %
3287 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3289 # Return count of items, list of *valid* items
3290 def _parse_page(self, url, video_id):
3291 webpage = self._download_webpage(url, video_id,
3292 u'Downloading video info JSON',
3293 u'unable to download video info JSON')
3295 response = json.loads(webpage)
3296 if type(response) != list:
3297 error_text = response.get('error', 'unknown error')
3298 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3300 for clip in response:
3301 video_url = clip['video_file_url']
3303 video_extension = os.path.splitext(video_url)[1][1:]
3304 video_date = re.sub('-', '', clip['start_time'][:10])
3305 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3306 video_id = clip['id']
3307 video_title = clip.get('title', video_id)
3311 'title': video_title,
3312 'uploader': clip.get('channel_name', video_uploader_id),
3313 'uploader_id': video_uploader_id,
3314 'upload_date': video_date,
3315 'ext': video_extension,
3317 return (len(response), info)
3319 def _real_extract(self, url):
3320 mobj = re.match(self._VALID_URL, url)
3322 raise ExtractorError(u'invalid URL: %s' % url)
3324 api_base = 'http://api.justin.tv'
3326 if mobj.group('channelid'):
3328 video_id = mobj.group('channelid')
3329 api = api_base + '/channel/archives/%s.json' % video_id
3330 elif mobj.group('chapterid'):
3331 chapter_id = mobj.group('chapterid')
3333 webpage = self._download_webpage(url, chapter_id)
3334 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3336 raise ExtractorError(u'Cannot find archive of a chapter')
3337 archive_id = m.group(1)
3339 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3340 chapter_info_xml = self._download_webpage(api, chapter_id,
3341 note=u'Downloading chapter information',
3342 errnote=u'Chapter information download failed')
3343 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3344 for a in doc.findall('.//archive'):
3345 if archive_id == a.find('./id').text:
3348 raise ExtractorError(u'Could not find chapter in chapter information')
3350 video_url = a.find('./video_file_url').text
3351 video_ext = video_url.rpartition('.')[2] or u'flv'
3353 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3354 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3355 note='Downloading chapter metadata',
3356 errnote='Download of chapter metadata failed')
3357 chapter_info = json.loads(chapter_info_json)
3359 bracket_start = int(doc.find('.//bracket_start').text)
3360 bracket_end = int(doc.find('.//bracket_end').text)
3362 # TODO determine start (and probably fix up file)
3363 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3364 #video_url += u'?start=' + TODO:start_timestamp
3365 # bracket_start is 13290, but we want 51670615
3366 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3367 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3370 'id': u'c' + chapter_id,
3373 'title': chapter_info['title'],
3374 'thumbnail': chapter_info['preview'],
3375 'description': chapter_info['description'],
3376 'uploader': chapter_info['channel']['display_name'],
3377 'uploader_id': chapter_info['channel']['name'],
3381 video_id = mobj.group('videoid')
3382 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3384 self.report_extraction(video_id)
3388 limit = self._JUSTIN_PAGE_LIMIT
3391 self.report_download_page(video_id, offset)
3392 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3393 page_count, page_info = self._parse_page(page_url, video_id)
3394 info.extend(page_info)
3395 if not paged or page_count != limit:
3400 class FunnyOrDieIE(InfoExtractor):
3401 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3406 raise ExtractorError(u'invalid URL: %s' % url)
3408 video_id = mobj.group('id')
3409 webpage = self._download_webpage(url, video_id)
3411 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3413 raise ExtractorError(u'Unable to find video information')
3414 video_url = unescapeHTML(m.group('url'))
3416 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3418 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3420 raise ExtractorError(u'Cannot find video title')
3421 title = clean_html(m.group('title'))
3423 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3425 desc = unescapeHTML(m.group('desc'))
3434 'description': desc,
3438 class SteamIE(InfoExtractor):
3439 _VALID_URL = r"""http://store\.steampowered\.com/
3441 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3443 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3447 def suitable(cls, url):
3448 """Receives a URL and returns True if suitable for this IE."""
3449 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3451 def _real_extract(self, url):
3452 m = re.match(self._VALID_URL, url, re.VERBOSE)
3453 gameID = m.group('gameID')
3454 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3455 self.report_age_confirmation()
3456 webpage = self._download_webpage(videourl, gameID)
3457 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3459 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3460 mweb = re.finditer(urlRE, webpage)
3461 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3462 titles = re.finditer(namesRE, webpage)
3463 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3464 thumbs = re.finditer(thumbsRE, webpage)
3466 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3467 video_id = vid.group('videoID')
3468 title = vtitle.group('videoName')
3469 video_url = vid.group('videoURL')
3470 video_thumb = thumb.group('thumbnail')
3472 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3477 'title': unescapeHTML(title),
3478 'thumbnail': video_thumb
3481 return [self.playlist_result(videos, gameID, game_title)]
3483 class UstreamIE(InfoExtractor):
3484 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3485 IE_NAME = u'ustream'
3487 def _real_extract(self, url):
3488 m = re.match(self._VALID_URL, url)
3489 video_id = m.group('videoID')
3490 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3491 webpage = self._download_webpage(url, video_id)
3492 self.report_extraction(video_id)
3494 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3495 title = m.group('title')
3496 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3498 uploader = unescapeHTML(m.group('uploader').strip())
3499 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3500 thumb = m.group('thumb')
3501 except AttributeError:
3502 raise ExtractorError(u'Unable to extract info')
3508 'uploader': uploader,
3513 class WorldStarHipHopIE(InfoExtractor):
3514 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3515 IE_NAME = u'WorldStarHipHop'
3517 def _real_extract(self, url):
3518 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3520 m = re.match(self._VALID_URL, url)
3521 video_id = m.group('id')
3523 webpage_src = self._download_webpage(url, video_id)
3525 mobj = re.search(_src_url, webpage_src)
3527 if mobj is not None:
3528 video_url = mobj.group(1)
3529 if 'mp4' in video_url:
3534 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3536 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3539 raise ExtractorError(u'Cannot determine title')
3540 title = mobj.group(1)
3542 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3543 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3544 if mobj is not None:
3545 thumbnail = mobj.group(1)
3547 _title = r"""candytitles.*>(.*)</span>"""
3548 mobj = re.search(_title, webpage_src)
3549 if mobj is not None:
3550 title = mobj.group(1)
3557 'thumbnail' : thumbnail,
3562 class RBMARadioIE(InfoExtractor):
3563 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3565 def _real_extract(self, url):
3566 m = re.match(self._VALID_URL, url)
3567 video_id = m.group('videoID')
3569 webpage = self._download_webpage(url, video_id)
3570 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3572 raise ExtractorError(u'Cannot find metadata')
3573 json_data = m.group(1)
3576 data = json.loads(json_data)
3577 except ValueError as e:
3578 raise ExtractorError(u'Invalid JSON: ' + str(e))
3580 video_url = data['akamai_url'] + '&cbr=256'
3581 url_parts = compat_urllib_parse_urlparse(video_url)
3582 video_ext = url_parts.path.rpartition('.')[2]
3587 'title': data['title'],
3588 'description': data.get('teaser_text'),
3589 'location': data.get('country_of_origin'),
3590 'uploader': data.get('host', {}).get('name'),
3591 'uploader_id': data.get('host', {}).get('slug'),
3592 'thumbnail': data.get('image', {}).get('large_url_2x'),
3593 'duration': data.get('duration'),
3598 class YouPornIE(InfoExtractor):
3599 """Information extractor for youporn.com."""
3600 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3602 def _print_formats(self, formats):
3603 """Print all available formats"""
3604 print(u'Available formats:')
3605 print(u'ext\t\tformat')
3606 print(u'---------------------------------')
3607 for format in formats:
3608 print(u'%s\t\t%s' % (format['ext'], format['format']))
3610 def _specific(self, req_format, formats):
3612 if(x["format"]==req_format):
3616 def _real_extract(self, url):
3617 mobj = re.match(self._VALID_URL, url)
3619 raise ExtractorError(u'Invalid URL: %s' % url)
3621 video_id = mobj.group('videoid')
3623 req = compat_urllib_request.Request(url)
3624 req.add_header('Cookie', 'age_verified=1')
3625 webpage = self._download_webpage(req, video_id)
3627 # Get the video title
3628 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3630 raise ExtractorError(u'Unable to extract video title')
3631 video_title = result.group('title').strip()
3633 # Get the video date
3634 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3636 self._downloader.report_warning(u'unable to extract video date')
3639 upload_date = unified_strdate(result.group('date').strip())
3641 # Get the video uploader
3642 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3644 self._downloader.report_warning(u'unable to extract uploader')
3645 video_uploader = None
3647 video_uploader = result.group('uploader').strip()
3648 video_uploader = clean_html( video_uploader )
3650 # Get all of the formats available
3651 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3652 result = re.search(DOWNLOAD_LIST_RE, webpage)
3654 raise ExtractorError(u'Unable to extract download list')
3655 download_list_html = result.group('download_list').strip()
3657 # Get all of the links from the page
3658 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3659 links = re.findall(LINK_RE, download_list_html)
3660 if(len(links) == 0):
3661 raise ExtractorError(u'ERROR: no known formats available for video')
3663 self.to_screen(u'Links found: %d' % len(links))
3668 # A link looks like this:
3669 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3670 # A path looks like this:
3671 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3672 video_url = unescapeHTML( link )
3673 path = compat_urllib_parse_urlparse( video_url ).path
3674 extension = os.path.splitext( path )[1][1:]
3675 format = path.split('/')[4].split('_')[:2]
3678 format = "-".join( format )
3679 title = u'%s-%s-%s' % (video_title, size, bitrate)
3684 'uploader': video_uploader,
3685 'upload_date': upload_date,
3690 'description': None,
3694 if self._downloader.params.get('listformats', None):
3695 self._print_formats(formats)
3698 req_format = self._downloader.params.get('format', None)
3699 self.to_screen(u'Format: %s' % req_format)
3701 if req_format is None or req_format == 'best':
3703 elif req_format == 'worst':
3704 return [formats[-1]]
3705 elif req_format in ('-1', 'all'):
3708 format = self._specific( req_format, formats )
3710 raise ExtractorError(u'Requested format not available')
3715 class PornotubeIE(InfoExtractor):
3716 """Information extractor for pornotube.com."""
3717 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3719 def _real_extract(self, url):
3720 mobj = re.match(self._VALID_URL, url)
3722 raise ExtractorError(u'Invalid URL: %s' % url)
3724 video_id = mobj.group('videoid')
3725 video_title = mobj.group('title')
3727 # Get webpage content
3728 webpage = self._download_webpage(url, video_id)
3731 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3732 result = re.search(VIDEO_URL_RE, webpage)
3734 raise ExtractorError(u'Unable to extract video url')
3735 video_url = compat_urllib_parse.unquote(result.group('url'))
3737 #Get the uploaded date
3738 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3739 result = re.search(VIDEO_UPLOADED_RE, webpage)
3741 raise ExtractorError(u'Unable to extract video title')
3742 upload_date = unified_strdate(result.group('date'))
3744 info = {'id': video_id,
3747 'upload_date': upload_date,
3748 'title': video_title,
3754 class YouJizzIE(InfoExtractor):
3755 """Information extractor for youjizz.com."""
3756 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3758 def _real_extract(self, url):
3759 mobj = re.match(self._VALID_URL, url)
3761 raise ExtractorError(u'Invalid URL: %s' % url)
3763 video_id = mobj.group('videoid')
3765 # Get webpage content
3766 webpage = self._download_webpage(url, video_id)
3768 # Get the video title
3769 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3771 raise ExtractorError(u'ERROR: unable to extract video title')
3772 video_title = result.group('title').strip()
3774 # Get the embed page
3775 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3777 raise ExtractorError(u'ERROR: unable to extract embed page')
3779 embed_page_url = result.group(0).strip()
3780 video_id = result.group('videoid')
3782 webpage = self._download_webpage(embed_page_url, video_id)
3785 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3787 raise ExtractorError(u'ERROR: unable to extract video url')
3788 video_url = result.group('source')
3790 info = {'id': video_id,
3792 'title': video_title,
3795 'player_url': embed_page_url}
3799 class EightTracksIE(InfoExtractor):
3801 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3803 def _real_extract(self, url):
3804 mobj = re.match(self._VALID_URL, url)
3806 raise ExtractorError(u'Invalid URL: %s' % url)
3807 playlist_id = mobj.group('id')
3809 webpage = self._download_webpage(url, playlist_id)
3811 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3813 raise ExtractorError(u'Cannot find trax information')
3814 json_like = m.group(1)
3815 data = json.loads(json_like)
3817 session = str(random.randint(0, 1000000000))
3819 track_count = data['tracks_count']
3820 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3821 next_url = first_url
3823 for i in itertools.count():
3824 api_json = self._download_webpage(next_url, playlist_id,
3825 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3826 errnote=u'Failed to download song information')
3827 api_data = json.loads(api_json)
3828 track_data = api_data[u'set']['track']
3830 'id': track_data['id'],
3831 'url': track_data['track_file_stream_url'],
3832 'title': track_data['performer'] + u' - ' + track_data['name'],
3833 'raw_title': track_data['name'],
3834 'uploader_id': data['user']['login'],
3838 if api_data['set']['at_last_track']:
3840 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3843 class KeekIE(InfoExtractor):
3844 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3847 def _real_extract(self, url):
3848 m = re.match(self._VALID_URL, url)
3849 video_id = m.group('videoID')
3850 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3851 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3852 webpage = self._download_webpage(url, video_id)
3853 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3854 title = unescapeHTML(m.group('title'))
3855 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3856 uploader = clean_html(m.group('uploader'))
3862 'thumbnail': thumbnail,
3863 'uploader': uploader
3867 class TEDIE(InfoExtractor):
3868 _VALID_URL=r'''http://www\.ted\.com/
3870 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3872 ((?P<type_talk>talks)) # We have a simple talk
3874 (/lang/(.*?))? # The url may contain the language
3875 /(?P<name>\w+) # Here goes the name and then ".html"
3879 def suitable(cls, url):
3880 """Receives a URL and returns True if suitable for this IE."""
3881 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3883 def _real_extract(self, url):
3884 m=re.match(self._VALID_URL, url, re.VERBOSE)
3885 if m.group('type_talk'):
3886 return [self._talk_info(url)]
3888 playlist_id=m.group('playlist_id')
3889 name=m.group('name')
3890 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3891 return [self._playlist_videos_info(url,name,playlist_id)]
3893 def _talk_video_link(self,mediaSlug):
3894 '''Returns the video link for that mediaSlug'''
3895 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3897 def _playlist_videos_info(self,url,name,playlist_id=0):
3898 '''Returns the videos of the playlist'''
3900 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3901 ([.\s]*?)data-playlist_item_id="(\d+)"
3902 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3904 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3905 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3906 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3907 m_names=re.finditer(video_name_RE,webpage)
3909 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3910 m_playlist = re.search(playlist_RE, webpage)
3911 playlist_title = m_playlist.group('playlist_title')
3913 playlist_entries = []
3914 for m_video, m_name in zip(m_videos,m_names):
3915 video_id=m_video.group('video_id')
3916 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3917 playlist_entries.append(self.url_result(talk_url, 'TED'))
3918 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3920 def _talk_info(self, url, video_id=0):
3921 """Return the video for the talk in the url"""
3922 m=re.match(self._VALID_URL, url,re.VERBOSE)
3923 videoName=m.group('name')
3924 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3925 # If the url includes the language we get the title translated
3926 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3927 title=re.search(title_RE, webpage).group('title')
3928 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3929 "id":(?P<videoID>[\d]+).*?
3930 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3931 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3932 thumb_match=re.search(thumb_RE,webpage)
3933 info_match=re.search(info_RE,webpage,re.VERBOSE)
3934 video_id=info_match.group('videoID')
3935 mediaSlug=info_match.group('mediaSlug')
3936 video_url=self._talk_video_link(mediaSlug)
3942 'thumbnail': thumb_match.group('thumbnail')
3946 class MySpassIE(InfoExtractor):
3947 _VALID_URL = r'http://www.myspass.de/.*'
3949 def _real_extract(self, url):
3950 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3952 # video id is the last path element of the URL
3953 # usually there is a trailing slash, so also try the second but last
3954 url_path = compat_urllib_parse_urlparse(url).path
3955 url_parent_path, video_id = os.path.split(url_path)
3957 _, video_id = os.path.split(url_parent_path)
3960 metadata_url = META_DATA_URL_TEMPLATE % video_id
3961 metadata_text = self._download_webpage(metadata_url, video_id)
3962 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3964 # extract values from metadata
3965 url_flv_el = metadata.find('url_flv')
3966 if url_flv_el is None:
3967 raise ExtractorError(u'Unable to extract download url')
3968 video_url = url_flv_el.text
3969 extension = os.path.splitext(video_url)[1][1:]
3970 title_el = metadata.find('title')
3971 if title_el is None:
3972 raise ExtractorError(u'Unable to extract title')
3973 title = title_el.text
3974 format_id_el = metadata.find('format_id')
3975 if format_id_el is None:
3978 format = format_id_el.text
3979 description_el = metadata.find('description')
3980 if description_el is not None:
3981 description = description_el.text
3984 imagePreview_el = metadata.find('imagePreview')
3985 if imagePreview_el is not None:
3986 thumbnail = imagePreview_el.text
3995 'thumbnail': thumbnail,
3996 'description': description
4000 class SpiegelIE(InfoExtractor):
4001 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4003 def _real_extract(self, url):
4004 m = re.match(self._VALID_URL, url)
4005 video_id = m.group('videoID')
4007 webpage = self._download_webpage(url, video_id)
4008 m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
4010 raise ExtractorError(u'Cannot find title')
4011 video_title = unescapeHTML(m.group(1))
4013 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4014 xml_code = self._download_webpage(xml_url, video_id,
4015 note=u'Downloading XML', errnote=u'Failed to download XML')
4017 idoc = xml.etree.ElementTree.fromstring(xml_code)
4018 last_type = idoc[-1]
4019 filename = last_type.findall('./filename')[0].text
4020 duration = float(last_type.findall('./duration')[0].text)
4022 video_url = 'http://video2.spiegel.de/flash/' + filename
4023 video_ext = filename.rpartition('.')[2]
4028 'title': video_title,
4029 'duration': duration,
4033 class LiveLeakIE(InfoExtractor):
4035 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4036 IE_NAME = u'liveleak'
4038 def _real_extract(self, url):
4039 mobj = re.match(self._VALID_URL, url)
4041 raise ExtractorError(u'Invalid URL: %s' % url)
4043 video_id = mobj.group('video_id')
4045 webpage = self._download_webpage(url, video_id)
4047 m = re.search(r'file: "(.*?)",', webpage)
4049 raise ExtractorError(u'Unable to find video url')
4050 video_url = m.group(1)
4052 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4054 raise ExtractorError(u'Cannot find video title')
4055 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4057 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4059 desc = unescapeHTML(m.group('desc'))
4063 m = re.search(r'By:.*?(\w+)</a>', webpage)
4065 uploader = clean_html(m.group(1))
4074 'description': desc,
4075 'uploader': uploader
4080 class ARDIE(InfoExtractor):
4081 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4082 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4083 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4085 def _real_extract(self, url):
4086 # determine video id from url
4087 m = re.match(self._VALID_URL, url)
4089 numid = re.search(r'documentId=([0-9]+)', url)
4091 video_id = numid.group(1)
4093 video_id = m.group('video_id')
4095 # determine title and media streams from webpage
4096 html = self._download_webpage(url, video_id)
4097 title = re.search(self._TITLE, html).group('title')
4098 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4100 assert '"fsk"' in html
4101 raise ExtractorError(u'This video is only available after 8:00 pm')
4103 # choose default media type and highest quality for now
4104 stream = max([s for s in streams if int(s["media_type"]) == 0],
4105 key=lambda s: int(s["quality"]))
4107 # there's two possibilities: RTMP stream or HTTP download
4108 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4109 if stream['rtmp_url']:
4110 self.to_screen(u'RTMP download detected')
4111 assert stream['video_url'].startswith('mp4:')
4112 info["url"] = stream["rtmp_url"]
4113 info["play_path"] = stream['video_url']
4115 assert stream["video_url"].endswith('.mp4')
4116 info["url"] = stream["video_url"]
4119 class ZDFIE(InfoExtractor):
4120 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4121 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4122 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4123 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4124 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4126 def _real_extract(self, url):
4127 mobj = re.match(self._VALID_URL, url)
4129 raise ExtractorError(u'Invalid URL: %s' % url)
4130 video_id = mobj.group('video_id')
4132 html = self._download_webpage(url, video_id)
4133 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4135 raise ExtractorError(u'No media url found.')
4137 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4138 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4139 # choose first/default media type and highest quality for now
4140 for s in streams: #find 300 - dsl1000mbit
4141 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4144 for s in streams: #find veryhigh - dsl2000mbit
4145 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4149 raise ExtractorError(u'No stream found.')
4151 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4153 self.report_extraction(video_id)
4154 mobj = re.search(self._TITLE, html)
4156 raise ExtractorError(u'Cannot extract title')
4157 title = unescapeHTML(mobj.group('title'))
4159 mobj = re.search(self._MMS_STREAM, media_link)
4161 mobj = re.search(self._RTSP_STREAM, media_link)
4163 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4164 mms_url = mobj.group('video_url')
4166 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4168 raise ExtractorError(u'Cannot extract extention')
4169 ext = mobj.group('ext')
4171 return [{'id': video_id,
4177 class TumblrIE(InfoExtractor):
4178 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4180 def _real_extract(self, url):
4181 m_url = re.match(self._VALID_URL, url)
4182 video_id = m_url.group('id')
4183 blog = m_url.group('blog_name')
4185 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4186 webpage = self._download_webpage(url, video_id)
4188 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4189 video = re.search(re_video, webpage)
4191 self.to_screen("No video found")
4193 video_url = video.group('video_url')
4194 ext = video.group('ext')
4196 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4197 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4199 # The only place where you can get a title, it's not complete,
4200 # but searching in other places doesn't work for all videos
4201 re_title = r'<title>(?P<title>.*?)</title>'
4202 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4204 return [{'id': video_id,
4211 class BandcampIE(InfoExtractor):
4212 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4214 def _real_extract(self, url):
4215 mobj = re.match(self._VALID_URL, url)
4216 title = mobj.group('title')
4217 webpage = self._download_webpage(url, title)
4218 # We get the link to the free download page
4219 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4220 if m_download is None:
4221 raise ExtractorError(u'No free songs founded')
4223 download_link = m_download.group(1)
4224 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4225 webpage, re.MULTILINE|re.DOTALL).group('id')
4227 download_webpage = self._download_webpage(download_link, id,
4228 'Downloading free downloads page')
4229 # We get the dictionary of the track from some javascrip code
4230 info = re.search(r'items: (.*?),$',
4231 download_webpage, re.MULTILINE).group(1)
4232 info = json.loads(info)[0]
4233 # We pick mp3-320 for now, until format selection can be easily implemented.
4234 mp3_info = info[u'downloads'][u'mp3-320']
4235 # If we try to use this url it says the link has expired
4236 initial_url = mp3_info[u'url']
4237 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4238 m_url = re.match(re_url, initial_url)
4239 #We build the url we will use to get the final track url
4240 # This url is build in Bandcamp in the script download_bunde_*.js
4241 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4242 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4243 # If we could correctly generate the .rand field the url would be
4244 #in the "download_url" key
4245 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4247 track_info = {'id':id,
4248 'title' : info[u'title'],
4251 'thumbnail' : info[u'thumb_url'],
4252 'uploader' : info[u'artist']
4257 class RedTubeIE(InfoExtractor):
4258 """Information Extractor for redtube"""
4259 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4261 def _real_extract(self,url):
4262 mobj = re.match(self._VALID_URL, url)
4264 raise ExtractorError(u'Invalid URL: %s' % url)
4266 video_id = mobj.group('id')
4267 video_extension = 'mp4'
4268 webpage = self._download_webpage(url, video_id)
4269 self.report_extraction(video_id)
4270 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4273 raise ExtractorError(u'Unable to extract media URL')
4275 video_url = mobj.group(1)
4276 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4278 raise ExtractorError(u'Unable to extract title')
4279 video_title = mobj.group(1)
4284 'ext': video_extension,
4285 'title': video_title,
4288 class InaIE(InfoExtractor):
4289 """Information Extractor for Ina.fr"""
4290 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4292 def _real_extract(self,url):
4293 mobj = re.match(self._VALID_URL, url)
4295 video_id = mobj.group('id')
4296 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4297 video_extension = 'mp4'
4298 webpage = self._download_webpage(mrss_url, video_id)
4300 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4302 raise ExtractorError(u'Unable to extract media URL')
4303 video_url = mobj.group(1)
4305 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4307 raise ExtractorError(u'Unable to extract title')
4308 video_title = mobj.group(1)
4313 'ext': video_extension,
4314 'title': video_title,
4317 class HowcastIE(InfoExtractor):
4318 """Information Extractor for Howcast.com"""
4319 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4321 def _real_extract(self, url):
4322 mobj = re.match(self._VALID_URL, url)
4324 video_id = mobj.group('id')
4325 webpage_url = 'http://www.howcast.com/videos/' + video_id
4326 webpage = self._download_webpage(webpage_url, video_id)
4328 self.report_extraction(video_id)
4330 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4332 raise ExtractorError(u'Unable to extract video URL')
4333 video_url = mobj.group(1)
4335 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4337 raise ExtractorError(u'Unable to extract title')
4338 video_title = mobj.group(1) or mobj.group(2)
4340 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4342 self._downloader.report_warning(u'unable to extract description')
4343 video_description = None
4345 video_description = mobj.group(1) or mobj.group(2)
4347 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4349 raise ExtractorError(u'Unable to extract thumbnail')
4350 thumbnail = mobj.group(1)
4356 'title': video_title,
4357 'description': video_description,
4358 'thumbnail': thumbnail,
4361 class VineIE(InfoExtractor):
4362 """Information Extractor for Vine.co"""
4363 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4365 def _real_extract(self, url):
4367 mobj = re.match(self._VALID_URL, url)
4369 video_id = mobj.group('id')
4370 webpage_url = 'https://vine.co/v/' + video_id
4371 webpage = self._download_webpage(webpage_url, video_id)
4373 self.report_extraction(video_id)
4375 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4377 raise ExtractorError(u'Unable to extract video URL')
4378 video_url = mobj.group(1)
4380 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4382 raise ExtractorError(u'Unable to extract title')
4383 video_title = mobj.group(1)
4385 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4387 raise ExtractorError(u'Unable to extract thumbnail')
4388 thumbnail = mobj.group(1)
4390 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4392 raise ExtractorError(u'Unable to extract uploader')
4393 uploader = mobj.group(1)
4399 'title': video_title,
4400 'thumbnail': thumbnail,
4401 'uploader': uploader,
4404 class FlickrIE(InfoExtractor):
4405 """Information Extractor for Flickr videos"""
4406 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4408 def _real_extract(self, url):
4409 mobj = re.match(self._VALID_URL, url)
4411 video_id = mobj.group('id')
4412 video_uploader_id = mobj.group('uploader_id')
4413 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4414 webpage = self._download_webpage(webpage_url, video_id)
4416 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4418 raise ExtractorError(u'Unable to extract video secret')
4419 secret = mobj.group(1)
4421 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4422 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4424 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4426 raise ExtractorError(u'Unable to extract node_id')
4427 node_id = mobj.group(1)
4429 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4430 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4432 self.report_extraction(video_id)
4434 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4436 raise ExtractorError(u'Unable to extract video url')
4437 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4439 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4441 raise ExtractorError(u'Unable to extract title')
4442 video_title = mobj.group(1) or mobj.group(2)
4444 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4446 self._downloader.report_warning(u'unable to extract description')
4447 video_description = None
4449 video_description = mobj.group(1) or mobj.group(2)
4451 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4453 raise ExtractorError(u'Unable to extract thumbnail')
4454 thumbnail = mobj.group(1) or mobj.group(2)
4460 'title': video_title,
4461 'description': video_description,
4462 'thumbnail': thumbnail,
4463 'uploader_id': video_uploader_id,
4466 class TeamcocoIE(InfoExtractor):
4467 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4469 def _real_extract(self, url):
4470 mobj = re.match(self._VALID_URL, url)
4472 raise ExtractorError(u'Invalid URL: %s' % url)
4473 url_title = mobj.group('url_title')
4474 webpage = self._download_webpage(url, url_title)
4476 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4477 video_id = mobj.group(1)
4479 self.report_extraction(video_id)
4481 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4483 raise ExtractorError(u'Unable to extract title')
4484 video_title = mobj.group(1)
4486 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4488 raise ExtractorError(u'Unable to extract thumbnail')
4489 thumbnail = mobj.group(1)
4491 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4493 raise ExtractorError(u'Unable to extract description')
4494 description = mobj.group(1)
4496 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4497 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4498 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4500 raise ExtractorError(u'Unable to extract video url')
4501 video_url = mobj.group(1)
4507 'title': video_title,
4508 'thumbnail': thumbnail,
4509 'description': description,
4512 class XHamsterIE(InfoExtractor):
4513 """Information Extractor for xHamster"""
4514 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4516 def _real_extract(self,url):
4517 mobj = re.match(self._VALID_URL, url)
4519 video_id = mobj.group('id')
4520 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4521 webpage = self._download_webpage(mrss_url, video_id)
4522 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4524 raise ExtractorError(u'Unable to extract media URL')
4525 if len(mobj.group('server')) == 0:
4526 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4528 video_url = mobj.group('server')+'/key='+mobj.group('file')
4529 video_extension = video_url.split('.')[-1]
4531 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4533 raise ExtractorError(u'Unable to extract title')
4534 video_title = unescapeHTML(mobj.group('title'))
4536 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4538 video_description = u''
4540 video_description = unescapeHTML(mobj.group('description'))
4542 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4544 raise ExtractorError(u'Unable to extract upload date')
4545 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4547 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4549 video_uploader_id = u'anonymous'
4551 video_uploader_id = mobj.group('uploader_id')
4553 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4555 raise ExtractorError(u'Unable to extract thumbnail URL')
4556 video_thumbnail = mobj.group('thumbnail')
4561 'ext': video_extension,
4562 'title': video_title,
4563 'description': video_description,
4564 'upload_date': video_upload_date,
4565 'uploader_id': video_uploader_id,
4566 'thumbnail': video_thumbnail
4569 class HypemIE(InfoExtractor):
4570 """Information Extractor for hypem"""
4571 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4573 def _real_extract(self, url):
4574 mobj = re.match(self._VALID_URL, url)
4576 raise ExtractorError(u'Invalid URL: %s' % url)
4577 track_id = mobj.group(1)
4579 data = { 'ax': 1, 'ts': time.time() }
4580 data_encoded = compat_urllib_parse.urlencode(data)
4581 complete_url = url + "?" + data_encoded
4582 request = compat_urllib_request.Request(complete_url)
4583 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4584 cookie = urlh.headers.get('Set-Cookie', '')
4586 self.report_extraction(track_id)
4587 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4589 raise ExtractorError(u'Unable to extrack tracks')
4590 html_tracks = mobj.group(1).strip()
4592 track_list = json.loads(html_tracks)
4593 track = track_list[u'tracks'][0]
4595 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4598 track_id = track[u"id"]
4599 artist = track[u"artist"]
4600 title = track[u"song"]
4602 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4603 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4604 request.add_header('cookie', cookie)
4605 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4607 song_data = json.loads(song_data_json)
4609 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4610 final_url = song_data[u"url"]
4620 class Vbox7IE(InfoExtractor):
4621 """Information Extractor for Vbox7"""
4622 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4624 def _real_extract(self,url):
4625 mobj = re.match(self._VALID_URL, url)
4627 raise ExtractorError(u'Invalid URL: %s' % url)
4628 video_id = mobj.group(1)
4630 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4631 redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4632 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4634 title = re.search(r'<title>(.*)</title>', webpage)
4635 title = (title.group(1)).split('/')[0].strip()
4638 info_url = "http://vbox7.com/play/magare.do"
4639 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4640 info_request = compat_urllib_request.Request(info_url, data)
4641 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4642 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4643 if info_response is None:
4644 raise ExtractorError(u'Unable to extract the media url')
4645 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4652 'thumbnail': thumbnail_url,
4655 def gen_extractors():
4656 """ Return a list of an instance of every supported extractor.
4657 The order does matter; the first extractor matched is the one handling the URL.
4660 YoutubePlaylistIE(),
4685 StanfordOpenClassroomIE(),
4695 WorldStarHipHopIE(),
4723 def get_info_extractor(ie_name):
4724 """Returns the info extractor class with the given ie_name"""
4725 return globals()[ie_name+'IE']