2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
646 video_url_list = [(rf, url_map[rf])]
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
654 for format_param, video_real_url in video_url_list:
656 video_extension = self._video_extensions.get(format_param, 'flv')
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
689 self.to_screen(u'Retrieving disclaimer')
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
716 raise ExtractorError(u'Invalid URL: %s' % url)
718 video_id = mobj.group(1)
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
725 # Retrieve video webpage to extract further information
726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
745 raise ExtractorError(u'Unable to extract media URL')
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
748 raise ExtractorError(u'Unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
751 raise ExtractorError(u'Unable to extract media URL')
752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753 video_extension = mediaURL[-3:]
754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
758 raise ExtractorError(u'Unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
761 mobj = re.search(r'submitter=(.*?);', webpage)
763 raise ExtractorError(u'Unable to extract uploader nickname')
764 video_uploader = mobj.group(1)
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
785 raise ExtractorError(u'Invalid URL: %s' % url)
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
789 video_extension = 'mp4'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800 raise ExtractorError(u'Unable to extract media URL')
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
806 self.to_screen(u'Using %s' % key)
809 raise ExtractorError(u'Unable to extract video URL')
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
813 raise ExtractorError(u'Unable to extract video URL')
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
817 # TODO: support choosing qualities
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
821 raise ExtractorError(u'Unable to extract title')
822 video_title = unescapeHTML(mobj.group('title'))
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
830 self._downloader.report_warning(u'unable to extract uploader nickname')
832 video_uploader = mobj_official.group(1)
834 video_uploader = mobj.group(1)
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
844 'uploader': video_uploader,
845 'upload_date': video_upload_date,
846 'title': video_title,
847 'ext': video_extension,
851 class PhotobucketIE(InfoExtractor):
852 """Information extractor for photobucket.com."""
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858 IE_NAME = u'photobucket'
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
864 raise ExtractorError(u'Invalid URL: %s' % url)
866 video_id = mobj.group('id')
868 video_extension = mobj.group('ext')
870 # Retrieve video webpage to extract further information
871 webpage = self._download_webpage(url, video_id)
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
878 info = json.loads(mobj.group('json'))
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
889 # We try looking in other parts of the webpage
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
902 video_uploader = mobj.group(2).decode('utf-8')
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
914 class YahooIE(InfoExtractor):
915 """Information extractor for screen.yahoo.com."""
916 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
918 def _real_extract(self, url):
919 mobj = re.match(self._VALID_URL, url)
921 raise ExtractorError(u'Invalid URL: %s' % url)
922 video_id = mobj.group('id')
923 webpage = self._download_webpage(url, video_id)
924 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
927 # TODO: Check which url parameters are required
928 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
929 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
930 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
931 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
932 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
933 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
935 self.report_extraction(video_id)
936 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
938 raise ExtractorError(u'Unable to extract video info')
939 video_title = m_info.group('title')
940 video_description = m_info.group('description')
941 video_thumb = m_info.group('thumb')
942 video_date = m_info.group('date')
943 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
945 # TODO: Find a way to get mp4 videos
946 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
947 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
948 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
949 video_url = m_rest.group('url')
950 video_path = m_rest.group('path')
952 raise ExtractorError(u'Unable to extract video url')
954 else: # We have to use a different method if another id is defined
955 long_id = m_id.group('new_id')
956 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
957 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
958 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
959 info = json.loads(json_str)
960 res = info[u'query'][u'results'][u'mediaObj'][0]
961 stream = res[u'streams'][0]
962 video_path = stream[u'path']
963 video_url = stream[u'host']
965 video_title = meta[u'title']
966 video_description = meta[u'description']
967 video_thumb = meta[u'thumbnail']
968 video_date = None # I can't find it
973 'play_path': video_path,
975 'description': video_description,
976 'thumbnail': video_thumb,
977 'upload_date': video_date,
982 class VimeoIE(InfoExtractor):
983 """Information extractor for vimeo.com."""
985 # _VALID_URL matches Vimeo URLs
986 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
993 raise ExtractorError(u'Invalid URL: %s' % url)
995 video_id = mobj.group('id')
996 if not mobj.group('proto'):
997 url = 'https://' + url
998 if mobj.group('direct_link'):
999 url = 'https://vimeo.com/' + video_id
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
1003 webpage = self._download_webpage(request, video_id)
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1010 # Extract the config JSON
1012 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013 config = json.loads(config)
1015 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1018 raise ExtractorError(u'Unable to extract info section')
1021 video_title = config["video"]["title"]
1023 # Extract uploader and uploader_id
1024 video_uploader = config["video"]["owner"]["name"]
1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1030 # Extract video description
1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
1032 if video_description: video_description = clean_html(video_description)
1033 else: video_description = u''
1035 # Extract upload date
1036 video_upload_date = None
1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038 if mobj is not None:
1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
1064 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067 raise ExtractorError(u'No known codec found')
1069 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1075 'uploader': video_uploader,
1076 'uploader_id': video_uploader_id,
1077 'upload_date': video_upload_date,
1078 'title': video_title,
1079 'ext': video_extension,
1080 'thumbnail': video_thumbnail,
1081 'description': video_description,
1085 class ArteTvIE(InfoExtractor):
1086 """arte.tv information extractor."""
1088 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089 _LIVE_URL = r'index-[0-9]+\.html$'
1091 IE_NAME = u'arte.tv'
1093 def fetch_webpage(self, url):
1094 request = compat_urllib_request.Request(url)
1096 self.report_download_webpage(url)
1097 webpage = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100 except ValueError as err:
1101 raise ExtractorError(u'Invalid URL: %s' % url)
1104 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105 page = self.fetch_webpage(url)
1106 mobj = re.search(regex, page, regexFlags)
1110 raise ExtractorError(u'Invalid URL: %s' % url)
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
1114 raise ExtractorError(err)
1116 info[key] = mobj.group(i)
1120 def extractLiveStream(self, url):
1121 video_lang = url.split('/')[-4]
1122 info = self.grep_webpage(
1124 r'src="(.*?/videothek_js.*?\.js)',
1127 (1, 'url', u'Invalid URL: %s' % url)
1130 http_host = url.split('/')[2]
1131 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132 info = self.grep_webpage(
1134 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135 '(http://.*?\.swf).*?' +
1139 (1, 'path', u'could not extract video path: %s' % url),
1140 (2, 'player', u'could not extract video player: %s' % url),
1141 (3, 'url', u'could not extract video url: %s' % url)
1144 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1146 def extractPlus7Stream(self, url):
1147 video_lang = url.split('/')[-3]
1148 info = self.grep_webpage(
1150 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1153 (1, 'url', u'Invalid URL: %s' % url)
1156 next_url = compat_urllib_parse.unquote(info.get('url'))
1157 info = self.grep_webpage(
1159 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1162 (1, 'url', u'Could not find <video> tag: %s' % url)
1165 next_url = compat_urllib_parse.unquote(info.get('url'))
1167 info = self.grep_webpage(
1169 r'<video id="(.*?)".*?>.*?' +
1170 '<name>(.*?)</name>.*?' +
1171 '<dateVideo>(.*?)</dateVideo>.*?' +
1172 '<url quality="hd">(.*?)</url>',
1175 (1, 'id', u'could not extract video id: %s' % url),
1176 (2, 'title', u'could not extract video title: %s' % url),
1177 (3, 'date', u'could not extract video date: %s' % url),
1178 (4, 'url', u'could not extract video url: %s' % url)
1183 'id': info.get('id'),
1184 'url': compat_urllib_parse.unquote(info.get('url')),
1185 'uploader': u'arte.tv',
1186 'upload_date': unified_strdate(info.get('date')),
1187 'title': info.get('title').decode('utf-8'),
1193 def _real_extract(self, url):
1194 video_id = url.split('/')[-1]
1195 self.report_extraction(video_id)
1197 if re.search(self._LIVE_URL, video_id) is not None:
1198 self.extractLiveStream(url)
1201 info = self.extractPlus7Stream(url)
1206 class GenericIE(InfoExtractor):
1207 """Generic last-resort information extractor."""
1210 IE_NAME = u'generic'
1212 def report_download_webpage(self, video_id):
1213 """Report webpage download."""
1214 if not self._downloader.params.get('test', False):
1215 self._downloader.report_warning(u'Falling back on generic information extractor.')
1216 super(GenericIE, self).report_download_webpage(video_id)
1218 def report_following_redirect(self, new_url):
1219 """Report information extraction."""
1220 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1222 def _test_redirect(self, url):
1223 """Check if it is a redirect, like url shorteners, in case return the new url."""
1224 class HeadRequest(compat_urllib_request.Request):
1225 def get_method(self):
1228 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1230 Subclass the HTTPRedirectHandler to make it use our
1231 HeadRequest also on the redirected URL
1233 def redirect_request(self, req, fp, code, msg, headers, newurl):
1234 if code in (301, 302, 303, 307):
1235 newurl = newurl.replace(' ', '%20')
1236 newheaders = dict((k,v) for k,v in req.headers.items()
1237 if k.lower() not in ("content-length", "content-type"))
1238 return HeadRequest(newurl,
1240 origin_req_host=req.get_origin_req_host(),
1243 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1245 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1247 Fallback to GET if HEAD is not allowed (405 HTTP error)
1249 def http_error_405(self, req, fp, code, msg, headers):
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1257 origin_req_host=req.get_origin_req_host(),
1261 opener = compat_urllib_request.OpenerDirector()
1262 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263 HTTPMethodFallback, HEADRedirectHandler,
1264 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265 opener.add_handler(handler())
1267 response = opener.open(HeadRequest(url))
1268 new_url = response.geturl()
1273 self.report_following_redirect(new_url)
1276 def _real_extract(self, url):
1277 new_url = self._test_redirect(url)
1278 if new_url: return [self.url_result(new_url)]
1280 video_id = url.split('/')[-1]
1282 webpage = self._download_webpage(url, video_id)
1283 except ValueError as err:
1284 # since this is the last-resort InfoExtractor, if
1285 # this error is thrown, it'll be thrown here
1286 raise ExtractorError(u'Invalid URL: %s' % url)
1288 self.report_extraction(video_id)
1289 # Start with something easy: JW Player in SWFObject
1290 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1292 # Broaden the search a little bit
1293 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1295 # Broaden the search a little bit: JWPlayer JS loader
1296 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1298 raise ExtractorError(u'Invalid URL: %s' % url)
1300 # It's possible that one of the regexes
1301 # matched, but returned an empty group:
1302 if mobj.group(1) is None:
1303 raise ExtractorError(u'Invalid URL: %s' % url)
1305 video_url = compat_urllib_parse.unquote(mobj.group(1))
1306 video_id = os.path.basename(video_url)
1308 # here's a fun little line of code for you:
1309 video_extension = os.path.splitext(video_id)[1][1:]
1310 video_id = os.path.splitext(video_id)[0]
1312 # it's tempting to parse this further, but you would
1313 # have to take into account all the variations like
1314 # Video Title - Site Name
1315 # Site Name | Video Title
1316 # Video Title - Tagline | Site Name
1317 # and so on and so forth; it's just not practical
1318 mobj = re.search(r'<title>(.*)</title>', webpage)
1320 raise ExtractorError(u'Unable to extract title')
1321 video_title = mobj.group(1)
1323 # video uploader is domain name
1324 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1326 raise ExtractorError(u'Unable to extract title')
1327 video_uploader = mobj.group(1)
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
1335 'ext': video_extension,
1339 class YoutubeSearchIE(InfoExtractor):
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1346 def report_download_page(self, query, pagenum):
1347 """Report attempt to download search page with given number."""
1348 query = query.decode(preferredencoding())
1349 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1351 def _real_extract(self, query):
1352 mobj = re.match(self._VALID_URL, query)
1354 raise ExtractorError(u'Invalid search query "%s"' % query)
1356 prefix, query = query.split(':')
1358 query = query.encode('utf-8')
1360 return self._get_n_results(query, 1)
1361 elif prefix == 'all':
1362 self._get_n_results(query, self._max_youtube_results)
1367 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368 elif n > self._max_youtube_results:
1369 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370 n = self._max_youtube_results
1371 return self._get_n_results(query, n)
1372 except ValueError: # parsing prefix as integer fails
1373 return self._get_n_results(query, 1)
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
1382 while (50 * pagenum) < limit:
1383 self.report_download_page(query, pagenum+1)
1384 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385 request = compat_urllib_request.Request(result_url)
1387 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390 api_response = json.loads(data)['data']
1392 if not 'items' in api_response:
1393 raise ExtractorError(u'[youtube] No video results')
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1398 limit = min(n, api_response['totalItems'])
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
1403 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1407 class GoogleSearchIE(InfoExtractor):
1408 """Information Extractor for Google Video search queries."""
1409 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1411 _max_google_results = 1000
1412 IE_NAME = u'video.google:search'
1414 def _real_extract(self, query):
1415 mobj = re.match(self._VALID_URL, query)
1417 prefix = mobj.group('prefix')
1418 query = mobj.group('query')
1420 return self._get_n_results(query, 1)
1421 elif prefix == 'all':
1422 return self._get_n_results(query, self._max_google_results)
1426 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427 elif n > self._max_google_results:
1428 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429 n = self._max_google_results
1430 return self._get_n_results(query, n)
1432 def _get_n_results(self, query, n):
1433 """Get a specified number of results for a query"""
1436 '_type': 'playlist',
1441 for pagenum in itertools.count(1):
1442 result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1444 note='Downloading result page ' + str(pagenum))
1446 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449 'url': mobj.group(1)
1451 res['entries'].append(e)
1453 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456 class YahooSearchIE(InfoExtractor):
1457 """Information Extractor for Yahoo! Video search queries."""
1460 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1461 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1462 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1463 _MORE_PAGES_INDICATOR = r'\s*Next'
1464 _max_yahoo_results = 1000
1465 IE_NAME = u'video.yahoo:search'
1467 def report_download_page(self, query, pagenum):
1468 """Report attempt to download playlist page with given number."""
1469 query = query.decode(preferredencoding())
1470 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1472 def _real_extract(self, query):
1473 mobj = re.match(self._VALID_URL, query)
1475 raise ExtractorError(u'Invalid search query "%s"' % query)
1477 prefix, query = query.split(':')
1479 query = query.encode('utf-8')
1481 self._download_n_results(query, 1)
1483 elif prefix == 'all':
1484 self._download_n_results(query, self._max_yahoo_results)
1490 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1491 elif n > self._max_yahoo_results:
1492 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1493 n = self._max_yahoo_results
1494 self._download_n_results(query, n)
1496 except ValueError: # parsing prefix as integer fails
1497 self._download_n_results(query, 1)
1500 def _download_n_results(self, query, n):
1501 """Downloads a specified number of results for a query"""
1504 already_seen = set()
1508 self.report_download_page(query, pagenum)
1509 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1510 request = compat_urllib_request.Request(result_url)
1512 page = compat_urllib_request.urlopen(request).read()
1513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1516 # Extract video identifiers
1517 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1518 video_id = mobj.group(1)
1519 if video_id not in already_seen:
1520 video_ids.append(video_id)
1521 already_seen.add(video_id)
1522 if len(video_ids) == n:
1523 # Specified n videos reached
1524 for id in video_ids:
1525 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1528 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1529 for id in video_ids:
1530 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1533 pagenum = pagenum + 1
1536 class YoutubePlaylistIE(InfoExtractor):
1537 """Information Extractor for YouTube playlists."""
1539 _VALID_URL = r"""(?:
1544 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1545 \? (?:.*?&)*? (?:p|a|list)=
1548 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1551 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1553 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1555 IE_NAME = u'youtube:playlist'
1558 def suitable(cls, url):
1559 """Receives a URL and returns True if suitable for this IE."""
1560 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1562 def _real_extract(self, url):
1563 # Extract playlist id
1564 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1566 raise ExtractorError(u'Invalid URL: %s' % url)
1568 # Download playlist videos from API
1569 playlist_id = mobj.group(1) or mobj.group(2)
1574 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1575 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1578 response = json.loads(page)
1579 except ValueError as err:
1580 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1582 if 'feed' not in response:
1583 raise ExtractorError(u'Got a malformed response from YouTube API')
1584 playlist_title = response['feed']['title']['$t']
1585 if 'entry' not in response['feed']:
1586 # Number of videos is a multiple of self._MAX_RESULTS
1589 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1590 for entry in response['feed']['entry']
1591 if 'content' in entry ]
1593 if len(response['feed']['entry']) < self._MAX_RESULTS:
1597 videos = [v[1] for v in sorted(videos)]
1599 url_results = [self.url_result(url, 'Youtube') for url in videos]
1600 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1603 class YoutubeChannelIE(InfoExtractor):
1604 """Information Extractor for YouTube channels."""
1606 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1607 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1608 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1609 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1610 IE_NAME = u'youtube:channel'
1612 def extract_videos_from_page(self, page):
1614 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1615 if mobj.group(1) not in ids_in_page:
1616 ids_in_page.append(mobj.group(1))
1619 def _real_extract(self, url):
1620 # Extract channel id
1621 mobj = re.match(self._VALID_URL, url)
1623 raise ExtractorError(u'Invalid URL: %s' % url)
1625 # Download channel page
1626 channel_id = mobj.group(1)
1630 url = self._TEMPLATE_URL % (channel_id, pagenum)
1631 page = self._download_webpage(url, channel_id,
1632 u'Downloading page #%s' % pagenum)
1634 # Extract video identifiers
1635 ids_in_page = self.extract_videos_from_page(page)
1636 video_ids.extend(ids_in_page)
1638 # Download any subsequent channel pages using the json-based channel_ajax query
1639 if self._MORE_PAGES_INDICATOR in page:
1641 pagenum = pagenum + 1
1643 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1644 page = self._download_webpage(url, channel_id,
1645 u'Downloading page #%s' % pagenum)
1647 page = json.loads(page)
1649 ids_in_page = self.extract_videos_from_page(page['content_html'])
1650 video_ids.extend(ids_in_page)
1652 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1655 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1657 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1658 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1659 return [self.playlist_result(url_entries, channel_id)]
1662 class YoutubeUserIE(InfoExtractor):
1663 """Information Extractor for YouTube users."""
1665 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1666 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1667 _GDATA_PAGE_SIZE = 50
1668 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1669 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1670 IE_NAME = u'youtube:user'
1672 def _real_extract(self, url):
1674 mobj = re.match(self._VALID_URL, url)
1676 raise ExtractorError(u'Invalid URL: %s' % url)
1678 username = mobj.group(1)
1680 # Download video ids using YouTube Data API. Result size per
1681 # query is limited (currently to 50 videos) so we need to query
1682 # page by page until there are no video ids - it means we got
1689 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1691 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1692 page = self._download_webpage(gdata_url, username,
1693 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1695 # Extract video identifiers
1698 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1699 if mobj.group(1) not in ids_in_page:
1700 ids_in_page.append(mobj.group(1))
1702 video_ids.extend(ids_in_page)
1704 # A little optimization - if current page is not
1705 # "full", ie. does not contain PAGE_SIZE video ids then
1706 # we can assume that this page is the last one - there
1707 # are no more ids on further pages - no need to query
1710 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1715 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1716 url_results = [self.url_result(url, 'Youtube') for url in urls]
1717 return [self.playlist_result(url_results, playlist_title = username)]
1720 class BlipTVUserIE(InfoExtractor):
1721 """Information Extractor for blip.tv users."""
1723 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1725 IE_NAME = u'blip.tv:user'
1727 def _real_extract(self, url):
1729 mobj = re.match(self._VALID_URL, url)
1731 raise ExtractorError(u'Invalid URL: %s' % url)
1733 username = mobj.group(1)
1735 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1737 page = self._download_webpage(url, username, u'Downloading user page')
1738 mobj = re.search(r'data-users-id="([^"]+)"', page)
1739 page_base = page_base % mobj.group(1)
1742 # Download video ids using BlipTV Ajax calls. Result size per
1743 # query is limited (currently to 12 videos) so we need to query
1744 # page by page until there are no video ids - it means we got
1751 url = page_base + "&page=" + str(pagenum)
1752 page = self._download_webpage(url, username,
1753 u'Downloading video ids from page %d' % pagenum)
1755 # Extract video identifiers
1758 for mobj in re.finditer(r'href="/([^"]+)"', page):
1759 if mobj.group(1) not in ids_in_page:
1760 ids_in_page.append(unescapeHTML(mobj.group(1)))
1762 video_ids.extend(ids_in_page)
1764 # A little optimization - if current page is not
1765 # "full", ie. does not contain PAGE_SIZE video ids then
1766 # we can assume that this page is the last one - there
1767 # are no more ids on further pages - no need to query
1770 if len(ids_in_page) < self._PAGE_SIZE:
1775 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1776 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1777 return [self.playlist_result(url_entries, playlist_title = username)]
1780 class DepositFilesIE(InfoExtractor):
1781 """Information extractor for depositfiles.com"""
1783 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1785 def _real_extract(self, url):
1786 file_id = url.split('/')[-1]
1787 # Rebuild url in english locale
1788 url = 'http://depositfiles.com/en/files/' + file_id
1790 # Retrieve file webpage with 'Free download' button pressed
1791 free_download_indication = { 'gateway_result' : '1' }
1792 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1794 self.report_download_webpage(file_id)
1795 webpage = compat_urllib_request.urlopen(request).read()
1796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1797 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1799 # Search for the real file URL
1800 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1801 if (mobj is None) or (mobj.group(1) is None):
1802 # Try to figure out reason of the error.
1803 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1804 if (mobj is not None) and (mobj.group(1) is not None):
1805 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1806 raise ExtractorError(u'%s' % restriction_message)
1808 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1810 file_url = mobj.group(1)
1811 file_extension = os.path.splitext(file_url)[1][1:]
1813 # Search for file title
1814 mobj = re.search(r'<b title="(.*?)">', webpage)
1816 raise ExtractorError(u'Unable to extract title')
1817 file_title = mobj.group(1).decode('utf-8')
1820 'id': file_id.decode('utf-8'),
1821 'url': file_url.decode('utf-8'),
1823 'upload_date': None,
1824 'title': file_title,
1825 'ext': file_extension.decode('utf-8'),
1829 class FacebookIE(InfoExtractor):
1830 """Information Extractor for Facebook"""
1832 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1833 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1834 _NETRC_MACHINE = 'facebook'
1835 IE_NAME = u'facebook'
1837 def report_login(self):
1838 """Report attempt to log in."""
1839 self.to_screen(u'Logging in')
1841 def _real_initialize(self):
1842 if self._downloader is None:
1847 downloader_params = self._downloader.params
1849 # Attempt to use provided username and password or .netrc data
1850 if downloader_params.get('username', None) is not None:
1851 useremail = downloader_params['username']
1852 password = downloader_params['password']
1853 elif downloader_params.get('usenetrc', False):
1855 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1856 if info is not None:
1860 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1861 except (IOError, netrc.NetrcParseError) as err:
1862 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1865 if useremail is None:
1874 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1877 login_results = compat_urllib_request.urlopen(request).read()
1878 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1879 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1885 def _real_extract(self, url):
1886 mobj = re.match(self._VALID_URL, url)
1888 raise ExtractorError(u'Invalid URL: %s' % url)
1889 video_id = mobj.group('ID')
1891 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1892 webpage = self._download_webpage(url, video_id)
1894 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1895 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1896 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1898 raise ExtractorError(u'Cannot parse data')
1899 data = dict(json.loads(m.group(1)))
1900 params_raw = compat_urllib_parse.unquote(data['params'])
1901 params = json.loads(params_raw)
1902 video_data = params['video_data'][0]
1903 video_url = video_data.get('hd_src')
1905 video_url = video_data['sd_src']
1907 raise ExtractorError(u'Cannot find video URL')
1908 video_duration = int(video_data['video_duration'])
1909 thumbnail = video_data['thumbnail_src']
1911 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1913 raise ExtractorError(u'Cannot find title in webpage')
1914 video_title = unescapeHTML(m.group(1))
1918 'title': video_title,
1921 'duration': video_duration,
1922 'thumbnail': thumbnail,
1927 class BlipTVIE(InfoExtractor):
1928 """Information extractor for blip.tv"""
1930 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1931 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1932 IE_NAME = u'blip.tv'
1934 def report_direct_download(self, title):
1935 """Report information extraction."""
1936 self.to_screen(u'%s: Direct download detected' % title)
1938 def _real_extract(self, url):
1939 mobj = re.match(self._VALID_URL, url)
1941 raise ExtractorError(u'Invalid URL: %s' % url)
1943 urlp = compat_urllib_parse_urlparse(url)
1944 if urlp.path.startswith('/play/'):
1945 request = compat_urllib_request.Request(url)
1946 response = compat_urllib_request.urlopen(request)
1947 redirecturl = response.geturl()
1948 rurlp = compat_urllib_parse_urlparse(redirecturl)
1949 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1950 url = 'http://blip.tv/a/a-' + file_id
1951 return self._real_extract(url)
1958 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1959 request = compat_urllib_request.Request(json_url)
1960 request.add_header('User-Agent', 'iTunes/10.6.1')
1961 self.report_extraction(mobj.group(1))
1964 urlh = compat_urllib_request.urlopen(request)
1965 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1966 basename = url.split('/')[-1]
1967 title,ext = os.path.splitext(basename)
1968 title = title.decode('UTF-8')
1969 ext = ext.replace('.', '')
1970 self.report_direct_download(title)
1975 'upload_date': None,
1980 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1981 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1982 if info is None: # Regular URL
1984 json_code_bytes = urlh.read()
1985 json_code = json_code_bytes.decode('utf-8')
1986 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1987 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1990 json_data = json.loads(json_code)
1991 if 'Post' in json_data:
1992 data = json_data['Post']
1996 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1997 video_url = data['media']['url']
1998 umobj = re.match(self._URL_EXT, video_url)
2000 raise ValueError('Can not determine filename extension')
2001 ext = umobj.group(1)
2004 'id': data['item_id'],
2006 'uploader': data['display_name'],
2007 'upload_date': upload_date,
2008 'title': data['title'],
2010 'format': data['media']['mimeType'],
2011 'thumbnail': data['thumbnailUrl'],
2012 'description': data['description'],
2013 'player_url': data['embedUrl'],
2014 'user_agent': 'iTunes/10.6.1',
2016 except (ValueError,KeyError) as err:
2017 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2022 class MyVideoIE(InfoExtractor):
2023 """Information Extractor for myvideo.de."""
2025 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2026 IE_NAME = u'myvideo'
2028 def _real_extract(self,url):
2029 mobj = re.match(self._VALID_URL, url)
2031 raise ExtractorError(u'Invalid URL: %s' % url)
2033 video_id = mobj.group(1)
2036 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2037 webpage = self._download_webpage(webpage_url, video_id)
2039 self.report_extraction(video_id)
2040 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2043 raise ExtractorError(u'Unable to extract media URL')
2044 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2046 mobj = re.search('<title>([^<]+)</title>', webpage)
2048 raise ExtractorError(u'Unable to extract title')
2050 video_title = mobj.group(1)
2056 'upload_date': None,
2057 'title': video_title,
2061 class ComedyCentralIE(InfoExtractor):
2062 """Information extractor for The Daily Show and Colbert Report """
2064 # urls can be abbreviations like :thedailyshow or :colbert
2065 # urls for episodes like:
2066 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2067 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2068 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2069 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2070 |(https?://)?(www\.)?
2071 (?P<showname>thedailyshow|colbertnation)\.com/
2072 (full-episodes/(?P<episode>.*)|
2074 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2075 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2078 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2080 _video_extensions = {
2088 _video_dimensions = {
2098 def suitable(cls, url):
2099 """Receives a URL and returns True if suitable for this IE."""
2100 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2102 def _print_formats(self, formats):
2103 print('Available formats:')
2105 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2111 raise ExtractorError(u'Invalid URL: %s' % url)
2113 if mobj.group('shortname'):
2114 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2115 url = u'http://www.thedailyshow.com/full-episodes/'
2117 url = u'http://www.colbertnation.com/full-episodes/'
2118 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2119 assert mobj is not None
2121 if mobj.group('clip'):
2122 if mobj.group('showname') == 'thedailyshow':
2123 epTitle = mobj.group('tdstitle')
2125 epTitle = mobj.group('cntitle')
2128 dlNewest = not mobj.group('episode')
2130 epTitle = mobj.group('showname')
2132 epTitle = mobj.group('episode')
2134 self.report_extraction(epTitle)
2135 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2137 url = htmlHandle.geturl()
2138 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2140 raise ExtractorError(u'Invalid redirected URL: ' + url)
2141 if mobj.group('episode') == '':
2142 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2143 epTitle = mobj.group('episode')
2145 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2147 if len(mMovieParams) == 0:
2148 # The Colbert Report embeds the information in a without
2149 # a URL prefix; so extract the alternate reference
2150 # and then add the URL prefix manually.
2152 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2153 if len(altMovieParams) == 0:
2154 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2156 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2158 uri = mMovieParams[0][1]
2159 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2160 indexXml = self._download_webpage(indexUrl, epTitle,
2161 u'Downloading show index',
2162 u'unable to download episode index')
2166 idoc = xml.etree.ElementTree.fromstring(indexXml)
2167 itemEls = idoc.findall('.//item')
2168 for partNum,itemEl in enumerate(itemEls):
2169 mediaId = itemEl.findall('./guid')[0].text
2170 shortMediaId = mediaId.split(':')[-1]
2171 showId = mediaId.split(':')[-2].replace('.com', '')
2172 officialTitle = itemEl.findall('./title')[0].text
2173 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2175 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2176 compat_urllib_parse.urlencode({'uri': mediaId}))
2177 configXml = self._download_webpage(configUrl, epTitle,
2178 u'Downloading configuration for %s' % shortMediaId)
2180 cdoc = xml.etree.ElementTree.fromstring(configXml)
2182 for rendition in cdoc.findall('.//rendition'):
2183 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2187 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2190 if self._downloader.params.get('listformats', None):
2191 self._print_formats([i[0] for i in turls])
2194 # For now, just pick the highest bitrate
2195 format,rtmp_video_url = turls[-1]
2197 # Get the format arg from the arg stream
2198 req_format = self._downloader.params.get('format', None)
2200 # Select format if we can find one
2203 format, rtmp_video_url = f, v
2206 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2208 raise ExtractorError(u'Cannot transform RTMP url')
2209 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2210 video_url = base + m.group('finalid')
2212 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2217 'upload_date': officialDate,
2222 'description': officialTitle,
2224 results.append(info)
2229 class EscapistIE(InfoExtractor):
2230 """Information extractor for The Escapist """
2232 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2233 IE_NAME = u'escapist'
2235 def _real_extract(self, url):
2236 mobj = re.match(self._VALID_URL, url)
2238 raise ExtractorError(u'Invalid URL: %s' % url)
2239 showName = mobj.group('showname')
2240 videoId = mobj.group('episode')
2242 self.report_extraction(showName)
2243 webPage = self._download_webpage(url, showName)
2245 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2246 description = unescapeHTML(descMatch.group(1))
2247 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2248 imgUrl = unescapeHTML(imgMatch.group(1))
2249 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2250 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2251 configUrlMatch = re.search('config=(.*)$', playerUrl)
2252 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2254 configJSON = self._download_webpage(configUrl, showName,
2255 u'Downloading configuration',
2256 u'unable to download configuration')
2258 # Technically, it's JavaScript, not JSON
2259 configJSON = configJSON.replace("'", '"')
2262 config = json.loads(configJSON)
2263 except (ValueError,) as err:
2264 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2266 playlist = config['playlist']
2267 videoUrl = playlist[1]['url']
2272 'uploader': showName,
2273 'upload_date': None,
2276 'thumbnail': imgUrl,
2277 'description': description,
2278 'player_url': playerUrl,
2283 class CollegeHumorIE(InfoExtractor):
2284 """Information extractor for collegehumor.com"""
2287 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2288 IE_NAME = u'collegehumor'
2290 def report_manifest(self, video_id):
2291 """Report information extraction."""
2292 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2294 def _real_extract(self, url):
2295 mobj = re.match(self._VALID_URL, url)
2297 raise ExtractorError(u'Invalid URL: %s' % url)
2298 video_id = mobj.group('videoid')
2303 'upload_date': None,
2306 self.report_extraction(video_id)
2307 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2309 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2311 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2313 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2315 videoNode = mdoc.findall('./video')[0]
2316 info['description'] = videoNode.findall('./description')[0].text
2317 info['title'] = videoNode.findall('./caption')[0].text
2318 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2319 manifest_url = videoNode.findall('./file')[0].text
2321 raise ExtractorError(u'Invalid metadata XML file')
2323 manifest_url += '?hdcore=2.10.3'
2324 self.report_manifest(video_id)
2326 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2328 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2330 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2332 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2333 node_id = media_node.attrib['url']
2334 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2335 except IndexError as err:
2336 raise ExtractorError(u'Invalid manifest file')
2338 url_pr = compat_urllib_parse_urlparse(manifest_url)
2339 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2346 class XVideosIE(InfoExtractor):
2347 """Information extractor for xvideos.com"""
2349 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2350 IE_NAME = u'xvideos'
2352 def _real_extract(self, url):
2353 mobj = re.match(self._VALID_URL, url)
2355 raise ExtractorError(u'Invalid URL: %s' % url)
2356 video_id = mobj.group(1)
2358 webpage = self._download_webpage(url, video_id)
2360 self.report_extraction(video_id)
2364 mobj = re.search(r'flv_url=(.+?)&', webpage)
2366 raise ExtractorError(u'Unable to extract video url')
2367 video_url = compat_urllib_parse.unquote(mobj.group(1))
2371 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2373 raise ExtractorError(u'Unable to extract video title')
2374 video_title = mobj.group(1)
2377 # Extract video thumbnail
2378 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2380 raise ExtractorError(u'Unable to extract video thumbnail')
2381 video_thumbnail = mobj.group(0)
2387 'upload_date': None,
2388 'title': video_title,
2390 'thumbnail': video_thumbnail,
2391 'description': None,
2397 class SoundcloudIE(InfoExtractor):
2398 """Information extractor for soundcloud.com
2399 To access the media, the uid of the song and a stream token
2400 must be extracted from the page source and the script must make
2401 a request to media.soundcloud.com/crossdomain.xml. Then
2402 the media can be grabbed by requesting from an url composed
2403 of the stream token and uid
2406 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2407 IE_NAME = u'soundcloud'
2409 def report_resolve(self, video_id):
2410 """Report information extraction."""
2411 self.to_screen(u'%s: Resolving id' % video_id)
2413 def _real_extract(self, url):
2414 mobj = re.match(self._VALID_URL, url)
2416 raise ExtractorError(u'Invalid URL: %s' % url)
2418 # extract uploader (which is in the url)
2419 uploader = mobj.group(1)
2420 # extract simple title (uploader + slug of song title)
2421 slug_title = mobj.group(2)
2422 simple_title = uploader + u'-' + slug_title
2423 full_title = '%s/%s' % (uploader, slug_title)
2425 self.report_resolve(full_title)
2427 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2428 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2429 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2431 info = json.loads(info_json)
2432 video_id = info['id']
2433 self.report_extraction(full_title)
2435 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2436 stream_json = self._download_webpage(streams_url, full_title,
2437 u'Downloading stream definitions',
2438 u'unable to download stream definitions')
2440 streams = json.loads(stream_json)
2441 mediaURL = streams['http_mp3_128_url']
2442 upload_date = unified_strdate(info['created_at'])
2447 'uploader': info['user']['username'],
2448 'upload_date': upload_date,
2449 'title': info['title'],
2451 'description': info['description'],
2454 class SoundcloudSetIE(InfoExtractor):
2455 """Information extractor for soundcloud.com sets
2456 To access the media, the uid of the song and a stream token
2457 must be extracted from the page source and the script must make
2458 a request to media.soundcloud.com/crossdomain.xml. Then
2459 the media can be grabbed by requesting from an url composed
2460 of the stream token and uid
2463 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2464 IE_NAME = u'soundcloud:set'
2466 def report_resolve(self, video_id):
2467 """Report information extraction."""
2468 self.to_screen(u'%s: Resolving id' % video_id)
2470 def _real_extract(self, url):
2471 mobj = re.match(self._VALID_URL, url)
2473 raise ExtractorError(u'Invalid URL: %s' % url)
2475 # extract uploader (which is in the url)
2476 uploader = mobj.group(1)
2477 # extract simple title (uploader + slug of song title)
2478 slug_title = mobj.group(2)
2479 simple_title = uploader + u'-' + slug_title
2480 full_title = '%s/sets/%s' % (uploader, slug_title)
2482 self.report_resolve(full_title)
2484 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2485 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2486 info_json = self._download_webpage(resolv_url, full_title)
2489 info = json.loads(info_json)
2490 if 'errors' in info:
2491 for err in info['errors']:
2492 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2495 self.report_extraction(full_title)
2496 for track in info['tracks']:
2497 video_id = track['id']
2499 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2500 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2502 self.report_extraction(video_id)
2503 streams = json.loads(stream_json)
2504 mediaURL = streams['http_mp3_128_url']
2509 'uploader': track['user']['username'],
2510 'upload_date': unified_strdate(track['created_at']),
2511 'title': track['title'],
2513 'description': track['description'],
2518 class InfoQIE(InfoExtractor):
2519 """Information extractor for infoq.com"""
2520 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2522 def _real_extract(self, url):
2523 mobj = re.match(self._VALID_URL, url)
2525 raise ExtractorError(u'Invalid URL: %s' % url)
2527 webpage = self._download_webpage(url, video_id=url)
2528 self.report_extraction(url)
2531 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2533 raise ExtractorError(u'Unable to extract video url')
2534 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2535 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2538 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2540 raise ExtractorError(u'Unable to extract video title')
2541 video_title = mobj.group(1)
2543 # Extract description
2544 video_description = u'No description available.'
2545 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2546 if mobj is not None:
2547 video_description = mobj.group(1)
2549 video_filename = video_url.split('/')[-1]
2550 video_id, extension = video_filename.split('.')
2556 'upload_date': None,
2557 'title': video_title,
2558 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2560 'description': video_description,
2565 class MixcloudIE(InfoExtractor):
2566 """Information extractor for www.mixcloud.com"""
2568 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2569 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2570 IE_NAME = u'mixcloud'
2572 def report_download_json(self, file_id):
2573 """Report JSON download."""
2574 self.to_screen(u'Downloading json')
2576 def get_urls(self, jsonData, fmt, bitrate='best'):
2577 """Get urls from 'audio_formats' section in json"""
2580 bitrate_list = jsonData[fmt]
2581 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2582 bitrate = max(bitrate_list) # select highest
2584 url_list = jsonData[fmt][bitrate]
2585 except TypeError: # we have no bitrate info.
2586 url_list = jsonData[fmt]
2589 def check_urls(self, url_list):
2590 """Returns 1st active url from list"""
2591 for url in url_list:
2593 compat_urllib_request.urlopen(url)
2595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2600 def _print_formats(self, formats):
2601 print('Available formats:')
2602 for fmt in formats.keys():
2603 for b in formats[fmt]:
2605 ext = formats[fmt][b][0]
2606 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2607 except TypeError: # we have no bitrate info
2608 ext = formats[fmt][0]
2609 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2612 def _real_extract(self, url):
2613 mobj = re.match(self._VALID_URL, url)
2615 raise ExtractorError(u'Invalid URL: %s' % url)
2616 # extract uploader & filename from url
2617 uploader = mobj.group(1).decode('utf-8')
2618 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2620 # construct API request
2621 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2622 # retrieve .json file with links to files
2623 request = compat_urllib_request.Request(file_url)
2625 self.report_download_json(file_url)
2626 jsonData = compat_urllib_request.urlopen(request).read()
2627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2631 json_data = json.loads(jsonData)
2632 player_url = json_data['player_swf_url']
2633 formats = dict(json_data['audio_formats'])
2635 req_format = self._downloader.params.get('format', None)
2638 if self._downloader.params.get('listformats', None):
2639 self._print_formats(formats)
2642 if req_format is None or req_format == 'best':
2643 for format_param in formats.keys():
2644 url_list = self.get_urls(formats, format_param)
2646 file_url = self.check_urls(url_list)
2647 if file_url is not None:
2650 if req_format not in formats:
2651 raise ExtractorError(u'Format is not available')
2653 url_list = self.get_urls(formats, req_format)
2654 file_url = self.check_urls(url_list)
2655 format_param = req_format
2658 'id': file_id.decode('utf-8'),
2659 'url': file_url.decode('utf-8'),
2660 'uploader': uploader.decode('utf-8'),
2661 'upload_date': None,
2662 'title': json_data['name'],
2663 'ext': file_url.split('.')[-1].decode('utf-8'),
2664 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2665 'thumbnail': json_data['thumbnail_url'],
2666 'description': json_data['description'],
2667 'player_url': player_url.decode('utf-8'),
2670 class StanfordOpenClassroomIE(InfoExtractor):
2671 """Information extractor for Stanford's Open ClassRoom"""
2673 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2674 IE_NAME = u'stanfordoc'
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2679 raise ExtractorError(u'Invalid URL: %s' % url)
2681 if mobj.group('course') and mobj.group('video'): # A specific video
2682 course = mobj.group('course')
2683 video = mobj.group('video')
2685 'id': course + '_' + video,
2687 'upload_date': None,
2690 self.report_extraction(info['id'])
2691 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2692 xmlUrl = baseUrl + video + '.xml'
2694 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2696 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2697 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2699 info['title'] = mdoc.findall('./title')[0].text
2700 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2702 raise ExtractorError(u'Invalid metadata XML file')
2703 info['ext'] = info['url'].rpartition('.')[2]
2705 elif mobj.group('course'): # A course page
2706 course = mobj.group('course')
2711 'upload_date': None,
2714 coursepage = self._download_webpage(url, info['id'],
2715 note='Downloading course info page',
2716 errnote='Unable to download course info page')
2718 m = re.search('<h1>([^<]+)</h1>', coursepage)
2720 info['title'] = unescapeHTML(m.group(1))
2722 info['title'] = info['id']
2724 m = re.search('<description>([^<]+)</description>', coursepage)
2726 info['description'] = unescapeHTML(m.group(1))
2728 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2731 'type': 'reference',
2732 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2736 for entry in info['list']:
2737 assert entry['type'] == 'reference'
2738 results += self.extract(entry['url'])
2742 'id': 'Stanford OpenClassroom',
2745 'upload_date': None,
2748 self.report_download_webpage(info['id'])
2749 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2751 rootpage = compat_urllib_request.urlopen(rootURL).read()
2752 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2755 info['title'] = info['id']
2757 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2760 'type': 'reference',
2761 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2766 for entry in info['list']:
2767 assert entry['type'] == 'reference'
2768 results += self.extract(entry['url'])
2771 class MTVIE(InfoExtractor):
2772 """Information extractor for MTV.com"""
2774 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2777 def _real_extract(self, url):
2778 mobj = re.match(self._VALID_URL, url)
2780 raise ExtractorError(u'Invalid URL: %s' % url)
2781 if not mobj.group('proto'):
2782 url = 'http://' + url
2783 video_id = mobj.group('videoid')
2785 webpage = self._download_webpage(url, video_id)
2787 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2789 raise ExtractorError(u'Unable to extract song name')
2790 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2791 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2793 raise ExtractorError(u'Unable to extract performer')
2794 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2795 video_title = performer + ' - ' + song_name
2797 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2799 raise ExtractorError(u'Unable to mtvn_uri')
2800 mtvn_uri = mobj.group(1)
2802 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2804 raise ExtractorError(u'Unable to extract content id')
2805 content_id = mobj.group(1)
2807 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2808 self.report_extraction(video_id)
2809 request = compat_urllib_request.Request(videogen_url)
2811 metadataXml = compat_urllib_request.urlopen(request).read()
2812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2815 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2816 renditions = mdoc.findall('.//rendition')
2818 # For now, always pick the highest quality.
2819 rendition = renditions[-1]
2822 _,_,ext = rendition.attrib['type'].partition('/')
2823 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2824 video_url = rendition.find('./src').text
2826 raise ExtractorError('Invalid rendition field.')
2831 'uploader': performer,
2832 'upload_date': None,
2833 'title': video_title,
2841 class YoukuIE(InfoExtractor):
2842 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2845 nowTime = int(time.time() * 1000)
2846 random1 = random.randint(1000,1998)
2847 random2 = random.randint(1000,9999)
2849 return "%d%d%d" %(nowTime,random1,random2)
2851 def _get_file_ID_mix_string(self, seed):
2853 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2855 for i in range(len(source)):
2856 seed = (seed * 211 + 30031 ) % 65536
2857 index = math.floor(seed / 65536 * len(source) )
2858 mixed.append(source[int(index)])
2859 source.remove(source[int(index)])
2860 #return ''.join(mixed)
2863 def _get_file_id(self, fileId, seed):
2864 mixed = self._get_file_ID_mix_string(seed)
2865 ids = fileId.split('*')
2869 realId.append(mixed[int(ch)])
2870 return ''.join(realId)
2872 def _real_extract(self, url):
2873 mobj = re.match(self._VALID_URL, url)
2875 raise ExtractorError(u'Invalid URL: %s' % url)
2876 video_id = mobj.group('ID')
2878 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2880 jsondata = self._download_webpage(info_url, video_id)
2882 self.report_extraction(video_id)
2884 config = json.loads(jsondata)
2886 video_title = config['data'][0]['title']
2887 seed = config['data'][0]['seed']
2889 format = self._downloader.params.get('format', None)
2890 supported_format = list(config['data'][0]['streamfileids'].keys())
2892 if format is None or format == 'best':
2893 if 'hd2' in supported_format:
2898 elif format == 'worst':
2906 fileid = config['data'][0]['streamfileids'][format]
2907 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2908 except (UnicodeDecodeError, ValueError, KeyError):
2909 raise ExtractorError(u'Unable to extract info section')
2912 sid = self._gen_sid()
2913 fileid = self._get_file_id(fileid, seed)
2915 #column 8,9 of fileid represent the segment number
2916 #fileid[7:9] should be changed
2917 for index, key in enumerate(keys):
2919 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2920 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2923 'id': '%s_part%02d' % (video_id, index),
2924 'url': download_url,
2926 'upload_date': None,
2927 'title': video_title,
2930 files_info.append(info)
2935 class XNXXIE(InfoExtractor):
2936 """Information extractor for xnxx.com"""
2938 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2940 VIDEO_URL_RE = r'flv_url=(.*?)&'
2941 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2942 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2944 def _real_extract(self, url):
2945 mobj = re.match(self._VALID_URL, url)
2947 raise ExtractorError(u'Invalid URL: %s' % url)
2948 video_id = mobj.group(1)
2950 # Get webpage content
2951 webpage = self._download_webpage(url, video_id)
2953 result = re.search(self.VIDEO_URL_RE, webpage)
2955 raise ExtractorError(u'Unable to extract video url')
2956 video_url = compat_urllib_parse.unquote(result.group(1))
2958 result = re.search(self.VIDEO_TITLE_RE, webpage)
2960 raise ExtractorError(u'Unable to extract video title')
2961 video_title = result.group(1)
2963 result = re.search(self.VIDEO_THUMB_RE, webpage)
2965 raise ExtractorError(u'Unable to extract video thumbnail')
2966 video_thumbnail = result.group(1)
2972 'upload_date': None,
2973 'title': video_title,
2975 'thumbnail': video_thumbnail,
2976 'description': None,
2980 class GooglePlusIE(InfoExtractor):
2981 """Information extractor for plus.google.com."""
2983 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2984 IE_NAME = u'plus.google'
2986 def report_extract_entry(self, url):
2987 """Report downloading extry"""
2988 self.to_screen(u'Downloading entry: %s' % url)
2990 def report_date(self, upload_date):
2991 """Report downloading extry"""
2992 self.to_screen(u'Entry date: %s' % upload_date)
2994 def report_uploader(self, uploader):
2995 """Report downloading extry"""
2996 self.to_screen(u'Uploader: %s' % uploader)
2998 def report_title(self, video_title):
2999 """Report downloading extry"""
3000 self.to_screen(u'Title: %s' % video_title)
3002 def report_extract_vid_page(self, video_page):
3003 """Report information extraction."""
3004 self.to_screen(u'Extracting video page: %s' % video_page)
3006 def _real_extract(self, url):
3007 # Extract id from URL
3008 mobj = re.match(self._VALID_URL, url)
3010 raise ExtractorError(u'Invalid URL: %s' % url)
3012 post_url = mobj.group(0)
3013 video_id = mobj.group(1)
3015 video_extension = 'flv'
3017 # Step 1, Retrieve post webpage to extract further information
3018 self.report_extract_entry(post_url)
3019 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3021 # Extract update date
3023 pattern = 'title="Timestamp">(.*?)</a>'
3024 mobj = re.search(pattern, webpage)
3026 upload_date = mobj.group(1)
3027 # Convert timestring to a format suitable for filename
3028 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3029 upload_date = upload_date.strftime('%Y%m%d')
3030 self.report_date(upload_date)
3034 pattern = r'rel\="author".*?>(.*?)</a>'
3035 mobj = re.search(pattern, webpage)
3037 uploader = mobj.group(1)
3038 self.report_uploader(uploader)
3041 # Get the first line for title
3043 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3044 mobj = re.search(pattern, webpage)
3046 video_title = mobj.group(1)
3047 self.report_title(video_title)
3049 # Step 2, Stimulate clicking the image box to launch video
3050 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3051 mobj = re.search(pattern, webpage)
3053 raise ExtractorError(u'Unable to extract video page URL')
3055 video_page = mobj.group(1)
3056 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3057 self.report_extract_vid_page(video_page)
3060 # Extract video links on video page
3061 """Extract video links of all sizes"""
3062 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3063 mobj = re.findall(pattern, webpage)
3065 raise ExtractorError(u'Unable to extract video links')
3067 # Sort in resolution
3068 links = sorted(mobj)
3070 # Choose the lowest of the sort, i.e. highest resolution
3071 video_url = links[-1]
3072 # Only get the url. The resolution part in the tuple has no use anymore
3073 video_url = video_url[-1]
3074 # Treat escaped \u0026 style hex
3076 video_url = video_url.decode("unicode_escape")
3077 except AttributeError: # Python 3
3078 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3084 'uploader': uploader,
3085 'upload_date': upload_date,
3086 'title': video_title,
3087 'ext': video_extension,
3090 class NBAIE(InfoExtractor):
3091 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3094 def _real_extract(self, url):
3095 mobj = re.match(self._VALID_URL, url)
3097 raise ExtractorError(u'Invalid URL: %s' % url)
3099 video_id = mobj.group(1)
3100 if video_id.endswith('/index.html'):
3101 video_id = video_id[:-len('/index.html')]
3103 webpage = self._download_webpage(url, video_id)
3105 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3106 def _findProp(rexp, default=None):
3107 m = re.search(rexp, webpage)
3109 return unescapeHTML(m.group(1))
3113 shortened_video_id = video_id.rpartition('/')[2]
3114 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3116 'id': shortened_video_id,
3120 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3121 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3125 class JustinTVIE(InfoExtractor):
3126 """Information extractor for justin.tv and twitch.tv"""
3127 # TODO: One broadcast may be split into multiple videos. The key
3128 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3129 # starts at 1 and increases. Can we treat all parts as one video?
3131 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3133 (?P<channelid>[^/]+)|
3134 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3135 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3139 _JUSTIN_PAGE_LIMIT = 100
3140 IE_NAME = u'justin.tv'
3142 def report_download_page(self, channel, offset):
3143 """Report attempt to download a single page of videos."""
3144 self.to_screen(u'%s: Downloading video information from %d to %d' %
3145 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3147 # Return count of items, list of *valid* items
3148 def _parse_page(self, url, video_id):
3149 webpage = self._download_webpage(url, video_id,
3150 u'Downloading video info JSON',
3151 u'unable to download video info JSON')
3153 response = json.loads(webpage)
3154 if type(response) != list:
3155 error_text = response.get('error', 'unknown error')
3156 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3158 for clip in response:
3159 video_url = clip['video_file_url']
3161 video_extension = os.path.splitext(video_url)[1][1:]
3162 video_date = re.sub('-', '', clip['start_time'][:10])
3163 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3164 video_id = clip['id']
3165 video_title = clip.get('title', video_id)
3169 'title': video_title,
3170 'uploader': clip.get('channel_name', video_uploader_id),
3171 'uploader_id': video_uploader_id,
3172 'upload_date': video_date,
3173 'ext': video_extension,
3175 return (len(response), info)
3177 def _real_extract(self, url):
3178 mobj = re.match(self._VALID_URL, url)
3180 raise ExtractorError(u'invalid URL: %s' % url)
3182 api_base = 'http://api.justin.tv'
3184 if mobj.group('channelid'):
3186 video_id = mobj.group('channelid')
3187 api = api_base + '/channel/archives/%s.json' % video_id
3188 elif mobj.group('chapterid'):
3189 chapter_id = mobj.group('chapterid')
3191 webpage = self._download_webpage(url, chapter_id)
3192 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3194 raise ExtractorError(u'Cannot find archive of a chapter')
3195 archive_id = m.group(1)
3197 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3198 chapter_info_xml = self._download_webpage(api, chapter_id,
3199 note=u'Downloading chapter information',
3200 errnote=u'Chapter information download failed')
3201 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3202 for a in doc.findall('.//archive'):
3203 if archive_id == a.find('./id').text:
3206 raise ExtractorError(u'Could not find chapter in chapter information')
3208 video_url = a.find('./video_file_url').text
3209 video_ext = video_url.rpartition('.')[2] or u'flv'
3211 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3212 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3213 note='Downloading chapter metadata',
3214 errnote='Download of chapter metadata failed')
3215 chapter_info = json.loads(chapter_info_json)
3217 bracket_start = int(doc.find('.//bracket_start').text)
3218 bracket_end = int(doc.find('.//bracket_end').text)
3220 # TODO determine start (and probably fix up file)
3221 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3222 #video_url += u'?start=' + TODO:start_timestamp
3223 # bracket_start is 13290, but we want 51670615
3224 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3225 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3228 'id': u'c' + chapter_id,
3231 'title': chapter_info['title'],
3232 'thumbnail': chapter_info['preview'],
3233 'description': chapter_info['description'],
3234 'uploader': chapter_info['channel']['display_name'],
3235 'uploader_id': chapter_info['channel']['name'],
3239 video_id = mobj.group('videoid')
3240 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3242 self.report_extraction(video_id)
3246 limit = self._JUSTIN_PAGE_LIMIT
3249 self.report_download_page(video_id, offset)
3250 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3251 page_count, page_info = self._parse_page(page_url, video_id)
3252 info.extend(page_info)
3253 if not paged or page_count != limit:
3258 class FunnyOrDieIE(InfoExtractor):
3259 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3261 def _real_extract(self, url):
3262 mobj = re.match(self._VALID_URL, url)
3264 raise ExtractorError(u'invalid URL: %s' % url)
3266 video_id = mobj.group('id')
3267 webpage = self._download_webpage(url, video_id)
3269 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3271 raise ExtractorError(u'Unable to find video information')
3272 video_url = unescapeHTML(m.group('url'))
3274 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3276 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3278 raise ExtractorError(u'Cannot find video title')
3279 title = clean_html(m.group('title'))
3281 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3283 desc = unescapeHTML(m.group('desc'))
3292 'description': desc,
3296 class SteamIE(InfoExtractor):
3297 _VALID_URL = r"""http://store\.steampowered\.com/
3299 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3301 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3305 def suitable(cls, url):
3306 """Receives a URL and returns True if suitable for this IE."""
3307 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3309 def _real_extract(self, url):
3310 m = re.match(self._VALID_URL, url, re.VERBOSE)
3311 gameID = m.group('gameID')
3312 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3313 self.report_age_confirmation()
3314 webpage = self._download_webpage(videourl, gameID)
3315 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3317 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3318 mweb = re.finditer(urlRE, webpage)
3319 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3320 titles = re.finditer(namesRE, webpage)
3321 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3322 thumbs = re.finditer(thumbsRE, webpage)
3324 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3325 video_id = vid.group('videoID')
3326 title = vtitle.group('videoName')
3327 video_url = vid.group('videoURL')
3328 video_thumb = thumb.group('thumbnail')
3330 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3335 'title': unescapeHTML(title),
3336 'thumbnail': video_thumb
3339 return [self.playlist_result(videos, gameID, game_title)]
3341 class UstreamIE(InfoExtractor):
3342 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3343 IE_NAME = u'ustream'
3345 def _real_extract(self, url):
3346 m = re.match(self._VALID_URL, url)
3347 video_id = m.group('videoID')
3348 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3349 webpage = self._download_webpage(url, video_id)
3350 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3351 title = m.group('title')
3352 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3353 uploader = m.group('uploader')
3359 'uploader': uploader
3363 class WorldStarHipHopIE(InfoExtractor):
3364 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3365 IE_NAME = u'WorldStarHipHop'
3367 def _real_extract(self, url):
3368 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3370 m = re.match(self._VALID_URL, url)
3371 video_id = m.group('id')
3373 webpage_src = self._download_webpage(url, video_id)
3375 mobj = re.search(_src_url, webpage_src)
3377 if mobj is not None:
3378 video_url = mobj.group(1)
3379 if 'mp4' in video_url:
3384 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3386 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3389 raise ExtractorError(u'Cannot determine title')
3390 title = mobj.group(1)
3392 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3393 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3394 if mobj is not None:
3395 thumbnail = mobj.group(1)
3397 _title = r"""candytitles.*>(.*)</span>"""
3398 mobj = re.search(_title, webpage_src)
3399 if mobj is not None:
3400 title = mobj.group(1)
3407 'thumbnail' : thumbnail,
3412 class RBMARadioIE(InfoExtractor):
3413 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3415 def _real_extract(self, url):
3416 m = re.match(self._VALID_URL, url)
3417 video_id = m.group('videoID')
3419 webpage = self._download_webpage(url, video_id)
3420 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3422 raise ExtractorError(u'Cannot find metadata')
3423 json_data = m.group(1)
3426 data = json.loads(json_data)
3427 except ValueError as e:
3428 raise ExtractorError(u'Invalid JSON: ' + str(e))
3430 video_url = data['akamai_url'] + '&cbr=256'
3431 url_parts = compat_urllib_parse_urlparse(video_url)
3432 video_ext = url_parts.path.rpartition('.')[2]
3437 'title': data['title'],
3438 'description': data.get('teaser_text'),
3439 'location': data.get('country_of_origin'),
3440 'uploader': data.get('host', {}).get('name'),
3441 'uploader_id': data.get('host', {}).get('slug'),
3442 'thumbnail': data.get('image', {}).get('large_url_2x'),
3443 'duration': data.get('duration'),
3448 class YouPornIE(InfoExtractor):
3449 """Information extractor for youporn.com."""
3450 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3452 def _print_formats(self, formats):
3453 """Print all available formats"""
3454 print(u'Available formats:')
3455 print(u'ext\t\tformat')
3456 print(u'---------------------------------')
3457 for format in formats:
3458 print(u'%s\t\t%s' % (format['ext'], format['format']))
3460 def _specific(self, req_format, formats):
3462 if(x["format"]==req_format):
3466 def _real_extract(self, url):
3467 mobj = re.match(self._VALID_URL, url)
3469 raise ExtractorError(u'Invalid URL: %s' % url)
3471 video_id = mobj.group('videoid')
3473 req = compat_urllib_request.Request(url)
3474 req.add_header('Cookie', 'age_verified=1')
3475 webpage = self._download_webpage(req, video_id)
3477 # Get the video title
3478 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3480 raise ExtractorError(u'Unable to extract video title')
3481 video_title = result.group('title').strip()
3483 # Get the video date
3484 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3486 self._downloader.report_warning(u'unable to extract video date')
3489 upload_date = unified_strdate(result.group('date').strip())
3491 # Get the video uploader
3492 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3494 self._downloader.report_warning(u'unable to extract uploader')
3495 video_uploader = None
3497 video_uploader = result.group('uploader').strip()
3498 video_uploader = clean_html( video_uploader )
3500 # Get all of the formats available
3501 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3502 result = re.search(DOWNLOAD_LIST_RE, webpage)
3504 raise ExtractorError(u'Unable to extract download list')
3505 download_list_html = result.group('download_list').strip()
3507 # Get all of the links from the page
3508 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3509 links = re.findall(LINK_RE, download_list_html)
3510 if(len(links) == 0):
3511 raise ExtractorError(u'ERROR: no known formats available for video')
3513 self.to_screen(u'Links found: %d' % len(links))
3518 # A link looks like this:
3519 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3520 # A path looks like this:
3521 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3522 video_url = unescapeHTML( link )
3523 path = compat_urllib_parse_urlparse( video_url ).path
3524 extension = os.path.splitext( path )[1][1:]
3525 format = path.split('/')[4].split('_')[:2]
3528 format = "-".join( format )
3529 title = u'%s-%s-%s' % (video_title, size, bitrate)
3534 'uploader': video_uploader,
3535 'upload_date': upload_date,
3540 'description': None,
3544 if self._downloader.params.get('listformats', None):
3545 self._print_formats(formats)
3548 req_format = self._downloader.params.get('format', None)
3549 self.to_screen(u'Format: %s' % req_format)
3551 if req_format is None or req_format == 'best':
3553 elif req_format == 'worst':
3554 return [formats[-1]]
3555 elif req_format in ('-1', 'all'):
3558 format = self._specific( req_format, formats )
3560 raise ExtractorError(u'Requested format not available')
3565 class PornotubeIE(InfoExtractor):
3566 """Information extractor for pornotube.com."""
3567 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3569 def _real_extract(self, url):
3570 mobj = re.match(self._VALID_URL, url)
3572 raise ExtractorError(u'Invalid URL: %s' % url)
3574 video_id = mobj.group('videoid')
3575 video_title = mobj.group('title')
3577 # Get webpage content
3578 webpage = self._download_webpage(url, video_id)
3581 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3582 result = re.search(VIDEO_URL_RE, webpage)
3584 raise ExtractorError(u'Unable to extract video url')
3585 video_url = compat_urllib_parse.unquote(result.group('url'))
3587 #Get the uploaded date
3588 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3589 result = re.search(VIDEO_UPLOADED_RE, webpage)
3591 raise ExtractorError(u'Unable to extract video title')
3592 upload_date = unified_strdate(result.group('date'))
3594 info = {'id': video_id,
3597 'upload_date': upload_date,
3598 'title': video_title,
3604 class YouJizzIE(InfoExtractor):
3605 """Information extractor for youjizz.com."""
3606 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3608 def _real_extract(self, url):
3609 mobj = re.match(self._VALID_URL, url)
3611 raise ExtractorError(u'Invalid URL: %s' % url)
3613 video_id = mobj.group('videoid')
3615 # Get webpage content
3616 webpage = self._download_webpage(url, video_id)
3618 # Get the video title
3619 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3621 raise ExtractorError(u'ERROR: unable to extract video title')
3622 video_title = result.group('title').strip()
3624 # Get the embed page
3625 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3627 raise ExtractorError(u'ERROR: unable to extract embed page')
3629 embed_page_url = result.group(0).strip()
3630 video_id = result.group('videoid')
3632 webpage = self._download_webpage(embed_page_url, video_id)
3635 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3637 raise ExtractorError(u'ERROR: unable to extract video url')
3638 video_url = result.group('source')
3640 info = {'id': video_id,
3642 'title': video_title,
3645 'player_url': embed_page_url}
3649 class EightTracksIE(InfoExtractor):
3651 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3653 def _real_extract(self, url):
3654 mobj = re.match(self._VALID_URL, url)
3656 raise ExtractorError(u'Invalid URL: %s' % url)
3657 playlist_id = mobj.group('id')
3659 webpage = self._download_webpage(url, playlist_id)
3661 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3663 raise ExtractorError(u'Cannot find trax information')
3664 json_like = m.group(1)
3665 data = json.loads(json_like)
3667 session = str(random.randint(0, 1000000000))
3669 track_count = data['tracks_count']
3670 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3671 next_url = first_url
3673 for i in itertools.count():
3674 api_json = self._download_webpage(next_url, playlist_id,
3675 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3676 errnote=u'Failed to download song information')
3677 api_data = json.loads(api_json)
3678 track_data = api_data[u'set']['track']
3680 'id': track_data['id'],
3681 'url': track_data['track_file_stream_url'],
3682 'title': track_data['performer'] + u' - ' + track_data['name'],
3683 'raw_title': track_data['name'],
3684 'uploader_id': data['user']['login'],
3688 if api_data['set']['at_last_track']:
3690 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3693 class KeekIE(InfoExtractor):
3694 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3697 def _real_extract(self, url):
3698 m = re.match(self._VALID_URL, url)
3699 video_id = m.group('videoID')
3700 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3701 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3702 webpage = self._download_webpage(url, video_id)
3703 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3704 title = unescapeHTML(m.group('title'))
3705 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3706 uploader = clean_html(m.group('uploader'))
3712 'thumbnail': thumbnail,
3713 'uploader': uploader
3717 class TEDIE(InfoExtractor):
3718 _VALID_URL=r'''http://www\.ted\.com/
3720 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3722 ((?P<type_talk>talks)) # We have a simple talk
3724 (/lang/(.*?))? # The url may contain the language
3725 /(?P<name>\w+) # Here goes the name and then ".html"
3729 def suitable(cls, url):
3730 """Receives a URL and returns True if suitable for this IE."""
3731 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3733 def _real_extract(self, url):
3734 m=re.match(self._VALID_URL, url, re.VERBOSE)
3735 if m.group('type_talk'):
3736 return [self._talk_info(url)]
3738 playlist_id=m.group('playlist_id')
3739 name=m.group('name')
3740 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3741 return [self._playlist_videos_info(url,name,playlist_id)]
3743 def _talk_video_link(self,mediaSlug):
3744 '''Returns the video link for that mediaSlug'''
3745 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3747 def _playlist_videos_info(self,url,name,playlist_id=0):
3748 '''Returns the videos of the playlist'''
3750 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3751 ([.\s]*?)data-playlist_item_id="(\d+)"
3752 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3754 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3755 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3756 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3757 m_names=re.finditer(video_name_RE,webpage)
3759 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3760 m_playlist = re.search(playlist_RE, webpage)
3761 playlist_title = m_playlist.group('playlist_title')
3763 playlist_entries = []
3764 for m_video, m_name in zip(m_videos,m_names):
3765 video_id=m_video.group('video_id')
3766 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3767 playlist_entries.append(self.url_result(talk_url, 'TED'))
3768 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3770 def _talk_info(self, url, video_id=0):
3771 """Return the video for the talk in the url"""
3772 m=re.match(self._VALID_URL, url,re.VERBOSE)
3773 videoName=m.group('name')
3774 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3775 # If the url includes the language we get the title translated
3776 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3777 title=re.search(title_RE, webpage).group('title')
3778 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3779 "id":(?P<videoID>[\d]+).*?
3780 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3781 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3782 thumb_match=re.search(thumb_RE,webpage)
3783 info_match=re.search(info_RE,webpage,re.VERBOSE)
3784 video_id=info_match.group('videoID')
3785 mediaSlug=info_match.group('mediaSlug')
3786 video_url=self._talk_video_link(mediaSlug)
3792 'thumbnail': thumb_match.group('thumbnail')
3796 class MySpassIE(InfoExtractor):
3797 _VALID_URL = r'http://www.myspass.de/.*'
3799 def _real_extract(self, url):
3800 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3802 # video id is the last path element of the URL
3803 # usually there is a trailing slash, so also try the second but last
3804 url_path = compat_urllib_parse_urlparse(url).path
3805 url_parent_path, video_id = os.path.split(url_path)
3807 _, video_id = os.path.split(url_parent_path)
3810 metadata_url = META_DATA_URL_TEMPLATE % video_id
3811 metadata_text = self._download_webpage(metadata_url, video_id)
3812 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3814 # extract values from metadata
3815 url_flv_el = metadata.find('url_flv')
3816 if url_flv_el is None:
3817 raise ExtractorError(u'Unable to extract download url')
3818 video_url = url_flv_el.text
3819 extension = os.path.splitext(video_url)[1][1:]
3820 title_el = metadata.find('title')
3821 if title_el is None:
3822 raise ExtractorError(u'Unable to extract title')
3823 title = title_el.text
3824 format_id_el = metadata.find('format_id')
3825 if format_id_el is None:
3828 format = format_id_el.text
3829 description_el = metadata.find('description')
3830 if description_el is not None:
3831 description = description_el.text
3834 imagePreview_el = metadata.find('imagePreview')
3835 if imagePreview_el is not None:
3836 thumbnail = imagePreview_el.text
3845 'thumbnail': thumbnail,
3846 'description': description
3850 class SpiegelIE(InfoExtractor):
3851 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3853 def _real_extract(self, url):
3854 m = re.match(self._VALID_URL, url)
3855 video_id = m.group('videoID')
3857 webpage = self._download_webpage(url, video_id)
3858 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3860 raise ExtractorError(u'Cannot find title')
3861 video_title = unescapeHTML(m.group(1))
3863 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3864 xml_code = self._download_webpage(xml_url, video_id,
3865 note=u'Downloading XML', errnote=u'Failed to download XML')
3867 idoc = xml.etree.ElementTree.fromstring(xml_code)
3868 last_type = idoc[-1]
3869 filename = last_type.findall('./filename')[0].text
3870 duration = float(last_type.findall('./duration')[0].text)
3872 video_url = 'http://video2.spiegel.de/flash/' + filename
3873 video_ext = filename.rpartition('.')[2]
3878 'title': video_title,
3879 'duration': duration,
3883 class LiveLeakIE(InfoExtractor):
3885 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3886 IE_NAME = u'liveleak'
3888 def _real_extract(self, url):
3889 mobj = re.match(self._VALID_URL, url)
3891 raise ExtractorError(u'Invalid URL: %s' % url)
3893 video_id = mobj.group('video_id')
3895 webpage = self._download_webpage(url, video_id)
3897 m = re.search(r'file: "(.*?)",', webpage)
3899 raise ExtractorError(u'Unable to find video url')
3900 video_url = m.group(1)
3902 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3904 raise ExtractorError(u'Cannot find video title')
3905 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3907 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3909 desc = unescapeHTML(m.group('desc'))
3913 m = re.search(r'By:.*?(\w+)</a>', webpage)
3915 uploader = clean_html(m.group(1))
3924 'description': desc,
3925 'uploader': uploader
3930 class ARDIE(InfoExtractor):
3931 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3932 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3933 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3935 def _real_extract(self, url):
3936 # determine video id from url
3937 m = re.match(self._VALID_URL, url)
3939 numid = re.search(r'documentId=([0-9]+)', url)
3941 video_id = numid.group(1)
3943 video_id = m.group('video_id')
3945 # determine title and media streams from webpage
3946 html = self._download_webpage(url, video_id)
3947 title = re.search(self._TITLE, html).group('title')
3948 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3950 assert '"fsk"' in html
3951 raise ExtractorError(u'This video is only available after 8:00 pm')
3953 # choose default media type and highest quality for now
3954 stream = max([s for s in streams if int(s["media_type"]) == 0],
3955 key=lambda s: int(s["quality"]))
3957 # there's two possibilities: RTMP stream or HTTP download
3958 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3959 if stream['rtmp_url']:
3960 self.to_screen(u'RTMP download detected')
3961 assert stream['video_url'].startswith('mp4:')
3962 info["url"] = stream["rtmp_url"]
3963 info["play_path"] = stream['video_url']
3965 assert stream["video_url"].endswith('.mp4')
3966 info["url"] = stream["video_url"]
3969 class TumblrIE(InfoExtractor):
3970 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3972 def _real_extract(self, url):
3973 m_url = re.match(self._VALID_URL, url)
3974 video_id = m_url.group('id')
3975 blog = m_url.group('blog_name')
3977 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3978 webpage = self._download_webpage(url, video_id)
3980 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3981 video = re.search(re_video, webpage)
3983 self.to_screen("No video founded")
3985 video_url = video.group('video_url')
3986 ext = video.group('ext')
3988 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3989 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3991 # The only place where you can get a title, it's not complete,
3992 # but searching in other places doesn't work for all videos
3993 re_title = r'<title>(?P<title>.*?)</title>'
3994 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3996 return [{'id': video_id,
4003 class BandcampIE(InfoExtractor):
4004 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4006 def _real_extract(self, url):
4007 mobj = re.match(self._VALID_URL, url)
4008 title = mobj.group('title')
4009 webpage = self._download_webpage(url, title)
4010 # We get the link to the free download page
4011 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4012 if m_download is None:
4013 raise ExtractorError(u'No free songs founded')
4015 download_link = m_download.group(1)
4016 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4017 webpage, re.MULTILINE|re.DOTALL).group('id')
4019 download_webpage = self._download_webpage(download_link, id,
4020 'Downloading free downloads page')
4021 # We get the dictionary of the track from some javascrip code
4022 info = re.search(r'items: (.*?),$',
4023 download_webpage, re.MULTILINE).group(1)
4024 info = json.loads(info)[0]
4025 # We pick mp3-320 for now, until format selection can be easily implemented.
4026 mp3_info = info[u'downloads'][u'mp3-320']
4027 # If we try to use this url it says the link has expired
4028 initial_url = mp3_info[u'url']
4029 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4030 m_url = re.match(re_url, initial_url)
4031 #We build the url we will use to get the final track url
4032 # This url is build in Bandcamp in the script download_bunde_*.js
4033 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4034 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4035 # If we could correctly generate the .rand field the url would be
4036 #in the "download_url" key
4037 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4039 track_info = {'id':id,
4040 'title' : info[u'title'],
4043 'thumbnail' : info[u'thumb_url'],
4044 'uploader' : info[u'artist']
4049 class RedTubeIE(InfoExtractor):
4050 """Information Extractor for redtube"""
4051 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4053 def _real_extract(self,url):
4054 mobj = re.match(self._VALID_URL, url)
4056 raise ExtractorError(u'Invalid URL: %s' % url)
4058 video_id = mobj.group('id')
4059 video_extension = 'mp4'
4060 webpage = self._download_webpage(url, video_id)
4061 self.report_extraction(video_id)
4062 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4065 raise ExtractorError(u'Unable to extract media URL')
4067 video_url = mobj.group(1)
4068 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4070 raise ExtractorError(u'Unable to extract title')
4071 video_title = mobj.group(1)
4076 'ext': video_extension,
4077 'title': video_title,
4080 class InaIE(InfoExtractor):
4081 """Information Extractor for Ina.fr"""
4082 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4084 def _real_extract(self,url):
4085 mobj = re.match(self._VALID_URL, url)
4087 video_id = mobj.group('id')
4088 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4089 video_extension = 'mp4'
4090 webpage = self._download_webpage(mrss_url, video_id)
4092 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4094 raise ExtractorError(u'Unable to extract media URL')
4095 video_url = mobj.group(1)
4097 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4099 raise ExtractorError(u'Unable to extract title')
4100 video_title = mobj.group(1)
4105 'ext': video_extension,
4106 'title': video_title,
4109 def gen_extractors():
4110 """ Return a list of an instance of every supported extractor.
4111 The order does matter; the first extractor matched is the one handling the URL.
4114 YoutubePlaylistIE(),
4139 StanfordOpenClassroomIE(),
4149 WorldStarHipHopIE(),
4169 def get_info_extractor(ie_name):
4170 """Returns the info extractor class with the given ie_name"""
4171 return globals()[ie_name+'IE']