2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
646 video_url_list = [(rf, url_map[rf])]
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
654 for format_param, video_real_url in video_url_list:
656 video_extension = self._video_extensions.get(format_param, 'flv')
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
689 self.to_screen(u'Retrieving disclaimer')
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
716 raise ExtractorError(u'Invalid URL: %s' % url)
718 video_id = mobj.group(1)
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
725 # Retrieve video webpage to extract further information
726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
745 raise ExtractorError(u'Unable to extract media URL')
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
748 raise ExtractorError(u'Unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
751 raise ExtractorError(u'Unable to extract media URL')
752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753 video_extension = mediaURL[-3:]
754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
758 raise ExtractorError(u'Unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
761 mobj = re.search(r'submitter=(.*?);', webpage)
763 raise ExtractorError(u'Unable to extract uploader nickname')
764 video_uploader = mobj.group(1)
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
785 raise ExtractorError(u'Invalid URL: %s' % url)
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
789 video_extension = 'mp4'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800 raise ExtractorError(u'Unable to extract media URL')
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
806 self.to_screen(u'Using %s' % key)
809 raise ExtractorError(u'Unable to extract video URL')
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
813 raise ExtractorError(u'Unable to extract video URL')
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
817 # TODO: support choosing qualities
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
821 raise ExtractorError(u'Unable to extract title')
822 video_title = unescapeHTML(mobj.group('title'))
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
830 self._downloader.report_warning(u'unable to extract uploader nickname')
832 video_uploader = mobj_official.group(1)
834 video_uploader = mobj.group(1)
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
844 'uploader': video_uploader,
845 'upload_date': video_upload_date,
846 'title': video_title,
847 'ext': video_extension,
851 class PhotobucketIE(InfoExtractor):
852 """Information extractor for photobucket.com."""
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858 IE_NAME = u'photobucket'
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
864 raise ExtractorError(u'Invalid URL: %s' % url)
866 video_id = mobj.group('id')
868 video_extension = mobj.group('ext')
870 # Retrieve video webpage to extract further information
871 webpage = self._download_webpage(url, video_id)
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
878 info = json.loads(mobj.group('json'))
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
889 # We try looking in other parts of the webpage
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
902 video_uploader = mobj.group(2).decode('utf-8')
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
914 class YahooIE(InfoExtractor):
915 """Information extractor for screen.yahoo.com."""
916 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
918 def _real_extract(self, url):
919 mobj = re.match(self._VALID_URL, url)
921 raise ExtractorError(u'Invalid URL: %s' % url)
922 video_id = mobj.group('id')
924 # TODO: Check which url parameters are required
925 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
926 webpage = self._download_webpage(info_url, video_id, "Downloading info webpage")
927 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
928 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
929 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
930 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
932 self.report_extraction(video_id)
933 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
935 raise ExtractorError(u'Unable to extract video info')
936 video_title = m_info.group('title')
937 video_description = m_info.group('description')
938 video_thumb = m_info.group('thumb')
939 video_date = m_info.group('date')
940 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
942 # TODO: Find a way to get mp4 videos
943 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
944 webpage = self._download_webpage(rest_url, video_id, 'Downloading video url webpage')
945 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
947 raise ExtractorError(u'Unable to extract video url')
951 'url':m_rest.group('url'),
952 'play_path': m_rest.group('path'),
954 'description': video_description,
955 'thumbnail': video_thumb,
956 'upload_date': video_date,
961 class VimeoIE(InfoExtractor):
962 """Information extractor for vimeo.com."""
964 # _VALID_URL matches Vimeo URLs
965 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
968 def _real_extract(self, url, new_video=True):
969 # Extract ID from URL
970 mobj = re.match(self._VALID_URL, url)
972 raise ExtractorError(u'Invalid URL: %s' % url)
974 video_id = mobj.group('id')
975 if not mobj.group('proto'):
976 url = 'https://' + url
977 if mobj.group('direct_link'):
978 url = 'https://vimeo.com/' + video_id
980 # Retrieve video webpage to extract further information
981 request = compat_urllib_request.Request(url, None, std_headers)
982 webpage = self._download_webpage(request, video_id)
984 # Now we begin extracting as much information as we can from what we
985 # retrieved. First we extract the information common to all extractors,
986 # and latter we extract those that are Vimeo specific.
987 self.report_extraction(video_id)
989 # Extract the config JSON
991 config = webpage.split(' = {config:')[1].split(',assets:')[0]
992 config = json.loads(config)
994 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
995 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
997 raise ExtractorError(u'Unable to extract info section')
1000 video_title = config["video"]["title"]
1002 # Extract uploader and uploader_id
1003 video_uploader = config["video"]["owner"]["name"]
1004 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1006 # Extract video thumbnail
1007 video_thumbnail = config["video"]["thumbnail"]
1009 # Extract video description
1010 video_description = get_element_by_attribute("itemprop", "description", webpage)
1011 if video_description: video_description = clean_html(video_description)
1012 else: video_description = u''
1014 # Extract upload date
1015 video_upload_date = None
1016 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1017 if mobj is not None:
1018 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1020 # Vimeo specific: extract request signature and timestamp
1021 sig = config['request']['signature']
1022 timestamp = config['request']['timestamp']
1024 # Vimeo specific: extract video codec and quality information
1025 # First consider quality, then codecs, then take everything
1026 # TODO bind to format param
1027 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1028 files = { 'hd': [], 'sd': [], 'other': []}
1029 for codec_name, codec_extension in codecs:
1030 if codec_name in config["video"]["files"]:
1031 if 'hd' in config["video"]["files"][codec_name]:
1032 files['hd'].append((codec_name, codec_extension, 'hd'))
1033 elif 'sd' in config["video"]["files"][codec_name]:
1034 files['sd'].append((codec_name, codec_extension, 'sd'))
1036 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1038 for quality in ('hd', 'sd', 'other'):
1039 if len(files[quality]) > 0:
1040 video_quality = files[quality][0][2]
1041 video_codec = files[quality][0][0]
1042 video_extension = files[quality][0][1]
1043 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1046 raise ExtractorError(u'No known codec found')
1048 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1049 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1054 'uploader': video_uploader,
1055 'uploader_id': video_uploader_id,
1056 'upload_date': video_upload_date,
1057 'title': video_title,
1058 'ext': video_extension,
1059 'thumbnail': video_thumbnail,
1060 'description': video_description,
1064 class ArteTvIE(InfoExtractor):
1065 """arte.tv information extractor."""
1067 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1068 _LIVE_URL = r'index-[0-9]+\.html$'
1070 IE_NAME = u'arte.tv'
1072 def fetch_webpage(self, url):
1073 request = compat_urllib_request.Request(url)
1075 self.report_download_webpage(url)
1076 webpage = compat_urllib_request.urlopen(request).read()
1077 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1078 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1079 except ValueError as err:
1080 raise ExtractorError(u'Invalid URL: %s' % url)
1083 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1084 page = self.fetch_webpage(url)
1085 mobj = re.search(regex, page, regexFlags)
1089 raise ExtractorError(u'Invalid URL: %s' % url)
1091 for (i, key, err) in matchTuples:
1092 if mobj.group(i) is None:
1093 raise ExtractorError(err)
1095 info[key] = mobj.group(i)
1099 def extractLiveStream(self, url):
1100 video_lang = url.split('/')[-4]
1101 info = self.grep_webpage(
1103 r'src="(.*?/videothek_js.*?\.js)',
1106 (1, 'url', u'Invalid URL: %s' % url)
1109 http_host = url.split('/')[2]
1110 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1111 info = self.grep_webpage(
1113 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1114 '(http://.*?\.swf).*?' +
1118 (1, 'path', u'could not extract video path: %s' % url),
1119 (2, 'player', u'could not extract video player: %s' % url),
1120 (3, 'url', u'could not extract video url: %s' % url)
1123 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1125 def extractPlus7Stream(self, url):
1126 video_lang = url.split('/')[-3]
1127 info = self.grep_webpage(
1129 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1132 (1, 'url', u'Invalid URL: %s' % url)
1135 next_url = compat_urllib_parse.unquote(info.get('url'))
1136 info = self.grep_webpage(
1138 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1141 (1, 'url', u'Could not find <video> tag: %s' % url)
1144 next_url = compat_urllib_parse.unquote(info.get('url'))
1146 info = self.grep_webpage(
1148 r'<video id="(.*?)".*?>.*?' +
1149 '<name>(.*?)</name>.*?' +
1150 '<dateVideo>(.*?)</dateVideo>.*?' +
1151 '<url quality="hd">(.*?)</url>',
1154 (1, 'id', u'could not extract video id: %s' % url),
1155 (2, 'title', u'could not extract video title: %s' % url),
1156 (3, 'date', u'could not extract video date: %s' % url),
1157 (4, 'url', u'could not extract video url: %s' % url)
1162 'id': info.get('id'),
1163 'url': compat_urllib_parse.unquote(info.get('url')),
1164 'uploader': u'arte.tv',
1165 'upload_date': unified_strdate(info.get('date')),
1166 'title': info.get('title').decode('utf-8'),
1172 def _real_extract(self, url):
1173 video_id = url.split('/')[-1]
1174 self.report_extraction(video_id)
1176 if re.search(self._LIVE_URL, video_id) is not None:
1177 self.extractLiveStream(url)
1180 info = self.extractPlus7Stream(url)
1185 class GenericIE(InfoExtractor):
1186 """Generic last-resort information extractor."""
1189 IE_NAME = u'generic'
1191 def report_download_webpage(self, video_id):
1192 """Report webpage download."""
1193 if not self._downloader.params.get('test', False):
1194 self._downloader.report_warning(u'Falling back on generic information extractor.')
1195 super(GenericIE, self).report_download_webpage(video_id)
1197 def report_following_redirect(self, new_url):
1198 """Report information extraction."""
1199 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1201 def _test_redirect(self, url):
1202 """Check if it is a redirect, like url shorteners, in case return the new url."""
1203 class HeadRequest(compat_urllib_request.Request):
1204 def get_method(self):
1207 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1209 Subclass the HTTPRedirectHandler to make it use our
1210 HeadRequest also on the redirected URL
1212 def redirect_request(self, req, fp, code, msg, headers, newurl):
1213 if code in (301, 302, 303, 307):
1214 newurl = newurl.replace(' ', '%20')
1215 newheaders = dict((k,v) for k,v in req.headers.items()
1216 if k.lower() not in ("content-length", "content-type"))
1217 return HeadRequest(newurl,
1219 origin_req_host=req.get_origin_req_host(),
1222 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1224 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1226 Fallback to GET if HEAD is not allowed (405 HTTP error)
1228 def http_error_405(self, req, fp, code, msg, headers):
1232 newheaders = dict((k,v) for k,v in req.headers.items()
1233 if k.lower() not in ("content-length", "content-type"))
1234 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1236 origin_req_host=req.get_origin_req_host(),
1240 opener = compat_urllib_request.OpenerDirector()
1241 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1242 HTTPMethodFallback, HEADRedirectHandler,
1243 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1244 opener.add_handler(handler())
1246 response = opener.open(HeadRequest(url))
1247 new_url = response.geturl()
1252 self.report_following_redirect(new_url)
1255 def _real_extract(self, url):
1256 new_url = self._test_redirect(url)
1257 if new_url: return [self.url_result(new_url)]
1259 video_id = url.split('/')[-1]
1261 webpage = self._download_webpage(url, video_id)
1262 except ValueError as err:
1263 # since this is the last-resort InfoExtractor, if
1264 # this error is thrown, it'll be thrown here
1265 raise ExtractorError(u'Invalid URL: %s' % url)
1267 self.report_extraction(video_id)
1268 # Start with something easy: JW Player in SWFObject
1269 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1271 # Broaden the search a little bit
1272 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1274 # Broaden the search a little bit: JWPlayer JS loader
1275 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1277 raise ExtractorError(u'Invalid URL: %s' % url)
1279 # It's possible that one of the regexes
1280 # matched, but returned an empty group:
1281 if mobj.group(1) is None:
1282 raise ExtractorError(u'Invalid URL: %s' % url)
1284 video_url = compat_urllib_parse.unquote(mobj.group(1))
1285 video_id = os.path.basename(video_url)
1287 # here's a fun little line of code for you:
1288 video_extension = os.path.splitext(video_id)[1][1:]
1289 video_id = os.path.splitext(video_id)[0]
1291 # it's tempting to parse this further, but you would
1292 # have to take into account all the variations like
1293 # Video Title - Site Name
1294 # Site Name | Video Title
1295 # Video Title - Tagline | Site Name
1296 # and so on and so forth; it's just not practical
1297 mobj = re.search(r'<title>(.*)</title>', webpage)
1299 raise ExtractorError(u'Unable to extract title')
1300 video_title = mobj.group(1)
1302 # video uploader is domain name
1303 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1305 raise ExtractorError(u'Unable to extract title')
1306 video_uploader = mobj.group(1)
1311 'uploader': video_uploader,
1312 'upload_date': None,
1313 'title': video_title,
1314 'ext': video_extension,
1318 class YoutubeSearchIE(InfoExtractor):
1319 """Information Extractor for YouTube search queries."""
1320 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1321 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1322 _max_youtube_results = 1000
1323 IE_NAME = u'youtube:search'
1325 def report_download_page(self, query, pagenum):
1326 """Report attempt to download search page with given number."""
1327 query = query.decode(preferredencoding())
1328 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1330 def _real_extract(self, query):
1331 mobj = re.match(self._VALID_URL, query)
1333 raise ExtractorError(u'Invalid search query "%s"' % query)
1335 prefix, query = query.split(':')
1337 query = query.encode('utf-8')
1339 return self._get_n_results(query, 1)
1340 elif prefix == 'all':
1341 self._get_n_results(query, self._max_youtube_results)
1346 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1347 elif n > self._max_youtube_results:
1348 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1349 n = self._max_youtube_results
1350 return self._get_n_results(query, n)
1351 except ValueError: # parsing prefix as integer fails
1352 return self._get_n_results(query, 1)
1354 def _get_n_results(self, query, n):
1355 """Get a specified number of results for a query"""
1361 while (50 * pagenum) < limit:
1362 self.report_download_page(query, pagenum+1)
1363 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1364 request = compat_urllib_request.Request(result_url)
1366 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1367 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1368 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1369 api_response = json.loads(data)['data']
1371 if not 'items' in api_response:
1372 raise ExtractorError(u'[youtube] No video results')
1374 new_ids = list(video['id'] for video in api_response['items'])
1375 video_ids += new_ids
1377 limit = min(n, api_response['totalItems'])
1380 if len(video_ids) > n:
1381 video_ids = video_ids[:n]
1382 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1386 class GoogleSearchIE(InfoExtractor):
1387 """Information Extractor for Google Video search queries."""
1388 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1389 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1390 _max_google_results = 1000
1391 IE_NAME = u'video.google:search'
1393 def _real_extract(self, query):
1394 mobj = re.match(self._VALID_URL, query)
1396 prefix = mobj.group('prefix')
1397 query = mobj.group('query')
1399 return self._get_n_results(query, 1)
1400 elif prefix == 'all':
1401 return self._get_n_results(query, self._max_google_results)
1405 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1406 elif n > self._max_google_results:
1407 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1408 n = self._max_google_results
1409 return self._get_n_results(query, n)
1411 def _get_n_results(self, query, n):
1412 """Get a specified number of results for a query"""
1415 '_type': 'playlist',
1420 for pagenum in itertools.count(1):
1421 result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1422 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1423 note='Downloading result page ' + str(pagenum))
1425 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1428 'url': mobj.group(1)
1430 res['entries'].append(e)
1432 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1435 class YahooSearchIE(InfoExtractor):
1436 """Information Extractor for Yahoo! Video search queries."""
1439 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1440 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1441 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1442 _MORE_PAGES_INDICATOR = r'\s*Next'
1443 _max_yahoo_results = 1000
1444 IE_NAME = u'video.yahoo:search'
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1454 raise ExtractorError(u'Invalid search query "%s"' % query)
1456 prefix, query = query.split(':')
1458 query = query.encode('utf-8')
1460 self._download_n_results(query, 1)
1462 elif prefix == 'all':
1463 self._download_n_results(query, self._max_yahoo_results)
1469 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1470 elif n > self._max_yahoo_results:
1471 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1472 n = self._max_yahoo_results
1473 self._download_n_results(query, n)
1475 except ValueError: # parsing prefix as integer fails
1476 self._download_n_results(query, 1)
1479 def _download_n_results(self, query, n):
1480 """Downloads a specified number of results for a query"""
1483 already_seen = set()
1487 self.report_download_page(query, pagenum)
1488 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1489 request = compat_urllib_request.Request(result_url)
1491 page = compat_urllib_request.urlopen(request).read()
1492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1493 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1495 # Extract video identifiers
1496 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1497 video_id = mobj.group(1)
1498 if video_id not in already_seen:
1499 video_ids.append(video_id)
1500 already_seen.add(video_id)
1501 if len(video_ids) == n:
1502 # Specified n videos reached
1503 for id in video_ids:
1504 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1507 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1508 for id in video_ids:
1509 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1512 pagenum = pagenum + 1
1515 class YoutubePlaylistIE(InfoExtractor):
1516 """Information Extractor for YouTube playlists."""
1518 _VALID_URL = r"""(?:
1523 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1524 \? (?:.*?&)*? (?:p|a|list)=
1527 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1530 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1532 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1534 IE_NAME = u'youtube:playlist'
1537 def suitable(cls, url):
1538 """Receives a URL and returns True if suitable for this IE."""
1539 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1541 def _real_extract(self, url):
1542 # Extract playlist id
1543 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1545 raise ExtractorError(u'Invalid URL: %s' % url)
1547 # Download playlist videos from API
1548 playlist_id = mobj.group(1) or mobj.group(2)
1553 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1554 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1557 response = json.loads(page)
1558 except ValueError as err:
1559 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1561 if 'feed' not in response:
1562 raise ExtractorError(u'Got a malformed response from YouTube API')
1563 playlist_title = response['feed']['title']['$t']
1564 if 'entry' not in response['feed']:
1565 # Number of videos is a multiple of self._MAX_RESULTS
1568 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1569 for entry in response['feed']['entry']
1570 if 'content' in entry ]
1572 if len(response['feed']['entry']) < self._MAX_RESULTS:
1576 videos = [v[1] for v in sorted(videos)]
1578 url_results = [self.url_result(url, 'Youtube') for url in videos]
1579 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1582 class YoutubeChannelIE(InfoExtractor):
1583 """Information Extractor for YouTube channels."""
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1588 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1589 IE_NAME = u'youtube:channel'
1591 def extract_videos_from_page(self, page):
1593 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594 if mobj.group(1) not in ids_in_page:
1595 ids_in_page.append(mobj.group(1))
1598 def _real_extract(self, url):
1599 # Extract channel id
1600 mobj = re.match(self._VALID_URL, url)
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1604 # Download channel page
1605 channel_id = mobj.group(1)
1609 url = self._TEMPLATE_URL % (channel_id, pagenum)
1610 page = self._download_webpage(url, channel_id,
1611 u'Downloading page #%s' % pagenum)
1613 # Extract video identifiers
1614 ids_in_page = self.extract_videos_from_page(page)
1615 video_ids.extend(ids_in_page)
1617 # Download any subsequent channel pages using the json-based channel_ajax query
1618 if self._MORE_PAGES_INDICATOR in page:
1620 pagenum = pagenum + 1
1622 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1623 page = self._download_webpage(url, channel_id,
1624 u'Downloading page #%s' % pagenum)
1626 page = json.loads(page)
1628 ids_in_page = self.extract_videos_from_page(page['content_html'])
1629 video_ids.extend(ids_in_page)
1631 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1634 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1636 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1637 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1638 return [self.playlist_result(url_entries, channel_id)]
1641 class YoutubeUserIE(InfoExtractor):
1642 """Information Extractor for YouTube users."""
1644 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1645 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1646 _GDATA_PAGE_SIZE = 50
1647 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1648 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1649 IE_NAME = u'youtube:user'
1651 def _real_extract(self, url):
1653 mobj = re.match(self._VALID_URL, url)
1655 raise ExtractorError(u'Invalid URL: %s' % url)
1657 username = mobj.group(1)
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671 page = self._download_webpage(gdata_url, username,
1672 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674 # Extract video identifiers
1677 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1678 if mobj.group(1) not in ids_in_page:
1679 ids_in_page.append(mobj.group(1))
1681 video_ids.extend(ids_in_page)
1683 # A little optimization - if current page is not
1684 # "full", ie. does not contain PAGE_SIZE video ids then
1685 # we can assume that this page is the last one - there
1686 # are no more ids on further pages - no need to query
1689 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1695 url_results = [self.url_result(url, 'Youtube') for url in urls]
1696 return [self.playlist_result(url_results, playlist_title = username)]
1699 class BlipTVUserIE(InfoExtractor):
1700 """Information Extractor for blip.tv users."""
1702 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1704 IE_NAME = u'blip.tv:user'
1706 def _real_extract(self, url):
1708 mobj = re.match(self._VALID_URL, url)
1710 raise ExtractorError(u'Invalid URL: %s' % url)
1712 username = mobj.group(1)
1714 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1716 page = self._download_webpage(url, username, u'Downloading user page')
1717 mobj = re.search(r'data-users-id="([^"]+)"', page)
1718 page_base = page_base % mobj.group(1)
1721 # Download video ids using BlipTV Ajax calls. Result size per
1722 # query is limited (currently to 12 videos) so we need to query
1723 # page by page until there are no video ids - it means we got
1730 url = page_base + "&page=" + str(pagenum)
1731 page = self._download_webpage(url, username,
1732 u'Downloading video ids from page %d' % pagenum)
1734 # Extract video identifiers
1737 for mobj in re.finditer(r'href="/([^"]+)"', page):
1738 if mobj.group(1) not in ids_in_page:
1739 ids_in_page.append(unescapeHTML(mobj.group(1)))
1741 video_ids.extend(ids_in_page)
1743 # A little optimization - if current page is not
1744 # "full", ie. does not contain PAGE_SIZE video ids then
1745 # we can assume that this page is the last one - there
1746 # are no more ids on further pages - no need to query
1749 if len(ids_in_page) < self._PAGE_SIZE:
1754 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1755 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1756 return [self.playlist_result(url_entries, playlist_title = username)]
1759 class DepositFilesIE(InfoExtractor):
1760 """Information extractor for depositfiles.com"""
1762 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1764 def _real_extract(self, url):
1765 file_id = url.split('/')[-1]
1766 # Rebuild url in english locale
1767 url = 'http://depositfiles.com/en/files/' + file_id
1769 # Retrieve file webpage with 'Free download' button pressed
1770 free_download_indication = { 'gateway_result' : '1' }
1771 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1773 self.report_download_webpage(file_id)
1774 webpage = compat_urllib_request.urlopen(request).read()
1775 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1776 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1778 # Search for the real file URL
1779 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1780 if (mobj is None) or (mobj.group(1) is None):
1781 # Try to figure out reason of the error.
1782 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1783 if (mobj is not None) and (mobj.group(1) is not None):
1784 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1785 raise ExtractorError(u'%s' % restriction_message)
1787 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1789 file_url = mobj.group(1)
1790 file_extension = os.path.splitext(file_url)[1][1:]
1792 # Search for file title
1793 mobj = re.search(r'<b title="(.*?)">', webpage)
1795 raise ExtractorError(u'Unable to extract title')
1796 file_title = mobj.group(1).decode('utf-8')
1799 'id': file_id.decode('utf-8'),
1800 'url': file_url.decode('utf-8'),
1802 'upload_date': None,
1803 'title': file_title,
1804 'ext': file_extension.decode('utf-8'),
1808 class FacebookIE(InfoExtractor):
1809 """Information Extractor for Facebook"""
1811 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1812 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1813 _NETRC_MACHINE = 'facebook'
1814 IE_NAME = u'facebook'
1816 def report_login(self):
1817 """Report attempt to log in."""
1818 self.to_screen(u'Logging in')
1820 def _real_initialize(self):
1821 if self._downloader is None:
1826 downloader_params = self._downloader.params
1828 # Attempt to use provided username and password or .netrc data
1829 if downloader_params.get('username', None) is not None:
1830 useremail = downloader_params['username']
1831 password = downloader_params['password']
1832 elif downloader_params.get('usenetrc', False):
1834 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1835 if info is not None:
1839 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1840 except (IOError, netrc.NetrcParseError) as err:
1841 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1844 if useremail is None:
1853 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1856 login_results = compat_urllib_request.urlopen(request).read()
1857 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1858 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1864 def _real_extract(self, url):
1865 mobj = re.match(self._VALID_URL, url)
1867 raise ExtractorError(u'Invalid URL: %s' % url)
1868 video_id = mobj.group('ID')
1870 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1871 webpage = self._download_webpage(url, video_id)
1873 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1874 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1875 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1877 raise ExtractorError(u'Cannot parse data')
1878 data = dict(json.loads(m.group(1)))
1879 params_raw = compat_urllib_parse.unquote(data['params'])
1880 params = json.loads(params_raw)
1881 video_data = params['video_data'][0]
1882 video_url = video_data.get('hd_src')
1884 video_url = video_data['sd_src']
1886 raise ExtractorError(u'Cannot find video URL')
1887 video_duration = int(video_data['video_duration'])
1888 thumbnail = video_data['thumbnail_src']
1890 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1892 raise ExtractorError(u'Cannot find title in webpage')
1893 video_title = unescapeHTML(m.group(1))
1897 'title': video_title,
1900 'duration': video_duration,
1901 'thumbnail': thumbnail,
1906 class BlipTVIE(InfoExtractor):
1907 """Information extractor for blip.tv"""
1909 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1910 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1911 IE_NAME = u'blip.tv'
1913 def report_direct_download(self, title):
1914 """Report information extraction."""
1915 self.to_screen(u'%s: Direct download detected' % title)
1917 def _real_extract(self, url):
1918 mobj = re.match(self._VALID_URL, url)
1920 raise ExtractorError(u'Invalid URL: %s' % url)
1922 urlp = compat_urllib_parse_urlparse(url)
1923 if urlp.path.startswith('/play/'):
1924 request = compat_urllib_request.Request(url)
1925 response = compat_urllib_request.urlopen(request)
1926 redirecturl = response.geturl()
1927 rurlp = compat_urllib_parse_urlparse(redirecturl)
1928 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1929 url = 'http://blip.tv/a/a-' + file_id
1930 return self._real_extract(url)
1937 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1938 request = compat_urllib_request.Request(json_url)
1939 request.add_header('User-Agent', 'iTunes/10.6.1')
1940 self.report_extraction(mobj.group(1))
1943 urlh = compat_urllib_request.urlopen(request)
1944 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1945 basename = url.split('/')[-1]
1946 title,ext = os.path.splitext(basename)
1947 title = title.decode('UTF-8')
1948 ext = ext.replace('.', '')
1949 self.report_direct_download(title)
1954 'upload_date': None,
1959 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1960 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1961 if info is None: # Regular URL
1963 json_code_bytes = urlh.read()
1964 json_code = json_code_bytes.decode('utf-8')
1965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1966 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1969 json_data = json.loads(json_code)
1970 if 'Post' in json_data:
1971 data = json_data['Post']
1975 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1976 video_url = data['media']['url']
1977 umobj = re.match(self._URL_EXT, video_url)
1979 raise ValueError('Can not determine filename extension')
1980 ext = umobj.group(1)
1983 'id': data['item_id'],
1985 'uploader': data['display_name'],
1986 'upload_date': upload_date,
1987 'title': data['title'],
1989 'format': data['media']['mimeType'],
1990 'thumbnail': data['thumbnailUrl'],
1991 'description': data['description'],
1992 'player_url': data['embedUrl'],
1993 'user_agent': 'iTunes/10.6.1',
1995 except (ValueError,KeyError) as err:
1996 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2001 class MyVideoIE(InfoExtractor):
2002 """Information Extractor for myvideo.de."""
2004 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2005 IE_NAME = u'myvideo'
2007 def _real_extract(self,url):
2008 mobj = re.match(self._VALID_URL, url)
2010 raise ExtractorError(u'Invalid URL: %s' % url)
2012 video_id = mobj.group(1)
2015 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2016 webpage = self._download_webpage(webpage_url, video_id)
2018 self.report_extraction(video_id)
2019 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2022 raise ExtractorError(u'Unable to extract media URL')
2023 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2025 mobj = re.search('<title>([^<]+)</title>', webpage)
2027 raise ExtractorError(u'Unable to extract title')
2029 video_title = mobj.group(1)
2035 'upload_date': None,
2036 'title': video_title,
2040 class ComedyCentralIE(InfoExtractor):
2041 """Information extractor for The Daily Show and Colbert Report """
2043 # urls can be abbreviations like :thedailyshow or :colbert
2044 # urls for episodes like:
2045 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2046 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2047 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2048 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2049 |(https?://)?(www\.)?
2050 (?P<showname>thedailyshow|colbertnation)\.com/
2051 (full-episodes/(?P<episode>.*)|
2053 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2054 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2057 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2059 _video_extensions = {
2067 _video_dimensions = {
2077 def suitable(cls, url):
2078 """Receives a URL and returns True if suitable for this IE."""
2079 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2081 def _print_formats(self, formats):
2082 print('Available formats:')
2084 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2087 def _real_extract(self, url):
2088 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2090 raise ExtractorError(u'Invalid URL: %s' % url)
2092 if mobj.group('shortname'):
2093 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2094 url = u'http://www.thedailyshow.com/full-episodes/'
2096 url = u'http://www.colbertnation.com/full-episodes/'
2097 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2098 assert mobj is not None
2100 if mobj.group('clip'):
2101 if mobj.group('showname') == 'thedailyshow':
2102 epTitle = mobj.group('tdstitle')
2104 epTitle = mobj.group('cntitle')
2107 dlNewest = not mobj.group('episode')
2109 epTitle = mobj.group('showname')
2111 epTitle = mobj.group('episode')
2113 self.report_extraction(epTitle)
2114 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2116 url = htmlHandle.geturl()
2117 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2119 raise ExtractorError(u'Invalid redirected URL: ' + url)
2120 if mobj.group('episode') == '':
2121 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2122 epTitle = mobj.group('episode')
2124 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2126 if len(mMovieParams) == 0:
2127 # The Colbert Report embeds the information in a without
2128 # a URL prefix; so extract the alternate reference
2129 # and then add the URL prefix manually.
2131 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2132 if len(altMovieParams) == 0:
2133 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2135 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2137 uri = mMovieParams[0][1]
2138 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2139 indexXml = self._download_webpage(indexUrl, epTitle,
2140 u'Downloading show index',
2141 u'unable to download episode index')
2145 idoc = xml.etree.ElementTree.fromstring(indexXml)
2146 itemEls = idoc.findall('.//item')
2147 for partNum,itemEl in enumerate(itemEls):
2148 mediaId = itemEl.findall('./guid')[0].text
2149 shortMediaId = mediaId.split(':')[-1]
2150 showId = mediaId.split(':')[-2].replace('.com', '')
2151 officialTitle = itemEl.findall('./title')[0].text
2152 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2154 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2155 compat_urllib_parse.urlencode({'uri': mediaId}))
2156 configXml = self._download_webpage(configUrl, epTitle,
2157 u'Downloading configuration for %s' % shortMediaId)
2159 cdoc = xml.etree.ElementTree.fromstring(configXml)
2161 for rendition in cdoc.findall('.//rendition'):
2162 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2166 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2169 if self._downloader.params.get('listformats', None):
2170 self._print_formats([i[0] for i in turls])
2173 # For now, just pick the highest bitrate
2174 format,rtmp_video_url = turls[-1]
2176 # Get the format arg from the arg stream
2177 req_format = self._downloader.params.get('format', None)
2179 # Select format if we can find one
2182 format, rtmp_video_url = f, v
2185 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2187 raise ExtractorError(u'Cannot transform RTMP url')
2188 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2189 video_url = base + m.group('finalid')
2191 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2196 'upload_date': officialDate,
2201 'description': officialTitle,
2203 results.append(info)
2208 class EscapistIE(InfoExtractor):
2209 """Information extractor for The Escapist """
2211 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2212 IE_NAME = u'escapist'
2214 def _real_extract(self, url):
2215 mobj = re.match(self._VALID_URL, url)
2217 raise ExtractorError(u'Invalid URL: %s' % url)
2218 showName = mobj.group('showname')
2219 videoId = mobj.group('episode')
2221 self.report_extraction(showName)
2222 webPage = self._download_webpage(url, showName)
2224 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2225 description = unescapeHTML(descMatch.group(1))
2226 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2227 imgUrl = unescapeHTML(imgMatch.group(1))
2228 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2229 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2230 configUrlMatch = re.search('config=(.*)$', playerUrl)
2231 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2233 configJSON = self._download_webpage(configUrl, showName,
2234 u'Downloading configuration',
2235 u'unable to download configuration')
2237 # Technically, it's JavaScript, not JSON
2238 configJSON = configJSON.replace("'", '"')
2241 config = json.loads(configJSON)
2242 except (ValueError,) as err:
2243 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2245 playlist = config['playlist']
2246 videoUrl = playlist[1]['url']
2251 'uploader': showName,
2252 'upload_date': None,
2255 'thumbnail': imgUrl,
2256 'description': description,
2257 'player_url': playerUrl,
2262 class CollegeHumorIE(InfoExtractor):
2263 """Information extractor for collegehumor.com"""
2266 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2267 IE_NAME = u'collegehumor'
2269 def report_manifest(self, video_id):
2270 """Report information extraction."""
2271 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2273 def _real_extract(self, url):
2274 mobj = re.match(self._VALID_URL, url)
2276 raise ExtractorError(u'Invalid URL: %s' % url)
2277 video_id = mobj.group('videoid')
2282 'upload_date': None,
2285 self.report_extraction(video_id)
2286 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2288 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2289 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2290 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2292 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2294 videoNode = mdoc.findall('./video')[0]
2295 info['description'] = videoNode.findall('./description')[0].text
2296 info['title'] = videoNode.findall('./caption')[0].text
2297 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2298 manifest_url = videoNode.findall('./file')[0].text
2300 raise ExtractorError(u'Invalid metadata XML file')
2302 manifest_url += '?hdcore=2.10.3'
2303 self.report_manifest(video_id)
2305 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2306 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2307 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2309 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2311 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2312 node_id = media_node.attrib['url']
2313 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2314 except IndexError as err:
2315 raise ExtractorError(u'Invalid manifest file')
2317 url_pr = compat_urllib_parse_urlparse(manifest_url)
2318 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2325 class XVideosIE(InfoExtractor):
2326 """Information extractor for xvideos.com"""
2328 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2329 IE_NAME = u'xvideos'
2331 def _real_extract(self, url):
2332 mobj = re.match(self._VALID_URL, url)
2334 raise ExtractorError(u'Invalid URL: %s' % url)
2335 video_id = mobj.group(1)
2337 webpage = self._download_webpage(url, video_id)
2339 self.report_extraction(video_id)
2343 mobj = re.search(r'flv_url=(.+?)&', webpage)
2345 raise ExtractorError(u'Unable to extract video url')
2346 video_url = compat_urllib_parse.unquote(mobj.group(1))
2350 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2352 raise ExtractorError(u'Unable to extract video title')
2353 video_title = mobj.group(1)
2356 # Extract video thumbnail
2357 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2359 raise ExtractorError(u'Unable to extract video thumbnail')
2360 video_thumbnail = mobj.group(0)
2366 'upload_date': None,
2367 'title': video_title,
2369 'thumbnail': video_thumbnail,
2370 'description': None,
2376 class SoundcloudIE(InfoExtractor):
2377 """Information extractor for soundcloud.com
2378 To access the media, the uid of the song and a stream token
2379 must be extracted from the page source and the script must make
2380 a request to media.soundcloud.com/crossdomain.xml. Then
2381 the media can be grabbed by requesting from an url composed
2382 of the stream token and uid
2385 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2386 IE_NAME = u'soundcloud'
2388 def report_resolve(self, video_id):
2389 """Report information extraction."""
2390 self.to_screen(u'%s: Resolving id' % video_id)
2392 def _real_extract(self, url):
2393 mobj = re.match(self._VALID_URL, url)
2395 raise ExtractorError(u'Invalid URL: %s' % url)
2397 # extract uploader (which is in the url)
2398 uploader = mobj.group(1)
2399 # extract simple title (uploader + slug of song title)
2400 slug_title = mobj.group(2)
2401 simple_title = uploader + u'-' + slug_title
2402 full_title = '%s/%s' % (uploader, slug_title)
2404 self.report_resolve(full_title)
2406 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2407 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2408 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2410 info = json.loads(info_json)
2411 video_id = info['id']
2412 self.report_extraction(full_title)
2414 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2415 stream_json = self._download_webpage(streams_url, full_title,
2416 u'Downloading stream definitions',
2417 u'unable to download stream definitions')
2419 streams = json.loads(stream_json)
2420 mediaURL = streams['http_mp3_128_url']
2421 upload_date = unified_strdate(info['created_at'])
2426 'uploader': info['user']['username'],
2427 'upload_date': upload_date,
2428 'title': info['title'],
2430 'description': info['description'],
2433 class SoundcloudSetIE(InfoExtractor):
2434 """Information extractor for soundcloud.com sets
2435 To access the media, the uid of the song and a stream token
2436 must be extracted from the page source and the script must make
2437 a request to media.soundcloud.com/crossdomain.xml. Then
2438 the media can be grabbed by requesting from an url composed
2439 of the stream token and uid
2442 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2443 IE_NAME = u'soundcloud:set'
2445 def report_resolve(self, video_id):
2446 """Report information extraction."""
2447 self.to_screen(u'%s: Resolving id' % video_id)
2449 def _real_extract(self, url):
2450 mobj = re.match(self._VALID_URL, url)
2452 raise ExtractorError(u'Invalid URL: %s' % url)
2454 # extract uploader (which is in the url)
2455 uploader = mobj.group(1)
2456 # extract simple title (uploader + slug of song title)
2457 slug_title = mobj.group(2)
2458 simple_title = uploader + u'-' + slug_title
2459 full_title = '%s/sets/%s' % (uploader, slug_title)
2461 self.report_resolve(full_title)
2463 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2464 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2465 info_json = self._download_webpage(resolv_url, full_title)
2468 info = json.loads(info_json)
2469 if 'errors' in info:
2470 for err in info['errors']:
2471 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2474 self.report_extraction(full_title)
2475 for track in info['tracks']:
2476 video_id = track['id']
2478 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2479 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2481 self.report_extraction(video_id)
2482 streams = json.loads(stream_json)
2483 mediaURL = streams['http_mp3_128_url']
2488 'uploader': track['user']['username'],
2489 'upload_date': unified_strdate(track['created_at']),
2490 'title': track['title'],
2492 'description': track['description'],
2497 class InfoQIE(InfoExtractor):
2498 """Information extractor for infoq.com"""
2499 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2501 def _real_extract(self, url):
2502 mobj = re.match(self._VALID_URL, url)
2504 raise ExtractorError(u'Invalid URL: %s' % url)
2506 webpage = self._download_webpage(url, video_id=url)
2507 self.report_extraction(url)
2510 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2512 raise ExtractorError(u'Unable to extract video url')
2513 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2514 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2517 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2519 raise ExtractorError(u'Unable to extract video title')
2520 video_title = mobj.group(1)
2522 # Extract description
2523 video_description = u'No description available.'
2524 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2525 if mobj is not None:
2526 video_description = mobj.group(1)
2528 video_filename = video_url.split('/')[-1]
2529 video_id, extension = video_filename.split('.')
2535 'upload_date': None,
2536 'title': video_title,
2537 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2539 'description': video_description,
2544 class MixcloudIE(InfoExtractor):
2545 """Information extractor for www.mixcloud.com"""
2547 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2548 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2549 IE_NAME = u'mixcloud'
2551 def report_download_json(self, file_id):
2552 """Report JSON download."""
2553 self.to_screen(u'Downloading json')
2555 def get_urls(self, jsonData, fmt, bitrate='best'):
2556 """Get urls from 'audio_formats' section in json"""
2559 bitrate_list = jsonData[fmt]
2560 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2561 bitrate = max(bitrate_list) # select highest
2563 url_list = jsonData[fmt][bitrate]
2564 except TypeError: # we have no bitrate info.
2565 url_list = jsonData[fmt]
2568 def check_urls(self, url_list):
2569 """Returns 1st active url from list"""
2570 for url in url_list:
2572 compat_urllib_request.urlopen(url)
2574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579 def _print_formats(self, formats):
2580 print('Available formats:')
2581 for fmt in formats.keys():
2582 for b in formats[fmt]:
2584 ext = formats[fmt][b][0]
2585 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2586 except TypeError: # we have no bitrate info
2587 ext = formats[fmt][0]
2588 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2591 def _real_extract(self, url):
2592 mobj = re.match(self._VALID_URL, url)
2594 raise ExtractorError(u'Invalid URL: %s' % url)
2595 # extract uploader & filename from url
2596 uploader = mobj.group(1).decode('utf-8')
2597 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2599 # construct API request
2600 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2601 # retrieve .json file with links to files
2602 request = compat_urllib_request.Request(file_url)
2604 self.report_download_json(file_url)
2605 jsonData = compat_urllib_request.urlopen(request).read()
2606 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2607 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2610 json_data = json.loads(jsonData)
2611 player_url = json_data['player_swf_url']
2612 formats = dict(json_data['audio_formats'])
2614 req_format = self._downloader.params.get('format', None)
2617 if self._downloader.params.get('listformats', None):
2618 self._print_formats(formats)
2621 if req_format is None or req_format == 'best':
2622 for format_param in formats.keys():
2623 url_list = self.get_urls(formats, format_param)
2625 file_url = self.check_urls(url_list)
2626 if file_url is not None:
2629 if req_format not in formats:
2630 raise ExtractorError(u'Format is not available')
2632 url_list = self.get_urls(formats, req_format)
2633 file_url = self.check_urls(url_list)
2634 format_param = req_format
2637 'id': file_id.decode('utf-8'),
2638 'url': file_url.decode('utf-8'),
2639 'uploader': uploader.decode('utf-8'),
2640 'upload_date': None,
2641 'title': json_data['name'],
2642 'ext': file_url.split('.')[-1].decode('utf-8'),
2643 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2644 'thumbnail': json_data['thumbnail_url'],
2645 'description': json_data['description'],
2646 'player_url': player_url.decode('utf-8'),
2649 class StanfordOpenClassroomIE(InfoExtractor):
2650 """Information extractor for Stanford's Open ClassRoom"""
2652 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2653 IE_NAME = u'stanfordoc'
2655 def _real_extract(self, url):
2656 mobj = re.match(self._VALID_URL, url)
2658 raise ExtractorError(u'Invalid URL: %s' % url)
2660 if mobj.group('course') and mobj.group('video'): # A specific video
2661 course = mobj.group('course')
2662 video = mobj.group('video')
2664 'id': course + '_' + video,
2666 'upload_date': None,
2669 self.report_extraction(info['id'])
2670 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2671 xmlUrl = baseUrl + video + '.xml'
2673 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2674 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2675 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2676 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2678 info['title'] = mdoc.findall('./title')[0].text
2679 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2681 raise ExtractorError(u'Invalid metadata XML file')
2682 info['ext'] = info['url'].rpartition('.')[2]
2684 elif mobj.group('course'): # A course page
2685 course = mobj.group('course')
2690 'upload_date': None,
2693 coursepage = self._download_webpage(url, info['id'],
2694 note='Downloading course info page',
2695 errnote='Unable to download course info page')
2697 m = re.search('<h1>([^<]+)</h1>', coursepage)
2699 info['title'] = unescapeHTML(m.group(1))
2701 info['title'] = info['id']
2703 m = re.search('<description>([^<]+)</description>', coursepage)
2705 info['description'] = unescapeHTML(m.group(1))
2707 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2710 'type': 'reference',
2711 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2715 for entry in info['list']:
2716 assert entry['type'] == 'reference'
2717 results += self.extract(entry['url'])
2721 'id': 'Stanford OpenClassroom',
2724 'upload_date': None,
2727 self.report_download_webpage(info['id'])
2728 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2730 rootpage = compat_urllib_request.urlopen(rootURL).read()
2731 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2732 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2734 info['title'] = info['id']
2736 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2739 'type': 'reference',
2740 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2745 for entry in info['list']:
2746 assert entry['type'] == 'reference'
2747 results += self.extract(entry['url'])
2750 class MTVIE(InfoExtractor):
2751 """Information extractor for MTV.com"""
2753 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2756 def _real_extract(self, url):
2757 mobj = re.match(self._VALID_URL, url)
2759 raise ExtractorError(u'Invalid URL: %s' % url)
2760 if not mobj.group('proto'):
2761 url = 'http://' + url
2762 video_id = mobj.group('videoid')
2764 webpage = self._download_webpage(url, video_id)
2766 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2768 raise ExtractorError(u'Unable to extract song name')
2769 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2770 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2772 raise ExtractorError(u'Unable to extract performer')
2773 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2774 video_title = performer + ' - ' + song_name
2776 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2778 raise ExtractorError(u'Unable to mtvn_uri')
2779 mtvn_uri = mobj.group(1)
2781 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2783 raise ExtractorError(u'Unable to extract content id')
2784 content_id = mobj.group(1)
2786 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2787 self.report_extraction(video_id)
2788 request = compat_urllib_request.Request(videogen_url)
2790 metadataXml = compat_urllib_request.urlopen(request).read()
2791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2794 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2795 renditions = mdoc.findall('.//rendition')
2797 # For now, always pick the highest quality.
2798 rendition = renditions[-1]
2801 _,_,ext = rendition.attrib['type'].partition('/')
2802 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2803 video_url = rendition.find('./src').text
2805 raise ExtractorError('Invalid rendition field.')
2810 'uploader': performer,
2811 'upload_date': None,
2812 'title': video_title,
2820 class YoukuIE(InfoExtractor):
2821 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2824 nowTime = int(time.time() * 1000)
2825 random1 = random.randint(1000,1998)
2826 random2 = random.randint(1000,9999)
2828 return "%d%d%d" %(nowTime,random1,random2)
2830 def _get_file_ID_mix_string(self, seed):
2832 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2834 for i in range(len(source)):
2835 seed = (seed * 211 + 30031 ) % 65536
2836 index = math.floor(seed / 65536 * len(source) )
2837 mixed.append(source[int(index)])
2838 source.remove(source[int(index)])
2839 #return ''.join(mixed)
2842 def _get_file_id(self, fileId, seed):
2843 mixed = self._get_file_ID_mix_string(seed)
2844 ids = fileId.split('*')
2848 realId.append(mixed[int(ch)])
2849 return ''.join(realId)
2851 def _real_extract(self, url):
2852 mobj = re.match(self._VALID_URL, url)
2854 raise ExtractorError(u'Invalid URL: %s' % url)
2855 video_id = mobj.group('ID')
2857 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2859 jsondata = self._download_webpage(info_url, video_id)
2861 self.report_extraction(video_id)
2863 config = json.loads(jsondata)
2865 video_title = config['data'][0]['title']
2866 seed = config['data'][0]['seed']
2868 format = self._downloader.params.get('format', None)
2869 supported_format = list(config['data'][0]['streamfileids'].keys())
2871 if format is None or format == 'best':
2872 if 'hd2' in supported_format:
2877 elif format == 'worst':
2885 fileid = config['data'][0]['streamfileids'][format]
2886 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2887 except (UnicodeDecodeError, ValueError, KeyError):
2888 raise ExtractorError(u'Unable to extract info section')
2891 sid = self._gen_sid()
2892 fileid = self._get_file_id(fileid, seed)
2894 #column 8,9 of fileid represent the segment number
2895 #fileid[7:9] should be changed
2896 for index, key in enumerate(keys):
2898 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2899 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2902 'id': '%s_part%02d' % (video_id, index),
2903 'url': download_url,
2905 'upload_date': None,
2906 'title': video_title,
2909 files_info.append(info)
2914 class XNXXIE(InfoExtractor):
2915 """Information extractor for xnxx.com"""
2917 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2919 VIDEO_URL_RE = r'flv_url=(.*?)&'
2920 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2921 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 raise ExtractorError(u'Invalid URL: %s' % url)
2927 video_id = mobj.group(1)
2929 # Get webpage content
2930 webpage = self._download_webpage(url, video_id)
2932 result = re.search(self.VIDEO_URL_RE, webpage)
2934 raise ExtractorError(u'Unable to extract video url')
2935 video_url = compat_urllib_parse.unquote(result.group(1))
2937 result = re.search(self.VIDEO_TITLE_RE, webpage)
2939 raise ExtractorError(u'Unable to extract video title')
2940 video_title = result.group(1)
2942 result = re.search(self.VIDEO_THUMB_RE, webpage)
2944 raise ExtractorError(u'Unable to extract video thumbnail')
2945 video_thumbnail = result.group(1)
2951 'upload_date': None,
2952 'title': video_title,
2954 'thumbnail': video_thumbnail,
2955 'description': None,
2959 class GooglePlusIE(InfoExtractor):
2960 """Information extractor for plus.google.com."""
2962 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2963 IE_NAME = u'plus.google'
2965 def report_extract_entry(self, url):
2966 """Report downloading extry"""
2967 self.to_screen(u'Downloading entry: %s' % url)
2969 def report_date(self, upload_date):
2970 """Report downloading extry"""
2971 self.to_screen(u'Entry date: %s' % upload_date)
2973 def report_uploader(self, uploader):
2974 """Report downloading extry"""
2975 self.to_screen(u'Uploader: %s' % uploader)
2977 def report_title(self, video_title):
2978 """Report downloading extry"""
2979 self.to_screen(u'Title: %s' % video_title)
2981 def report_extract_vid_page(self, video_page):
2982 """Report information extraction."""
2983 self.to_screen(u'Extracting video page: %s' % video_page)
2985 def _real_extract(self, url):
2986 # Extract id from URL
2987 mobj = re.match(self._VALID_URL, url)
2989 raise ExtractorError(u'Invalid URL: %s' % url)
2991 post_url = mobj.group(0)
2992 video_id = mobj.group(1)
2994 video_extension = 'flv'
2996 # Step 1, Retrieve post webpage to extract further information
2997 self.report_extract_entry(post_url)
2998 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3000 # Extract update date
3002 pattern = 'title="Timestamp">(.*?)</a>'
3003 mobj = re.search(pattern, webpage)
3005 upload_date = mobj.group(1)
3006 # Convert timestring to a format suitable for filename
3007 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3008 upload_date = upload_date.strftime('%Y%m%d')
3009 self.report_date(upload_date)
3013 pattern = r'rel\="author".*?>(.*?)</a>'
3014 mobj = re.search(pattern, webpage)
3016 uploader = mobj.group(1)
3017 self.report_uploader(uploader)
3020 # Get the first line for title
3022 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3023 mobj = re.search(pattern, webpage)
3025 video_title = mobj.group(1)
3026 self.report_title(video_title)
3028 # Step 2, Stimulate clicking the image box to launch video
3029 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3030 mobj = re.search(pattern, webpage)
3032 raise ExtractorError(u'Unable to extract video page URL')
3034 video_page = mobj.group(1)
3035 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3036 self.report_extract_vid_page(video_page)
3039 # Extract video links on video page
3040 """Extract video links of all sizes"""
3041 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3042 mobj = re.findall(pattern, webpage)
3044 raise ExtractorError(u'Unable to extract video links')
3046 # Sort in resolution
3047 links = sorted(mobj)
3049 # Choose the lowest of the sort, i.e. highest resolution
3050 video_url = links[-1]
3051 # Only get the url. The resolution part in the tuple has no use anymore
3052 video_url = video_url[-1]
3053 # Treat escaped \u0026 style hex
3055 video_url = video_url.decode("unicode_escape")
3056 except AttributeError: # Python 3
3057 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3063 'uploader': uploader,
3064 'upload_date': upload_date,
3065 'title': video_title,
3066 'ext': video_extension,
3069 class NBAIE(InfoExtractor):
3070 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3073 def _real_extract(self, url):
3074 mobj = re.match(self._VALID_URL, url)
3076 raise ExtractorError(u'Invalid URL: %s' % url)
3078 video_id = mobj.group(1)
3079 if video_id.endswith('/index.html'):
3080 video_id = video_id[:-len('/index.html')]
3082 webpage = self._download_webpage(url, video_id)
3084 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3085 def _findProp(rexp, default=None):
3086 m = re.search(rexp, webpage)
3088 return unescapeHTML(m.group(1))
3092 shortened_video_id = video_id.rpartition('/')[2]
3093 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3095 'id': shortened_video_id,
3099 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3100 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3104 class JustinTVIE(InfoExtractor):
3105 """Information extractor for justin.tv and twitch.tv"""
3106 # TODO: One broadcast may be split into multiple videos. The key
3107 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3108 # starts at 1 and increases. Can we treat all parts as one video?
3110 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3112 (?P<channelid>[^/]+)|
3113 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3114 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3118 _JUSTIN_PAGE_LIMIT = 100
3119 IE_NAME = u'justin.tv'
3121 def report_download_page(self, channel, offset):
3122 """Report attempt to download a single page of videos."""
3123 self.to_screen(u'%s: Downloading video information from %d to %d' %
3124 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3126 # Return count of items, list of *valid* items
3127 def _parse_page(self, url, video_id):
3128 webpage = self._download_webpage(url, video_id,
3129 u'Downloading video info JSON',
3130 u'unable to download video info JSON')
3132 response = json.loads(webpage)
3133 if type(response) != list:
3134 error_text = response.get('error', 'unknown error')
3135 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3137 for clip in response:
3138 video_url = clip['video_file_url']
3140 video_extension = os.path.splitext(video_url)[1][1:]
3141 video_date = re.sub('-', '', clip['start_time'][:10])
3142 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3143 video_id = clip['id']
3144 video_title = clip.get('title', video_id)
3148 'title': video_title,
3149 'uploader': clip.get('channel_name', video_uploader_id),
3150 'uploader_id': video_uploader_id,
3151 'upload_date': video_date,
3152 'ext': video_extension,
3154 return (len(response), info)
3156 def _real_extract(self, url):
3157 mobj = re.match(self._VALID_URL, url)
3159 raise ExtractorError(u'invalid URL: %s' % url)
3161 api_base = 'http://api.justin.tv'
3163 if mobj.group('channelid'):
3165 video_id = mobj.group('channelid')
3166 api = api_base + '/channel/archives/%s.json' % video_id
3167 elif mobj.group('chapterid'):
3168 chapter_id = mobj.group('chapterid')
3170 webpage = self._download_webpage(url, chapter_id)
3171 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3173 raise ExtractorError(u'Cannot find archive of a chapter')
3174 archive_id = m.group(1)
3176 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3177 chapter_info_xml = self._download_webpage(api, chapter_id,
3178 note=u'Downloading chapter information',
3179 errnote=u'Chapter information download failed')
3180 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3181 for a in doc.findall('.//archive'):
3182 if archive_id == a.find('./id').text:
3185 raise ExtractorError(u'Could not find chapter in chapter information')
3187 video_url = a.find('./video_file_url').text
3188 video_ext = video_url.rpartition('.')[2] or u'flv'
3190 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3191 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3192 note='Downloading chapter metadata',
3193 errnote='Download of chapter metadata failed')
3194 chapter_info = json.loads(chapter_info_json)
3196 bracket_start = int(doc.find('.//bracket_start').text)
3197 bracket_end = int(doc.find('.//bracket_end').text)
3199 # TODO determine start (and probably fix up file)
3200 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3201 #video_url += u'?start=' + TODO:start_timestamp
3202 # bracket_start is 13290, but we want 51670615
3203 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3204 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3207 'id': u'c' + chapter_id,
3210 'title': chapter_info['title'],
3211 'thumbnail': chapter_info['preview'],
3212 'description': chapter_info['description'],
3213 'uploader': chapter_info['channel']['display_name'],
3214 'uploader_id': chapter_info['channel']['name'],
3218 video_id = mobj.group('videoid')
3219 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3221 self.report_extraction(video_id)
3225 limit = self._JUSTIN_PAGE_LIMIT
3228 self.report_download_page(video_id, offset)
3229 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3230 page_count, page_info = self._parse_page(page_url, video_id)
3231 info.extend(page_info)
3232 if not paged or page_count != limit:
3237 class FunnyOrDieIE(InfoExtractor):
3238 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3240 def _real_extract(self, url):
3241 mobj = re.match(self._VALID_URL, url)
3243 raise ExtractorError(u'invalid URL: %s' % url)
3245 video_id = mobj.group('id')
3246 webpage = self._download_webpage(url, video_id)
3248 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3250 raise ExtractorError(u'Unable to find video information')
3251 video_url = unescapeHTML(m.group('url'))
3253 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3255 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3257 raise ExtractorError(u'Cannot find video title')
3258 title = clean_html(m.group('title'))
3260 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3262 desc = unescapeHTML(m.group('desc'))
3271 'description': desc,
3275 class SteamIE(InfoExtractor):
3276 _VALID_URL = r"""http://store\.steampowered\.com/
3278 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3280 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3284 def suitable(cls, url):
3285 """Receives a URL and returns True if suitable for this IE."""
3286 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3288 def _real_extract(self, url):
3289 m = re.match(self._VALID_URL, url, re.VERBOSE)
3290 gameID = m.group('gameID')
3291 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3292 self.report_age_confirmation()
3293 webpage = self._download_webpage(videourl, gameID)
3294 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3296 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3297 mweb = re.finditer(urlRE, webpage)
3298 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3299 titles = re.finditer(namesRE, webpage)
3300 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3301 thumbs = re.finditer(thumbsRE, webpage)
3303 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3304 video_id = vid.group('videoID')
3305 title = vtitle.group('videoName')
3306 video_url = vid.group('videoURL')
3307 video_thumb = thumb.group('thumbnail')
3309 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3314 'title': unescapeHTML(title),
3315 'thumbnail': video_thumb
3318 return [self.playlist_result(videos, gameID, game_title)]
3320 class UstreamIE(InfoExtractor):
3321 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3322 IE_NAME = u'ustream'
3324 def _real_extract(self, url):
3325 m = re.match(self._VALID_URL, url)
3326 video_id = m.group('videoID')
3327 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3328 webpage = self._download_webpage(url, video_id)
3329 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3330 title = m.group('title')
3331 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3332 uploader = m.group('uploader')
3338 'uploader': uploader
3342 class WorldStarHipHopIE(InfoExtractor):
3343 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3344 IE_NAME = u'WorldStarHipHop'
3346 def _real_extract(self, url):
3347 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3349 m = re.match(self._VALID_URL, url)
3350 video_id = m.group('id')
3352 webpage_src = self._download_webpage(url, video_id)
3354 mobj = re.search(_src_url, webpage_src)
3356 if mobj is not None:
3357 video_url = mobj.group(1)
3358 if 'mp4' in video_url:
3363 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3365 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3368 raise ExtractorError(u'Cannot determine title')
3369 title = mobj.group(1)
3371 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3372 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3373 if mobj is not None:
3374 thumbnail = mobj.group(1)
3376 _title = r"""candytitles.*>(.*)</span>"""
3377 mobj = re.search(_title, webpage_src)
3378 if mobj is not None:
3379 title = mobj.group(1)
3386 'thumbnail' : thumbnail,
3391 class RBMARadioIE(InfoExtractor):
3392 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3394 def _real_extract(self, url):
3395 m = re.match(self._VALID_URL, url)
3396 video_id = m.group('videoID')
3398 webpage = self._download_webpage(url, video_id)
3399 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3401 raise ExtractorError(u'Cannot find metadata')
3402 json_data = m.group(1)
3405 data = json.loads(json_data)
3406 except ValueError as e:
3407 raise ExtractorError(u'Invalid JSON: ' + str(e))
3409 video_url = data['akamai_url'] + '&cbr=256'
3410 url_parts = compat_urllib_parse_urlparse(video_url)
3411 video_ext = url_parts.path.rpartition('.')[2]
3416 'title': data['title'],
3417 'description': data.get('teaser_text'),
3418 'location': data.get('country_of_origin'),
3419 'uploader': data.get('host', {}).get('name'),
3420 'uploader_id': data.get('host', {}).get('slug'),
3421 'thumbnail': data.get('image', {}).get('large_url_2x'),
3422 'duration': data.get('duration'),
3427 class YouPornIE(InfoExtractor):
3428 """Information extractor for youporn.com."""
3429 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3431 def _print_formats(self, formats):
3432 """Print all available formats"""
3433 print(u'Available formats:')
3434 print(u'ext\t\tformat')
3435 print(u'---------------------------------')
3436 for format in formats:
3437 print(u'%s\t\t%s' % (format['ext'], format['format']))
3439 def _specific(self, req_format, formats):
3441 if(x["format"]==req_format):
3445 def _real_extract(self, url):
3446 mobj = re.match(self._VALID_URL, url)
3448 raise ExtractorError(u'Invalid URL: %s' % url)
3450 video_id = mobj.group('videoid')
3452 req = compat_urllib_request.Request(url)
3453 req.add_header('Cookie', 'age_verified=1')
3454 webpage = self._download_webpage(req, video_id)
3456 # Get the video title
3457 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3459 raise ExtractorError(u'Unable to extract video title')
3460 video_title = result.group('title').strip()
3462 # Get the video date
3463 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3465 self._downloader.report_warning(u'unable to extract video date')
3468 upload_date = unified_strdate(result.group('date').strip())
3470 # Get the video uploader
3471 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3473 self._downloader.report_warning(u'unable to extract uploader')
3474 video_uploader = None
3476 video_uploader = result.group('uploader').strip()
3477 video_uploader = clean_html( video_uploader )
3479 # Get all of the formats available
3480 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3481 result = re.search(DOWNLOAD_LIST_RE, webpage)
3483 raise ExtractorError(u'Unable to extract download list')
3484 download_list_html = result.group('download_list').strip()
3486 # Get all of the links from the page
3487 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3488 links = re.findall(LINK_RE, download_list_html)
3489 if(len(links) == 0):
3490 raise ExtractorError(u'ERROR: no known formats available for video')
3492 self.to_screen(u'Links found: %d' % len(links))
3497 # A link looks like this:
3498 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3499 # A path looks like this:
3500 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3501 video_url = unescapeHTML( link )
3502 path = compat_urllib_parse_urlparse( video_url ).path
3503 extension = os.path.splitext( path )[1][1:]
3504 format = path.split('/')[4].split('_')[:2]
3507 format = "-".join( format )
3508 title = u'%s-%s-%s' % (video_title, size, bitrate)
3513 'uploader': video_uploader,
3514 'upload_date': upload_date,
3519 'description': None,
3523 if self._downloader.params.get('listformats', None):
3524 self._print_formats(formats)
3527 req_format = self._downloader.params.get('format', None)
3528 self.to_screen(u'Format: %s' % req_format)
3530 if req_format is None or req_format == 'best':
3532 elif req_format == 'worst':
3533 return [formats[-1]]
3534 elif req_format in ('-1', 'all'):
3537 format = self._specific( req_format, formats )
3539 raise ExtractorError(u'Requested format not available')
3544 class PornotubeIE(InfoExtractor):
3545 """Information extractor for pornotube.com."""
3546 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3548 def _real_extract(self, url):
3549 mobj = re.match(self._VALID_URL, url)
3551 raise ExtractorError(u'Invalid URL: %s' % url)
3553 video_id = mobj.group('videoid')
3554 video_title = mobj.group('title')
3556 # Get webpage content
3557 webpage = self._download_webpage(url, video_id)
3560 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3561 result = re.search(VIDEO_URL_RE, webpage)
3563 raise ExtractorError(u'Unable to extract video url')
3564 video_url = compat_urllib_parse.unquote(result.group('url'))
3566 #Get the uploaded date
3567 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3568 result = re.search(VIDEO_UPLOADED_RE, webpage)
3570 raise ExtractorError(u'Unable to extract video title')
3571 upload_date = unified_strdate(result.group('date'))
3573 info = {'id': video_id,
3576 'upload_date': upload_date,
3577 'title': video_title,
3583 class YouJizzIE(InfoExtractor):
3584 """Information extractor for youjizz.com."""
3585 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3587 def _real_extract(self, url):
3588 mobj = re.match(self._VALID_URL, url)
3590 raise ExtractorError(u'Invalid URL: %s' % url)
3592 video_id = mobj.group('videoid')
3594 # Get webpage content
3595 webpage = self._download_webpage(url, video_id)
3597 # Get the video title
3598 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3600 raise ExtractorError(u'ERROR: unable to extract video title')
3601 video_title = result.group('title').strip()
3603 # Get the embed page
3604 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3606 raise ExtractorError(u'ERROR: unable to extract embed page')
3608 embed_page_url = result.group(0).strip()
3609 video_id = result.group('videoid')
3611 webpage = self._download_webpage(embed_page_url, video_id)
3614 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3616 raise ExtractorError(u'ERROR: unable to extract video url')
3617 video_url = result.group('source')
3619 info = {'id': video_id,
3621 'title': video_title,
3624 'player_url': embed_page_url}
3628 class EightTracksIE(InfoExtractor):
3630 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3632 def _real_extract(self, url):
3633 mobj = re.match(self._VALID_URL, url)
3635 raise ExtractorError(u'Invalid URL: %s' % url)
3636 playlist_id = mobj.group('id')
3638 webpage = self._download_webpage(url, playlist_id)
3640 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3642 raise ExtractorError(u'Cannot find trax information')
3643 json_like = m.group(1)
3644 data = json.loads(json_like)
3646 session = str(random.randint(0, 1000000000))
3648 track_count = data['tracks_count']
3649 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3650 next_url = first_url
3652 for i in itertools.count():
3653 api_json = self._download_webpage(next_url, playlist_id,
3654 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3655 errnote=u'Failed to download song information')
3656 api_data = json.loads(api_json)
3657 track_data = api_data[u'set']['track']
3659 'id': track_data['id'],
3660 'url': track_data['track_file_stream_url'],
3661 'title': track_data['performer'] + u' - ' + track_data['name'],
3662 'raw_title': track_data['name'],
3663 'uploader_id': data['user']['login'],
3667 if api_data['set']['at_last_track']:
3669 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3672 class KeekIE(InfoExtractor):
3673 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3676 def _real_extract(self, url):
3677 m = re.match(self._VALID_URL, url)
3678 video_id = m.group('videoID')
3679 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3680 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3681 webpage = self._download_webpage(url, video_id)
3682 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3683 title = unescapeHTML(m.group('title'))
3684 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3685 uploader = clean_html(m.group('uploader'))
3691 'thumbnail': thumbnail,
3692 'uploader': uploader
3696 class TEDIE(InfoExtractor):
3697 _VALID_URL=r'''http://www\.ted\.com/
3699 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3701 ((?P<type_talk>talks)) # We have a simple talk
3703 (/lang/(.*?))? # The url may contain the language
3704 /(?P<name>\w+) # Here goes the name and then ".html"
3708 def suitable(cls, url):
3709 """Receives a URL and returns True if suitable for this IE."""
3710 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3712 def _real_extract(self, url):
3713 m=re.match(self._VALID_URL, url, re.VERBOSE)
3714 if m.group('type_talk'):
3715 return [self._talk_info(url)]
3717 playlist_id=m.group('playlist_id')
3718 name=m.group('name')
3719 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3720 return [self._playlist_videos_info(url,name,playlist_id)]
3722 def _talk_video_link(self,mediaSlug):
3723 '''Returns the video link for that mediaSlug'''
3724 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3726 def _playlist_videos_info(self,url,name,playlist_id=0):
3727 '''Returns the videos of the playlist'''
3729 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3730 ([.\s]*?)data-playlist_item_id="(\d+)"
3731 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3733 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3734 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3735 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3736 m_names=re.finditer(video_name_RE,webpage)
3738 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3739 m_playlist = re.search(playlist_RE, webpage)
3740 playlist_title = m_playlist.group('playlist_title')
3742 playlist_entries = []
3743 for m_video, m_name in zip(m_videos,m_names):
3744 video_id=m_video.group('video_id')
3745 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3746 playlist_entries.append(self.url_result(talk_url, 'TED'))
3747 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3749 def _talk_info(self, url, video_id=0):
3750 """Return the video for the talk in the url"""
3751 m=re.match(self._VALID_URL, url,re.VERBOSE)
3752 videoName=m.group('name')
3753 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3754 # If the url includes the language we get the title translated
3755 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3756 title=re.search(title_RE, webpage).group('title')
3757 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3758 "id":(?P<videoID>[\d]+).*?
3759 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3760 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3761 thumb_match=re.search(thumb_RE,webpage)
3762 info_match=re.search(info_RE,webpage,re.VERBOSE)
3763 video_id=info_match.group('videoID')
3764 mediaSlug=info_match.group('mediaSlug')
3765 video_url=self._talk_video_link(mediaSlug)
3771 'thumbnail': thumb_match.group('thumbnail')
3775 class MySpassIE(InfoExtractor):
3776 _VALID_URL = r'http://www.myspass.de/.*'
3778 def _real_extract(self, url):
3779 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3781 # video id is the last path element of the URL
3782 # usually there is a trailing slash, so also try the second but last
3783 url_path = compat_urllib_parse_urlparse(url).path
3784 url_parent_path, video_id = os.path.split(url_path)
3786 _, video_id = os.path.split(url_parent_path)
3789 metadata_url = META_DATA_URL_TEMPLATE % video_id
3790 metadata_text = self._download_webpage(metadata_url, video_id)
3791 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3793 # extract values from metadata
3794 url_flv_el = metadata.find('url_flv')
3795 if url_flv_el is None:
3796 raise ExtractorError(u'Unable to extract download url')
3797 video_url = url_flv_el.text
3798 extension = os.path.splitext(video_url)[1][1:]
3799 title_el = metadata.find('title')
3800 if title_el is None:
3801 raise ExtractorError(u'Unable to extract title')
3802 title = title_el.text
3803 format_id_el = metadata.find('format_id')
3804 if format_id_el is None:
3807 format = format_id_el.text
3808 description_el = metadata.find('description')
3809 if description_el is not None:
3810 description = description_el.text
3813 imagePreview_el = metadata.find('imagePreview')
3814 if imagePreview_el is not None:
3815 thumbnail = imagePreview_el.text
3824 'thumbnail': thumbnail,
3825 'description': description
3829 class SpiegelIE(InfoExtractor):
3830 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3832 def _real_extract(self, url):
3833 m = re.match(self._VALID_URL, url)
3834 video_id = m.group('videoID')
3836 webpage = self._download_webpage(url, video_id)
3837 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3839 raise ExtractorError(u'Cannot find title')
3840 video_title = unescapeHTML(m.group(1))
3842 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3843 xml_code = self._download_webpage(xml_url, video_id,
3844 note=u'Downloading XML', errnote=u'Failed to download XML')
3846 idoc = xml.etree.ElementTree.fromstring(xml_code)
3847 last_type = idoc[-1]
3848 filename = last_type.findall('./filename')[0].text
3849 duration = float(last_type.findall('./duration')[0].text)
3851 video_url = 'http://video2.spiegel.de/flash/' + filename
3852 video_ext = filename.rpartition('.')[2]
3857 'title': video_title,
3858 'duration': duration,
3862 class LiveLeakIE(InfoExtractor):
3864 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3865 IE_NAME = u'liveleak'
3867 def _real_extract(self, url):
3868 mobj = re.match(self._VALID_URL, url)
3870 raise ExtractorError(u'Invalid URL: %s' % url)
3872 video_id = mobj.group('video_id')
3874 webpage = self._download_webpage(url, video_id)
3876 m = re.search(r'file: "(.*?)",', webpage)
3878 raise ExtractorError(u'Unable to find video url')
3879 video_url = m.group(1)
3881 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3883 raise ExtractorError(u'Cannot find video title')
3884 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3886 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3888 desc = unescapeHTML(m.group('desc'))
3892 m = re.search(r'By:.*?(\w+)</a>', webpage)
3894 uploader = clean_html(m.group(1))
3903 'description': desc,
3904 'uploader': uploader
3909 class ARDIE(InfoExtractor):
3910 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3911 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3912 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3914 def _real_extract(self, url):
3915 # determine video id from url
3916 m = re.match(self._VALID_URL, url)
3918 numid = re.search(r'documentId=([0-9]+)', url)
3920 video_id = numid.group(1)
3922 video_id = m.group('video_id')
3924 # determine title and media streams from webpage
3925 html = self._download_webpage(url, video_id)
3926 title = re.search(self._TITLE, html).group('title')
3927 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3929 assert '"fsk"' in html
3930 raise ExtractorError(u'This video is only available after 8:00 pm')
3932 # choose default media type and highest quality for now
3933 stream = max([s for s in streams if int(s["media_type"]) == 0],
3934 key=lambda s: int(s["quality"]))
3936 # there's two possibilities: RTMP stream or HTTP download
3937 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3938 if stream['rtmp_url']:
3939 self.to_screen(u'RTMP download detected')
3940 assert stream['video_url'].startswith('mp4:')
3941 info["url"] = stream["rtmp_url"]
3942 info["play_path"] = stream['video_url']
3944 assert stream["video_url"].endswith('.mp4')
3945 info["url"] = stream["video_url"]
3948 class TumblrIE(InfoExtractor):
3949 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3951 def _real_extract(self, url):
3952 m_url = re.match(self._VALID_URL, url)
3953 video_id = m_url.group('id')
3954 blog = m_url.group('blog_name')
3956 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3957 webpage = self._download_webpage(url, video_id)
3959 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3960 video = re.search(re_video, webpage)
3962 self.to_screen("No video founded")
3964 video_url = video.group('video_url')
3965 ext = video.group('ext')
3967 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3968 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3970 # The only place where you can get a title, it's not complete,
3971 # but searching in other places doesn't work for all videos
3972 re_title = r'<title>(?P<title>.*?)</title>'
3973 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3975 return [{'id': video_id,
3982 class BandcampIE(InfoExtractor):
3983 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3985 def _real_extract(self, url):
3986 mobj = re.match(self._VALID_URL, url)
3987 title = mobj.group('title')
3988 webpage = self._download_webpage(url, title)
3989 # We get the link to the free download page
3990 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3991 if m_download is None:
3992 raise ExtractorError(u'No free songs founded')
3994 download_link = m_download.group(1)
3995 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3996 webpage, re.MULTILINE|re.DOTALL).group('id')
3998 download_webpage = self._download_webpage(download_link, id,
3999 'Downloading free downloads page')
4000 # We get the dictionary of the track from some javascrip code
4001 info = re.search(r'items: (.*?),$',
4002 download_webpage, re.MULTILINE).group(1)
4003 info = json.loads(info)[0]
4004 # We pick mp3-320 for now, until format selection can be easily implemented.
4005 mp3_info = info[u'downloads'][u'mp3-320']
4006 # If we try to use this url it says the link has expired
4007 initial_url = mp3_info[u'url']
4008 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4009 m_url = re.match(re_url, initial_url)
4010 #We build the url we will use to get the final track url
4011 # This url is build in Bandcamp in the script download_bunde_*.js
4012 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4013 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4014 # If we could correctly generate the .rand field the url would be
4015 #in the "download_url" key
4016 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4018 track_info = {'id':id,
4019 'title' : info[u'title'],
4022 'thumbnail' : info[u'thumb_url'],
4023 'uploader' : info[u'artist']
4028 class RedTubeIE(InfoExtractor):
4029 """Information Extractor for redtube"""
4030 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4032 def _real_extract(self,url):
4033 mobj = re.match(self._VALID_URL, url)
4035 raise ExtractorError(u'Invalid URL: %s' % url)
4037 video_id = mobj.group('id')
4038 video_extension = 'mp4'
4039 webpage = self._download_webpage(url, video_id)
4040 self.report_extraction(video_id)
4041 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4044 raise ExtractorError(u'Unable to extract media URL')
4046 video_url = mobj.group(1)
4047 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4049 raise ExtractorError(u'Unable to extract title')
4050 video_title = mobj.group(1)
4055 'ext': video_extension,
4056 'title': video_title,
4059 class InaIE(InfoExtractor):
4060 """Information Extractor for Ina.fr"""
4061 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4063 def _real_extract(self,url):
4064 mobj = re.match(self._VALID_URL, url)
4066 video_id = mobj.group('id')
4067 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4068 video_extension = 'mp4'
4069 webpage = self._download_webpage(mrss_url, video_id)
4071 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4073 raise ExtractorError(u'Unable to extract media URL')
4074 video_url = mobj.group(1)
4076 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4078 raise ExtractorError(u'Unable to extract title')
4079 video_title = mobj.group(1)
4084 'ext': video_extension,
4085 'title': video_title,
4088 def gen_extractors():
4089 """ Return a list of an instance of every supported extractor.
4090 The order does matter; the first extractor matched is the one handling the URL.
4093 YoutubePlaylistIE(),
4118 StanfordOpenClassroomIE(),
4128 WorldStarHipHopIE(),
4148 def get_info_extractor(ie_name):
4149 """Returns the info extractor class with the given ie_name"""
4150 return globals()[ie_name+'IE']