2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
614 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
615 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
616 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
618 format_limit = self._downloader.params.get('format_limit', None)
619 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
620 if format_limit is not None and format_limit in available_formats:
621 format_list = available_formats[available_formats.index(format_limit):]
623 format_list = available_formats
624 existing_formats = [x for x in format_list if x in url_map]
625 if len(existing_formats) == 0:
626 raise ExtractorError(u'no known formats available for video')
627 if self._downloader.params.get('listformats', None):
628 self._print_formats(existing_formats)
630 if req_format is None or req_format == 'best':
631 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632 elif req_format == 'worst':
633 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634 elif req_format in ('-1', 'all'):
635 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
637 # Specific formats. We pick the first in a slash-delimeted sequence.
638 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639 req_formats = req_format.split('/')
640 video_url_list = None
641 for rf in req_formats:
643 video_url_list = [(rf, url_map[rf])]
645 if video_url_list is None:
646 raise ExtractorError(u'requested format not available')
648 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651 for format_param, video_real_url in video_url_list:
653 video_extension = self._video_extensions.get(format_param, 'flv')
655 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
656 self._video_dimensions.get(format_param, '???'))
660 'url': video_real_url,
661 'uploader': video_uploader,
662 'uploader_id': video_uploader_id,
663 'upload_date': upload_date,
664 'title': video_title,
665 'ext': video_extension,
666 'format': video_format,
667 'thumbnail': video_thumbnail,
668 'description': video_description,
669 'player_url': player_url,
670 'subtitles': video_subtitles,
671 'duration': video_duration
676 class MetacafeIE(InfoExtractor):
677 """Information Extractor for metacafe.com."""
679 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
680 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
681 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
682 IE_NAME = u'metacafe'
684 def report_disclaimer(self):
685 """Report disclaimer retrieval."""
686 self.to_screen(u'Retrieving disclaimer')
688 def _real_initialize(self):
689 # Retrieve disclaimer
690 request = compat_urllib_request.Request(self._DISCLAIMER)
692 self.report_disclaimer()
693 disclaimer = compat_urllib_request.urlopen(request).read()
694 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
695 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
700 'submit': "Continue - I'm over 18",
702 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
704 self.report_age_confirmation()
705 disclaimer = compat_urllib_request.urlopen(request).read()
706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
709 def _real_extract(self, url):
710 # Extract id and simplified title from URL
711 mobj = re.match(self._VALID_URL, url)
713 raise ExtractorError(u'Invalid URL: %s' % url)
715 video_id = mobj.group(1)
717 # Check if video comes from YouTube
718 mobj2 = re.match(r'^yt-(.*)$', video_id)
719 if mobj2 is not None:
720 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
722 # Retrieve video webpage to extract further information
723 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
725 # Extract URL, uploader and title from webpage
726 self.report_extraction(video_id)
727 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
729 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
730 video_extension = mediaURL[-3:]
732 # Extract gdaKey if available
733 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737 gdaKey = mobj.group(1)
738 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
740 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
742 raise ExtractorError(u'Unable to extract media URL')
743 vardict = compat_parse_qs(mobj.group(1))
744 if 'mediaData' not in vardict:
745 raise ExtractorError(u'Unable to extract media URL')
746 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
748 raise ExtractorError(u'Unable to extract media URL')
749 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
750 video_extension = mediaURL[-3:]
751 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
753 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
755 raise ExtractorError(u'Unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 raise ExtractorError(u'Unable to extract uploader nickname')
761 video_uploader = mobj.group(1)
764 'id': video_id.decode('utf-8'),
765 'url': video_url.decode('utf-8'),
766 'uploader': video_uploader.decode('utf-8'),
768 'title': video_title,
769 'ext': video_extension.decode('utf-8'),
772 class DailymotionIE(InfoExtractor):
773 """Information Extractor for Dailymotion"""
775 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
776 IE_NAME = u'dailymotion'
778 def _real_extract(self, url):
779 # Extract id and simplified title from URL
780 mobj = re.match(self._VALID_URL, url)
782 raise ExtractorError(u'Invalid URL: %s' % url)
784 video_id = mobj.group(1).split('_')[0].split('?')[0]
786 video_extension = 'mp4'
788 # Retrieve video webpage to extract further information
789 request = compat_urllib_request.Request(url)
790 request.add_header('Cookie', 'family_filter=off')
791 webpage = self._download_webpage(request, video_id)
793 # Extract URL, uploader and title from webpage
794 self.report_extraction(video_id)
795 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
797 raise ExtractorError(u'Unable to extract media URL')
798 flashvars = compat_urllib_parse.unquote(mobj.group(1))
800 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
803 self.to_screen(u'Using %s' % key)
806 raise ExtractorError(u'Unable to extract video URL')
808 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
810 raise ExtractorError(u'Unable to extract video URL')
812 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
814 # TODO: support choosing qualities
816 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
818 raise ExtractorError(u'Unable to extract title')
819 video_title = unescapeHTML(mobj.group('title'))
821 video_uploader = None
822 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
824 # lookin for official user
825 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
826 if mobj_official is None:
827 self._downloader.report_warning(u'unable to extract uploader nickname')
829 video_uploader = mobj_official.group(1)
831 video_uploader = mobj.group(1)
833 video_upload_date = None
834 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
836 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
841 'uploader': video_uploader,
842 'upload_date': video_upload_date,
843 'title': video_title,
844 'ext': video_extension,
848 class PhotobucketIE(InfoExtractor):
849 """Information extractor for photobucket.com."""
851 # TODO: the original _VALID_URL was:
852 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
853 # Check if it's necessary to keep the old extracion process
854 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
855 IE_NAME = u'photobucket'
857 def _real_extract(self, url):
858 # Extract id from URL
859 mobj = re.match(self._VALID_URL, url)
861 raise ExtractorError(u'Invalid URL: %s' % url)
863 video_id = mobj.group('id')
865 video_extension = mobj.group('ext')
867 # Retrieve video webpage to extract further information
868 webpage = self._download_webpage(url, video_id)
870 # Extract URL, uploader, and title from webpage
871 self.report_extraction(video_id)
872 # We try first by looking the javascript code:
873 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
875 info = json.loads(mobj.group('json'))
878 'url': info[u'downloadUrl'],
879 'uploader': info[u'username'],
880 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
881 'title': info[u'title'],
882 'ext': video_extension,
883 'thumbnail': info[u'thumbUrl'],
886 # We try looking in other parts of the webpage
887 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
889 raise ExtractorError(u'Unable to extract media URL')
890 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
896 raise ExtractorError(u'Unable to extract title')
897 video_title = mobj.group(1).decode('utf-8')
899 video_uploader = mobj.group(2).decode('utf-8')
902 'id': video_id.decode('utf-8'),
903 'url': video_url.decode('utf-8'),
904 'uploader': video_uploader,
906 'title': video_title,
907 'ext': video_extension.decode('utf-8'),
911 class YahooIE(InfoExtractor):
912 """Information extractor for video.yahoo.com."""
915 # _VALID_URL matches all Yahoo! Video URLs
916 # _VPAGE_URL matches only the extractable '/watch/' URLs
917 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919 IE_NAME = u'video.yahoo'
921 def _real_extract(self, url, new_video=True):
922 # Extract ID from URL
923 mobj = re.match(self._VALID_URL, url)
925 raise ExtractorError(u'Invalid URL: %s' % url)
927 video_id = mobj.group(2)
928 video_extension = 'flv'
930 # Rewrite valid but non-extractable URLs as
931 # extractable English language /watch/ URLs
932 if re.match(self._VPAGE_URL, url) is None:
933 request = compat_urllib_request.Request(url)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
939 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
941 raise ExtractorError(u'Unable to extract id field')
942 yahoo_id = mobj.group(1)
944 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
946 raise ExtractorError(u'Unable to extract vid field')
947 yahoo_vid = mobj.group(1)
949 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
950 return self._real_extract(url, new_video=False)
952 # Retrieve video webpage to extract further information
953 request = compat_urllib_request.Request(url)
955 self.report_download_webpage(video_id)
956 webpage = compat_urllib_request.urlopen(request).read()
957 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
960 # Extract uploader and title from webpage
961 self.report_extraction(video_id)
962 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
964 raise ExtractorError(u'Unable to extract video title')
965 video_title = mobj.group(1).decode('utf-8')
967 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
969 raise ExtractorError(u'Unable to extract video uploader')
970 video_uploader = mobj.group(1).decode('utf-8')
972 # Extract video thumbnail
973 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
975 raise ExtractorError(u'Unable to extract video thumbnail')
976 video_thumbnail = mobj.group(1).decode('utf-8')
978 # Extract video description
979 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
981 raise ExtractorError(u'Unable to extract video description')
982 video_description = mobj.group(1).decode('utf-8')
983 if not video_description:
984 video_description = 'No description available.'
986 # Extract video height and width
987 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
989 raise ExtractorError(u'Unable to extract video height')
990 yv_video_height = mobj.group(1)
992 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
994 raise ExtractorError(u'Unable to extract video width')
995 yv_video_width = mobj.group(1)
997 # Retrieve video playlist to extract media URL
998 # I'm not completely sure what all these options are, but we
999 # seem to need most of them, otherwise the server sends a 401.
1000 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1001 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1002 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006 self.report_download_webpage(video_id)
1007 webpage = compat_urllib_request.urlopen(request).read()
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1011 # Extract media URL from playlist XML
1012 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1014 raise ExtractorError(u'Unable to extract media URL')
1015 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1016 video_url = unescapeHTML(video_url)
1019 'id': video_id.decode('utf-8'),
1021 'uploader': video_uploader,
1022 'upload_date': None,
1023 'title': video_title,
1024 'ext': video_extension.decode('utf-8'),
1025 'thumbnail': video_thumbnail.decode('utf-8'),
1026 'description': video_description,
1030 class VimeoIE(InfoExtractor):
1031 """Information extractor for vimeo.com."""
1033 # _VALID_URL matches Vimeo URLs
1034 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1037 def _real_extract(self, url, new_video=True):
1038 # Extract ID from URL
1039 mobj = re.match(self._VALID_URL, url)
1041 raise ExtractorError(u'Invalid URL: %s' % url)
1043 video_id = mobj.group('id')
1044 if not mobj.group('proto'):
1045 url = 'https://' + url
1046 if mobj.group('direct_link'):
1047 url = 'https://vimeo.com/' + video_id
1049 # Retrieve video webpage to extract further information
1050 request = compat_urllib_request.Request(url, None, std_headers)
1051 webpage = self._download_webpage(request, video_id)
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1058 # Extract the config JSON
1060 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061 config = json.loads(config)
1063 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1064 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1066 raise ExtractorError(u'Unable to extract info section')
1069 video_title = config["video"]["title"]
1071 # Extract uploader and uploader_id
1072 video_uploader = config["video"]["owner"]["name"]
1073 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1075 # Extract video thumbnail
1076 video_thumbnail = config["video"]["thumbnail"]
1078 # Extract video description
1079 video_description = get_element_by_attribute("itemprop", "description", webpage)
1080 if video_description: video_description = clean_html(video_description)
1081 else: video_description = u''
1083 # Extract upload date
1084 video_upload_date = None
1085 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086 if mobj is not None:
1087 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1089 # Vimeo specific: extract request signature and timestamp
1090 sig = config['request']['signature']
1091 timestamp = config['request']['timestamp']
1093 # Vimeo specific: extract video codec and quality information
1094 # First consider quality, then codecs, then take everything
1095 # TODO bind to format param
1096 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097 files = { 'hd': [], 'sd': [], 'other': []}
1098 for codec_name, codec_extension in codecs:
1099 if codec_name in config["video"]["files"]:
1100 if 'hd' in config["video"]["files"][codec_name]:
1101 files['hd'].append((codec_name, codec_extension, 'hd'))
1102 elif 'sd' in config["video"]["files"][codec_name]:
1103 files['sd'].append((codec_name, codec_extension, 'sd'))
1105 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1107 for quality in ('hd', 'sd', 'other'):
1108 if len(files[quality]) > 0:
1109 video_quality = files[quality][0][2]
1110 video_codec = files[quality][0][0]
1111 video_extension = files[quality][0][1]
1112 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1115 raise ExtractorError(u'No known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def fetch_webpage(self, url):
1142 request = compat_urllib_request.Request(url)
1144 self.report_download_webpage(url)
1145 webpage = compat_urllib_request.urlopen(request).read()
1146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1148 except ValueError as err:
1149 raise ExtractorError(u'Invalid URL: %s' % url)
1152 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1153 page = self.fetch_webpage(url)
1154 mobj = re.search(regex, page, regexFlags)
1158 raise ExtractorError(u'Invalid URL: %s' % url)
1160 for (i, key, err) in matchTuples:
1161 if mobj.group(i) is None:
1162 raise ExtractorError(err)
1164 info[key] = mobj.group(i)
1168 def extractLiveStream(self, url):
1169 video_lang = url.split('/')[-4]
1170 info = self.grep_webpage(
1172 r'src="(.*?/videothek_js.*?\.js)',
1175 (1, 'url', u'Invalid URL: %s' % url)
1178 http_host = url.split('/')[2]
1179 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1180 info = self.grep_webpage(
1182 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1183 '(http://.*?\.swf).*?' +
1187 (1, 'path', u'could not extract video path: %s' % url),
1188 (2, 'player', u'could not extract video player: %s' % url),
1189 (3, 'url', u'could not extract video url: %s' % url)
1192 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1194 def extractPlus7Stream(self, url):
1195 video_lang = url.split('/')[-3]
1196 info = self.grep_webpage(
1198 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1201 (1, 'url', u'Invalid URL: %s' % url)
1204 next_url = compat_urllib_parse.unquote(info.get('url'))
1205 info = self.grep_webpage(
1207 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1210 (1, 'url', u'Could not find <video> tag: %s' % url)
1213 next_url = compat_urllib_parse.unquote(info.get('url'))
1215 info = self.grep_webpage(
1217 r'<video id="(.*?)".*?>.*?' +
1218 '<name>(.*?)</name>.*?' +
1219 '<dateVideo>(.*?)</dateVideo>.*?' +
1220 '<url quality="hd">(.*?)</url>',
1223 (1, 'id', u'could not extract video id: %s' % url),
1224 (2, 'title', u'could not extract video title: %s' % url),
1225 (3, 'date', u'could not extract video date: %s' % url),
1226 (4, 'url', u'could not extract video url: %s' % url)
1231 'id': info.get('id'),
1232 'url': compat_urllib_parse.unquote(info.get('url')),
1233 'uploader': u'arte.tv',
1234 'upload_date': unified_strdate(info.get('date')),
1235 'title': info.get('title').decode('utf-8'),
1241 def _real_extract(self, url):
1242 video_id = url.split('/')[-1]
1243 self.report_extraction(video_id)
1245 if re.search(self._LIVE_URL, video_id) is not None:
1246 self.extractLiveStream(url)
1249 info = self.extractPlus7Stream(url)
1254 class GenericIE(InfoExtractor):
1255 """Generic last-resort information extractor."""
1258 IE_NAME = u'generic'
1260 def report_download_webpage(self, video_id):
1261 """Report webpage download."""
1262 if not self._downloader.params.get('test', False):
1263 self._downloader.report_warning(u'Falling back on generic information extractor.')
1264 super(GenericIE, self).report_download_webpage(video_id)
1266 def report_following_redirect(self, new_url):
1267 """Report information extraction."""
1268 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1270 def _test_redirect(self, url):
1271 """Check if it is a redirect, like url shorteners, in case return the new url."""
1272 class HeadRequest(compat_urllib_request.Request):
1273 def get_method(self):
1276 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1278 Subclass the HTTPRedirectHandler to make it use our
1279 HeadRequest also on the redirected URL
1281 def redirect_request(self, req, fp, code, msg, headers, newurl):
1282 if code in (301, 302, 303, 307):
1283 newurl = newurl.replace(' ', '%20')
1284 newheaders = dict((k,v) for k,v in req.headers.items()
1285 if k.lower() not in ("content-length", "content-type"))
1286 return HeadRequest(newurl,
1288 origin_req_host=req.get_origin_req_host(),
1291 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1293 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1295 Fallback to GET if HEAD is not allowed (405 HTTP error)
1297 def http_error_405(self, req, fp, code, msg, headers):
1301 newheaders = dict((k,v) for k,v in req.headers.items()
1302 if k.lower() not in ("content-length", "content-type"))
1303 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1305 origin_req_host=req.get_origin_req_host(),
1309 opener = compat_urllib_request.OpenerDirector()
1310 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1311 HTTPMethodFallback, HEADRedirectHandler,
1312 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1313 opener.add_handler(handler())
1315 response = opener.open(HeadRequest(url))
1316 new_url = response.geturl()
1321 self.report_following_redirect(new_url)
1324 def _real_extract(self, url):
1325 new_url = self._test_redirect(url)
1326 if new_url: return [self.url_result(new_url)]
1328 video_id = url.split('/')[-1]
1330 webpage = self._download_webpage(url, video_id)
1331 except ValueError as err:
1332 # since this is the last-resort InfoExtractor, if
1333 # this error is thrown, it'll be thrown here
1334 raise ExtractorError(u'Invalid URL: %s' % url)
1336 self.report_extraction(video_id)
1337 # Start with something easy: JW Player in SWFObject
1338 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1340 # Broaden the search a little bit
1341 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1343 # Broaden the search a little bit: JWPlayer JS loader
1344 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1346 raise ExtractorError(u'Invalid URL: %s' % url)
1348 # It's possible that one of the regexes
1349 # matched, but returned an empty group:
1350 if mobj.group(1) is None:
1351 raise ExtractorError(u'Invalid URL: %s' % url)
1353 video_url = compat_urllib_parse.unquote(mobj.group(1))
1354 video_id = os.path.basename(video_url)
1356 # here's a fun little line of code for you:
1357 video_extension = os.path.splitext(video_id)[1][1:]
1358 video_id = os.path.splitext(video_id)[0]
1360 # it's tempting to parse this further, but you would
1361 # have to take into account all the variations like
1362 # Video Title - Site Name
1363 # Site Name | Video Title
1364 # Video Title - Tagline | Site Name
1365 # and so on and so forth; it's just not practical
1366 mobj = re.search(r'<title>(.*)</title>', webpage)
1368 raise ExtractorError(u'Unable to extract title')
1369 video_title = mobj.group(1)
1371 # video uploader is domain name
1372 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1374 raise ExtractorError(u'Unable to extract title')
1375 video_uploader = mobj.group(1)
1380 'uploader': video_uploader,
1381 'upload_date': None,
1382 'title': video_title,
1383 'ext': video_extension,
1387 class YoutubeSearchIE(InfoExtractor):
1388 """Information Extractor for YouTube search queries."""
1389 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1390 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1391 _max_youtube_results = 1000
1392 IE_NAME = u'youtube:search'
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download search page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1402 raise ExtractorError(u'Invalid search query "%s"' % query)
1404 prefix, query = query.split(':')
1406 query = query.encode('utf-8')
1408 return self._get_n_results(query, 1)
1409 elif prefix == 'all':
1410 self._get_n_results(query, self._max_youtube_results)
1415 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1416 elif n > self._max_youtube_results:
1417 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1418 n = self._max_youtube_results
1419 return self._get_n_results(query, n)
1420 except ValueError: # parsing prefix as integer fails
1421 return self._get_n_results(query, 1)
1423 def _get_n_results(self, query, n):
1424 """Get a specified number of results for a query"""
1430 while (50 * pagenum) < limit:
1431 self.report_download_page(query, pagenum+1)
1432 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1433 request = compat_urllib_request.Request(result_url)
1435 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1437 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1438 api_response = json.loads(data)['data']
1440 if not 'items' in api_response:
1441 raise ExtractorError(u'[youtube] No video results')
1443 new_ids = list(video['id'] for video in api_response['items'])
1444 video_ids += new_ids
1446 limit = min(n, api_response['totalItems'])
1449 if len(video_ids) > n:
1450 video_ids = video_ids[:n]
1451 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1455 class GoogleSearchIE(InfoExtractor):
1456 """Information Extractor for Google Video search queries."""
1457 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1458 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1459 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1460 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1461 _max_google_results = 1000
1462 IE_NAME = u'video.google:search'
1464 def report_download_page(self, query, pagenum):
1465 """Report attempt to download playlist page with given number."""
1466 query = query.decode(preferredencoding())
1467 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1469 def _real_extract(self, query):
1470 mobj = re.match(self._VALID_URL, query)
1472 raise ExtractorError(u'Invalid search query "%s"' % query)
1474 prefix, query = query.split(':')
1476 query = query.encode('utf-8')
1478 self._download_n_results(query, 1)
1480 elif prefix == 'all':
1481 self._download_n_results(query, self._max_google_results)
1487 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1488 elif n > self._max_google_results:
1489 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1490 n = self._max_google_results
1491 self._download_n_results(query, n)
1493 except ValueError: # parsing prefix as integer fails
1494 self._download_n_results(query, 1)
1497 def _download_n_results(self, query, n):
1498 """Downloads a specified number of results for a query"""
1504 self.report_download_page(query, pagenum)
1505 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1506 request = compat_urllib_request.Request(result_url)
1508 page = compat_urllib_request.urlopen(request).read()
1509 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1510 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1512 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514 video_id = mobj.group(1)
1515 if video_id not in video_ids:
1516 video_ids.append(video_id)
1517 if len(video_ids) == n:
1518 # Specified n videos reached
1519 for id in video_ids:
1520 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 for id in video_ids:
1525 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1528 pagenum = pagenum + 1
1531 class YahooSearchIE(InfoExtractor):
1532 """Information Extractor for Yahoo! Video search queries."""
1535 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538 _MORE_PAGES_INDICATOR = r'\s*Next'
1539 _max_yahoo_results = 1000
1540 IE_NAME = u'video.yahoo:search'
1542 def report_download_page(self, query, pagenum):
1543 """Report attempt to download playlist page with given number."""
1544 query = query.decode(preferredencoding())
1545 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1547 def _real_extract(self, query):
1548 mobj = re.match(self._VALID_URL, query)
1550 raise ExtractorError(u'Invalid search query "%s"' % query)
1552 prefix, query = query.split(':')
1554 query = query.encode('utf-8')
1556 self._download_n_results(query, 1)
1558 elif prefix == 'all':
1559 self._download_n_results(query, self._max_yahoo_results)
1565 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1566 elif n > self._max_yahoo_results:
1567 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1568 n = self._max_yahoo_results
1569 self._download_n_results(query, n)
1571 except ValueError: # parsing prefix as integer fails
1572 self._download_n_results(query, 1)
1575 def _download_n_results(self, query, n):
1576 """Downloads a specified number of results for a query"""
1579 already_seen = set()
1583 self.report_download_page(query, pagenum)
1584 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1585 request = compat_urllib_request.Request(result_url)
1587 page = compat_urllib_request.urlopen(request).read()
1588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1589 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1591 # Extract video identifiers
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 video_id = mobj.group(1)
1594 if video_id not in already_seen:
1595 video_ids.append(video_id)
1596 already_seen.add(video_id)
1597 if len(video_ids) == n:
1598 # Specified n videos reached
1599 for id in video_ids:
1600 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1603 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1604 for id in video_ids:
1605 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608 pagenum = pagenum + 1
1611 class YoutubePlaylistIE(InfoExtractor):
1612 """Information Extractor for YouTube playlists."""
1614 _VALID_URL = r"""(?:
1619 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1620 \? (?:.*?&)*? (?:p|a|list)=
1623 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1626 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1628 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1630 IE_NAME = u'youtube:playlist'
1633 def suitable(cls, url):
1634 """Receives a URL and returns True if suitable for this IE."""
1635 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1637 def _real_extract(self, url):
1638 # Extract playlist id
1639 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1641 raise ExtractorError(u'Invalid URL: %s' % url)
1643 # Download playlist videos from API
1644 playlist_id = mobj.group(1) or mobj.group(2)
1649 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1650 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1653 response = json.loads(page)
1654 except ValueError as err:
1655 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1657 if 'feed' not in response:
1658 raise ExtractorError(u'Got a malformed response from YouTube API')
1659 playlist_title = response['feed']['title']['$t']
1660 if 'entry' not in response['feed']:
1661 # Number of videos is a multiple of self._MAX_RESULTS
1664 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1665 for entry in response['feed']['entry']
1666 if 'content' in entry ]
1668 if len(response['feed']['entry']) < self._MAX_RESULTS:
1672 videos = [v[1] for v in sorted(videos)]
1674 url_results = [self.url_result(url, 'Youtube') for url in videos]
1675 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1678 class YoutubeChannelIE(InfoExtractor):
1679 """Information Extractor for YouTube channels."""
1681 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1682 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1683 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1684 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1685 IE_NAME = u'youtube:channel'
1687 def extract_videos_from_page(self, page):
1689 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1690 if mobj.group(1) not in ids_in_page:
1691 ids_in_page.append(mobj.group(1))
1694 def _real_extract(self, url):
1695 # Extract channel id
1696 mobj = re.match(self._VALID_URL, url)
1698 raise ExtractorError(u'Invalid URL: %s' % url)
1700 # Download channel page
1701 channel_id = mobj.group(1)
1705 url = self._TEMPLATE_URL % (channel_id, pagenum)
1706 page = self._download_webpage(url, channel_id,
1707 u'Downloading page #%s' % pagenum)
1709 # Extract video identifiers
1710 ids_in_page = self.extract_videos_from_page(page)
1711 video_ids.extend(ids_in_page)
1713 # Download any subsequent channel pages using the json-based channel_ajax query
1714 if self._MORE_PAGES_INDICATOR in page:
1716 pagenum = pagenum + 1
1718 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1719 page = self._download_webpage(url, channel_id,
1720 u'Downloading page #%s' % pagenum)
1722 page = json.loads(page)
1724 ids_in_page = self.extract_videos_from_page(page['content_html'])
1725 video_ids.extend(ids_in_page)
1727 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1730 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1732 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1733 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1734 return [self.playlist_result(url_entries, channel_id)]
1737 class YoutubeUserIE(InfoExtractor):
1738 """Information Extractor for YouTube users."""
1740 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1741 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1742 _GDATA_PAGE_SIZE = 50
1743 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1744 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1745 IE_NAME = u'youtube:user'
1747 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1751 raise ExtractorError(u'Invalid URL: %s' % url)
1753 username = mobj.group(1)
1755 # Download video ids using YouTube Data API. Result size per
1756 # query is limited (currently to 50 videos) so we need to query
1757 # page by page until there are no video ids - it means we got
1764 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1766 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1767 page = self._download_webpage(gdata_url, username,
1768 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1770 # Extract video identifiers
1773 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1774 if mobj.group(1) not in ids_in_page:
1775 ids_in_page.append(mobj.group(1))
1777 video_ids.extend(ids_in_page)
1779 # A little optimization - if current page is not
1780 # "full", ie. does not contain PAGE_SIZE video ids then
1781 # we can assume that this page is the last one - there
1782 # are no more ids on further pages - no need to query
1785 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1790 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1791 url_results = [self.url_result(url, 'Youtube') for url in urls]
1792 return [self.playlist_result(url_results, playlist_title = username)]
1795 class BlipTVUserIE(InfoExtractor):
1796 """Information Extractor for blip.tv users."""
1798 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1800 IE_NAME = u'blip.tv:user'
1802 def _real_extract(self, url):
1804 mobj = re.match(self._VALID_URL, url)
1806 raise ExtractorError(u'Invalid URL: %s' % url)
1808 username = mobj.group(1)
1810 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1812 page = self._download_webpage(url, username, u'Downloading user page')
1813 mobj = re.search(r'data-users-id="([^"]+)"', page)
1814 page_base = page_base % mobj.group(1)
1817 # Download video ids using BlipTV Ajax calls. Result size per
1818 # query is limited (currently to 12 videos) so we need to query
1819 # page by page until there are no video ids - it means we got
1826 url = page_base + "&page=" + str(pagenum)
1827 page = self._download_webpage(url, username,
1828 u'Downloading video ids from page %d' % pagenum)
1830 # Extract video identifiers
1833 for mobj in re.finditer(r'href="/([^"]+)"', page):
1834 if mobj.group(1) not in ids_in_page:
1835 ids_in_page.append(unescapeHTML(mobj.group(1)))
1837 video_ids.extend(ids_in_page)
1839 # A little optimization - if current page is not
1840 # "full", ie. does not contain PAGE_SIZE video ids then
1841 # we can assume that this page is the last one - there
1842 # are no more ids on further pages - no need to query
1845 if len(ids_in_page) < self._PAGE_SIZE:
1850 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1851 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1852 return [self.playlist_result(url_entries, playlist_title = username)]
1855 class DepositFilesIE(InfoExtractor):
1856 """Information extractor for depositfiles.com"""
1858 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1860 def _real_extract(self, url):
1861 file_id = url.split('/')[-1]
1862 # Rebuild url in english locale
1863 url = 'http://depositfiles.com/en/files/' + file_id
1865 # Retrieve file webpage with 'Free download' button pressed
1866 free_download_indication = { 'gateway_result' : '1' }
1867 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1869 self.report_download_webpage(file_id)
1870 webpage = compat_urllib_request.urlopen(request).read()
1871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1872 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1874 # Search for the real file URL
1875 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1876 if (mobj is None) or (mobj.group(1) is None):
1877 # Try to figure out reason of the error.
1878 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1879 if (mobj is not None) and (mobj.group(1) is not None):
1880 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1881 raise ExtractorError(u'%s' % restriction_message)
1883 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1885 file_url = mobj.group(1)
1886 file_extension = os.path.splitext(file_url)[1][1:]
1888 # Search for file title
1889 mobj = re.search(r'<b title="(.*?)">', webpage)
1891 raise ExtractorError(u'Unable to extract title')
1892 file_title = mobj.group(1).decode('utf-8')
1895 'id': file_id.decode('utf-8'),
1896 'url': file_url.decode('utf-8'),
1898 'upload_date': None,
1899 'title': file_title,
1900 'ext': file_extension.decode('utf-8'),
1904 class FacebookIE(InfoExtractor):
1905 """Information Extractor for Facebook"""
1907 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1908 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1909 _NETRC_MACHINE = 'facebook'
1910 IE_NAME = u'facebook'
1912 def report_login(self):
1913 """Report attempt to log in."""
1914 self.to_screen(u'Logging in')
1916 def _real_initialize(self):
1917 if self._downloader is None:
1922 downloader_params = self._downloader.params
1924 # Attempt to use provided username and password or .netrc data
1925 if downloader_params.get('username', None) is not None:
1926 useremail = downloader_params['username']
1927 password = downloader_params['password']
1928 elif downloader_params.get('usenetrc', False):
1930 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1931 if info is not None:
1935 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1936 except (IOError, netrc.NetrcParseError) as err:
1937 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1940 if useremail is None:
1949 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1952 login_results = compat_urllib_request.urlopen(request).read()
1953 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1954 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1956 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1957 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1960 def _real_extract(self, url):
1961 mobj = re.match(self._VALID_URL, url)
1963 raise ExtractorError(u'Invalid URL: %s' % url)
1964 video_id = mobj.group('ID')
1966 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1967 webpage = self._download_webpage(url, video_id)
1969 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1970 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1971 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1973 raise ExtractorError(u'Cannot parse data')
1974 data = dict(json.loads(m.group(1)))
1975 params_raw = compat_urllib_parse.unquote(data['params'])
1976 params = json.loads(params_raw)
1977 video_data = params['video_data'][0]
1978 video_url = video_data.get('hd_src')
1980 video_url = video_data['sd_src']
1982 raise ExtractorError(u'Cannot find video URL')
1983 video_duration = int(video_data['video_duration'])
1984 thumbnail = video_data['thumbnail_src']
1986 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1988 raise ExtractorError(u'Cannot find title in webpage')
1989 video_title = unescapeHTML(m.group(1))
1993 'title': video_title,
1996 'duration': video_duration,
1997 'thumbnail': thumbnail,
2002 class BlipTVIE(InfoExtractor):
2003 """Information extractor for blip.tv"""
2005 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2006 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2007 IE_NAME = u'blip.tv'
2009 def report_direct_download(self, title):
2010 """Report information extraction."""
2011 self.to_screen(u'%s: Direct download detected' % title)
2013 def _real_extract(self, url):
2014 mobj = re.match(self._VALID_URL, url)
2016 raise ExtractorError(u'Invalid URL: %s' % url)
2018 urlp = compat_urllib_parse_urlparse(url)
2019 if urlp.path.startswith('/play/'):
2020 request = compat_urllib_request.Request(url)
2021 response = compat_urllib_request.urlopen(request)
2022 redirecturl = response.geturl()
2023 rurlp = compat_urllib_parse_urlparse(redirecturl)
2024 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2025 url = 'http://blip.tv/a/a-' + file_id
2026 return self._real_extract(url)
2033 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2034 request = compat_urllib_request.Request(json_url)
2035 request.add_header('User-Agent', 'iTunes/10.6.1')
2036 self.report_extraction(mobj.group(1))
2039 urlh = compat_urllib_request.urlopen(request)
2040 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2041 basename = url.split('/')[-1]
2042 title,ext = os.path.splitext(basename)
2043 title = title.decode('UTF-8')
2044 ext = ext.replace('.', '')
2045 self.report_direct_download(title)
2050 'upload_date': None,
2055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2056 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2057 if info is None: # Regular URL
2059 json_code_bytes = urlh.read()
2060 json_code = json_code_bytes.decode('utf-8')
2061 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2062 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2065 json_data = json.loads(json_code)
2066 if 'Post' in json_data:
2067 data = json_data['Post']
2071 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2072 video_url = data['media']['url']
2073 umobj = re.match(self._URL_EXT, video_url)
2075 raise ValueError('Can not determine filename extension')
2076 ext = umobj.group(1)
2079 'id': data['item_id'],
2081 'uploader': data['display_name'],
2082 'upload_date': upload_date,
2083 'title': data['title'],
2085 'format': data['media']['mimeType'],
2086 'thumbnail': data['thumbnailUrl'],
2087 'description': data['description'],
2088 'player_url': data['embedUrl'],
2089 'user_agent': 'iTunes/10.6.1',
2091 except (ValueError,KeyError) as err:
2092 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2097 class MyVideoIE(InfoExtractor):
2098 """Information Extractor for myvideo.de."""
2100 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2101 IE_NAME = u'myvideo'
2103 def _real_extract(self,url):
2104 mobj = re.match(self._VALID_URL, url)
2106 raise ExtractorError(u'Invalid URL: %s' % url)
2108 video_id = mobj.group(1)
2111 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2112 webpage = self._download_webpage(webpage_url, video_id)
2114 self.report_extraction(video_id)
2115 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2118 raise ExtractorError(u'Unable to extract media URL')
2119 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2121 mobj = re.search('<title>([^<]+)</title>', webpage)
2123 raise ExtractorError(u'Unable to extract title')
2125 video_title = mobj.group(1)
2131 'upload_date': None,
2132 'title': video_title,
2136 class ComedyCentralIE(InfoExtractor):
2137 """Information extractor for The Daily Show and Colbert Report """
2139 # urls can be abbreviations like :thedailyshow or :colbert
2140 # urls for episodes like:
2141 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2142 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2143 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2144 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2145 |(https?://)?(www\.)?
2146 (?P<showname>thedailyshow|colbertnation)\.com/
2147 (full-episodes/(?P<episode>.*)|
2149 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2150 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2153 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2155 _video_extensions = {
2163 _video_dimensions = {
2173 def suitable(cls, url):
2174 """Receives a URL and returns True if suitable for this IE."""
2175 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2177 def _print_formats(self, formats):
2178 print('Available formats:')
2180 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2183 def _real_extract(self, url):
2184 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2186 raise ExtractorError(u'Invalid URL: %s' % url)
2188 if mobj.group('shortname'):
2189 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2190 url = u'http://www.thedailyshow.com/full-episodes/'
2192 url = u'http://www.colbertnation.com/full-episodes/'
2193 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2194 assert mobj is not None
2196 if mobj.group('clip'):
2197 if mobj.group('showname') == 'thedailyshow':
2198 epTitle = mobj.group('tdstitle')
2200 epTitle = mobj.group('cntitle')
2203 dlNewest = not mobj.group('episode')
2205 epTitle = mobj.group('showname')
2207 epTitle = mobj.group('episode')
2209 self.report_extraction(epTitle)
2210 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2212 url = htmlHandle.geturl()
2213 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2215 raise ExtractorError(u'Invalid redirected URL: ' + url)
2216 if mobj.group('episode') == '':
2217 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2218 epTitle = mobj.group('episode')
2220 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2222 if len(mMovieParams) == 0:
2223 # The Colbert Report embeds the information in a without
2224 # a URL prefix; so extract the alternate reference
2225 # and then add the URL prefix manually.
2227 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2228 if len(altMovieParams) == 0:
2229 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2231 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2233 uri = mMovieParams[0][1]
2234 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2235 indexXml = self._download_webpage(indexUrl, epTitle,
2236 u'Downloading show index',
2237 u'unable to download episode index')
2241 idoc = xml.etree.ElementTree.fromstring(indexXml)
2242 itemEls = idoc.findall('.//item')
2243 for partNum,itemEl in enumerate(itemEls):
2244 mediaId = itemEl.findall('./guid')[0].text
2245 shortMediaId = mediaId.split(':')[-1]
2246 showId = mediaId.split(':')[-2].replace('.com', '')
2247 officialTitle = itemEl.findall('./title')[0].text
2248 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2250 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2251 compat_urllib_parse.urlencode({'uri': mediaId}))
2252 configXml = self._download_webpage(configUrl, epTitle,
2253 u'Downloading configuration for %s' % shortMediaId)
2255 cdoc = xml.etree.ElementTree.fromstring(configXml)
2257 for rendition in cdoc.findall('.//rendition'):
2258 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2262 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2265 if self._downloader.params.get('listformats', None):
2266 self._print_formats([i[0] for i in turls])
2269 # For now, just pick the highest bitrate
2270 format,rtmp_video_url = turls[-1]
2272 # Get the format arg from the arg stream
2273 req_format = self._downloader.params.get('format', None)
2275 # Select format if we can find one
2278 format, rtmp_video_url = f, v
2281 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2283 raise ExtractorError(u'Cannot transform RTMP url')
2284 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2285 video_url = base + m.group('finalid')
2287 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2292 'upload_date': officialDate,
2297 'description': officialTitle,
2299 results.append(info)
2304 class EscapistIE(InfoExtractor):
2305 """Information extractor for The Escapist """
2307 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2308 IE_NAME = u'escapist'
2310 def _real_extract(self, url):
2311 mobj = re.match(self._VALID_URL, url)
2313 raise ExtractorError(u'Invalid URL: %s' % url)
2314 showName = mobj.group('showname')
2315 videoId = mobj.group('episode')
2317 self.report_extraction(showName)
2318 webPage = self._download_webpage(url, showName)
2320 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2321 description = unescapeHTML(descMatch.group(1))
2322 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2323 imgUrl = unescapeHTML(imgMatch.group(1))
2324 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2325 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2326 configUrlMatch = re.search('config=(.*)$', playerUrl)
2327 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2329 configJSON = self._download_webpage(configUrl, showName,
2330 u'Downloading configuration',
2331 u'unable to download configuration')
2333 # Technically, it's JavaScript, not JSON
2334 configJSON = configJSON.replace("'", '"')
2337 config = json.loads(configJSON)
2338 except (ValueError,) as err:
2339 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2341 playlist = config['playlist']
2342 videoUrl = playlist[1]['url']
2347 'uploader': showName,
2348 'upload_date': None,
2351 'thumbnail': imgUrl,
2352 'description': description,
2353 'player_url': playerUrl,
2358 class CollegeHumorIE(InfoExtractor):
2359 """Information extractor for collegehumor.com"""
2362 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2363 IE_NAME = u'collegehumor'
2365 def report_manifest(self, video_id):
2366 """Report information extraction."""
2367 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2369 def _real_extract(self, url):
2370 mobj = re.match(self._VALID_URL, url)
2372 raise ExtractorError(u'Invalid URL: %s' % url)
2373 video_id = mobj.group('videoid')
2378 'upload_date': None,
2381 self.report_extraction(video_id)
2382 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2384 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2388 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2390 videoNode = mdoc.findall('./video')[0]
2391 info['description'] = videoNode.findall('./description')[0].text
2392 info['title'] = videoNode.findall('./caption')[0].text
2393 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2394 manifest_url = videoNode.findall('./file')[0].text
2396 raise ExtractorError(u'Invalid metadata XML file')
2398 manifest_url += '?hdcore=2.10.3'
2399 self.report_manifest(video_id)
2401 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2402 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2403 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2405 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2407 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2408 node_id = media_node.attrib['url']
2409 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2410 except IndexError as err:
2411 raise ExtractorError(u'Invalid manifest file')
2413 url_pr = compat_urllib_parse_urlparse(manifest_url)
2414 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2421 class XVideosIE(InfoExtractor):
2422 """Information extractor for xvideos.com"""
2424 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2425 IE_NAME = u'xvideos'
2427 def _real_extract(self, url):
2428 mobj = re.match(self._VALID_URL, url)
2430 raise ExtractorError(u'Invalid URL: %s' % url)
2431 video_id = mobj.group(1)
2433 webpage = self._download_webpage(url, video_id)
2435 self.report_extraction(video_id)
2439 mobj = re.search(r'flv_url=(.+?)&', webpage)
2441 raise ExtractorError(u'Unable to extract video url')
2442 video_url = compat_urllib_parse.unquote(mobj.group(1))
2446 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2448 raise ExtractorError(u'Unable to extract video title')
2449 video_title = mobj.group(1)
2452 # Extract video thumbnail
2453 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2455 raise ExtractorError(u'Unable to extract video thumbnail')
2456 video_thumbnail = mobj.group(0)
2462 'upload_date': None,
2463 'title': video_title,
2465 'thumbnail': video_thumbnail,
2466 'description': None,
2472 class SoundcloudIE(InfoExtractor):
2473 """Information extractor for soundcloud.com
2474 To access the media, the uid of the song and a stream token
2475 must be extracted from the page source and the script must make
2476 a request to media.soundcloud.com/crossdomain.xml. Then
2477 the media can be grabbed by requesting from an url composed
2478 of the stream token and uid
2481 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2482 IE_NAME = u'soundcloud'
2484 def report_resolve(self, video_id):
2485 """Report information extraction."""
2486 self.to_screen(u'%s: Resolving id' % video_id)
2488 def _real_extract(self, url):
2489 mobj = re.match(self._VALID_URL, url)
2491 raise ExtractorError(u'Invalid URL: %s' % url)
2493 # extract uploader (which is in the url)
2494 uploader = mobj.group(1)
2495 # extract simple title (uploader + slug of song title)
2496 slug_title = mobj.group(2)
2497 simple_title = uploader + u'-' + slug_title
2498 full_title = '%s/%s' % (uploader, slug_title)
2500 self.report_resolve(full_title)
2502 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2503 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2504 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2506 info = json.loads(info_json)
2507 video_id = info['id']
2508 self.report_extraction(full_title)
2510 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2511 stream_json = self._download_webpage(streams_url, full_title,
2512 u'Downloading stream definitions',
2513 u'unable to download stream definitions')
2515 streams = json.loads(stream_json)
2516 mediaURL = streams['http_mp3_128_url']
2517 upload_date = unified_strdate(info['created_at'])
2522 'uploader': info['user']['username'],
2523 'upload_date': upload_date,
2524 'title': info['title'],
2526 'description': info['description'],
2529 class SoundcloudSetIE(InfoExtractor):
2530 """Information extractor for soundcloud.com sets
2531 To access the media, the uid of the song and a stream token
2532 must be extracted from the page source and the script must make
2533 a request to media.soundcloud.com/crossdomain.xml. Then
2534 the media can be grabbed by requesting from an url composed
2535 of the stream token and uid
2538 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2539 IE_NAME = u'soundcloud:set'
2541 def report_resolve(self, video_id):
2542 """Report information extraction."""
2543 self.to_screen(u'%s: Resolving id' % video_id)
2545 def _real_extract(self, url):
2546 mobj = re.match(self._VALID_URL, url)
2548 raise ExtractorError(u'Invalid URL: %s' % url)
2550 # extract uploader (which is in the url)
2551 uploader = mobj.group(1)
2552 # extract simple title (uploader + slug of song title)
2553 slug_title = mobj.group(2)
2554 simple_title = uploader + u'-' + slug_title
2555 full_title = '%s/sets/%s' % (uploader, slug_title)
2557 self.report_resolve(full_title)
2559 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2560 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2561 info_json = self._download_webpage(resolv_url, full_title)
2564 info = json.loads(info_json)
2565 if 'errors' in info:
2566 for err in info['errors']:
2567 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2570 self.report_extraction(full_title)
2571 for track in info['tracks']:
2572 video_id = track['id']
2574 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2575 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2577 self.report_extraction(video_id)
2578 streams = json.loads(stream_json)
2579 mediaURL = streams['http_mp3_128_url']
2584 'uploader': track['user']['username'],
2585 'upload_date': unified_strdate(track['created_at']),
2586 'title': track['title'],
2588 'description': track['description'],
2593 class InfoQIE(InfoExtractor):
2594 """Information extractor for infoq.com"""
2595 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2600 raise ExtractorError(u'Invalid URL: %s' % url)
2602 webpage = self._download_webpage(url, video_id=url)
2603 self.report_extraction(url)
2606 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2608 raise ExtractorError(u'Unable to extract video url')
2609 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2610 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2613 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2615 raise ExtractorError(u'Unable to extract video title')
2616 video_title = mobj.group(1)
2618 # Extract description
2619 video_description = u'No description available.'
2620 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2621 if mobj is not None:
2622 video_description = mobj.group(1)
2624 video_filename = video_url.split('/')[-1]
2625 video_id, extension = video_filename.split('.')
2631 'upload_date': None,
2632 'title': video_title,
2633 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2635 'description': video_description,
2640 class MixcloudIE(InfoExtractor):
2641 """Information extractor for www.mixcloud.com"""
2643 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2644 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2645 IE_NAME = u'mixcloud'
2647 def report_download_json(self, file_id):
2648 """Report JSON download."""
2649 self.to_screen(u'Downloading json')
2651 def get_urls(self, jsonData, fmt, bitrate='best'):
2652 """Get urls from 'audio_formats' section in json"""
2655 bitrate_list = jsonData[fmt]
2656 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2657 bitrate = max(bitrate_list) # select highest
2659 url_list = jsonData[fmt][bitrate]
2660 except TypeError: # we have no bitrate info.
2661 url_list = jsonData[fmt]
2664 def check_urls(self, url_list):
2665 """Returns 1st active url from list"""
2666 for url in url_list:
2668 compat_urllib_request.urlopen(url)
2670 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2675 def _print_formats(self, formats):
2676 print('Available formats:')
2677 for fmt in formats.keys():
2678 for b in formats[fmt]:
2680 ext = formats[fmt][b][0]
2681 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2682 except TypeError: # we have no bitrate info
2683 ext = formats[fmt][0]
2684 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2687 def _real_extract(self, url):
2688 mobj = re.match(self._VALID_URL, url)
2690 raise ExtractorError(u'Invalid URL: %s' % url)
2691 # extract uploader & filename from url
2692 uploader = mobj.group(1).decode('utf-8')
2693 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2695 # construct API request
2696 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2697 # retrieve .json file with links to files
2698 request = compat_urllib_request.Request(file_url)
2700 self.report_download_json(file_url)
2701 jsonData = compat_urllib_request.urlopen(request).read()
2702 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2703 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2706 json_data = json.loads(jsonData)
2707 player_url = json_data['player_swf_url']
2708 formats = dict(json_data['audio_formats'])
2710 req_format = self._downloader.params.get('format', None)
2713 if self._downloader.params.get('listformats', None):
2714 self._print_formats(formats)
2717 if req_format is None or req_format == 'best':
2718 for format_param in formats.keys():
2719 url_list = self.get_urls(formats, format_param)
2721 file_url = self.check_urls(url_list)
2722 if file_url is not None:
2725 if req_format not in formats:
2726 raise ExtractorError(u'Format is not available')
2728 url_list = self.get_urls(formats, req_format)
2729 file_url = self.check_urls(url_list)
2730 format_param = req_format
2733 'id': file_id.decode('utf-8'),
2734 'url': file_url.decode('utf-8'),
2735 'uploader': uploader.decode('utf-8'),
2736 'upload_date': None,
2737 'title': json_data['name'],
2738 'ext': file_url.split('.')[-1].decode('utf-8'),
2739 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2740 'thumbnail': json_data['thumbnail_url'],
2741 'description': json_data['description'],
2742 'player_url': player_url.decode('utf-8'),
2745 class StanfordOpenClassroomIE(InfoExtractor):
2746 """Information extractor for Stanford's Open ClassRoom"""
2748 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2749 IE_NAME = u'stanfordoc'
2751 def _real_extract(self, url):
2752 mobj = re.match(self._VALID_URL, url)
2754 raise ExtractorError(u'Invalid URL: %s' % url)
2756 if mobj.group('course') and mobj.group('video'): # A specific video
2757 course = mobj.group('course')
2758 video = mobj.group('video')
2760 'id': course + '_' + video,
2762 'upload_date': None,
2765 self.report_extraction(info['id'])
2766 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2767 xmlUrl = baseUrl + video + '.xml'
2769 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2770 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2771 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2772 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2774 info['title'] = mdoc.findall('./title')[0].text
2775 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2777 raise ExtractorError(u'Invalid metadata XML file')
2778 info['ext'] = info['url'].rpartition('.')[2]
2780 elif mobj.group('course'): # A course page
2781 course = mobj.group('course')
2786 'upload_date': None,
2789 coursepage = self._download_webpage(url, info['id'],
2790 note='Downloading course info page',
2791 errnote='Unable to download course info page')
2793 m = re.search('<h1>([^<]+)</h1>', coursepage)
2795 info['title'] = unescapeHTML(m.group(1))
2797 info['title'] = info['id']
2799 m = re.search('<description>([^<]+)</description>', coursepage)
2801 info['description'] = unescapeHTML(m.group(1))
2803 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2806 'type': 'reference',
2807 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2811 for entry in info['list']:
2812 assert entry['type'] == 'reference'
2813 results += self.extract(entry['url'])
2817 'id': 'Stanford OpenClassroom',
2820 'upload_date': None,
2823 self.report_download_webpage(info['id'])
2824 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2826 rootpage = compat_urllib_request.urlopen(rootURL).read()
2827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2828 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2830 info['title'] = info['id']
2832 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2835 'type': 'reference',
2836 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2841 for entry in info['list']:
2842 assert entry['type'] == 'reference'
2843 results += self.extract(entry['url'])
2846 class MTVIE(InfoExtractor):
2847 """Information extractor for MTV.com"""
2849 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2852 def _real_extract(self, url):
2853 mobj = re.match(self._VALID_URL, url)
2855 raise ExtractorError(u'Invalid URL: %s' % url)
2856 if not mobj.group('proto'):
2857 url = 'http://' + url
2858 video_id = mobj.group('videoid')
2860 webpage = self._download_webpage(url, video_id)
2862 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2864 raise ExtractorError(u'Unable to extract song name')
2865 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2866 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2868 raise ExtractorError(u'Unable to extract performer')
2869 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2870 video_title = performer + ' - ' + song_name
2872 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2874 raise ExtractorError(u'Unable to mtvn_uri')
2875 mtvn_uri = mobj.group(1)
2877 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2879 raise ExtractorError(u'Unable to extract content id')
2880 content_id = mobj.group(1)
2882 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2883 self.report_extraction(video_id)
2884 request = compat_urllib_request.Request(videogen_url)
2886 metadataXml = compat_urllib_request.urlopen(request).read()
2887 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2888 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2890 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2891 renditions = mdoc.findall('.//rendition')
2893 # For now, always pick the highest quality.
2894 rendition = renditions[-1]
2897 _,_,ext = rendition.attrib['type'].partition('/')
2898 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2899 video_url = rendition.find('./src').text
2901 raise ExtractorError('Invalid rendition field.')
2906 'uploader': performer,
2907 'upload_date': None,
2908 'title': video_title,
2916 class YoukuIE(InfoExtractor):
2917 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2920 nowTime = int(time.time() * 1000)
2921 random1 = random.randint(1000,1998)
2922 random2 = random.randint(1000,9999)
2924 return "%d%d%d" %(nowTime,random1,random2)
2926 def _get_file_ID_mix_string(self, seed):
2928 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2930 for i in range(len(source)):
2931 seed = (seed * 211 + 30031 ) % 65536
2932 index = math.floor(seed / 65536 * len(source) )
2933 mixed.append(source[int(index)])
2934 source.remove(source[int(index)])
2935 #return ''.join(mixed)
2938 def _get_file_id(self, fileId, seed):
2939 mixed = self._get_file_ID_mix_string(seed)
2940 ids = fileId.split('*')
2944 realId.append(mixed[int(ch)])
2945 return ''.join(realId)
2947 def _real_extract(self, url):
2948 mobj = re.match(self._VALID_URL, url)
2950 raise ExtractorError(u'Invalid URL: %s' % url)
2951 video_id = mobj.group('ID')
2953 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2955 jsondata = self._download_webpage(info_url, video_id)
2957 self.report_extraction(video_id)
2959 config = json.loads(jsondata)
2961 video_title = config['data'][0]['title']
2962 seed = config['data'][0]['seed']
2964 format = self._downloader.params.get('format', None)
2965 supported_format = list(config['data'][0]['streamfileids'].keys())
2967 if format is None or format == 'best':
2968 if 'hd2' in supported_format:
2973 elif format == 'worst':
2981 fileid = config['data'][0]['streamfileids'][format]
2982 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2983 except (UnicodeDecodeError, ValueError, KeyError):
2984 raise ExtractorError(u'Unable to extract info section')
2987 sid = self._gen_sid()
2988 fileid = self._get_file_id(fileid, seed)
2990 #column 8,9 of fileid represent the segment number
2991 #fileid[7:9] should be changed
2992 for index, key in enumerate(keys):
2994 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2995 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2998 'id': '%s_part%02d' % (video_id, index),
2999 'url': download_url,
3001 'upload_date': None,
3002 'title': video_title,
3005 files_info.append(info)
3010 class XNXXIE(InfoExtractor):
3011 """Information extractor for xnxx.com"""
3013 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3015 VIDEO_URL_RE = r'flv_url=(.*?)&'
3016 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3017 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3019 def _real_extract(self, url):
3020 mobj = re.match(self._VALID_URL, url)
3022 raise ExtractorError(u'Invalid URL: %s' % url)
3023 video_id = mobj.group(1)
3025 # Get webpage content
3026 webpage = self._download_webpage(url, video_id)
3028 result = re.search(self.VIDEO_URL_RE, webpage)
3030 raise ExtractorError(u'Unable to extract video url')
3031 video_url = compat_urllib_parse.unquote(result.group(1))
3033 result = re.search(self.VIDEO_TITLE_RE, webpage)
3035 raise ExtractorError(u'Unable to extract video title')
3036 video_title = result.group(1)
3038 result = re.search(self.VIDEO_THUMB_RE, webpage)
3040 raise ExtractorError(u'Unable to extract video thumbnail')
3041 video_thumbnail = result.group(1)
3047 'upload_date': None,
3048 'title': video_title,
3050 'thumbnail': video_thumbnail,
3051 'description': None,
3055 class GooglePlusIE(InfoExtractor):
3056 """Information extractor for plus.google.com."""
3058 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3059 IE_NAME = u'plus.google'
3061 def report_extract_entry(self, url):
3062 """Report downloading extry"""
3063 self.to_screen(u'Downloading entry: %s' % url)
3065 def report_date(self, upload_date):
3066 """Report downloading extry"""
3067 self.to_screen(u'Entry date: %s' % upload_date)
3069 def report_uploader(self, uploader):
3070 """Report downloading extry"""
3071 self.to_screen(u'Uploader: %s' % uploader)
3073 def report_title(self, video_title):
3074 """Report downloading extry"""
3075 self.to_screen(u'Title: %s' % video_title)
3077 def report_extract_vid_page(self, video_page):
3078 """Report information extraction."""
3079 self.to_screen(u'Extracting video page: %s' % video_page)
3081 def _real_extract(self, url):
3082 # Extract id from URL
3083 mobj = re.match(self._VALID_URL, url)
3085 raise ExtractorError(u'Invalid URL: %s' % url)
3087 post_url = mobj.group(0)
3088 video_id = mobj.group(1)
3090 video_extension = 'flv'
3092 # Step 1, Retrieve post webpage to extract further information
3093 self.report_extract_entry(post_url)
3094 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3096 # Extract update date
3098 pattern = 'title="Timestamp">(.*?)</a>'
3099 mobj = re.search(pattern, webpage)
3101 upload_date = mobj.group(1)
3102 # Convert timestring to a format suitable for filename
3103 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3104 upload_date = upload_date.strftime('%Y%m%d')
3105 self.report_date(upload_date)
3109 pattern = r'rel\="author".*?>(.*?)</a>'
3110 mobj = re.search(pattern, webpage)
3112 uploader = mobj.group(1)
3113 self.report_uploader(uploader)
3116 # Get the first line for title
3118 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3119 mobj = re.search(pattern, webpage)
3121 video_title = mobj.group(1)
3122 self.report_title(video_title)
3124 # Step 2, Stimulate clicking the image box to launch video
3125 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3126 mobj = re.search(pattern, webpage)
3128 self._downloader.report_error(u'unable to extract video page URL')
3130 video_page = mobj.group(1)
3131 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132 self.report_extract_vid_page(video_page)
3135 # Extract video links on video page
3136 """Extract video links of all sizes"""
3137 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3138 mobj = re.findall(pattern, webpage)
3140 self._downloader.report_error(u'unable to extract video links')
3142 # Sort in resolution
3143 links = sorted(mobj)
3145 # Choose the lowest of the sort, i.e. highest resolution
3146 video_url = links[-1]
3147 # Only get the url. The resolution part in the tuple has no use anymore
3148 video_url = video_url[-1]
3149 # Treat escaped \u0026 style hex
3151 video_url = video_url.decode("unicode_escape")
3152 except AttributeError: # Python 3
3153 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3159 'uploader': uploader,
3160 'upload_date': upload_date,
3161 'title': video_title,
3162 'ext': video_extension,
3165 class NBAIE(InfoExtractor):
3166 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3169 def _real_extract(self, url):
3170 mobj = re.match(self._VALID_URL, url)
3172 raise ExtractorError(u'Invalid URL: %s' % url)
3174 video_id = mobj.group(1)
3175 if video_id.endswith('/index.html'):
3176 video_id = video_id[:-len('/index.html')]
3178 webpage = self._download_webpage(url, video_id)
3180 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3181 def _findProp(rexp, default=None):
3182 m = re.search(rexp, webpage)
3184 return unescapeHTML(m.group(1))
3188 shortened_video_id = video_id.rpartition('/')[2]
3189 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3191 'id': shortened_video_id,
3195 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3196 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3200 class JustinTVIE(InfoExtractor):
3201 """Information extractor for justin.tv and twitch.tv"""
3202 # TODO: One broadcast may be split into multiple videos. The key
3203 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3204 # starts at 1 and increases. Can we treat all parts as one video?
3206 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3208 (?P<channelid>[^/]+)|
3209 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3210 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3214 _JUSTIN_PAGE_LIMIT = 100
3215 IE_NAME = u'justin.tv'
3217 def report_download_page(self, channel, offset):
3218 """Report attempt to download a single page of videos."""
3219 self.to_screen(u'%s: Downloading video information from %d to %d' %
3220 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3222 # Return count of items, list of *valid* items
3223 def _parse_page(self, url, video_id):
3224 webpage = self._download_webpage(url, video_id,
3225 u'Downloading video info JSON',
3226 u'unable to download video info JSON')
3228 response = json.loads(webpage)
3229 if type(response) != list:
3230 error_text = response.get('error', 'unknown error')
3231 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3233 for clip in response:
3234 video_url = clip['video_file_url']
3236 video_extension = os.path.splitext(video_url)[1][1:]
3237 video_date = re.sub('-', '', clip['start_time'][:10])
3238 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3239 video_id = clip['id']
3240 video_title = clip.get('title', video_id)
3244 'title': video_title,
3245 'uploader': clip.get('channel_name', video_uploader_id),
3246 'uploader_id': video_uploader_id,
3247 'upload_date': video_date,
3248 'ext': video_extension,
3250 return (len(response), info)
3252 def _real_extract(self, url):
3253 mobj = re.match(self._VALID_URL, url)
3255 raise ExtractorError(u'invalid URL: %s' % url)
3257 api_base = 'http://api.justin.tv'
3259 if mobj.group('channelid'):
3261 video_id = mobj.group('channelid')
3262 api = api_base + '/channel/archives/%s.json' % video_id
3263 elif mobj.group('chapterid'):
3264 chapter_id = mobj.group('chapterid')
3266 webpage = self._download_webpage(url, chapter_id)
3267 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3269 raise ExtractorError(u'Cannot find archive of a chapter')
3270 archive_id = m.group(1)
3272 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3273 chapter_info_xml = self._download_webpage(api, chapter_id,
3274 note=u'Downloading chapter information',
3275 errnote=u'Chapter information download failed')
3276 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3277 for a in doc.findall('.//archive'):
3278 if archive_id == a.find('./id').text:
3281 raise ExtractorError(u'Could not find chapter in chapter information')
3283 video_url = a.find('./video_file_url').text
3284 video_ext = video_url.rpartition('.')[2] or u'flv'
3286 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3287 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3288 note='Downloading chapter metadata',
3289 errnote='Download of chapter metadata failed')
3290 chapter_info = json.loads(chapter_info_json)
3292 bracket_start = int(doc.find('.//bracket_start').text)
3293 bracket_end = int(doc.find('.//bracket_end').text)
3295 # TODO determine start (and probably fix up file)
3296 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3297 #video_url += u'?start=' + TODO:start_timestamp
3298 # bracket_start is 13290, but we want 51670615
3299 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3300 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3303 'id': u'c' + chapter_id,
3306 'title': chapter_info['title'],
3307 'thumbnail': chapter_info['preview'],
3308 'description': chapter_info['description'],
3309 'uploader': chapter_info['channel']['display_name'],
3310 'uploader_id': chapter_info['channel']['name'],
3314 video_id = mobj.group('videoid')
3315 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3317 self.report_extraction(video_id)
3321 limit = self._JUSTIN_PAGE_LIMIT
3324 self.report_download_page(video_id, offset)
3325 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3326 page_count, page_info = self._parse_page(page_url, video_id)
3327 info.extend(page_info)
3328 if not paged or page_count != limit:
3333 class FunnyOrDieIE(InfoExtractor):
3334 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3336 def _real_extract(self, url):
3337 mobj = re.match(self._VALID_URL, url)
3339 raise ExtractorError(u'invalid URL: %s' % url)
3341 video_id = mobj.group('id')
3342 webpage = self._download_webpage(url, video_id)
3344 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3346 self._downloader.report_error(u'unable to find video information')
3347 video_url = unescapeHTML(m.group('url'))
3349 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3351 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3353 self._downloader.report_error(u'Cannot find video title')
3354 title = clean_html(m.group('title'))
3356 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3358 desc = unescapeHTML(m.group('desc'))
3367 'description': desc,
3371 class SteamIE(InfoExtractor):
3372 _VALID_URL = r"""http://store\.steampowered\.com/
3374 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3376 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3380 def suitable(cls, url):
3381 """Receives a URL and returns True if suitable for this IE."""
3382 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3384 def _real_extract(self, url):
3385 m = re.match(self._VALID_URL, url, re.VERBOSE)
3386 gameID = m.group('gameID')
3387 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3388 self.report_age_confirmation()
3389 webpage = self._download_webpage(videourl, gameID)
3390 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3392 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3393 mweb = re.finditer(urlRE, webpage)
3394 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3395 titles = re.finditer(namesRE, webpage)
3396 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3397 thumbs = re.finditer(thumbsRE, webpage)
3399 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3400 video_id = vid.group('videoID')
3401 title = vtitle.group('videoName')
3402 video_url = vid.group('videoURL')
3403 video_thumb = thumb.group('thumbnail')
3405 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3410 'title': unescapeHTML(title),
3411 'thumbnail': video_thumb
3414 return [self.playlist_result(videos, gameID, game_title)]
3416 class UstreamIE(InfoExtractor):
3417 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3418 IE_NAME = u'ustream'
3420 def _real_extract(self, url):
3421 m = re.match(self._VALID_URL, url)
3422 video_id = m.group('videoID')
3423 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3424 webpage = self._download_webpage(url, video_id)
3425 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3426 title = m.group('title')
3427 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3428 uploader = m.group('uploader')
3434 'uploader': uploader
3438 class WorldStarHipHopIE(InfoExtractor):
3439 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3440 IE_NAME = u'WorldStarHipHop'
3442 def _real_extract(self, url):
3443 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3445 m = re.match(self._VALID_URL, url)
3446 video_id = m.group('id')
3448 webpage_src = self._download_webpage(url, video_id)
3450 mobj = re.search(_src_url, webpage_src)
3452 if mobj is not None:
3453 video_url = mobj.group(1)
3454 if 'mp4' in video_url:
3459 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3461 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3464 raise ExtractorError(u'Cannot determine title')
3465 title = mobj.group(1)
3467 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3468 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3469 if mobj is not None:
3470 thumbnail = mobj.group(1)
3472 _title = r"""candytitles.*>(.*)</span>"""
3473 mobj = re.search(_title, webpage_src)
3474 if mobj is not None:
3475 title = mobj.group(1)
3482 'thumbnail' : thumbnail,
3487 class RBMARadioIE(InfoExtractor):
3488 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3490 def _real_extract(self, url):
3491 m = re.match(self._VALID_URL, url)
3492 video_id = m.group('videoID')
3494 webpage = self._download_webpage(url, video_id)
3495 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3497 raise ExtractorError(u'Cannot find metadata')
3498 json_data = m.group(1)
3501 data = json.loads(json_data)
3502 except ValueError as e:
3503 raise ExtractorError(u'Invalid JSON: ' + str(e))
3505 video_url = data['akamai_url'] + '&cbr=256'
3506 url_parts = compat_urllib_parse_urlparse(video_url)
3507 video_ext = url_parts.path.rpartition('.')[2]
3512 'title': data['title'],
3513 'description': data.get('teaser_text'),
3514 'location': data.get('country_of_origin'),
3515 'uploader': data.get('host', {}).get('name'),
3516 'uploader_id': data.get('host', {}).get('slug'),
3517 'thumbnail': data.get('image', {}).get('large_url_2x'),
3518 'duration': data.get('duration'),
3523 class YouPornIE(InfoExtractor):
3524 """Information extractor for youporn.com."""
3525 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3527 def _print_formats(self, formats):
3528 """Print all available formats"""
3529 print(u'Available formats:')
3530 print(u'ext\t\tformat')
3531 print(u'---------------------------------')
3532 for format in formats:
3533 print(u'%s\t\t%s' % (format['ext'], format['format']))
3535 def _specific(self, req_format, formats):
3537 if(x["format"]==req_format):
3541 def _real_extract(self, url):
3542 mobj = re.match(self._VALID_URL, url)
3544 raise ExtractorError(u'Invalid URL: %s' % url)
3546 video_id = mobj.group('videoid')
3548 req = compat_urllib_request.Request(url)
3549 req.add_header('Cookie', 'age_verified=1')
3550 webpage = self._download_webpage(req, video_id)
3552 # Get the video title
3553 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3555 raise ExtractorError(u'Unable to extract video title')
3556 video_title = result.group('title').strip()
3558 # Get the video date
3559 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3561 self._downloader.report_warning(u'unable to extract video date')
3564 upload_date = unified_strdate(result.group('date').strip())
3566 # Get the video uploader
3567 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3569 self._downloader.report_warning(u'unable to extract uploader')
3570 video_uploader = None
3572 video_uploader = result.group('uploader').strip()
3573 video_uploader = clean_html( video_uploader )
3575 # Get all of the formats available
3576 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3577 result = re.search(DOWNLOAD_LIST_RE, webpage)
3579 raise ExtractorError(u'Unable to extract download list')
3580 download_list_html = result.group('download_list').strip()
3582 # Get all of the links from the page
3583 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3584 links = re.findall(LINK_RE, download_list_html)
3585 if(len(links) == 0):
3586 raise ExtractorError(u'ERROR: no known formats available for video')
3588 self.to_screen(u'Links found: %d' % len(links))
3593 # A link looks like this:
3594 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3595 # A path looks like this:
3596 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3597 video_url = unescapeHTML( link )
3598 path = compat_urllib_parse_urlparse( video_url ).path
3599 extension = os.path.splitext( path )[1][1:]
3600 format = path.split('/')[4].split('_')[:2]
3603 format = "-".join( format )
3604 title = u'%s-%s-%s' % (video_title, size, bitrate)
3609 'uploader': video_uploader,
3610 'upload_date': upload_date,
3615 'description': None,
3619 if self._downloader.params.get('listformats', None):
3620 self._print_formats(formats)
3623 req_format = self._downloader.params.get('format', None)
3624 self.to_screen(u'Format: %s' % req_format)
3626 if req_format is None or req_format == 'best':
3628 elif req_format == 'worst':
3629 return [formats[-1]]
3630 elif req_format in ('-1', 'all'):
3633 format = self._specific( req_format, formats )
3635 raise ExtractorError(u'Requested format not available')
3640 class PornotubeIE(InfoExtractor):
3641 """Information extractor for pornotube.com."""
3642 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3644 def _real_extract(self, url):
3645 mobj = re.match(self._VALID_URL, url)
3647 raise ExtractorError(u'Invalid URL: %s' % url)
3649 video_id = mobj.group('videoid')
3650 video_title = mobj.group('title')
3652 # Get webpage content
3653 webpage = self._download_webpage(url, video_id)
3656 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3657 result = re.search(VIDEO_URL_RE, webpage)
3659 raise ExtractorError(u'Unable to extract video url')
3660 video_url = compat_urllib_parse.unquote(result.group('url'))
3662 #Get the uploaded date
3663 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3664 result = re.search(VIDEO_UPLOADED_RE, webpage)
3666 raise ExtractorError(u'Unable to extract video title')
3667 upload_date = unified_strdate(result.group('date'))
3669 info = {'id': video_id,
3672 'upload_date': upload_date,
3673 'title': video_title,
3679 class YouJizzIE(InfoExtractor):
3680 """Information extractor for youjizz.com."""
3681 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3683 def _real_extract(self, url):
3684 mobj = re.match(self._VALID_URL, url)
3686 raise ExtractorError(u'Invalid URL: %s' % url)
3688 video_id = mobj.group('videoid')
3690 # Get webpage content
3691 webpage = self._download_webpage(url, video_id)
3693 # Get the video title
3694 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3696 raise ExtractorError(u'ERROR: unable to extract video title')
3697 video_title = result.group('title').strip()
3699 # Get the embed page
3700 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3702 raise ExtractorError(u'ERROR: unable to extract embed page')
3704 embed_page_url = result.group(0).strip()
3705 video_id = result.group('videoid')
3707 webpage = self._download_webpage(embed_page_url, video_id)
3710 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3712 raise ExtractorError(u'ERROR: unable to extract video url')
3713 video_url = result.group('source')
3715 info = {'id': video_id,
3717 'title': video_title,
3720 'player_url': embed_page_url}
3724 class EightTracksIE(InfoExtractor):
3726 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3728 def _real_extract(self, url):
3729 mobj = re.match(self._VALID_URL, url)
3731 raise ExtractorError(u'Invalid URL: %s' % url)
3732 playlist_id = mobj.group('id')
3734 webpage = self._download_webpage(url, playlist_id)
3736 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3738 raise ExtractorError(u'Cannot find trax information')
3739 json_like = m.group(1)
3740 data = json.loads(json_like)
3742 session = str(random.randint(0, 1000000000))
3744 track_count = data['tracks_count']
3745 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3746 next_url = first_url
3748 for i in itertools.count():
3749 api_json = self._download_webpage(next_url, playlist_id,
3750 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3751 errnote=u'Failed to download song information')
3752 api_data = json.loads(api_json)
3753 track_data = api_data[u'set']['track']
3755 'id': track_data['id'],
3756 'url': track_data['track_file_stream_url'],
3757 'title': track_data['performer'] + u' - ' + track_data['name'],
3758 'raw_title': track_data['name'],
3759 'uploader_id': data['user']['login'],
3763 if api_data['set']['at_last_track']:
3765 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3768 class KeekIE(InfoExtractor):
3769 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3772 def _real_extract(self, url):
3773 m = re.match(self._VALID_URL, url)
3774 video_id = m.group('videoID')
3775 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3776 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3777 webpage = self._download_webpage(url, video_id)
3778 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3779 title = unescapeHTML(m.group('title'))
3780 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3781 uploader = clean_html(m.group('uploader'))
3787 'thumbnail': thumbnail,
3788 'uploader': uploader
3792 class TEDIE(InfoExtractor):
3793 _VALID_URL=r'''http://www\.ted\.com/
3795 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3797 ((?P<type_talk>talks)) # We have a simple talk
3799 (/lang/(.*?))? # The url may contain the language
3800 /(?P<name>\w+) # Here goes the name and then ".html"
3804 def suitable(cls, url):
3805 """Receives a URL and returns True if suitable for this IE."""
3806 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3808 def _real_extract(self, url):
3809 m=re.match(self._VALID_URL, url, re.VERBOSE)
3810 if m.group('type_talk'):
3811 return [self._talk_info(url)]
3813 playlist_id=m.group('playlist_id')
3814 name=m.group('name')
3815 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3816 return [self._playlist_videos_info(url,name,playlist_id)]
3818 def _talk_video_link(self,mediaSlug):
3819 '''Returns the video link for that mediaSlug'''
3820 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3822 def _playlist_videos_info(self,url,name,playlist_id=0):
3823 '''Returns the videos of the playlist'''
3825 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3826 ([.\s]*?)data-playlist_item_id="(\d+)"
3827 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3829 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3830 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3831 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3832 m_names=re.finditer(video_name_RE,webpage)
3834 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3835 m_playlist = re.search(playlist_RE, webpage)
3836 playlist_title = m_playlist.group('playlist_title')
3838 playlist_entries = []
3839 for m_video, m_name in zip(m_videos,m_names):
3840 video_id=m_video.group('video_id')
3841 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3842 playlist_entries.append(self.url_result(talk_url, 'TED'))
3843 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3845 def _talk_info(self, url, video_id=0):
3846 """Return the video for the talk in the url"""
3847 m=re.match(self._VALID_URL, url,re.VERBOSE)
3848 videoName=m.group('name')
3849 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3850 # If the url includes the language we get the title translated
3851 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3852 title=re.search(title_RE, webpage).group('title')
3853 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3854 "id":(?P<videoID>[\d]+).*?
3855 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3856 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3857 thumb_match=re.search(thumb_RE,webpage)
3858 info_match=re.search(info_RE,webpage,re.VERBOSE)
3859 video_id=info_match.group('videoID')
3860 mediaSlug=info_match.group('mediaSlug')
3861 video_url=self._talk_video_link(mediaSlug)
3867 'thumbnail': thumb_match.group('thumbnail')
3871 class MySpassIE(InfoExtractor):
3872 _VALID_URL = r'http://www.myspass.de/.*'
3874 def _real_extract(self, url):
3875 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3877 # video id is the last path element of the URL
3878 # usually there is a trailing slash, so also try the second but last
3879 url_path = compat_urllib_parse_urlparse(url).path
3880 url_parent_path, video_id = os.path.split(url_path)
3882 _, video_id = os.path.split(url_parent_path)
3885 metadata_url = META_DATA_URL_TEMPLATE % video_id
3886 metadata_text = self._download_webpage(metadata_url, video_id)
3887 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3889 # extract values from metadata
3890 url_flv_el = metadata.find('url_flv')
3891 if url_flv_el is None:
3892 raise ExtractorError(u'Unable to extract download url')
3893 video_url = url_flv_el.text
3894 extension = os.path.splitext(video_url)[1][1:]
3895 title_el = metadata.find('title')
3896 if title_el is None:
3897 raise ExtractorError(u'Unable to extract title')
3898 title = title_el.text
3899 format_id_el = metadata.find('format_id')
3900 if format_id_el is None:
3903 format = format_id_el.text
3904 description_el = metadata.find('description')
3905 if description_el is not None:
3906 description = description_el.text
3909 imagePreview_el = metadata.find('imagePreview')
3910 if imagePreview_el is not None:
3911 thumbnail = imagePreview_el.text
3920 'thumbnail': thumbnail,
3921 'description': description
3925 class SpiegelIE(InfoExtractor):
3926 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3928 def _real_extract(self, url):
3929 m = re.match(self._VALID_URL, url)
3930 video_id = m.group('videoID')
3932 webpage = self._download_webpage(url, video_id)
3933 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3935 raise ExtractorError(u'Cannot find title')
3936 video_title = unescapeHTML(m.group(1))
3938 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3939 xml_code = self._download_webpage(xml_url, video_id,
3940 note=u'Downloading XML', errnote=u'Failed to download XML')
3942 idoc = xml.etree.ElementTree.fromstring(xml_code)
3943 last_type = idoc[-1]
3944 filename = last_type.findall('./filename')[0].text
3945 duration = float(last_type.findall('./duration')[0].text)
3947 video_url = 'http://video2.spiegel.de/flash/' + filename
3948 video_ext = filename.rpartition('.')[2]
3953 'title': video_title,
3954 'duration': duration,
3958 class LiveLeakIE(InfoExtractor):
3960 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3961 IE_NAME = u'liveleak'
3963 def _real_extract(self, url):
3964 mobj = re.match(self._VALID_URL, url)
3966 raise ExtractorError(u'Invalid URL: %s' % url)
3968 video_id = mobj.group('video_id')
3970 webpage = self._download_webpage(url, video_id)
3972 m = re.search(r'file: "(.*?)",', webpage)
3974 raise ExtractorError(u'Unable to find video url')
3975 video_url = m.group(1)
3977 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3979 self._downloader.report_error(u'Cannot find video title')
3980 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3982 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3984 desc = unescapeHTML(m.group('desc'))
3988 m = re.search(r'By:.*?(\w+)</a>', webpage)
3990 uploader = clean_html(m.group(1))
3999 'description': desc,
4000 'uploader': uploader
4005 class ARDIE(InfoExtractor):
4006 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4007 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4008 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4010 def _real_extract(self, url):
4011 # determine video id from url
4012 m = re.match(self._VALID_URL, url)
4014 numid = re.search(r'documentId=([0-9]+)', url)
4016 video_id = numid.group(1)
4018 video_id = m.group('video_id')
4020 # determine title and media streams from webpage
4021 html = self._download_webpage(url, video_id)
4022 title = re.search(self._TITLE, html).group('title')
4023 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4025 assert '"fsk"' in html
4026 raise ExtractorError(u'This video is only available after 8:00 pm')
4028 # choose default media type and highest quality for now
4029 stream = max([s for s in streams if int(s["media_type"]) == 0],
4030 key=lambda s: int(s["quality"]))
4032 # there's two possibilities: RTMP stream or HTTP download
4033 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4034 if stream['rtmp_url']:
4035 self.to_screen(u'RTMP download detected')
4036 assert stream['video_url'].startswith('mp4:')
4037 info["url"] = stream["rtmp_url"]
4038 info["play_path"] = stream['video_url']
4040 assert stream["video_url"].endswith('.mp4')
4041 info["url"] = stream["video_url"]
4044 class TumblrIE(InfoExtractor):
4045 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4047 def _real_extract(self, url):
4048 m_url = re.match(self._VALID_URL, url)
4049 video_id = m_url.group('id')
4050 blog = m_url.group('blog_name')
4052 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4053 webpage = self._download_webpage(url, video_id)
4055 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4056 video = re.search(re_video, webpage)
4058 self.to_screen("No video founded")
4060 video_url = video.group('video_url')
4061 ext = video.group('ext')
4063 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4064 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4066 # The only place where you can get a title, it's not complete,
4067 # but searching in other places doesn't work for all videos
4068 re_title = r'<title>(?P<title>.*?)</title>'
4069 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4071 return [{'id': video_id,
4078 class BandcampIE(InfoExtractor):
4079 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4081 def _real_extract(self, url):
4082 mobj = re.match(self._VALID_URL, url)
4083 title = mobj.group('title')
4084 webpage = self._download_webpage(url, title)
4085 # We get the link to the free download page
4086 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4087 if m_download is None:
4088 raise ExtractorError(u'No free songs founded')
4090 download_link = m_download.group(1)
4091 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4092 webpage, re.MULTILINE|re.DOTALL).group('id')
4094 download_webpage = self._download_webpage(download_link, id,
4095 'Downloading free downloads page')
4096 # We get the dictionary of the track from some javascrip code
4097 info = re.search(r'items: (.*?),$',
4098 download_webpage, re.MULTILINE).group(1)
4099 info = json.loads(info)[0]
4100 # We pick mp3-320 for now, until format selection can be easily implemented.
4101 mp3_info = info[u'downloads'][u'mp3-320']
4102 # If we try to use this url it says the link has expired
4103 initial_url = mp3_info[u'url']
4104 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4105 m_url = re.match(re_url, initial_url)
4106 #We build the url we will use to get the final track url
4107 # This url is build in Bandcamp in the script download_bunde_*.js
4108 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4109 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4110 # If we could correctly generate the .rand field the url would be
4111 #in the "download_url" key
4112 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4114 track_info = {'id':id,
4115 'title' : info[u'title'],
4118 'thumbnail' : info[u'thumb_url'],
4119 'uploader' : info[u'artist']
4124 class RedTubeIE(InfoExtractor):
4125 """Information Extractor for redtube"""
4126 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4128 def _real_extract(self,url):
4129 mobj = re.match(self._VALID_URL, url)
4131 raise ExtractorError(u'Invalid URL: %s' % url)
4133 video_id = mobj.group('id')
4134 video_extension = 'mp4'
4135 webpage = self._download_webpage(url, video_id)
4136 self.report_extraction(video_id)
4137 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4140 raise ExtractorError(u'Unable to extract media URL')
4142 video_url = mobj.group(1)
4143 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4145 raise ExtractorError(u'Unable to extract title')
4146 video_title = mobj.group(1)
4151 'ext': video_extension,
4152 'title': video_title,
4156 def gen_extractors():
4157 """ Return a list of an instance of every supported extractor.
4158 The order does matter; the first extractor matched is the one handling the URL.
4161 YoutubePlaylistIE(),
4186 StanfordOpenClassroomIE(),
4196 WorldStarHipHopIE(),
4215 def get_info_extractor(ie_name):
4216 """Returns the info extractor class with the given ie_name"""
4217 return globals()[ie_name+'IE']