2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
138 #Methods for following #608
139 #They set the correct value of the '_type' key
140 def video_result(self, video_info):
141 """Returns a video"""
142 video_info['_type'] = 'video'
144 def url_result(self, url, ie=None):
145 """Returns a url that points to a page that should be processed"""
146 #TODO: ie should be the class used for getting the info
147 video_info = {'_type': 'url',
150 def playlist_result(self, entries):
151 """Returns a playlist"""
152 video_info = {'_type': 'playlist',
157 class YoutubeIE(InfoExtractor):
158 """Information extractor for youtube.com."""
162 (?:https?://)? # http(s):// (optional)
163 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
164 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
165 (?:.*?\#/)? # handle anchor (#/) redirect urls
166 (?: # the various things that can precede the ID:
167 (?:(?:v|embed|e)/) # v/ or embed/ or e/
168 |(?: # or the v= param in all its forms
169 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
170 (?:\?|\#!?) # the params delimiter ? or # or #!
171 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
174 )? # optional -> youtube.com/xxxx is OK
175 )? # all until now is optional -> you can pass the naked ID
176 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
177 (?(1).+)? # if we found the ID, everything can follow
179 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
180 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
181 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
182 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
183 _NETRC_MACHINE = 'youtube'
184 # Listed in order of quality
185 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
186 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
187 _video_extensions = {
193 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
199 _video_dimensions = {
218 def suitable(cls, url):
219 """Receives a URL and returns True if suitable for this IE."""
220 if YoutubePlaylistIE.suitable(url): return False
221 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
223 def report_lang(self):
224 """Report attempt to set language."""
225 self._downloader.to_screen(u'[youtube] Setting language')
227 def report_login(self):
228 """Report attempt to log in."""
229 self._downloader.to_screen(u'[youtube] Logging in')
231 def report_age_confirmation(self):
232 """Report attempt to confirm age."""
233 self._downloader.to_screen(u'[youtube] Confirming age')
235 def report_video_webpage_download(self, video_id):
236 """Report attempt to download video webpage."""
237 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
239 def report_video_info_webpage_download(self, video_id):
240 """Report attempt to download video info webpage."""
241 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
243 def report_video_subtitles_download(self, video_id):
244 """Report attempt to download video info webpage."""
245 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
247 def report_video_subtitles_request(self, video_id, sub_lang, format):
248 """Report attempt to download video info webpage."""
249 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
251 def report_video_subtitles_available(self, video_id, sub_lang_list):
252 """Report available subtitles."""
253 sub_lang = ",".join(list(sub_lang_list.keys()))
254 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
256 def report_information_extraction(self, video_id):
257 """Report attempt to extract video information."""
258 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
260 def report_unavailable_format(self, video_id, format):
261 """Report extracted video URL."""
262 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
264 def report_rtmp_download(self):
265 """Indicate the download will use the RTMP protocol."""
266 self._downloader.to_screen(u'[youtube] RTMP download detected')
268 def _get_available_subtitles(self, video_id):
269 self.report_video_subtitles_download(video_id)
270 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
272 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
273 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
274 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
275 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
276 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
277 if not sub_lang_list:
278 return (u'WARNING: video doesn\'t have subtitles', None)
281 def _list_available_subtitles(self, video_id):
282 sub_lang_list = self._get_available_subtitles(video_id)
283 self.report_video_subtitles_available(video_id, sub_lang_list)
285 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
286 self.report_video_subtitles_request(video_id, sub_lang, format)
287 params = compat_urllib_parse.urlencode({
293 url = 'http://www.youtube.com/api/timedtext?' + params
295 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
296 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
297 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
299 return (u'WARNING: Did not fetch video subtitles', None)
300 return (None, sub_lang, sub)
302 def _extract_subtitle(self, video_id):
303 sub_lang_list = self._get_available_subtitles(video_id)
304 sub_format = self._downloader.params.get('subtitlesformat')
305 if self._downloader.params.get('subtitleslang', False):
306 sub_lang = self._downloader.params.get('subtitleslang')
307 elif 'en' in sub_lang_list:
310 sub_lang = list(sub_lang_list.keys())[0]
311 if not sub_lang in sub_lang_list:
312 return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
314 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
317 def _extract_all_subtitles(self, video_id):
318 sub_lang_list = self._get_available_subtitles(video_id)
319 sub_format = self._downloader.params.get('subtitlesformat')
321 for sub_lang in sub_lang_list:
322 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
323 subtitles.append(subtitle)
326 def _print_formats(self, formats):
327 print('Available formats:')
329 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
331 def _real_initialize(self):
332 if self._downloader is None:
337 downloader_params = self._downloader.params
339 # Attempt to use provided username and password or .netrc data
340 if downloader_params.get('username', None) is not None:
341 username = downloader_params['username']
342 password = downloader_params['password']
343 elif downloader_params.get('usenetrc', False):
345 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
350 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
351 except (IOError, netrc.NetrcParseError) as err:
352 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
356 request = compat_urllib_request.Request(self._LANG_URL)
359 compat_urllib_request.urlopen(request).read()
360 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
361 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
364 # No authentication to be performed
368 request = compat_urllib_request.Request(self._LOGIN_URL)
370 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
377 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
379 galx = match.group(1)
381 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
387 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
391 u'PersistentCookie': u'yes',
393 u'bgresponse': u'js_disabled',
394 u'checkConnection': u'',
395 u'checkedDomains': u'youtube',
401 u'signIn': u'Sign in',
403 u'service': u'youtube',
407 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
409 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
410 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
411 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
414 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
415 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
416 self._downloader.report_warning(u'unable to log in: bad username or password')
418 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
419 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
425 'action_confirm': 'Confirm',
427 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
429 self.report_age_confirmation()
430 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
432 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
435 def _extract_id(self, url):
436 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
438 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
440 video_id = mobj.group(2)
443 def _real_extract(self, url):
444 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
445 mobj = re.search(self._NEXT_URL_RE, url)
447 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
448 video_id = self._extract_id(url)
451 self.report_video_webpage_download(video_id)
452 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
453 request = compat_urllib_request.Request(url)
455 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
460 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
462 # Attempt to extract SWF player URL
463 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
465 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
470 self.report_video_info_webpage_download(video_id)
471 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
472 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
473 % (video_id, el_type))
474 request = compat_urllib_request.Request(video_info_url)
476 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
477 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
478 video_info = compat_parse_qs(video_info_webpage)
479 if 'token' in video_info:
481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
484 if 'token' not in video_info:
485 if 'reason' in video_info:
486 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
488 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
491 # Check for "rental" videos
492 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
493 self._downloader.trouble(u'ERROR: "rental" videos not supported')
496 # Start extracting information
497 self.report_information_extraction(video_id)
500 if 'author' not in video_info:
501 self._downloader.trouble(u'ERROR: unable to extract uploader name')
503 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
506 video_uploader_id = None
507 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
509 video_uploader_id = mobj.group(1)
511 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
514 if 'title' not in video_info:
515 self._downloader.trouble(u'ERROR: unable to extract video title')
517 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
520 if 'thumbnail_url' not in video_info:
521 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
523 else: # don't panic if we can't find it
524 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
528 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
530 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
531 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
532 for expression in format_expressions:
534 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
539 video_description = get_element_by_id("eow-description", video_webpage)
540 if video_description:
541 video_description = clean_html(video_description)
543 video_description = ''
546 video_subtitles = None
548 if self._downloader.params.get('writesubtitles', False):
549 video_subtitles = self._extract_subtitle(video_id)
551 (sub_error, sub_lang, sub) = video_subtitles[0]
553 self._downloader.trouble(sub_error)
555 if self._downloader.params.get('allsubtitles', False):
556 video_subtitles = self._extract_all_subtitles(video_id)
557 for video_subtitle in video_subtitles:
558 (sub_error, sub_lang, sub) = video_subtitle
560 self._downloader.trouble(sub_error)
562 if self._downloader.params.get('listsubtitles', False):
563 sub_lang_list = self._list_available_subtitles(video_id)
566 if 'length_seconds' not in video_info:
567 self._downloader.trouble(u'WARNING: unable to extract video duration')
570 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
573 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
575 # Decide which formats to download
576 req_format = self._downloader.params.get('format', None)
578 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
579 self.report_rtmp_download()
580 video_url_list = [(None, video_info['conn'][0])]
581 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
582 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
583 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
584 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
585 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
587 format_limit = self._downloader.params.get('format_limit', None)
588 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
589 if format_limit is not None and format_limit in available_formats:
590 format_list = available_formats[available_formats.index(format_limit):]
592 format_list = available_formats
593 existing_formats = [x for x in format_list if x in url_map]
594 if len(existing_formats) == 0:
595 self._downloader.trouble(u'ERROR: no known formats available for video')
597 if self._downloader.params.get('listformats', None):
598 self._print_formats(existing_formats)
600 if req_format is None or req_format == 'best':
601 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
602 elif req_format == 'worst':
603 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
604 elif req_format in ('-1', 'all'):
605 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
607 # Specific formats. We pick the first in a slash-delimeted sequence.
608 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
609 req_formats = req_format.split('/')
610 video_url_list = None
611 for rf in req_formats:
613 video_url_list = [(rf, url_map[rf])]
615 if video_url_list is None:
616 self._downloader.trouble(u'ERROR: requested format not available')
619 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
623 for format_param, video_real_url in video_url_list:
625 video_extension = self._video_extensions.get(format_param, 'flv')
627 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
628 self._video_dimensions.get(format_param, '???'))
632 'url': video_real_url,
633 'uploader': video_uploader,
634 'uploader_id': video_uploader_id,
635 'upload_date': upload_date,
636 'title': video_title,
637 'ext': video_extension,
638 'format': video_format,
639 'thumbnail': video_thumbnail,
640 'description': video_description,
641 'player_url': player_url,
642 'subtitles': video_subtitles,
643 'duration': video_duration
648 class MetacafeIE(InfoExtractor):
649 """Information Extractor for metacafe.com."""
651 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
652 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
653 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
654 IE_NAME = u'metacafe'
656 def __init__(self, downloader=None):
657 InfoExtractor.__init__(self, downloader)
659 def report_disclaimer(self):
660 """Report disclaimer retrieval."""
661 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
663 def report_age_confirmation(self):
664 """Report attempt to confirm age."""
665 self._downloader.to_screen(u'[metacafe] Confirming age')
667 def report_download_webpage(self, video_id):
668 """Report webpage download."""
669 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
671 def report_extraction(self, video_id):
672 """Report information extraction."""
673 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
675 def _real_initialize(self):
676 # Retrieve disclaimer
677 request = compat_urllib_request.Request(self._DISCLAIMER)
679 self.report_disclaimer()
680 disclaimer = compat_urllib_request.urlopen(request).read()
681 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
682 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
688 'submit': "Continue - I'm over 18",
690 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
692 self.report_age_confirmation()
693 disclaimer = compat_urllib_request.urlopen(request).read()
694 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
695 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
698 def _real_extract(self, url):
699 # Extract id and simplified title from URL
700 mobj = re.match(self._VALID_URL, url)
702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
705 video_id = mobj.group(1)
707 # Check if video comes from YouTube
708 mobj2 = re.match(r'^yt-(.*)$', video_id)
709 if mobj2 is not None:
710 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
713 # Retrieve video webpage to extract further information
714 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
716 self.report_download_webpage(video_id)
717 webpage = compat_urllib_request.urlopen(request).read()
718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
719 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
722 # Extract URL, uploader and title from webpage
723 self.report_extraction(video_id)
724 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
726 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
727 video_extension = mediaURL[-3:]
729 # Extract gdaKey if available
730 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
734 gdaKey = mobj.group(1)
735 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
737 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
739 self._downloader.trouble(u'ERROR: unable to extract media URL')
741 vardict = compat_parse_qs(mobj.group(1))
742 if 'mediaData' not in vardict:
743 self._downloader.trouble(u'ERROR: unable to extract media URL')
745 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
747 self._downloader.trouble(u'ERROR: unable to extract media URL')
749 mediaURL = mobj.group(1).replace('\\/', '/')
750 video_extension = mediaURL[-3:]
751 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
753 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
755 self._downloader.trouble(u'ERROR: unable to extract title')
757 video_title = mobj.group(1).decode('utf-8')
759 mobj = re.search(r'submitter=(.*?);', webpage)
761 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
763 video_uploader = mobj.group(1)
766 'id': video_id.decode('utf-8'),
767 'url': video_url.decode('utf-8'),
768 'uploader': video_uploader.decode('utf-8'),
770 'title': video_title,
771 'ext': video_extension.decode('utf-8'),
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
782 def __init__(self, downloader=None):
783 InfoExtractor.__init__(self, downloader)
785 def report_extraction(self, video_id):
786 """Report information extraction."""
787 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
796 video_id = mobj.group(1).split('_')[0].split('?')[0]
798 video_extension = 'mp4'
800 # Retrieve video webpage to extract further information
801 request = compat_urllib_request.Request(url)
802 request.add_header('Cookie', 'family_filter=off')
803 webpage = self._download_webpage(request, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
809 self._downloader.trouble(u'ERROR: unable to extract media URL')
811 flashvars = compat_urllib_parse.unquote(mobj.group(1))
813 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
816 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
819 self._downloader.trouble(u'ERROR: unable to extract video URL')
822 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
824 self._downloader.trouble(u'ERROR: unable to extract video URL')
827 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
829 # TODO: support choosing qualities
831 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
833 self._downloader.trouble(u'ERROR: unable to extract title')
835 video_title = unescapeHTML(mobj.group('title'))
837 video_uploader = None
838 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
840 # lookin for official user
841 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
842 if mobj_official is None:
843 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
845 video_uploader = mobj_official.group(1)
847 video_uploader = mobj.group(1)
849 video_upload_date = None
850 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
852 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
857 'uploader': video_uploader,
858 'upload_date': video_upload_date,
859 'title': video_title,
860 'ext': video_extension,
864 class PhotobucketIE(InfoExtractor):
865 """Information extractor for photobucket.com."""
867 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
868 IE_NAME = u'photobucket'
870 def __init__(self, downloader=None):
871 InfoExtractor.__init__(self, downloader)
873 def report_download_webpage(self, video_id):
874 """Report webpage download."""
875 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
877 def report_extraction(self, video_id):
878 """Report information extraction."""
879 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
881 def _real_extract(self, url):
882 # Extract id from URL
883 mobj = re.match(self._VALID_URL, url)
885 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
888 video_id = mobj.group(1)
890 video_extension = 'flv'
892 # Retrieve video webpage to extract further information
893 request = compat_urllib_request.Request(url)
895 self.report_download_webpage(video_id)
896 webpage = compat_urllib_request.urlopen(request).read()
897 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
898 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
901 # Extract URL, uploader, and title from webpage
902 self.report_extraction(video_id)
903 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
905 self._downloader.trouble(u'ERROR: unable to extract media URL')
907 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
911 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract title')
915 video_title = mobj.group(1).decode('utf-8')
917 video_uploader = mobj.group(2).decode('utf-8')
920 'id': video_id.decode('utf-8'),
921 'url': video_url.decode('utf-8'),
922 'uploader': video_uploader,
924 'title': video_title,
925 'ext': video_extension.decode('utf-8'),
929 class YahooIE(InfoExtractor):
930 """Information extractor for video.yahoo.com."""
933 # _VALID_URL matches all Yahoo! Video URLs
934 # _VPAGE_URL matches only the extractable '/watch/' URLs
935 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
936 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
937 IE_NAME = u'video.yahoo'
939 def __init__(self, downloader=None):
940 InfoExtractor.__init__(self, downloader)
942 def report_download_webpage(self, video_id):
943 """Report webpage download."""
944 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
946 def report_extraction(self, video_id):
947 """Report information extraction."""
948 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
950 def _real_extract(self, url, new_video=True):
951 # Extract ID from URL
952 mobj = re.match(self._VALID_URL, url)
954 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
957 video_id = mobj.group(2)
958 video_extension = 'flv'
960 # Rewrite valid but non-extractable URLs as
961 # extractable English language /watch/ URLs
962 if re.match(self._VPAGE_URL, url) is None:
963 request = compat_urllib_request.Request(url)
965 webpage = compat_urllib_request.urlopen(request).read()
966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
967 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
970 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
972 self._downloader.trouble(u'ERROR: Unable to extract id field')
974 yahoo_id = mobj.group(1)
976 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
978 self._downloader.trouble(u'ERROR: Unable to extract vid field')
980 yahoo_vid = mobj.group(1)
982 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
983 return self._real_extract(url, new_video=False)
985 # Retrieve video webpage to extract further information
986 request = compat_urllib_request.Request(url)
988 self.report_download_webpage(video_id)
989 webpage = compat_urllib_request.urlopen(request).read()
990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
994 # Extract uploader and title from webpage
995 self.report_extraction(video_id)
996 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
998 self._downloader.trouble(u'ERROR: unable to extract video title')
1000 video_title = mobj.group(1).decode('utf-8')
1002 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1004 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1006 video_uploader = mobj.group(1).decode('utf-8')
1008 # Extract video thumbnail
1009 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1011 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1013 video_thumbnail = mobj.group(1).decode('utf-8')
1015 # Extract video description
1016 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1018 self._downloader.trouble(u'ERROR: unable to extract video description')
1020 video_description = mobj.group(1).decode('utf-8')
1021 if not video_description:
1022 video_description = 'No description available.'
1024 # Extract video height and width
1025 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1027 self._downloader.trouble(u'ERROR: unable to extract video height')
1029 yv_video_height = mobj.group(1)
1031 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1033 self._downloader.trouble(u'ERROR: unable to extract video width')
1035 yv_video_width = mobj.group(1)
1037 # Retrieve video playlist to extract media URL
1038 # I'm not completely sure what all these options are, but we
1039 # seem to need most of them, otherwise the server sends a 401.
1040 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1041 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1042 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1043 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1044 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1046 self.report_download_webpage(video_id)
1047 webpage = compat_urllib_request.urlopen(request).read()
1048 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1049 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052 # Extract media URL from playlist XML
1053 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1055 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1057 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1058 video_url = unescapeHTML(video_url)
1061 'id': video_id.decode('utf-8'),
1063 'uploader': video_uploader,
1064 'upload_date': None,
1065 'title': video_title,
1066 'ext': video_extension.decode('utf-8'),
1067 'thumbnail': video_thumbnail.decode('utf-8'),
1068 'description': video_description,
1072 class VimeoIE(InfoExtractor):
1073 """Information extractor for vimeo.com."""
1075 # _VALID_URL matches Vimeo URLs
1076 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1079 def __init__(self, downloader=None):
1080 InfoExtractor.__init__(self, downloader)
1082 def report_download_webpage(self, video_id):
1083 """Report webpage download."""
1084 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1086 def report_extraction(self, video_id):
1087 """Report information extraction."""
1088 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1090 def _real_extract(self, url, new_video=True):
1091 # Extract ID from URL
1092 mobj = re.match(self._VALID_URL, url)
1094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1097 video_id = mobj.group('id')
1098 if not mobj.group('proto'):
1099 url = 'https://' + url
1100 if mobj.group('direct_link'):
1101 url = 'https://vimeo.com/' + video_id
1103 # Retrieve video webpage to extract further information
1104 request = compat_urllib_request.Request(url, None, std_headers)
1106 self.report_download_webpage(video_id)
1107 webpage_bytes = compat_urllib_request.urlopen(request).read()
1108 webpage = webpage_bytes.decode('utf-8')
1109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1113 # Now we begin extracting as much information as we can from what we
1114 # retrieved. First we extract the information common to all extractors,
1115 # and latter we extract those that are Vimeo specific.
1116 self.report_extraction(video_id)
1118 # Extract the config JSON
1120 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1121 config = json.loads(config)
1123 self._downloader.trouble(u'ERROR: unable to extract info section')
1127 video_title = config["video"]["title"]
1129 # Extract uploader and uploader_id
1130 video_uploader = config["video"]["owner"]["name"]
1131 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1133 # Extract video thumbnail
1134 video_thumbnail = config["video"]["thumbnail"]
1136 # Extract video description
1137 video_description = get_element_by_attribute("itemprop", "description", webpage)
1138 if video_description: video_description = clean_html(video_description)
1139 else: video_description = ''
1141 # Extract upload date
1142 video_upload_date = None
1143 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1144 if mobj is not None:
1145 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1147 # Vimeo specific: extract request signature and timestamp
1148 sig = config['request']['signature']
1149 timestamp = config['request']['timestamp']
1151 # Vimeo specific: extract video codec and quality information
1152 # First consider quality, then codecs, then take everything
1153 # TODO bind to format param
1154 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1155 files = { 'hd': [], 'sd': [], 'other': []}
1156 for codec_name, codec_extension in codecs:
1157 if codec_name in config["video"]["files"]:
1158 if 'hd' in config["video"]["files"][codec_name]:
1159 files['hd'].append((codec_name, codec_extension, 'hd'))
1160 elif 'sd' in config["video"]["files"][codec_name]:
1161 files['sd'].append((codec_name, codec_extension, 'sd'))
1163 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1165 for quality in ('hd', 'sd', 'other'):
1166 if len(files[quality]) > 0:
1167 video_quality = files[quality][0][2]
1168 video_codec = files[quality][0][0]
1169 video_extension = files[quality][0][1]
1170 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1173 self._downloader.trouble(u'ERROR: no known codec found')
1176 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1182 'uploader': video_uploader,
1183 'uploader_id': video_uploader_id,
1184 'upload_date': video_upload_date,
1185 'title': video_title,
1186 'ext': video_extension,
1187 'thumbnail': video_thumbnail,
1188 'description': video_description,
1192 class ArteTvIE(InfoExtractor):
1193 """arte.tv information extractor."""
1195 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL = r'index-[0-9]+\.html$'
1198 IE_NAME = u'arte.tv'
1200 def __init__(self, downloader=None):
1201 InfoExtractor.__init__(self, downloader)
1203 def report_download_webpage(self, video_id):
1204 """Report webpage download."""
1205 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1207 def report_extraction(self, video_id):
1208 """Report information extraction."""
1209 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1211 def fetch_webpage(self, url):
1212 request = compat_urllib_request.Request(url)
1214 self.report_download_webpage(url)
1215 webpage = compat_urllib_request.urlopen(request).read()
1216 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1217 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1219 except ValueError as err:
1220 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1224 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1225 page = self.fetch_webpage(url)
1226 mobj = re.search(regex, page, regexFlags)
1230 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233 for (i, key, err) in matchTuples:
1234 if mobj.group(i) is None:
1235 self._downloader.trouble(err)
1238 info[key] = mobj.group(i)
1242 def extractLiveStream(self, url):
1243 video_lang = url.split('/')[-4]
1244 info = self.grep_webpage(
1246 r'src="(.*?/videothek_js.*?\.js)',
1249 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1252 http_host = url.split('/')[2]
1253 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1254 info = self.grep_webpage(
1256 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1257 '(http://.*?\.swf).*?' +
1261 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1262 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1263 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1266 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1268 def extractPlus7Stream(self, url):
1269 video_lang = url.split('/')[-3]
1270 info = self.grep_webpage(
1272 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1275 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1278 next_url = compat_urllib_parse.unquote(info.get('url'))
1279 info = self.grep_webpage(
1281 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1284 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1287 next_url = compat_urllib_parse.unquote(info.get('url'))
1289 info = self.grep_webpage(
1291 r'<video id="(.*?)".*?>.*?' +
1292 '<name>(.*?)</name>.*?' +
1293 '<dateVideo>(.*?)</dateVideo>.*?' +
1294 '<url quality="hd">(.*?)</url>',
1297 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1298 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1299 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1300 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1305 'id': info.get('id'),
1306 'url': compat_urllib_parse.unquote(info.get('url')),
1307 'uploader': u'arte.tv',
1308 'upload_date': info.get('date'),
1309 'title': info.get('title').decode('utf-8'),
1315 def _real_extract(self, url):
1316 video_id = url.split('/')[-1]
1317 self.report_extraction(video_id)
1319 if re.search(self._LIVE_URL, video_id) is not None:
1320 self.extractLiveStream(url)
1323 info = self.extractPlus7Stream(url)
1328 class GenericIE(InfoExtractor):
1329 """Generic last-resort information extractor."""
1332 IE_NAME = u'generic'
1334 def __init__(self, downloader=None):
1335 InfoExtractor.__init__(self, downloader)
1337 def report_download_webpage(self, video_id):
1338 """Report webpage download."""
1339 if not self._downloader.params.get('test', False):
1340 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1341 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1343 def report_extraction(self, video_id):
1344 """Report information extraction."""
1345 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1347 def report_following_redirect(self, new_url):
1348 """Report information extraction."""
1349 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1351 def _test_redirect(self, url):
1352 """Check if it is a redirect, like url shorteners, in case return the new url."""
1353 class HeadRequest(compat_urllib_request.Request):
1354 def get_method(self):
1357 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1359 Subclass the HTTPRedirectHandler to make it use our
1360 HeadRequest also on the redirected URL
1362 def redirect_request(self, req, fp, code, msg, headers, newurl):
1363 if code in (301, 302, 303, 307):
1364 newurl = newurl.replace(' ', '%20')
1365 newheaders = dict((k,v) for k,v in req.headers.items()
1366 if k.lower() not in ("content-length", "content-type"))
1367 return HeadRequest(newurl,
1369 origin_req_host=req.get_origin_req_host(),
1372 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1374 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1376 Fallback to GET if HEAD is not allowed (405 HTTP error)
1378 def http_error_405(self, req, fp, code, msg, headers):
1382 newheaders = dict((k,v) for k,v in req.headers.items()
1383 if k.lower() not in ("content-length", "content-type"))
1384 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1386 origin_req_host=req.get_origin_req_host(),
1390 opener = compat_urllib_request.OpenerDirector()
1391 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1392 HTTPMethodFallback, HEADRedirectHandler,
1393 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1394 opener.add_handler(handler())
1396 response = opener.open(HeadRequest(url))
1397 new_url = response.geturl()
1402 self.report_following_redirect(new_url)
1405 def _real_extract(self, url):
1406 new_url = self._test_redirect(url)
1407 if new_url: return [self.url_result(new_url)]
1409 video_id = url.split('/')[-1]
1411 webpage = self._download_webpage(url, video_id)
1412 except ValueError as err:
1413 # since this is the last-resort InfoExtractor, if
1414 # this error is thrown, it'll be thrown here
1415 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1418 self.report_extraction(video_id)
1419 # Start with something easy: JW Player in SWFObject
1420 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1422 # Broaden the search a little bit
1423 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1425 # Broaden the search a little bit: JWPlayer JS loader
1426 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1428 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1431 # It's possible that one of the regexes
1432 # matched, but returned an empty group:
1433 if mobj.group(1) is None:
1434 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1437 video_url = compat_urllib_parse.unquote(mobj.group(1))
1438 video_id = os.path.basename(video_url)
1440 # here's a fun little line of code for you:
1441 video_extension = os.path.splitext(video_id)[1][1:]
1442 video_id = os.path.splitext(video_id)[0]
1444 # it's tempting to parse this further, but you would
1445 # have to take into account all the variations like
1446 # Video Title - Site Name
1447 # Site Name | Video Title
1448 # Video Title - Tagline | Site Name
1449 # and so on and so forth; it's just not practical
1450 mobj = re.search(r'<title>(.*)</title>', webpage)
1452 self._downloader.trouble(u'ERROR: unable to extract title')
1454 video_title = mobj.group(1)
1456 # video uploader is domain name
1457 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1459 self._downloader.trouble(u'ERROR: unable to extract title')
1461 video_uploader = mobj.group(1)
1466 'uploader': video_uploader,
1467 'upload_date': None,
1468 'title': video_title,
1469 'ext': video_extension,
1473 class YoutubeSearchIE(InfoExtractor):
1474 """Information Extractor for YouTube search queries."""
1475 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1476 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1477 _max_youtube_results = 1000
1478 IE_NAME = u'youtube:search'
1480 def __init__(self, downloader=None):
1481 InfoExtractor.__init__(self, downloader)
1483 def report_download_page(self, query, pagenum):
1484 """Report attempt to download search page with given number."""
1485 query = query.decode(preferredencoding())
1486 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1488 def _real_extract(self, query):
1489 mobj = re.match(self._VALID_URL, query)
1491 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1494 prefix, query = query.split(':')
1496 query = query.encode('utf-8')
1498 self._download_n_results(query, 1)
1500 elif prefix == 'all':
1501 self._download_n_results(query, self._max_youtube_results)
1507 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1509 elif n > self._max_youtube_results:
1510 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1511 n = self._max_youtube_results
1512 self._download_n_results(query, n)
1514 except ValueError: # parsing prefix as integer fails
1515 self._download_n_results(query, 1)
1518 def _download_n_results(self, query, n):
1519 """Downloads a specified number of results for a query"""
1525 while (50 * pagenum) < limit:
1526 self.report_download_page(query, pagenum+1)
1527 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1528 request = compat_urllib_request.Request(result_url)
1530 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1532 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1534 api_response = json.loads(data)['data']
1536 if not 'items' in api_response:
1537 self._downloader.trouble(u'[youtube] No video results')
1540 new_ids = list(video['id'] for video in api_response['items'])
1541 video_ids += new_ids
1543 limit = min(n, api_response['totalItems'])
1546 if len(video_ids) > n:
1547 video_ids = video_ids[:n]
1548 for id in video_ids:
1549 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1553 class GoogleSearchIE(InfoExtractor):
1554 """Information Extractor for Google Video search queries."""
1555 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1556 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1557 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1558 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1559 _max_google_results = 1000
1560 IE_NAME = u'video.google:search'
1562 def __init__(self, downloader=None):
1563 InfoExtractor.__init__(self, downloader)
1565 def report_download_page(self, query, pagenum):
1566 """Report attempt to download playlist page with given number."""
1567 query = query.decode(preferredencoding())
1568 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1570 def _real_extract(self, query):
1571 mobj = re.match(self._VALID_URL, query)
1573 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1576 prefix, query = query.split(':')
1578 query = query.encode('utf-8')
1580 self._download_n_results(query, 1)
1582 elif prefix == 'all':
1583 self._download_n_results(query, self._max_google_results)
1589 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1591 elif n > self._max_google_results:
1592 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1593 n = self._max_google_results
1594 self._download_n_results(query, n)
1596 except ValueError: # parsing prefix as integer fails
1597 self._download_n_results(query, 1)
1600 def _download_n_results(self, query, n):
1601 """Downloads a specified number of results for a query"""
1607 self.report_download_page(query, pagenum)
1608 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1609 request = compat_urllib_request.Request(result_url)
1611 page = compat_urllib_request.urlopen(request).read()
1612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1613 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1616 # Extract video identifiers
1617 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1618 video_id = mobj.group(1)
1619 if video_id not in video_ids:
1620 video_ids.append(video_id)
1621 if len(video_ids) == n:
1622 # Specified n videos reached
1623 for id in video_ids:
1624 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1627 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1628 for id in video_ids:
1629 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1632 pagenum = pagenum + 1
1635 class YahooSearchIE(InfoExtractor):
1636 """Information Extractor for Yahoo! Video search queries."""
1639 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1640 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1641 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1642 _MORE_PAGES_INDICATOR = r'\s*Next'
1643 _max_yahoo_results = 1000
1644 IE_NAME = u'video.yahoo:search'
1646 def __init__(self, downloader=None):
1647 InfoExtractor.__init__(self, downloader)
1649 def report_download_page(self, query, pagenum):
1650 """Report attempt to download playlist page with given number."""
1651 query = query.decode(preferredencoding())
1652 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1654 def _real_extract(self, query):
1655 mobj = re.match(self._VALID_URL, query)
1657 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1660 prefix, query = query.split(':')
1662 query = query.encode('utf-8')
1664 self._download_n_results(query, 1)
1666 elif prefix == 'all':
1667 self._download_n_results(query, self._max_yahoo_results)
1673 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1675 elif n > self._max_yahoo_results:
1676 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1677 n = self._max_yahoo_results
1678 self._download_n_results(query, n)
1680 except ValueError: # parsing prefix as integer fails
1681 self._download_n_results(query, 1)
1684 def _download_n_results(self, query, n):
1685 """Downloads a specified number of results for a query"""
1688 already_seen = set()
1692 self.report_download_page(query, pagenum)
1693 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1694 request = compat_urllib_request.Request(result_url)
1696 page = compat_urllib_request.urlopen(request).read()
1697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1701 # Extract video identifiers
1702 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703 video_id = mobj.group(1)
1704 if video_id not in already_seen:
1705 video_ids.append(video_id)
1706 already_seen.add(video_id)
1707 if len(video_ids) == n:
1708 # Specified n videos reached
1709 for id in video_ids:
1710 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1713 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1714 for id in video_ids:
1715 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1718 pagenum = pagenum + 1
1721 class YoutubePlaylistIE(InfoExtractor):
1722 """Information Extractor for YouTube playlists."""
1724 _VALID_URL = r"""(?:
1729 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1730 \? (?:.*?&)*? (?:p|a|list)=
1735 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1738 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1740 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1742 IE_NAME = u'youtube:playlist'
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1748 def suitable(cls, url):
1749 """Receives a URL and returns True if suitable for this IE."""
1750 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1752 def report_download_page(self, playlist_id, pagenum):
1753 """Report attempt to download playlist page with given number."""
1754 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1756 def _real_extract(self, url):
1757 # Extract playlist id
1758 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1760 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1763 # Download playlist videos from API
1764 playlist_id = mobj.group(1) or mobj.group(2)
1769 self.report_download_page(playlist_id, page_num)
1771 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1773 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779 response = json.loads(page)
1780 except ValueError as err:
1781 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1784 if not 'feed' in response or not 'entry' in response['feed']:
1785 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1787 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1788 for entry in response['feed']['entry']
1789 if 'content' in entry ]
1791 if len(response['feed']['entry']) < self._MAX_RESULTS:
1795 videos = [v[1] for v in sorted(videos)]
1798 playliststart = self._downloader.params.get('playliststart', 1) - 1
1799 playlistend = self._downloader.params.get('playlistend', -1)
1800 if playlistend == -1:
1801 videos = videos[playliststart:]
1803 videos = videos[playliststart:playlistend]
1805 if len(videos) == total:
1806 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1808 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1810 url_results = [self.url_result(url) for url in videos]
1811 return [self.playlist_result(url_results)]
1814 class YoutubeChannelIE(InfoExtractor):
1815 """Information Extractor for YouTube channels."""
1817 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1818 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1819 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1820 IE_NAME = u'youtube:channel'
1822 def report_download_page(self, channel_id, pagenum):
1823 """Report attempt to download channel page with given number."""
1824 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1826 def _real_extract(self, url):
1827 # Extract channel id
1828 mobj = re.match(self._VALID_URL, url)
1830 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1833 # Download channel pages
1834 channel_id = mobj.group(1)
1839 self.report_download_page(channel_id, pagenum)
1840 url = self._TEMPLATE_URL % (channel_id, pagenum)
1841 request = compat_urllib_request.Request(url)
1843 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1848 # Extract video identifiers
1850 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1851 if mobj.group(1) not in ids_in_page:
1852 ids_in_page.append(mobj.group(1))
1853 video_ids.extend(ids_in_page)
1855 if self._MORE_PAGES_INDICATOR not in page:
1857 pagenum = pagenum + 1
1859 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1861 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862 url_entries = [self.url_result(url) for url in urls]
1863 return [self.playlist_result(url_entries)]
1866 class YoutubeUserIE(InfoExtractor):
1867 """Information Extractor for YouTube users."""
1869 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871 _GDATA_PAGE_SIZE = 50
1872 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874 IE_NAME = u'youtube:user'
1876 def __init__(self, downloader=None):
1877 InfoExtractor.__init__(self, downloader)
1879 def report_download_page(self, username, start_index):
1880 """Report attempt to download user page."""
1881 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1884 def _real_extract(self, url):
1886 mobj = re.match(self._VALID_URL, url)
1888 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1891 username = mobj.group(1)
1893 # Download video ids using YouTube Data API. Result size per
1894 # query is limited (currently to 50 videos) so we need to query
1895 # page by page until there are no video ids - it means we got
1902 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903 self.report_download_page(username, start_index)
1905 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1913 # Extract video identifiers
1916 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917 if mobj.group(1) not in ids_in_page:
1918 ids_in_page.append(mobj.group(1))
1920 video_ids.extend(ids_in_page)
1922 # A little optimization - if current page is not
1923 # "full", ie. does not contain PAGE_SIZE video ids then
1924 # we can assume that this page is the last one - there
1925 # are no more ids on further pages - no need to query
1928 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1933 all_ids_count = len(video_ids)
1934 playliststart = self._downloader.params.get('playliststart', 1) - 1
1935 playlistend = self._downloader.params.get('playlistend', -1)
1937 if playlistend == -1:
1938 video_ids = video_ids[playliststart:]
1940 video_ids = video_ids[playliststart:playlistend]
1942 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1943 (username, all_ids_count, len(video_ids)))
1945 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1946 url_results = [self.url_result(url) for url in urls]
1947 return [self.playlist_result(url_results)]
1950 class BlipTVUserIE(InfoExtractor):
1951 """Information Extractor for blip.tv users."""
1953 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1955 IE_NAME = u'blip.tv:user'
1957 def __init__(self, downloader=None):
1958 InfoExtractor.__init__(self, downloader)
1960 def report_download_page(self, username, pagenum):
1961 """Report attempt to download user page."""
1962 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1963 (self.IE_NAME, username, pagenum))
1965 def _real_extract(self, url):
1967 mobj = re.match(self._VALID_URL, url)
1969 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1972 username = mobj.group(1)
1974 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1976 request = compat_urllib_request.Request(url)
1979 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1980 mobj = re.search(r'data-users-id="([^"]+)"', page)
1981 page_base = page_base % mobj.group(1)
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1987 # Download video ids using BlipTV Ajax calls. Result size per
1988 # query is limited (currently to 12 videos) so we need to query
1989 # page by page until there are no video ids - it means we got
1996 self.report_download_page(username, pagenum)
1997 url = page_base + "&page=" + str(pagenum)
1998 request = compat_urllib_request.Request( url )
2000 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2001 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2005 # Extract video identifiers
2008 for mobj in re.finditer(r'href="/([^"]+)"', page):
2009 if mobj.group(1) not in ids_in_page:
2010 ids_in_page.append(unescapeHTML(mobj.group(1)))
2012 video_ids.extend(ids_in_page)
2014 # A little optimization - if current page is not
2015 # "full", ie. does not contain PAGE_SIZE video ids then
2016 # we can assume that this page is the last one - there
2017 # are no more ids on further pages - no need to query
2020 if len(ids_in_page) < self._PAGE_SIZE:
2025 all_ids_count = len(video_ids)
2026 playliststart = self._downloader.params.get('playliststart', 1) - 1
2027 playlistend = self._downloader.params.get('playlistend', -1)
2029 if playlistend == -1:
2030 video_ids = video_ids[playliststart:]
2032 video_ids = video_ids[playliststart:playlistend]
2034 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2035 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2037 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2038 url_entries = [self.url_result(url) for url in urls]
2039 return [self.playlist_result(url_entries)]
2042 class DepositFilesIE(InfoExtractor):
2043 """Information extractor for depositfiles.com"""
2045 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2047 def report_download_webpage(self, file_id):
2048 """Report webpage download."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2051 def report_extraction(self, file_id):
2052 """Report information extraction."""
2053 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2055 def _real_extract(self, url):
2056 file_id = url.split('/')[-1]
2057 # Rebuild url in english locale
2058 url = 'http://depositfiles.com/en/files/' + file_id
2060 # Retrieve file webpage with 'Free download' button pressed
2061 free_download_indication = { 'gateway_result' : '1' }
2062 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2064 self.report_download_webpage(file_id)
2065 webpage = compat_urllib_request.urlopen(request).read()
2066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2067 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2070 # Search for the real file URL
2071 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2072 if (mobj is None) or (mobj.group(1) is None):
2073 # Try to figure out reason of the error.
2074 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2075 if (mobj is not None) and (mobj.group(1) is not None):
2076 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2077 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2079 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2082 file_url = mobj.group(1)
2083 file_extension = os.path.splitext(file_url)[1][1:]
2085 # Search for file title
2086 mobj = re.search(r'<b title="(.*?)">', webpage)
2088 self._downloader.trouble(u'ERROR: unable to extract title')
2090 file_title = mobj.group(1).decode('utf-8')
2093 'id': file_id.decode('utf-8'),
2094 'url': file_url.decode('utf-8'),
2096 'upload_date': None,
2097 'title': file_title,
2098 'ext': file_extension.decode('utf-8'),
2102 class FacebookIE(InfoExtractor):
2103 """Information Extractor for Facebook"""
2105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2106 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2107 _NETRC_MACHINE = 'facebook'
2108 IE_NAME = u'facebook'
2110 def report_login(self):
2111 """Report attempt to log in."""
2112 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2114 def _real_initialize(self):
2115 if self._downloader is None:
2120 downloader_params = self._downloader.params
2122 # Attempt to use provided username and password or .netrc data
2123 if downloader_params.get('username', None) is not None:
2124 useremail = downloader_params['username']
2125 password = downloader_params['password']
2126 elif downloader_params.get('usenetrc', False):
2128 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2129 if info is not None:
2133 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2134 except (IOError, netrc.NetrcParseError) as err:
2135 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2138 if useremail is None:
2147 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2150 login_results = compat_urllib_request.urlopen(request).read()
2151 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2152 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2154 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2155 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2158 def _real_extract(self, url):
2159 mobj = re.match(self._VALID_URL, url)
2161 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163 video_id = mobj.group('ID')
2165 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2166 webpage = self._download_webpage(url, video_id)
2168 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2169 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2170 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2172 raise ExtractorError(u'Cannot parse data')
2173 data = dict(json.loads(m.group(1)))
2174 params_raw = compat_urllib_parse.unquote(data['params'])
2175 params = json.loads(params_raw)
2176 video_url = params['hd_src']
2178 video_url = params['sd_src']
2180 raise ExtractorError(u'Cannot find video URL')
2181 video_duration = int(params['video_duration'])
2183 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2185 raise ExtractorError(u'Cannot find title in webpage')
2186 video_title = unescapeHTML(m.group(1))
2190 'title': video_title,
2193 'duration': video_duration,
2194 'thumbnail': params['thumbnail_src'],
2199 class BlipTVIE(InfoExtractor):
2200 """Information extractor for blip.tv"""
2202 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2203 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2204 IE_NAME = u'blip.tv'
2206 def report_extraction(self, file_id):
2207 """Report information extraction."""
2208 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2210 def report_direct_download(self, title):
2211 """Report information extraction."""
2212 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2214 def _real_extract(self, url):
2215 mobj = re.match(self._VALID_URL, url)
2217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2220 urlp = compat_urllib_parse_urlparse(url)
2221 if urlp.path.startswith('/play/'):
2222 request = compat_urllib_request.Request(url)
2223 response = compat_urllib_request.urlopen(request)
2224 redirecturl = response.geturl()
2225 rurlp = compat_urllib_parse_urlparse(redirecturl)
2226 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2227 url = 'http://blip.tv/a/a-' + file_id
2228 return self._real_extract(url)
2235 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2236 request = compat_urllib_request.Request(json_url)
2237 request.add_header('User-Agent', 'iTunes/10.6.1')
2238 self.report_extraction(mobj.group(1))
2241 urlh = compat_urllib_request.urlopen(request)
2242 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2243 basename = url.split('/')[-1]
2244 title,ext = os.path.splitext(basename)
2245 title = title.decode('UTF-8')
2246 ext = ext.replace('.', '')
2247 self.report_direct_download(title)
2252 'upload_date': None,
2257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2258 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2259 if info is None: # Regular URL
2261 json_code_bytes = urlh.read()
2262 json_code = json_code_bytes.decode('utf-8')
2263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2268 json_data = json.loads(json_code)
2269 if 'Post' in json_data:
2270 data = json_data['Post']
2274 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2275 video_url = data['media']['url']
2276 umobj = re.match(self._URL_EXT, video_url)
2278 raise ValueError('Can not determine filename extension')
2279 ext = umobj.group(1)
2282 'id': data['item_id'],
2284 'uploader': data['display_name'],
2285 'upload_date': upload_date,
2286 'title': data['title'],
2288 'format': data['media']['mimeType'],
2289 'thumbnail': data['thumbnailUrl'],
2290 'description': data['description'],
2291 'player_url': data['embedUrl'],
2292 'user_agent': 'iTunes/10.6.1',
2294 except (ValueError,KeyError) as err:
2295 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2301 class MyVideoIE(InfoExtractor):
2302 """Information Extractor for myvideo.de."""
2304 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2305 IE_NAME = u'myvideo'
2307 def __init__(self, downloader=None):
2308 InfoExtractor.__init__(self, downloader)
2310 def report_extraction(self, video_id):
2311 """Report information extraction."""
2312 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2314 def _real_extract(self,url):
2315 mobj = re.match(self._VALID_URL, url)
2317 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2320 video_id = mobj.group(1)
2323 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2324 webpage = self._download_webpage(webpage_url, video_id)
2326 self.report_extraction(video_id)
2327 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2330 self._downloader.trouble(u'ERROR: unable to extract media URL')
2332 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2334 mobj = re.search('<title>([^<]+)</title>', webpage)
2336 self._downloader.trouble(u'ERROR: unable to extract title')
2339 video_title = mobj.group(1)
2345 'upload_date': None,
2346 'title': video_title,
2350 class ComedyCentralIE(InfoExtractor):
2351 """Information extractor for The Daily Show and Colbert Report """
2353 # urls can be abbreviations like :thedailyshow or :colbert
2354 # urls for episodes like:
2355 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2356 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2357 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2358 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2359 |(https?://)?(www\.)?
2360 (?P<showname>thedailyshow|colbertnation)\.com/
2361 (full-episodes/(?P<episode>.*)|
2363 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2364 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2367 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2369 _video_extensions = {
2377 _video_dimensions = {
2387 def suitable(cls, url):
2388 """Receives a URL and returns True if suitable for this IE."""
2389 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2391 def report_extraction(self, episode_id):
2392 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2394 def report_config_download(self, episode_id, media_id):
2395 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2397 def report_index_download(self, episode_id):
2398 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2400 def _print_formats(self, formats):
2401 print('Available formats:')
2403 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2406 def _real_extract(self, url):
2407 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2412 if mobj.group('shortname'):
2413 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2414 url = u'http://www.thedailyshow.com/full-episodes/'
2416 url = u'http://www.colbertnation.com/full-episodes/'
2417 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2418 assert mobj is not None
2420 if mobj.group('clip'):
2421 if mobj.group('showname') == 'thedailyshow':
2422 epTitle = mobj.group('tdstitle')
2424 epTitle = mobj.group('cntitle')
2427 dlNewest = not mobj.group('episode')
2429 epTitle = mobj.group('showname')
2431 epTitle = mobj.group('episode')
2433 req = compat_urllib_request.Request(url)
2434 self.report_extraction(epTitle)
2436 htmlHandle = compat_urllib_request.urlopen(req)
2437 html = htmlHandle.read()
2438 webpage = html.decode('utf-8')
2439 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2443 url = htmlHandle.geturl()
2444 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2446 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2448 if mobj.group('episode') == '':
2449 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2451 epTitle = mobj.group('episode')
2453 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2455 if len(mMovieParams) == 0:
2456 # The Colbert Report embeds the information in a without
2457 # a URL prefix; so extract the alternate reference
2458 # and then add the URL prefix manually.
2460 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2461 if len(altMovieParams) == 0:
2462 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2465 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2467 uri = mMovieParams[0][1]
2468 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2469 self.report_index_download(epTitle)
2471 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2478 idoc = xml.etree.ElementTree.fromstring(indexXml)
2479 itemEls = idoc.findall('.//item')
2480 for partNum,itemEl in enumerate(itemEls):
2481 mediaId = itemEl.findall('./guid')[0].text
2482 shortMediaId = mediaId.split(':')[-1]
2483 showId = mediaId.split(':')[-2].replace('.com', '')
2484 officialTitle = itemEl.findall('./title')[0].text
2485 officialDate = itemEl.findall('./pubDate')[0].text
2487 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2488 compat_urllib_parse.urlencode({'uri': mediaId}))
2489 configReq = compat_urllib_request.Request(configUrl)
2490 self.report_config_download(epTitle, shortMediaId)
2492 configXml = compat_urllib_request.urlopen(configReq).read()
2493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2497 cdoc = xml.etree.ElementTree.fromstring(configXml)
2499 for rendition in cdoc.findall('.//rendition'):
2500 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2504 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2507 if self._downloader.params.get('listformats', None):
2508 self._print_formats([i[0] for i in turls])
2511 # For now, just pick the highest bitrate
2512 format,rtmp_video_url = turls[-1]
2514 # Get the format arg from the arg stream
2515 req_format = self._downloader.params.get('format', None)
2517 # Select format if we can find one
2520 format, rtmp_video_url = f, v
2523 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2525 raise ExtractorError(u'Cannot transform RTMP url')
2526 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2527 video_url = base + m.group('finalid')
2529 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2534 'upload_date': officialDate,
2539 'description': officialTitle,
2541 results.append(info)
2546 class EscapistIE(InfoExtractor):
2547 """Information extractor for The Escapist """
2549 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2550 IE_NAME = u'escapist'
2552 def report_extraction(self, showName):
2553 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2555 def report_config_download(self, showName):
2556 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2558 def _real_extract(self, url):
2559 mobj = re.match(self._VALID_URL, url)
2561 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2563 showName = mobj.group('showname')
2564 videoId = mobj.group('episode')
2566 self.report_extraction(showName)
2568 webPage = compat_urllib_request.urlopen(url)
2569 webPageBytes = webPage.read()
2570 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2571 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2573 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2576 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2577 description = unescapeHTML(descMatch.group(1))
2578 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2579 imgUrl = unescapeHTML(imgMatch.group(1))
2580 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2581 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2582 configUrlMatch = re.search('config=(.*)$', playerUrl)
2583 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2585 self.report_config_download(showName)
2587 configJSON = compat_urllib_request.urlopen(configUrl)
2588 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2589 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2590 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2591 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2594 # Technically, it's JavaScript, not JSON
2595 configJSON = configJSON.replace("'", '"')
2598 config = json.loads(configJSON)
2599 except (ValueError,) as err:
2600 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2603 playlist = config['playlist']
2604 videoUrl = playlist[1]['url']
2609 'uploader': showName,
2610 'upload_date': None,
2613 'thumbnail': imgUrl,
2614 'description': description,
2615 'player_url': playerUrl,
2620 class CollegeHumorIE(InfoExtractor):
2621 """Information extractor for collegehumor.com"""
2624 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2625 IE_NAME = u'collegehumor'
2627 def report_manifest(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2631 def report_extraction(self, video_id):
2632 """Report information extraction."""
2633 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2635 def _real_extract(self, url):
2636 mobj = re.match(self._VALID_URL, url)
2638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640 video_id = mobj.group('videoid')
2645 'upload_date': None,
2648 self.report_extraction(video_id)
2649 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2651 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2652 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2656 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2658 videoNode = mdoc.findall('./video')[0]
2659 info['description'] = videoNode.findall('./description')[0].text
2660 info['title'] = videoNode.findall('./caption')[0].text
2661 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2662 manifest_url = videoNode.findall('./file')[0].text
2664 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2667 manifest_url += '?hdcore=2.10.3'
2668 self.report_manifest(video_id)
2670 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2672 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2675 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2677 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2678 node_id = media_node.attrib['url']
2679 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2680 except IndexError as err:
2681 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2684 url_pr = compat_urllib_parse_urlparse(manifest_url)
2685 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2692 class XVideosIE(InfoExtractor):
2693 """Information extractor for xvideos.com"""
2695 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2696 IE_NAME = u'xvideos'
2698 def report_extraction(self, video_id):
2699 """Report information extraction."""
2700 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2702 def _real_extract(self, url):
2703 mobj = re.match(self._VALID_URL, url)
2705 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2707 video_id = mobj.group(1)
2709 webpage = self._download_webpage(url, video_id)
2711 self.report_extraction(video_id)
2715 mobj = re.search(r'flv_url=(.+?)&', webpage)
2717 self._downloader.trouble(u'ERROR: unable to extract video url')
2719 video_url = compat_urllib_parse.unquote(mobj.group(1))
2723 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2725 self._downloader.trouble(u'ERROR: unable to extract video title')
2727 video_title = mobj.group(1)
2730 # Extract video thumbnail
2731 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2733 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2735 video_thumbnail = mobj.group(0)
2741 'upload_date': None,
2742 'title': video_title,
2744 'thumbnail': video_thumbnail,
2745 'description': None,
2751 class SoundcloudIE(InfoExtractor):
2752 """Information extractor for soundcloud.com
2753 To access the media, the uid of the song and a stream token
2754 must be extracted from the page source and the script must make
2755 a request to media.soundcloud.com/crossdomain.xml. Then
2756 the media can be grabbed by requesting from an url composed
2757 of the stream token and uid
2760 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2761 IE_NAME = u'soundcloud'
2763 def __init__(self, downloader=None):
2764 InfoExtractor.__init__(self, downloader)
2766 def report_resolve(self, video_id):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2770 def report_extraction(self, video_id):
2771 """Report information extraction."""
2772 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2774 def _real_extract(self, url):
2775 mobj = re.match(self._VALID_URL, url)
2777 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2780 # extract uploader (which is in the url)
2781 uploader = mobj.group(1)
2782 # extract simple title (uploader + slug of song title)
2783 slug_title = mobj.group(2)
2784 simple_title = uploader + u'-' + slug_title
2786 self.report_resolve('%s/%s' % (uploader, slug_title))
2788 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2789 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790 request = compat_urllib_request.Request(resolv_url)
2792 info_json_bytes = compat_urllib_request.urlopen(request).read()
2793 info_json = info_json_bytes.decode('utf-8')
2794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2798 info = json.loads(info_json)
2799 video_id = info['id']
2800 self.report_extraction('%s/%s' % (uploader, slug_title))
2802 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803 request = compat_urllib_request.Request(streams_url)
2805 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2806 stream_json = stream_json_bytes.decode('utf-8')
2807 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2811 streams = json.loads(stream_json)
2812 mediaURL = streams['http_mp3_128_url']
2817 'uploader': info['user']['username'],
2818 'upload_date': info['created_at'],
2819 'title': info['title'],
2821 'description': info['description'],
2825 class InfoQIE(InfoExtractor):
2826 """Information extractor for infoq.com"""
2827 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2829 def report_extraction(self, video_id):
2830 """Report information extraction."""
2831 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2833 def _real_extract(self, url):
2834 mobj = re.match(self._VALID_URL, url)
2836 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2839 webpage = self._download_webpage(url, video_id=url)
2840 self.report_extraction(url)
2843 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2845 self._downloader.trouble(u'ERROR: unable to extract video url')
2847 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2848 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2851 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2853 self._downloader.trouble(u'ERROR: unable to extract video title')
2855 video_title = mobj.group(1)
2857 # Extract description
2858 video_description = u'No description available.'
2859 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2860 if mobj is not None:
2861 video_description = mobj.group(1)
2863 video_filename = video_url.split('/')[-1]
2864 video_id, extension = video_filename.split('.')
2870 'upload_date': None,
2871 'title': video_title,
2872 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2874 'description': video_description,
2879 class MixcloudIE(InfoExtractor):
2880 """Information extractor for www.mixcloud.com"""
2882 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2883 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2884 IE_NAME = u'mixcloud'
2886 def __init__(self, downloader=None):
2887 InfoExtractor.__init__(self, downloader)
2889 def report_download_json(self, file_id):
2890 """Report JSON download."""
2891 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2893 def report_extraction(self, file_id):
2894 """Report information extraction."""
2895 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2897 def get_urls(self, jsonData, fmt, bitrate='best'):
2898 """Get urls from 'audio_formats' section in json"""
2901 bitrate_list = jsonData[fmt]
2902 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2903 bitrate = max(bitrate_list) # select highest
2905 url_list = jsonData[fmt][bitrate]
2906 except TypeError: # we have no bitrate info.
2907 url_list = jsonData[fmt]
2910 def check_urls(self, url_list):
2911 """Returns 1st active url from list"""
2912 for url in url_list:
2914 compat_urllib_request.urlopen(url)
2916 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2921 def _print_formats(self, formats):
2922 print('Available formats:')
2923 for fmt in formats.keys():
2924 for b in formats[fmt]:
2926 ext = formats[fmt][b][0]
2927 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2928 except TypeError: # we have no bitrate info
2929 ext = formats[fmt][0]
2930 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2933 def _real_extract(self, url):
2934 mobj = re.match(self._VALID_URL, url)
2936 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2938 # extract uploader & filename from url
2939 uploader = mobj.group(1).decode('utf-8')
2940 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2942 # construct API request
2943 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2944 # retrieve .json file with links to files
2945 request = compat_urllib_request.Request(file_url)
2947 self.report_download_json(file_url)
2948 jsonData = compat_urllib_request.urlopen(request).read()
2949 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2950 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2954 json_data = json.loads(jsonData)
2955 player_url = json_data['player_swf_url']
2956 formats = dict(json_data['audio_formats'])
2958 req_format = self._downloader.params.get('format', None)
2961 if self._downloader.params.get('listformats', None):
2962 self._print_formats(formats)
2965 if req_format is None or req_format == 'best':
2966 for format_param in formats.keys():
2967 url_list = self.get_urls(formats, format_param)
2969 file_url = self.check_urls(url_list)
2970 if file_url is not None:
2973 if req_format not in formats:
2974 self._downloader.trouble(u'ERROR: format is not available')
2977 url_list = self.get_urls(formats, req_format)
2978 file_url = self.check_urls(url_list)
2979 format_param = req_format
2982 'id': file_id.decode('utf-8'),
2983 'url': file_url.decode('utf-8'),
2984 'uploader': uploader.decode('utf-8'),
2985 'upload_date': None,
2986 'title': json_data['name'],
2987 'ext': file_url.split('.')[-1].decode('utf-8'),
2988 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2989 'thumbnail': json_data['thumbnail_url'],
2990 'description': json_data['description'],
2991 'player_url': player_url.decode('utf-8'),
2994 class StanfordOpenClassroomIE(InfoExtractor):
2995 """Information extractor for Stanford's Open ClassRoom"""
2997 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2998 IE_NAME = u'stanfordoc'
3000 def report_download_webpage(self, objid):
3001 """Report information extraction."""
3002 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3004 def report_extraction(self, video_id):
3005 """Report information extraction."""
3006 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3008 def _real_extract(self, url):
3009 mobj = re.match(self._VALID_URL, url)
3011 raise ExtractorError(u'Invalid URL: %s' % url)
3013 if mobj.group('course') and mobj.group('video'): # A specific video
3014 course = mobj.group('course')
3015 video = mobj.group('video')
3017 'id': course + '_' + video,
3019 'upload_date': None,
3022 self.report_extraction(info['id'])
3023 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3024 xmlUrl = baseUrl + video + '.xml'
3026 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3030 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3032 info['title'] = mdoc.findall('./title')[0].text
3033 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3035 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3037 info['ext'] = info['url'].rpartition('.')[2]
3039 elif mobj.group('course'): # A course page
3040 course = mobj.group('course')
3045 'upload_date': None,
3048 coursepage = self._download_webpage(url, info['id'],
3049 note='Downloading course info page',
3050 errnote='Unable to download course info page')
3052 m = re.search('<h1>([^<]+)</h1>', coursepage)
3054 info['title'] = unescapeHTML(m.group(1))
3056 info['title'] = info['id']
3058 m = re.search('<description>([^<]+)</description>', coursepage)
3060 info['description'] = unescapeHTML(m.group(1))
3062 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3065 'type': 'reference',
3066 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3070 for entry in info['list']:
3071 assert entry['type'] == 'reference'
3072 results += self.extract(entry['url'])
3076 'id': 'Stanford OpenClassroom',
3079 'upload_date': None,
3082 self.report_download_webpage(info['id'])
3083 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3085 rootpage = compat_urllib_request.urlopen(rootURL).read()
3086 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3087 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3090 info['title'] = info['id']
3092 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3095 'type': 'reference',
3096 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3101 for entry in info['list']:
3102 assert entry['type'] == 'reference'
3103 results += self.extract(entry['url'])
3106 class MTVIE(InfoExtractor):
3107 """Information extractor for MTV.com"""
3109 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3112 def report_extraction(self, video_id):
3113 """Report information extraction."""
3114 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3116 def _real_extract(self, url):
3117 mobj = re.match(self._VALID_URL, url)
3119 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3121 if not mobj.group('proto'):
3122 url = 'http://' + url
3123 video_id = mobj.group('videoid')
3125 webpage = self._download_webpage(url, video_id)
3127 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3129 self._downloader.trouble(u'ERROR: unable to extract song name')
3131 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3132 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3134 self._downloader.trouble(u'ERROR: unable to extract performer')
3136 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3137 video_title = performer + ' - ' + song_name
3139 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3141 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3143 mtvn_uri = mobj.group(1)
3145 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3147 self._downloader.trouble(u'ERROR: unable to extract content id')
3149 content_id = mobj.group(1)
3151 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3152 self.report_extraction(video_id)
3153 request = compat_urllib_request.Request(videogen_url)
3155 metadataXml = compat_urllib_request.urlopen(request).read()
3156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3157 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3160 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3161 renditions = mdoc.findall('.//rendition')
3163 # For now, always pick the highest quality.
3164 rendition = renditions[-1]
3167 _,_,ext = rendition.attrib['type'].partition('/')
3168 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3169 video_url = rendition.find('./src').text
3171 self._downloader.trouble('Invalid rendition field.')
3177 'uploader': performer,
3178 'upload_date': None,
3179 'title': video_title,
3187 class YoukuIE(InfoExtractor):
3188 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3190 def report_download_webpage(self, file_id):
3191 """Report webpage download."""
3192 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3194 def report_extraction(self, file_id):
3195 """Report information extraction."""
3196 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3199 nowTime = int(time.time() * 1000)
3200 random1 = random.randint(1000,1998)
3201 random2 = random.randint(1000,9999)
3203 return "%d%d%d" %(nowTime,random1,random2)
3205 def _get_file_ID_mix_string(self, seed):
3207 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3209 for i in range(len(source)):
3210 seed = (seed * 211 + 30031 ) % 65536
3211 index = math.floor(seed / 65536 * len(source) )
3212 mixed.append(source[int(index)])
3213 source.remove(source[int(index)])
3214 #return ''.join(mixed)
3217 def _get_file_id(self, fileId, seed):
3218 mixed = self._get_file_ID_mix_string(seed)
3219 ids = fileId.split('*')
3223 realId.append(mixed[int(ch)])
3224 return ''.join(realId)
3226 def _real_extract(self, url):
3227 mobj = re.match(self._VALID_URL, url)
3229 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3231 video_id = mobj.group('ID')
3233 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3235 request = compat_urllib_request.Request(info_url, None, std_headers)
3237 self.report_download_webpage(video_id)
3238 jsondata = compat_urllib_request.urlopen(request).read()
3239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3240 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3243 self.report_extraction(video_id)
3245 jsonstr = jsondata.decode('utf-8')
3246 config = json.loads(jsonstr)
3248 video_title = config['data'][0]['title']
3249 seed = config['data'][0]['seed']
3251 format = self._downloader.params.get('format', None)
3252 supported_format = list(config['data'][0]['streamfileids'].keys())
3254 if format is None or format == 'best':
3255 if 'hd2' in supported_format:
3260 elif format == 'worst':
3268 fileid = config['data'][0]['streamfileids'][format]
3269 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3270 except (UnicodeDecodeError, ValueError, KeyError):
3271 self._downloader.trouble(u'ERROR: unable to extract info section')
3275 sid = self._gen_sid()
3276 fileid = self._get_file_id(fileid, seed)
3278 #column 8,9 of fileid represent the segment number
3279 #fileid[7:9] should be changed
3280 for index, key in enumerate(keys):
3282 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3283 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3286 'id': '%s_part%02d' % (video_id, index),
3287 'url': download_url,
3289 'upload_date': None,
3290 'title': video_title,
3293 files_info.append(info)
3298 class XNXXIE(InfoExtractor):
3299 """Information extractor for xnxx.com"""
3301 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3303 VIDEO_URL_RE = r'flv_url=(.*?)&'
3304 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3305 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3307 def report_webpage(self, video_id):
3308 """Report information extraction"""
3309 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3311 def report_extraction(self, video_id):
3312 """Report information extraction"""
3313 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3315 def _real_extract(self, url):
3316 mobj = re.match(self._VALID_URL, url)
3318 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3320 video_id = mobj.group(1)
3322 self.report_webpage(video_id)
3324 # Get webpage content
3326 webpage_bytes = compat_urllib_request.urlopen(url).read()
3327 webpage = webpage_bytes.decode('utf-8')
3328 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3329 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3332 result = re.search(self.VIDEO_URL_RE, webpage)
3334 self._downloader.trouble(u'ERROR: unable to extract video url')
3336 video_url = compat_urllib_parse.unquote(result.group(1))
3338 result = re.search(self.VIDEO_TITLE_RE, webpage)
3340 self._downloader.trouble(u'ERROR: unable to extract video title')
3342 video_title = result.group(1)
3344 result = re.search(self.VIDEO_THUMB_RE, webpage)
3346 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3348 video_thumbnail = result.group(1)
3354 'upload_date': None,
3355 'title': video_title,
3357 'thumbnail': video_thumbnail,
3358 'description': None,
3362 class GooglePlusIE(InfoExtractor):
3363 """Information extractor for plus.google.com."""
3365 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3366 IE_NAME = u'plus.google'
3368 def __init__(self, downloader=None):
3369 InfoExtractor.__init__(self, downloader)
3371 def report_extract_entry(self, url):
3372 """Report downloading extry"""
3373 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3375 def report_date(self, upload_date):
3376 """Report downloading extry"""
3377 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3379 def report_uploader(self, uploader):
3380 """Report downloading extry"""
3381 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3383 def report_title(self, video_title):
3384 """Report downloading extry"""
3385 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3387 def report_extract_vid_page(self, video_page):
3388 """Report information extraction."""
3389 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3391 def _real_extract(self, url):
3392 # Extract id from URL
3393 mobj = re.match(self._VALID_URL, url)
3395 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3398 post_url = mobj.group(0)
3399 video_id = mobj.group(1)
3401 video_extension = 'flv'
3403 # Step 1, Retrieve post webpage to extract further information
3404 self.report_extract_entry(post_url)
3405 request = compat_urllib_request.Request(post_url)
3407 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3409 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3412 # Extract update date
3414 pattern = 'title="Timestamp">(.*?)</a>'
3415 mobj = re.search(pattern, webpage)
3417 upload_date = mobj.group(1)
3418 # Convert timestring to a format suitable for filename
3419 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3420 upload_date = upload_date.strftime('%Y%m%d')
3421 self.report_date(upload_date)
3425 pattern = r'rel\="author".*?>(.*?)</a>'
3426 mobj = re.search(pattern, webpage)
3428 uploader = mobj.group(1)
3429 self.report_uploader(uploader)
3432 # Get the first line for title
3434 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3435 mobj = re.search(pattern, webpage)
3437 video_title = mobj.group(1)
3438 self.report_title(video_title)
3440 # Step 2, Stimulate clicking the image box to launch video
3441 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3442 mobj = re.search(pattern, webpage)
3444 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3446 video_page = mobj.group(1)
3447 request = compat_urllib_request.Request(video_page)
3449 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3453 self.report_extract_vid_page(video_page)
3456 # Extract video links on video page
3457 """Extract video links of all sizes"""
3458 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3459 mobj = re.findall(pattern, webpage)
3461 self._downloader.trouble(u'ERROR: unable to extract video links')
3463 # Sort in resolution
3464 links = sorted(mobj)
3466 # Choose the lowest of the sort, i.e. highest resolution
3467 video_url = links[-1]
3468 # Only get the url. The resolution part in the tuple has no use anymore
3469 video_url = video_url[-1]
3470 # Treat escaped \u0026 style hex
3472 video_url = video_url.decode("unicode_escape")
3473 except AttributeError: # Python 3
3474 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3480 'uploader': uploader,
3481 'upload_date': upload_date,
3482 'title': video_title,
3483 'ext': video_extension,
3486 class NBAIE(InfoExtractor):
3487 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3490 def _real_extract(self, url):
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496 video_id = mobj.group(1)
3497 if video_id.endswith('/index.html'):
3498 video_id = video_id[:-len('/index.html')]
3500 webpage = self._download_webpage(url, video_id)
3502 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3503 def _findProp(rexp, default=None):
3504 m = re.search(rexp, webpage)
3506 return unescapeHTML(m.group(1))
3510 shortened_video_id = video_id.rpartition('/')[2]
3511 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3513 'id': shortened_video_id,
3517 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3518 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3522 class JustinTVIE(InfoExtractor):
3523 """Information extractor for justin.tv and twitch.tv"""
3524 # TODO: One broadcast may be split into multiple videos. The key
3525 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3526 # starts at 1 and increases. Can we treat all parts as one video?
3528 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3529 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3530 _JUSTIN_PAGE_LIMIT = 100
3531 IE_NAME = u'justin.tv'
3533 def report_extraction(self, file_id):
3534 """Report information extraction."""
3535 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3537 def report_download_page(self, channel, offset):
3538 """Report attempt to download a single page of videos."""
3539 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3540 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3542 # Return count of items, list of *valid* items
3543 def _parse_page(self, url):
3545 urlh = compat_urllib_request.urlopen(url)
3546 webpage_bytes = urlh.read()
3547 webpage = webpage_bytes.decode('utf-8', 'ignore')
3548 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3552 response = json.loads(webpage)
3553 if type(response) != list:
3554 error_text = response.get('error', 'unknown error')
3555 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3558 for clip in response:
3559 video_url = clip['video_file_url']
3561 video_extension = os.path.splitext(video_url)[1][1:]
3562 video_date = re.sub('-', '', clip['start_time'][:10])
3563 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3564 video_id = clip['id']
3565 video_title = clip.get('title', video_id)
3569 'title': video_title,
3570 'uploader': clip.get('channel_name', video_uploader_id),
3571 'uploader_id': video_uploader_id,
3572 'upload_date': video_date,
3573 'ext': video_extension,
3575 return (len(response), info)
3577 def _real_extract(self, url):
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3583 api = 'http://api.justin.tv'
3584 video_id = mobj.group(mobj.lastindex)
3586 if mobj.lastindex == 1:
3588 api += '/channel/archives/%s.json'
3590 api += '/broadcast/by_archive/%s.json'
3591 api = api % (video_id,)
3593 self.report_extraction(video_id)
3597 limit = self._JUSTIN_PAGE_LIMIT
3600 self.report_download_page(video_id, offset)
3601 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3602 page_count, page_info = self._parse_page(page_url)
3603 info.extend(page_info)
3604 if not paged or page_count != limit:
3609 class FunnyOrDieIE(InfoExtractor):
3610 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3612 def _real_extract(self, url):
3613 mobj = re.match(self._VALID_URL, url)
3615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3618 video_id = mobj.group('id')
3619 webpage = self._download_webpage(url, video_id)
3621 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3623 self._downloader.trouble(u'ERROR: unable to find video information')
3624 video_url = unescapeHTML(m.group('url'))
3626 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3628 self._downloader.trouble(u'Cannot find video title')
3629 title = unescapeHTML(m.group('title'))
3631 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3633 desc = unescapeHTML(m.group('desc'))
3642 'description': desc,
3646 class SteamIE(InfoExtractor):
3647 _VALID_URL = r"""http://store.steampowered.com/
3648 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3650 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3654 def suitable(cls, url):
3655 """Receives a URL and returns True if suitable for this IE."""
3656 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3658 def _real_extract(self, url):
3659 m = re.match(self._VALID_URL, url, re.VERBOSE)
3660 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3661 gameID = m.group('gameID')
3662 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3663 webpage = self._download_webpage(videourl, gameID)
3664 mweb = re.finditer(urlRE, webpage)
3665 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3666 titles = re.finditer(namesRE, webpage)
3667 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3668 thumbs = re.finditer(thumbsRE, webpage)
3670 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3671 video_id = vid.group('videoID')
3672 title = vtitle.group('videoName')
3673 video_url = vid.group('videoURL')
3674 video_thumb = thumb.group('thumbnail')
3676 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3681 'title': unescapeHTML(title),
3682 'thumbnail': video_thumb
3687 class UstreamIE(InfoExtractor):
3688 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3689 IE_NAME = u'ustream'
3691 def _real_extract(self, url):
3692 m = re.match(self._VALID_URL, url)
3693 video_id = m.group('videoID')
3694 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3695 webpage = self._download_webpage(url, video_id)
3696 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3697 title = m.group('title')
3698 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3699 uploader = m.group('uploader')
3705 'uploader': uploader
3709 class RBMARadioIE(InfoExtractor):
3710 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3712 def _real_extract(self, url):
3713 m = re.match(self._VALID_URL, url)
3714 video_id = m.group('videoID')
3716 webpage = self._download_webpage(url, video_id)
3717 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3719 raise ExtractorError(u'Cannot find metadata')
3720 json_data = m.group(1)
3723 data = json.loads(json_data)
3724 except ValueError as e:
3725 raise ExtractorError(u'Invalid JSON: ' + str(e))
3727 video_url = data['akamai_url'] + '&cbr=256'
3728 url_parts = compat_urllib_parse_urlparse(video_url)
3729 video_ext = url_parts.path.rpartition('.')[2]
3734 'title': data['title'],
3735 'description': data.get('teaser_text'),
3736 'location': data.get('country_of_origin'),
3737 'uploader': data.get('host', {}).get('name'),
3738 'uploader_id': data.get('host', {}).get('slug'),
3739 'thumbnail': data.get('image', {}).get('large_url_2x'),
3740 'duration': data.get('duration'),
3745 class YouPornIE(InfoExtractor):
3746 """Information extractor for youporn.com."""
3747 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3749 def _print_formats(self, formats):
3750 """Print all available formats"""
3751 print(u'Available formats:')
3752 print(u'ext\t\tformat')
3753 print(u'---------------------------------')
3754 for format in formats:
3755 print(u'%s\t\t%s' % (format['ext'], format['format']))
3757 def _specific(self, req_format, formats):
3759 if(x["format"]==req_format):
3763 def _real_extract(self, url):
3764 mobj = re.match(self._VALID_URL, url)
3766 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3769 video_id = mobj.group('videoid')
3771 req = compat_urllib_request.Request(url)
3772 req.add_header('Cookie', 'age_verified=1')
3773 webpage = self._download_webpage(req, video_id)
3775 # Get the video title
3776 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3778 raise ExtractorError(u'Unable to extract video title')
3779 video_title = result.group('title').strip()
3781 # Get the video date
3782 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3784 self._downloader.report_warning(u'unable to extract video date')
3787 upload_date = result.group('date').strip()
3789 # Get the video uploader
3790 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3792 self._downloader.report_warning(u'unable to extract uploader')
3793 video_uploader = None
3795 video_uploader = result.group('uploader').strip()
3796 video_uploader = clean_html( video_uploader )
3798 # Get all of the formats available
3799 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3800 result = re.search(DOWNLOAD_LIST_RE, webpage)
3802 raise ExtractorError(u'Unable to extract download list')
3803 download_list_html = result.group('download_list').strip()
3805 # Get all of the links from the page
3806 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3807 links = re.findall(LINK_RE, download_list_html)
3808 if(len(links) == 0):
3809 raise ExtractorError(u'ERROR: no known formats available for video')
3811 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3816 # A link looks like this:
3817 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3818 # A path looks like this:
3819 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3820 video_url = unescapeHTML( link )
3821 path = compat_urllib_parse_urlparse( video_url ).path
3822 extension = os.path.splitext( path )[1][1:]
3823 format = path.split('/')[4].split('_')[:2]
3826 format = "-".join( format )
3827 title = u'%s-%s-%s' % (video_title, size, bitrate)
3832 'uploader': video_uploader,
3833 'upload_date': upload_date,
3838 'description': None,
3842 if self._downloader.params.get('listformats', None):
3843 self._print_formats(formats)
3846 req_format = self._downloader.params.get('format', None)
3847 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3849 if req_format is None or req_format == 'best':
3851 elif req_format == 'worst':
3852 return [formats[-1]]
3853 elif req_format in ('-1', 'all'):
3856 format = self._specific( req_format, formats )
3858 self._downloader.trouble(u'ERROR: requested format not available')
3864 class PornotubeIE(InfoExtractor):
3865 """Information extractor for pornotube.com."""
3866 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3868 def _real_extract(self, url):
3869 mobj = re.match(self._VALID_URL, url)
3871 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3874 video_id = mobj.group('videoid')
3875 video_title = mobj.group('title')
3877 # Get webpage content
3878 webpage = self._download_webpage(url, video_id)
3881 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3882 result = re.search(VIDEO_URL_RE, webpage)
3884 self._downloader.trouble(u'ERROR: unable to extract video url')
3886 video_url = compat_urllib_parse.unquote(result.group('url'))
3888 #Get the uploaded date
3889 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3890 result = re.search(VIDEO_UPLOADED_RE, webpage)
3892 self._downloader.trouble(u'ERROR: unable to extract video title')
3894 upload_date = result.group('date')
3896 info = {'id': video_id,
3899 'upload_date': upload_date,
3900 'title': video_title,
3906 class YouJizzIE(InfoExtractor):
3907 """Information extractor for youjizz.com."""
3908 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3910 def _real_extract(self, url):
3911 mobj = re.match(self._VALID_URL, url)
3913 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3916 video_id = mobj.group('videoid')
3918 # Get webpage content
3919 webpage = self._download_webpage(url, video_id)
3921 # Get the video title
3922 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3924 raise ExtractorError(u'ERROR: unable to extract video title')
3925 video_title = result.group('title').strip()
3927 # Get the embed page
3928 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3930 raise ExtractorError(u'ERROR: unable to extract embed page')
3932 embed_page_url = result.group(0).strip()
3933 video_id = result.group('videoid')
3935 webpage = self._download_webpage(embed_page_url, video_id)
3938 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3940 raise ExtractorError(u'ERROR: unable to extract video url')
3941 video_url = result.group('source')
3943 info = {'id': video_id,
3945 'title': video_title,
3948 'player_url': embed_page_url}
3952 class EightTracksIE(InfoExtractor):
3954 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3956 def _real_extract(self, url):
3957 mobj = re.match(self._VALID_URL, url)
3959 raise ExtractorError(u'Invalid URL: %s' % url)
3960 playlist_id = mobj.group('id')
3962 webpage = self._download_webpage(url, playlist_id)
3964 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3966 raise ExtractorError(u'Cannot find trax information')
3967 json_like = m.group(1)
3968 data = json.loads(json_like)
3970 session = str(random.randint(0, 1000000000))
3972 track_count = data['tracks_count']
3973 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3974 next_url = first_url
3976 for i in itertools.count():
3977 api_json = self._download_webpage(next_url, playlist_id,
3978 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3979 errnote=u'Failed to download song information')
3980 api_data = json.loads(api_json)
3981 track_data = api_data[u'set']['track']
3983 'id': track_data['id'],
3984 'url': track_data['track_file_stream_url'],
3985 'title': track_data['performer'] + u' - ' + track_data['name'],
3986 'raw_title': track_data['name'],
3987 'uploader_id': data['user']['login'],
3991 if api_data['set']['at_last_track']:
3993 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3996 class KeekIE(InfoExtractor):
3997 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4000 def _real_extract(self, url):
4001 m = re.match(self._VALID_URL, url)
4002 video_id = m.group('videoID')
4003 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4004 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4005 webpage = self._download_webpage(url, video_id)
4006 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4007 title = unescapeHTML(m.group('title'))
4008 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4009 uploader = clean_html(m.group('uploader'))
4015 'thumbnail': thumbnail,
4016 'uploader': uploader
4020 class TEDIE(InfoExtractor):
4021 _VALID_URL=r'''http://www.ted.com/
4023 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4025 ((?P<type_talk>talks)) # We have a simple talk
4027 /(?P<name>\w+) # Here goes the name and then ".html"
4031 def suitable(cls, url):
4032 """Receives a URL and returns True if suitable for this IE."""
4033 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4035 def _real_extract(self, url):
4036 m=re.match(self._VALID_URL, url, re.VERBOSE)
4037 if m.group('type_talk'):
4038 return [self._talk_info(url)]
4040 playlist_id=m.group('playlist_id')
4041 name=m.group('name')
4042 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4043 return self._playlist_videos_info(url,name,playlist_id)
4045 def _talk_video_link(self,mediaSlug):
4046 '''Returns the video link for that mediaSlug'''
4047 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4049 def _playlist_videos_info(self,url,name,playlist_id=0):
4050 '''Returns the videos of the playlist'''
4052 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4053 ([.\s]*?)data-playlist_item_id="(\d+)"
4054 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4056 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4057 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4058 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4059 m_names=re.finditer(video_name_RE,webpage)
4061 for m_video, m_name in zip(m_videos,m_names):
4062 video_id=m_video.group('video_id')
4063 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4064 info.append(self._talk_info(talk_url,video_id))
4067 def _talk_info(self, url, video_id=0):
4068 """Return the video for the talk in the url"""
4069 m=re.match(self._VALID_URL, url,re.VERBOSE)
4070 videoName=m.group('name')
4071 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4072 # If the url includes the language we get the title translated
4073 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4074 title=re.search(title_RE, webpage).group('title')
4075 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4076 "id":(?P<videoID>[\d]+).*?
4077 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4078 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4079 thumb_match=re.search(thumb_RE,webpage)
4080 info_match=re.search(info_RE,webpage,re.VERBOSE)
4081 video_id=info_match.group('videoID')
4082 mediaSlug=info_match.group('mediaSlug')
4083 video_url=self._talk_video_link(mediaSlug)
4089 'thumbnail': thumb_match.group('thumbnail')
4093 class MySpassIE(InfoExtractor):
4094 _VALID_URL = r'http://www.myspass.de/.*'
4096 def _real_extract(self, url):
4097 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4099 # video id is the last path element of the URL
4100 # usually there is a trailing slash, so also try the second but last
4101 url_path = compat_urllib_parse_urlparse(url).path
4102 url_parent_path, video_id = os.path.split(url_path)
4104 _, video_id = os.path.split(url_parent_path)
4107 metadata_url = META_DATA_URL_TEMPLATE % video_id
4108 metadata_text = self._download_webpage(metadata_url, video_id)
4109 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4111 # extract values from metadata
4112 url_flv_el = metadata.find('url_flv')
4113 if url_flv_el is None:
4114 self._downloader.trouble(u'ERROR: unable to extract download url')
4116 video_url = url_flv_el.text
4117 extension = os.path.splitext(video_url)[1][1:]
4118 title_el = metadata.find('title')
4119 if title_el is None:
4120 self._downloader.trouble(u'ERROR: unable to extract title')
4122 title = title_el.text
4123 format_id_el = metadata.find('format_id')
4124 if format_id_el is None:
4127 format = format_id_el.text
4128 description_el = metadata.find('description')
4129 if description_el is not None:
4130 description = description_el.text
4133 imagePreview_el = metadata.find('imagePreview')
4134 if imagePreview_el is not None:
4135 thumbnail = imagePreview_el.text
4144 'thumbnail': thumbnail,
4145 'description': description
4149 class SpiegelIE(InfoExtractor):
4150 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
4152 def _real_extract(self, url):
4153 m = re.match(self._VALID_URL, url)
4154 video_id = m.group('videoID')
4156 webpage = self._download_webpage(url, video_id)
4157 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4159 raise ExtractorError(u'Cannot find title')
4160 video_title = unescapeHTML(m.group(1))
4162 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4163 xml_code = self._download_webpage(xml_url, video_id,
4164 note=u'Downloading XML', errnote=u'Failed to download XML')
4166 idoc = xml.etree.ElementTree.fromstring(xml_code)
4167 last_type = idoc[-1]
4168 filename = last_type.findall('./filename')[0].text
4169 duration = float(last_type.findall('./duration')[0].text)
4171 video_url = 'http://video2.spiegel.de/flash/' + filename
4172 video_ext = filename.rpartition('.')[2]
4177 'title': video_title,
4178 'duration': duration,
4183 def gen_extractors():
4184 """ Return a list of an instance of every supported extractor.
4185 The order does matter; the first extractor matched is the one handling the URL.
4188 YoutubePlaylistIE(),
4212 StanfordOpenClassroomIE(),