2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
159 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
160 """Returns a playlist"""
161 video_info = {'_type': 'playlist',
164 video_info['id'] = playlist_id
166 video_info['title'] = playlist_title
170 class YoutubeIE(InfoExtractor):
171 """Information extractor for youtube.com."""
175 (?:https?://)? # http(s):// (optional)
176 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
177 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
178 (?:.*?\#/)? # handle anchor (#/) redirect urls
179 (?: # the various things that can precede the ID:
180 (?:(?:v|embed|e)/) # v/ or embed/ or e/
181 |(?: # or the v= param in all its forms
182 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
183 (?:\?|\#!?) # the params delimiter ? or # or #!
184 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
187 )? # optional -> youtube.com/xxxx is OK
188 )? # all until now is optional -> you can pass the naked ID
189 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
190 (?(1).+)? # if we found the ID, everything can follow
192 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
193 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
194 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
195 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
196 _NETRC_MACHINE = 'youtube'
197 # Listed in order of quality
198 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
199 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
200 _video_extensions = {
206 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
212 _video_dimensions = {
231 def suitable(cls, url):
232 """Receives a URL and returns True if suitable for this IE."""
233 if YoutubePlaylistIE.suitable(url): return False
234 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
236 def report_lang(self):
237 """Report attempt to set language."""
238 self._downloader.to_screen(u'[youtube] Setting language')
240 def report_login(self):
241 """Report attempt to log in."""
242 self._downloader.to_screen(u'[youtube] Logging in')
244 def report_age_confirmation(self):
245 """Report attempt to confirm age."""
246 self._downloader.to_screen(u'[youtube] Confirming age')
248 def report_video_webpage_download(self, video_id):
249 """Report attempt to download video webpage."""
250 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
252 def report_video_info_webpage_download(self, video_id):
253 """Report attempt to download video info webpage."""
254 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
256 def report_video_subtitles_download(self, video_id):
257 """Report attempt to download video info webpage."""
258 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
260 def report_video_subtitles_request(self, video_id, sub_lang, format):
261 """Report attempt to download video info webpage."""
262 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
264 def report_video_subtitles_available(self, video_id, sub_lang_list):
265 """Report available subtitles."""
266 sub_lang = ",".join(list(sub_lang_list.keys()))
267 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
269 def report_information_extraction(self, video_id):
270 """Report attempt to extract video information."""
271 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
273 def report_unavailable_format(self, video_id, format):
274 """Report extracted video URL."""
275 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
277 def report_rtmp_download(self):
278 """Indicate the download will use the RTMP protocol."""
279 self._downloader.to_screen(u'[youtube] RTMP download detected')
281 def _get_available_subtitles(self, video_id):
282 self.report_video_subtitles_download(video_id)
283 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
285 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
287 return (u'unable to download video subtitles: %s' % compat_str(err), None)
288 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
289 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
290 if not sub_lang_list:
291 return (u'video doesn\'t have subtitles', None)
294 def _list_available_subtitles(self, video_id):
295 sub_lang_list = self._get_available_subtitles(video_id)
296 self.report_video_subtitles_available(video_id, sub_lang_list)
298 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
301 (error_message, sub_lang, sub)
303 self.report_video_subtitles_request(video_id, sub_lang, format)
304 params = compat_urllib_parse.urlencode({
310 url = 'http://www.youtube.com/api/timedtext?' + params
312 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
316 return (u'Did not fetch video subtitles', None, None)
317 return (None, sub_lang, sub)
319 def _extract_subtitle(self, video_id):
321 Return a list with a tuple:
322 [(error_message, sub_lang, sub)]
324 sub_lang_list = self._get_available_subtitles(video_id)
325 sub_format = self._downloader.params.get('subtitlesformat')
326 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
327 return [(sub_lang_list[0], None, None)]
328 if self._downloader.params.get('subtitleslang', False):
329 sub_lang = self._downloader.params.get('subtitleslang')
330 elif 'en' in sub_lang_list:
333 sub_lang = list(sub_lang_list.keys())[0]
334 if not sub_lang in sub_lang_list:
335 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
337 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
340 def _extract_all_subtitles(self, video_id):
341 sub_lang_list = self._get_available_subtitles(video_id)
342 sub_format = self._downloader.params.get('subtitlesformat')
343 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
344 return [(sub_lang_list[0], None, None)]
346 for sub_lang in sub_lang_list:
347 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
348 subtitles.append(subtitle)
351 def _print_formats(self, formats):
352 print('Available formats:')
354 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
356 def _real_initialize(self):
357 if self._downloader is None:
362 downloader_params = self._downloader.params
364 # Attempt to use provided username and password or .netrc data
365 if downloader_params.get('username', None) is not None:
366 username = downloader_params['username']
367 password = downloader_params['password']
368 elif downloader_params.get('usenetrc', False):
370 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
375 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376 except (IOError, netrc.NetrcParseError) as err:
377 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
381 request = compat_urllib_request.Request(self._LANG_URL)
384 compat_urllib_request.urlopen(request).read()
385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
386 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
389 # No authentication to be performed
393 request = compat_urllib_request.Request(self._LOGIN_URL)
395 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
402 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
404 galx = match.group(1)
406 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
412 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
416 u'PersistentCookie': u'yes',
418 u'bgresponse': u'js_disabled',
419 u'checkConnection': u'',
420 u'checkedDomains': u'youtube',
426 u'signIn': u'Sign in',
428 u'service': u'youtube',
432 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
434 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
435 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
436 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
439 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
440 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
441 self._downloader.report_warning(u'unable to log in: bad username or password')
443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
444 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
450 'action_confirm': 'Confirm',
452 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
454 self.report_age_confirmation()
455 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
460 def _extract_id(self, url):
461 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
463 self._downloader.report_error(u'invalid URL: %s' % url)
465 video_id = mobj.group(2)
468 def _real_extract(self, url):
469 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
470 mobj = re.search(self._NEXT_URL_RE, url)
472 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
473 video_id = self._extract_id(url)
476 self.report_video_webpage_download(video_id)
477 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
478 request = compat_urllib_request.Request(url)
480 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
485 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
487 # Attempt to extract SWF player URL
488 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
490 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
495 self.report_video_info_webpage_download(video_id)
496 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
497 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
498 % (video_id, el_type))
499 video_info_webpage = self._download_webpage(video_info_url, video_id,
501 errnote='unable to download video info webpage')
502 video_info = compat_parse_qs(video_info_webpage)
503 if 'token' in video_info:
505 if 'token' not in video_info:
506 if 'reason' in video_info:
507 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
509 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
512 # Check for "rental" videos
513 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
514 self._downloader.report_error(u'"rental" videos not supported')
517 # Start extracting information
518 self.report_information_extraction(video_id)
521 if 'author' not in video_info:
522 self._downloader.report_error(u'unable to extract uploader name')
524 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
527 video_uploader_id = None
528 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
530 video_uploader_id = mobj.group(1)
532 self._downloader.report_warning(u'unable to extract uploader nickname')
535 if 'title' not in video_info:
536 self._downloader.report_error(u'unable to extract video title')
538 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
541 if 'thumbnail_url' not in video_info:
542 self._downloader.report_warning(u'unable to extract video thumbnail')
544 else: # don't panic if we can't find it
545 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
549 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
551 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
552 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
553 for expression in format_expressions:
555 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
560 video_description = get_element_by_id("eow-description", video_webpage)
561 if video_description:
562 video_description = clean_html(video_description)
564 video_description = ''
567 video_subtitles = None
569 if self._downloader.params.get('writesubtitles', False):
570 video_subtitles = self._extract_subtitle(video_id)
572 (sub_error, sub_lang, sub) = video_subtitles[0]
574 self._downloader.report_error(sub_error)
576 if self._downloader.params.get('allsubtitles', False):
577 video_subtitles = self._extract_all_subtitles(video_id)
578 for video_subtitle in video_subtitles:
579 (sub_error, sub_lang, sub) = video_subtitle
581 self._downloader.report_error(sub_error)
583 if self._downloader.params.get('listsubtitles', False):
584 sub_lang_list = self._list_available_subtitles(video_id)
587 if 'length_seconds' not in video_info:
588 self._downloader.report_warning(u'unable to extract video duration')
591 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
594 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
596 # Decide which formats to download
597 req_format = self._downloader.params.get('format', None)
599 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
600 self.report_rtmp_download()
601 video_url_list = [(None, video_info['conn'][0])]
602 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
603 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
604 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
605 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
606 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
608 format_limit = self._downloader.params.get('format_limit', None)
609 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
610 if format_limit is not None and format_limit in available_formats:
611 format_list = available_formats[available_formats.index(format_limit):]
613 format_list = available_formats
614 existing_formats = [x for x in format_list if x in url_map]
615 if len(existing_formats) == 0:
616 self._downloader.report_error(u'no known formats available for video')
618 if self._downloader.params.get('listformats', None):
619 self._print_formats(existing_formats)
621 if req_format is None or req_format == 'best':
622 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
623 elif req_format == 'worst':
624 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
625 elif req_format in ('-1', 'all'):
626 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
628 # Specific formats. We pick the first in a slash-delimeted sequence.
629 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
630 req_formats = req_format.split('/')
631 video_url_list = None
632 for rf in req_formats:
634 video_url_list = [(rf, url_map[rf])]
636 if video_url_list is None:
637 self._downloader.report_error(u'requested format not available')
640 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
644 for format_param, video_real_url in video_url_list:
646 video_extension = self._video_extensions.get(format_param, 'flv')
648 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
649 self._video_dimensions.get(format_param, '???'))
653 'url': video_real_url,
654 'uploader': video_uploader,
655 'uploader_id': video_uploader_id,
656 'upload_date': upload_date,
657 'title': video_title,
658 'ext': video_extension,
659 'format': video_format,
660 'thumbnail': video_thumbnail,
661 'description': video_description,
662 'player_url': player_url,
663 'subtitles': video_subtitles,
664 'duration': video_duration
669 class MetacafeIE(InfoExtractor):
670 """Information Extractor for metacafe.com."""
672 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
673 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
674 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
675 IE_NAME = u'metacafe'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_disclaimer(self):
681 """Report disclaimer retrieval."""
682 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
684 def report_age_confirmation(self):
685 """Report attempt to confirm age."""
686 self._downloader.to_screen(u'[metacafe] Confirming age')
688 def report_download_webpage(self, video_id):
689 """Report webpage download."""
690 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
692 def report_extraction(self, video_id):
693 """Report information extraction."""
694 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
696 def _real_initialize(self):
697 # Retrieve disclaimer
698 request = compat_urllib_request.Request(self._DISCLAIMER)
700 self.report_disclaimer()
701 disclaimer = compat_urllib_request.urlopen(request).read()
702 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
703 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
709 'submit': "Continue - I'm over 18",
711 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
713 self.report_age_confirmation()
714 disclaimer = compat_urllib_request.urlopen(request).read()
715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
719 def _real_extract(self, url):
720 # Extract id and simplified title from URL
721 mobj = re.match(self._VALID_URL, url)
723 self._downloader.report_error(u'invalid URL: %s' % url)
726 video_id = mobj.group(1)
728 # Check if video comes from YouTube
729 mobj2 = re.match(r'^yt-(.*)$', video_id)
730 if mobj2 is not None:
731 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
733 # Retrieve video webpage to extract further information
734 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
736 self.report_download_webpage(video_id)
737 webpage = compat_urllib_request.urlopen(request).read()
738 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
739 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
742 # Extract URL, uploader and title from webpage
743 self.report_extraction(video_id)
744 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
746 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
747 video_extension = mediaURL[-3:]
749 # Extract gdaKey if available
750 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
754 gdaKey = mobj.group(1)
755 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
757 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
759 self._downloader.report_error(u'unable to extract media URL')
761 vardict = compat_parse_qs(mobj.group(1))
762 if 'mediaData' not in vardict:
763 self._downloader.report_error(u'unable to extract media URL')
765 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
767 self._downloader.report_error(u'unable to extract media URL')
769 mediaURL = mobj.group(1).replace('\\/', '/')
770 video_extension = mediaURL[-3:]
771 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
773 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
775 self._downloader.report_error(u'unable to extract title')
777 video_title = mobj.group(1).decode('utf-8')
779 mobj = re.search(r'submitter=(.*?);', webpage)
781 self._downloader.report_error(u'unable to extract uploader nickname')
783 video_uploader = mobj.group(1)
786 'id': video_id.decode('utf-8'),
787 'url': video_url.decode('utf-8'),
788 'uploader': video_uploader.decode('utf-8'),
790 'title': video_title,
791 'ext': video_extension.decode('utf-8'),
795 class DailymotionIE(InfoExtractor):
796 """Information Extractor for Dailymotion"""
798 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
799 IE_NAME = u'dailymotion'
802 def __init__(self, downloader=None):
803 InfoExtractor.__init__(self, downloader)
805 def report_extraction(self, video_id):
806 """Report information extraction."""
807 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
809 def _real_extract(self, url):
810 # Extract id and simplified title from URL
811 mobj = re.match(self._VALID_URL, url)
813 self._downloader.report_error(u'invalid URL: %s' % url)
816 video_id = mobj.group(1).split('_')[0].split('?')[0]
818 video_extension = 'mp4'
820 # Retrieve video webpage to extract further information
821 request = compat_urllib_request.Request(url)
822 request.add_header('Cookie', 'family_filter=off')
823 webpage = self._download_webpage(request, video_id)
825 # Extract URL, uploader and title from webpage
826 self.report_extraction(video_id)
827 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
829 self._downloader.report_error(u'unable to extract media URL')
831 flashvars = compat_urllib_parse.unquote(mobj.group(1))
833 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
836 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
839 self._downloader.report_error(u'unable to extract video URL')
842 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
844 self._downloader.report_error(u'unable to extract video URL')
847 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
849 # TODO: support choosing qualities
851 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
853 self._downloader.report_error(u'unable to extract title')
855 video_title = unescapeHTML(mobj.group('title'))
857 video_uploader = None
858 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
860 # lookin for official user
861 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
862 if mobj_official is None:
863 self._downloader.report_warning(u'unable to extract uploader nickname')
865 video_uploader = mobj_official.group(1)
867 video_uploader = mobj.group(1)
869 video_upload_date = None
870 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
872 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
877 'uploader': video_uploader,
878 'upload_date': video_upload_date,
879 'title': video_title,
880 'ext': video_extension,
884 class PhotobucketIE(InfoExtractor):
885 """Information extractor for photobucket.com."""
887 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
888 IE_NAME = u'photobucket'
890 def __init__(self, downloader=None):
891 InfoExtractor.__init__(self, downloader)
893 def report_download_webpage(self, video_id):
894 """Report webpage download."""
895 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
897 def report_extraction(self, video_id):
898 """Report information extraction."""
899 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
901 def _real_extract(self, url):
902 # Extract id from URL
903 mobj = re.match(self._VALID_URL, url)
905 self._downloader.report_error(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1)
910 video_extension = 'flv'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
915 self.report_download_webpage(video_id)
916 webpage = compat_urllib_request.urlopen(request).read()
917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
921 # Extract URL, uploader, and title from webpage
922 self.report_extraction(video_id)
923 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
925 self._downloader.report_error(u'unable to extract media URL')
927 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
931 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
933 self._downloader.report_error(u'unable to extract title')
935 video_title = mobj.group(1).decode('utf-8')
937 video_uploader = mobj.group(2).decode('utf-8')
940 'id': video_id.decode('utf-8'),
941 'url': video_url.decode('utf-8'),
942 'uploader': video_uploader,
944 'title': video_title,
945 'ext': video_extension.decode('utf-8'),
949 class YahooIE(InfoExtractor):
950 """Information extractor for video.yahoo.com."""
953 # _VALID_URL matches all Yahoo! Video URLs
954 # _VPAGE_URL matches only the extractable '/watch/' URLs
955 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
956 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
957 IE_NAME = u'video.yahoo'
959 def __init__(self, downloader=None):
960 InfoExtractor.__init__(self, downloader)
962 def report_download_webpage(self, video_id):
963 """Report webpage download."""
964 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
966 def report_extraction(self, video_id):
967 """Report information extraction."""
968 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
970 def _real_extract(self, url, new_video=True):
971 # Extract ID from URL
972 mobj = re.match(self._VALID_URL, url)
974 self._downloader.report_error(u'Invalid URL: %s' % url)
977 video_id = mobj.group(2)
978 video_extension = 'flv'
980 # Rewrite valid but non-extractable URLs as
981 # extractable English language /watch/ URLs
982 if re.match(self._VPAGE_URL, url) is None:
983 request = compat_urllib_request.Request(url)
985 webpage = compat_urllib_request.urlopen(request).read()
986 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
987 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
990 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
992 self._downloader.report_error(u'Unable to extract id field')
994 yahoo_id = mobj.group(1)
996 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
998 self._downloader.report_error(u'Unable to extract vid field')
1000 yahoo_vid = mobj.group(1)
1002 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1003 return self._real_extract(url, new_video=False)
1005 # Retrieve video webpage to extract further information
1006 request = compat_urllib_request.Request(url)
1008 self.report_download_webpage(video_id)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Extract uploader and title from webpage
1015 self.report_extraction(video_id)
1016 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1018 self._downloader.report_error(u'unable to extract video title')
1020 video_title = mobj.group(1).decode('utf-8')
1022 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1024 self._downloader.report_error(u'unable to extract video uploader')
1026 video_uploader = mobj.group(1).decode('utf-8')
1028 # Extract video thumbnail
1029 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1031 self._downloader.report_error(u'unable to extract video thumbnail')
1033 video_thumbnail = mobj.group(1).decode('utf-8')
1035 # Extract video description
1036 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1038 self._downloader.report_error(u'unable to extract video description')
1040 video_description = mobj.group(1).decode('utf-8')
1041 if not video_description:
1042 video_description = 'No description available.'
1044 # Extract video height and width
1045 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1047 self._downloader.report_error(u'unable to extract video height')
1049 yv_video_height = mobj.group(1)
1051 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1053 self._downloader.report_error(u'unable to extract video width')
1055 yv_video_width = mobj.group(1)
1057 # Retrieve video playlist to extract media URL
1058 # I'm not completely sure what all these options are, but we
1059 # seem to need most of them, otherwise the server sends a 401.
1060 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1061 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1062 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1063 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1064 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1066 self.report_download_webpage(video_id)
1067 webpage = compat_urllib_request.urlopen(request).read()
1068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1072 # Extract media URL from playlist XML
1073 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1075 self._downloader.report_error(u'Unable to extract media URL')
1077 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1078 video_url = unescapeHTML(video_url)
1081 'id': video_id.decode('utf-8'),
1083 'uploader': video_uploader,
1084 'upload_date': None,
1085 'title': video_title,
1086 'ext': video_extension.decode('utf-8'),
1087 'thumbnail': video_thumbnail.decode('utf-8'),
1088 'description': video_description,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def __init__(self, downloader=None):
1100 InfoExtractor.__init__(self, downloader)
1102 def report_download_webpage(self, video_id):
1103 """Report webpage download."""
1104 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1106 def report_extraction(self, video_id):
1107 """Report information extraction."""
1108 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1110 def _real_extract(self, url, new_video=True):
1111 # Extract ID from URL
1112 mobj = re.match(self._VALID_URL, url)
1114 self._downloader.report_error(u'Invalid URL: %s' % url)
1117 video_id = mobj.group('id')
1118 if not mobj.group('proto'):
1119 url = 'https://' + url
1120 if mobj.group('direct_link'):
1121 url = 'https://vimeo.com/' + video_id
1123 # Retrieve video webpage to extract further information
1124 request = compat_urllib_request.Request(url, None, std_headers)
1126 self.report_download_webpage(video_id)
1127 webpage_bytes = compat_urllib_request.urlopen(request).read()
1128 webpage = webpage_bytes.decode('utf-8')
1129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1130 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1133 # Now we begin extracting as much information as we can from what we
1134 # retrieved. First we extract the information common to all extractors,
1135 # and latter we extract those that are Vimeo specific.
1136 self.report_extraction(video_id)
1138 # Extract the config JSON
1140 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1141 config = json.loads(config)
1143 self._downloader.report_error(u'unable to extract info section')
1147 video_title = config["video"]["title"]
1149 # Extract uploader and uploader_id
1150 video_uploader = config["video"]["owner"]["name"]
1151 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1153 # Extract video thumbnail
1154 video_thumbnail = config["video"]["thumbnail"]
1156 # Extract video description
1157 video_description = get_element_by_attribute("itemprop", "description", webpage)
1158 if video_description: video_description = clean_html(video_description)
1159 else: video_description = u''
1161 # Extract upload date
1162 video_upload_date = None
1163 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1164 if mobj is not None:
1165 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1167 # Vimeo specific: extract request signature and timestamp
1168 sig = config['request']['signature']
1169 timestamp = config['request']['timestamp']
1171 # Vimeo specific: extract video codec and quality information
1172 # First consider quality, then codecs, then take everything
1173 # TODO bind to format param
1174 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1175 files = { 'hd': [], 'sd': [], 'other': []}
1176 for codec_name, codec_extension in codecs:
1177 if codec_name in config["video"]["files"]:
1178 if 'hd' in config["video"]["files"][codec_name]:
1179 files['hd'].append((codec_name, codec_extension, 'hd'))
1180 elif 'sd' in config["video"]["files"][codec_name]:
1181 files['sd'].append((codec_name, codec_extension, 'sd'))
1183 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1185 for quality in ('hd', 'sd', 'other'):
1186 if len(files[quality]) > 0:
1187 video_quality = files[quality][0][2]
1188 video_codec = files[quality][0][0]
1189 video_extension = files[quality][0][1]
1190 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1193 self._downloader.report_error(u'no known codec found')
1196 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1197 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1202 'uploader': video_uploader,
1203 'uploader_id': video_uploader_id,
1204 'upload_date': video_upload_date,
1205 'title': video_title,
1206 'ext': video_extension,
1207 'thumbnail': video_thumbnail,
1208 'description': video_description,
1212 class ArteTvIE(InfoExtractor):
1213 """arte.tv information extractor."""
1215 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1216 _LIVE_URL = r'index-[0-9]+\.html$'
1218 IE_NAME = u'arte.tv'
1220 def __init__(self, downloader=None):
1221 InfoExtractor.__init__(self, downloader)
1223 def report_download_webpage(self, video_id):
1224 """Report webpage download."""
1225 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1227 def report_extraction(self, video_id):
1228 """Report information extraction."""
1229 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1231 def fetch_webpage(self, url):
1232 request = compat_urllib_request.Request(url)
1234 self.report_download_webpage(url)
1235 webpage = compat_urllib_request.urlopen(request).read()
1236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1237 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1239 except ValueError as err:
1240 self._downloader.report_error(u'Invalid URL: %s' % url)
1244 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1245 page = self.fetch_webpage(url)
1246 mobj = re.search(regex, page, regexFlags)
1250 self._downloader.report_error(u'Invalid URL: %s' % url)
1253 for (i, key, err) in matchTuples:
1254 if mobj.group(i) is None:
1255 self._downloader.trouble(err)
1258 info[key] = mobj.group(i)
1262 def extractLiveStream(self, url):
1263 video_lang = url.split('/')[-4]
1264 info = self.grep_webpage(
1266 r'src="(.*?/videothek_js.*?\.js)',
1269 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1272 http_host = url.split('/')[2]
1273 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1274 info = self.grep_webpage(
1276 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1277 '(http://.*?\.swf).*?' +
1281 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1282 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1283 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1286 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1288 def extractPlus7Stream(self, url):
1289 video_lang = url.split('/')[-3]
1290 info = self.grep_webpage(
1292 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1295 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1298 next_url = compat_urllib_parse.unquote(info.get('url'))
1299 info = self.grep_webpage(
1301 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1304 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1307 next_url = compat_urllib_parse.unquote(info.get('url'))
1309 info = self.grep_webpage(
1311 r'<video id="(.*?)".*?>.*?' +
1312 '<name>(.*?)</name>.*?' +
1313 '<dateVideo>(.*?)</dateVideo>.*?' +
1314 '<url quality="hd">(.*?)</url>',
1317 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1318 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1319 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1320 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1325 'id': info.get('id'),
1326 'url': compat_urllib_parse.unquote(info.get('url')),
1327 'uploader': u'arte.tv',
1328 'upload_date': info.get('date'),
1329 'title': info.get('title').decode('utf-8'),
1335 def _real_extract(self, url):
1336 video_id = url.split('/')[-1]
1337 self.report_extraction(video_id)
1339 if re.search(self._LIVE_URL, video_id) is not None:
1340 self.extractLiveStream(url)
1343 info = self.extractPlus7Stream(url)
1348 class GenericIE(InfoExtractor):
1349 """Generic last-resort information extractor."""
1352 IE_NAME = u'generic'
1354 def __init__(self, downloader=None):
1355 InfoExtractor.__init__(self, downloader)
1357 def report_download_webpage(self, video_id):
1358 """Report webpage download."""
1359 if not self._downloader.params.get('test', False):
1360 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1361 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1363 def report_extraction(self, video_id):
1364 """Report information extraction."""
1365 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1367 def report_following_redirect(self, new_url):
1368 """Report information extraction."""
1369 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1371 def _test_redirect(self, url):
1372 """Check if it is a redirect, like url shorteners, in case return the new url."""
1373 class HeadRequest(compat_urllib_request.Request):
1374 def get_method(self):
1377 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1379 Subclass the HTTPRedirectHandler to make it use our
1380 HeadRequest also on the redirected URL
1382 def redirect_request(self, req, fp, code, msg, headers, newurl):
1383 if code in (301, 302, 303, 307):
1384 newurl = newurl.replace(' ', '%20')
1385 newheaders = dict((k,v) for k,v in req.headers.items()
1386 if k.lower() not in ("content-length", "content-type"))
1387 return HeadRequest(newurl,
1389 origin_req_host=req.get_origin_req_host(),
1392 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1394 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1396 Fallback to GET if HEAD is not allowed (405 HTTP error)
1398 def http_error_405(self, req, fp, code, msg, headers):
1402 newheaders = dict((k,v) for k,v in req.headers.items()
1403 if k.lower() not in ("content-length", "content-type"))
1404 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1406 origin_req_host=req.get_origin_req_host(),
1410 opener = compat_urllib_request.OpenerDirector()
1411 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1412 HTTPMethodFallback, HEADRedirectHandler,
1413 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1414 opener.add_handler(handler())
1416 response = opener.open(HeadRequest(url))
1417 new_url = response.geturl()
1422 self.report_following_redirect(new_url)
1425 def _real_extract(self, url):
1426 new_url = self._test_redirect(url)
1427 if new_url: return [self.url_result(new_url)]
1429 video_id = url.split('/')[-1]
1431 webpage = self._download_webpage(url, video_id)
1432 except ValueError as err:
1433 # since this is the last-resort InfoExtractor, if
1434 # this error is thrown, it'll be thrown here
1435 self._downloader.report_error(u'Invalid URL: %s' % url)
1438 self.report_extraction(video_id)
1439 # Start with something easy: JW Player in SWFObject
1440 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1442 # Broaden the search a little bit
1443 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1445 # Broaden the search a little bit: JWPlayer JS loader
1446 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1448 self._downloader.report_error(u'Invalid URL: %s' % url)
1451 # It's possible that one of the regexes
1452 # matched, but returned an empty group:
1453 if mobj.group(1) is None:
1454 self._downloader.report_error(u'Invalid URL: %s' % url)
1457 video_url = compat_urllib_parse.unquote(mobj.group(1))
1458 video_id = os.path.basename(video_url)
1460 # here's a fun little line of code for you:
1461 video_extension = os.path.splitext(video_id)[1][1:]
1462 video_id = os.path.splitext(video_id)[0]
1464 # it's tempting to parse this further, but you would
1465 # have to take into account all the variations like
1466 # Video Title - Site Name
1467 # Site Name | Video Title
1468 # Video Title - Tagline | Site Name
1469 # and so on and so forth; it's just not practical
1470 mobj = re.search(r'<title>(.*)</title>', webpage)
1472 self._downloader.report_error(u'unable to extract title')
1474 video_title = mobj.group(1)
1476 # video uploader is domain name
1477 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1479 self._downloader.report_error(u'unable to extract title')
1481 video_uploader = mobj.group(1)
1486 'uploader': video_uploader,
1487 'upload_date': None,
1488 'title': video_title,
1489 'ext': video_extension,
1493 class YoutubeSearchIE(InfoExtractor):
1494 """Information Extractor for YouTube search queries."""
1495 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1496 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1497 _max_youtube_results = 1000
1498 IE_NAME = u'youtube:search'
1500 def __init__(self, downloader=None):
1501 InfoExtractor.__init__(self, downloader)
1503 def report_download_page(self, query, pagenum):
1504 """Report attempt to download search page with given number."""
1505 query = query.decode(preferredencoding())
1506 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1508 def _real_extract(self, query):
1509 mobj = re.match(self._VALID_URL, query)
1511 self._downloader.report_error(u'invalid search query "%s"' % query)
1514 prefix, query = query.split(':')
1516 query = query.encode('utf-8')
1518 self._download_n_results(query, 1)
1520 elif prefix == 'all':
1521 self._download_n_results(query, self._max_youtube_results)
1527 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1529 elif n > self._max_youtube_results:
1530 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1531 n = self._max_youtube_results
1532 self._download_n_results(query, n)
1534 except ValueError: # parsing prefix as integer fails
1535 self._download_n_results(query, 1)
1538 def _download_n_results(self, query, n):
1539 """Downloads a specified number of results for a query"""
1545 while (50 * pagenum) < limit:
1546 self.report_download_page(query, pagenum+1)
1547 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1548 request = compat_urllib_request.Request(result_url)
1550 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1554 api_response = json.loads(data)['data']
1556 if not 'items' in api_response:
1557 self._downloader.trouble(u'[youtube] No video results')
1560 new_ids = list(video['id'] for video in api_response['items'])
1561 video_ids += new_ids
1563 limit = min(n, api_response['totalItems'])
1566 if len(video_ids) > n:
1567 video_ids = video_ids[:n]
1568 for id in video_ids:
1569 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1573 class GoogleSearchIE(InfoExtractor):
1574 """Information Extractor for Google Video search queries."""
1575 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1576 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1578 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1579 _max_google_results = 1000
1580 IE_NAME = u'video.google:search'
1582 def __init__(self, downloader=None):
1583 InfoExtractor.__init__(self, downloader)
1585 def report_download_page(self, query, pagenum):
1586 """Report attempt to download playlist page with given number."""
1587 query = query.decode(preferredencoding())
1588 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1590 def _real_extract(self, query):
1591 mobj = re.match(self._VALID_URL, query)
1593 self._downloader.report_error(u'invalid search query "%s"' % query)
1596 prefix, query = query.split(':')
1598 query = query.encode('utf-8')
1600 self._download_n_results(query, 1)
1602 elif prefix == 'all':
1603 self._download_n_results(query, self._max_google_results)
1609 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1611 elif n > self._max_google_results:
1612 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1613 n = self._max_google_results
1614 self._download_n_results(query, n)
1616 except ValueError: # parsing prefix as integer fails
1617 self._download_n_results(query, 1)
1620 def _download_n_results(self, query, n):
1621 """Downloads a specified number of results for a query"""
1627 self.report_download_page(query, pagenum)
1628 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1629 request = compat_urllib_request.Request(result_url)
1631 page = compat_urllib_request.urlopen(request).read()
1632 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1633 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1636 # Extract video identifiers
1637 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1638 video_id = mobj.group(1)
1639 if video_id not in video_ids:
1640 video_ids.append(video_id)
1641 if len(video_ids) == n:
1642 # Specified n videos reached
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648 for id in video_ids:
1649 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1652 pagenum = pagenum + 1
1655 class YahooSearchIE(InfoExtractor):
1656 """Information Extractor for Yahoo! Video search queries."""
1659 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1660 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1661 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1662 _MORE_PAGES_INDICATOR = r'\s*Next'
1663 _max_yahoo_results = 1000
1664 IE_NAME = u'video.yahoo:search'
1666 def __init__(self, downloader=None):
1667 InfoExtractor.__init__(self, downloader)
1669 def report_download_page(self, query, pagenum):
1670 """Report attempt to download playlist page with given number."""
1671 query = query.decode(preferredencoding())
1672 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1674 def _real_extract(self, query):
1675 mobj = re.match(self._VALID_URL, query)
1677 self._downloader.report_error(u'invalid search query "%s"' % query)
1680 prefix, query = query.split(':')
1682 query = query.encode('utf-8')
1684 self._download_n_results(query, 1)
1686 elif prefix == 'all':
1687 self._download_n_results(query, self._max_yahoo_results)
1693 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1695 elif n > self._max_yahoo_results:
1696 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1697 n = self._max_yahoo_results
1698 self._download_n_results(query, n)
1700 except ValueError: # parsing prefix as integer fails
1701 self._download_n_results(query, 1)
1704 def _download_n_results(self, query, n):
1705 """Downloads a specified number of results for a query"""
1708 already_seen = set()
1712 self.report_download_page(query, pagenum)
1713 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1714 request = compat_urllib_request.Request(result_url)
1716 page = compat_urllib_request.urlopen(request).read()
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1721 # Extract video identifiers
1722 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723 video_id = mobj.group(1)
1724 if video_id not in already_seen:
1725 video_ids.append(video_id)
1726 already_seen.add(video_id)
1727 if len(video_ids) == n:
1728 # Specified n videos reached
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1734 for id in video_ids:
1735 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1738 pagenum = pagenum + 1
1741 class YoutubePlaylistIE(InfoExtractor):
1742 """Information Extractor for YouTube playlists."""
1744 _VALID_URL = r"""(?:
1749 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1750 \? (?:.*?&)*? (?:p|a|list)=
1753 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1756 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1758 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1760 IE_NAME = u'youtube:playlist'
1762 def __init__(self, downloader=None):
1763 InfoExtractor.__init__(self, downloader)
1766 def suitable(cls, url):
1767 """Receives a URL and returns True if suitable for this IE."""
1768 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1770 def report_download_page(self, playlist_id, pagenum):
1771 """Report attempt to download playlist page with given number."""
1772 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1774 def _real_extract(self, url):
1775 # Extract playlist id
1776 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1778 self._downloader.report_error(u'invalid url: %s' % url)
1781 # Download playlist videos from API
1782 playlist_id = mobj.group(1) or mobj.group(2)
1787 self.report_download_page(playlist_id, page_num)
1789 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1791 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1797 response = json.loads(page)
1798 except ValueError as err:
1799 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1802 if 'feed' not in response:
1803 self._downloader.report_error(u'Got a malformed response from YouTube API')
1805 if 'entry' not in response['feed']:
1806 # Number of videos is a multiple of self._MAX_RESULTS
1809 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1810 for entry in response['feed']['entry']
1811 if 'content' in entry ]
1813 if len(response['feed']['entry']) < self._MAX_RESULTS:
1817 videos = [v[1] for v in sorted(videos)]
1819 url_results = [self.url_result(url) for url in videos]
1820 return [self.playlist_result(url_results, playlist_id)]
1823 class YoutubeChannelIE(InfoExtractor):
1824 """Information Extractor for YouTube channels."""
1826 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1827 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1828 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1829 IE_NAME = u'youtube:channel'
1831 def report_download_page(self, channel_id, pagenum):
1832 """Report attempt to download channel page with given number."""
1833 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1835 def _real_extract(self, url):
1836 # Extract channel id
1837 mobj = re.match(self._VALID_URL, url)
1839 self._downloader.report_error(u'invalid url: %s' % url)
1842 # Download channel pages
1843 channel_id = mobj.group(1)
1848 self.report_download_page(channel_id, pagenum)
1849 url = self._TEMPLATE_URL % (channel_id, pagenum)
1850 request = compat_urllib_request.Request(url)
1852 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1854 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1857 # Extract video identifiers
1859 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1860 if mobj.group(1) not in ids_in_page:
1861 ids_in_page.append(mobj.group(1))
1862 video_ids.extend(ids_in_page)
1864 if self._MORE_PAGES_INDICATOR not in page:
1866 pagenum = pagenum + 1
1868 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1870 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1871 url_entries = [self.url_result(url) for url in urls]
1872 return [self.playlist_result(url_entries, channel_id)]
1875 class YoutubeUserIE(InfoExtractor):
1876 """Information Extractor for YouTube users."""
1878 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1879 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1880 _GDATA_PAGE_SIZE = 50
1881 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1882 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1883 IE_NAME = u'youtube:user'
1885 def __init__(self, downloader=None):
1886 InfoExtractor.__init__(self, downloader)
1888 def report_download_page(self, username, start_index):
1889 """Report attempt to download user page."""
1890 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1891 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1893 def _real_extract(self, url):
1895 mobj = re.match(self._VALID_URL, url)
1897 self._downloader.report_error(u'invalid url: %s' % url)
1900 username = mobj.group(1)
1902 # Download video ids using YouTube Data API. Result size per
1903 # query is limited (currently to 50 videos) so we need to query
1904 # page by page until there are no video ids - it means we got
1911 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1912 self.report_download_page(username, start_index)
1914 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1917 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1922 # Extract video identifiers
1925 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1926 if mobj.group(1) not in ids_in_page:
1927 ids_in_page.append(mobj.group(1))
1929 video_ids.extend(ids_in_page)
1931 # A little optimization - if current page is not
1932 # "full", ie. does not contain PAGE_SIZE video ids then
1933 # we can assume that this page is the last one - there
1934 # are no more ids on further pages - no need to query
1937 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1942 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1943 url_results = [self.url_result(url) for url in urls]
1944 return [self.playlist_result(url_results, playlist_title = username)]
1947 class BlipTVUserIE(InfoExtractor):
1948 """Information Extractor for blip.tv users."""
1950 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1952 IE_NAME = u'blip.tv:user'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_page(self, username, pagenum):
1958 """Report attempt to download user page."""
1959 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960 (self.IE_NAME, username, pagenum))
1962 def _real_extract(self, url):
1964 mobj = re.match(self._VALID_URL, url)
1966 self._downloader.report_error(u'invalid url: %s' % url)
1969 username = mobj.group(1)
1971 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1973 request = compat_urllib_request.Request(url)
1976 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977 mobj = re.search(r'data-users-id="([^"]+)"', page)
1978 page_base = page_base % mobj.group(1)
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1984 # Download video ids using BlipTV Ajax calls. Result size per
1985 # query is limited (currently to 12 videos) so we need to query
1986 # page by page until there are no video ids - it means we got
1993 self.report_download_page(username, pagenum)
1994 url = page_base + "&page=" + str(pagenum)
1995 request = compat_urllib_request.Request( url )
1997 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2002 # Extract video identifiers
2005 for mobj in re.finditer(r'href="/([^"]+)"', page):
2006 if mobj.group(1) not in ids_in_page:
2007 ids_in_page.append(unescapeHTML(mobj.group(1)))
2009 video_ids.extend(ids_in_page)
2011 # A little optimization - if current page is not
2012 # "full", ie. does not contain PAGE_SIZE video ids then
2013 # we can assume that this page is the last one - there
2014 # are no more ids on further pages - no need to query
2017 if len(ids_in_page) < self._PAGE_SIZE:
2022 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2023 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2025 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2026 url_entries = [self.url_result(url) for url in urls]
2027 return [self.playlist_result(url_entries, playlist_title = username)]
2030 class DepositFilesIE(InfoExtractor):
2031 """Information extractor for depositfiles.com"""
2033 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2035 def report_download_webpage(self, file_id):
2036 """Report webpage download."""
2037 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2039 def report_extraction(self, file_id):
2040 """Report information extraction."""
2041 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2043 def _real_extract(self, url):
2044 file_id = url.split('/')[-1]
2045 # Rebuild url in english locale
2046 url = 'http://depositfiles.com/en/files/' + file_id
2048 # Retrieve file webpage with 'Free download' button pressed
2049 free_download_indication = { 'gateway_result' : '1' }
2050 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2052 self.report_download_webpage(file_id)
2053 webpage = compat_urllib_request.urlopen(request).read()
2054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2058 # Search for the real file URL
2059 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2060 if (mobj is None) or (mobj.group(1) is None):
2061 # Try to figure out reason of the error.
2062 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2063 if (mobj is not None) and (mobj.group(1) is not None):
2064 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2065 self._downloader.report_error(u'%s' % restriction_message)
2067 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2070 file_url = mobj.group(1)
2071 file_extension = os.path.splitext(file_url)[1][1:]
2073 # Search for file title
2074 mobj = re.search(r'<b title="(.*?)">', webpage)
2076 self._downloader.report_error(u'unable to extract title')
2078 file_title = mobj.group(1).decode('utf-8')
2081 'id': file_id.decode('utf-8'),
2082 'url': file_url.decode('utf-8'),
2084 'upload_date': None,
2085 'title': file_title,
2086 'ext': file_extension.decode('utf-8'),
2090 class FacebookIE(InfoExtractor):
2091 """Information Extractor for Facebook"""
2093 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2094 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2095 _NETRC_MACHINE = 'facebook'
2096 IE_NAME = u'facebook'
2098 def report_login(self):
2099 """Report attempt to log in."""
2100 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2102 def _real_initialize(self):
2103 if self._downloader is None:
2108 downloader_params = self._downloader.params
2110 # Attempt to use provided username and password or .netrc data
2111 if downloader_params.get('username', None) is not None:
2112 useremail = downloader_params['username']
2113 password = downloader_params['password']
2114 elif downloader_params.get('usenetrc', False):
2116 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117 if info is not None:
2121 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122 except (IOError, netrc.NetrcParseError) as err:
2123 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2126 if useremail is None:
2135 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2138 login_results = compat_urllib_request.urlopen(request).read()
2139 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2142 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2146 def _real_extract(self, url):
2147 mobj = re.match(self._VALID_URL, url)
2149 self._downloader.report_error(u'invalid URL: %s' % url)
2151 video_id = mobj.group('ID')
2153 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2154 webpage = self._download_webpage(url, video_id)
2156 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2157 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2158 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2160 raise ExtractorError(u'Cannot parse data')
2161 data = dict(json.loads(m.group(1)))
2162 params_raw = compat_urllib_parse.unquote(data['params'])
2163 params = json.loads(params_raw)
2164 video_data = params['video_data'][0]
2165 video_url = video_data.get('hd_src')
2167 video_url = video_data['sd_src']
2169 raise ExtractorError(u'Cannot find video URL')
2170 video_duration = int(video_data['video_duration'])
2171 thumbnail = video_data['thumbnail_src']
2173 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2175 raise ExtractorError(u'Cannot find title in webpage')
2176 video_title = unescapeHTML(m.group(1))
2180 'title': video_title,
2183 'duration': video_duration,
2184 'thumbnail': thumbnail,
2189 class BlipTVIE(InfoExtractor):
2190 """Information extractor for blip.tv"""
2192 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194 IE_NAME = u'blip.tv'
2196 def report_extraction(self, file_id):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200 def report_direct_download(self, title):
2201 """Report information extraction."""
2202 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204 def _real_extract(self, url):
2205 mobj = re.match(self._VALID_URL, url)
2207 self._downloader.report_error(u'invalid URL: %s' % url)
2210 urlp = compat_urllib_parse_urlparse(url)
2211 if urlp.path.startswith('/play/'):
2212 request = compat_urllib_request.Request(url)
2213 response = compat_urllib_request.urlopen(request)
2214 redirecturl = response.geturl()
2215 rurlp = compat_urllib_parse_urlparse(redirecturl)
2216 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2217 url = 'http://blip.tv/a/a-' + file_id
2218 return self._real_extract(url)
2225 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2226 request = compat_urllib_request.Request(json_url)
2227 request.add_header('User-Agent', 'iTunes/10.6.1')
2228 self.report_extraction(mobj.group(1))
2231 urlh = compat_urllib_request.urlopen(request)
2232 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2233 basename = url.split('/')[-1]
2234 title,ext = os.path.splitext(basename)
2235 title = title.decode('UTF-8')
2236 ext = ext.replace('.', '')
2237 self.report_direct_download(title)
2242 'upload_date': None,
2247 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2248 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2249 if info is None: # Regular URL
2251 json_code_bytes = urlh.read()
2252 json_code = json_code_bytes.decode('utf-8')
2253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2254 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2258 json_data = json.loads(json_code)
2259 if 'Post' in json_data:
2260 data = json_data['Post']
2264 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2265 video_url = data['media']['url']
2266 umobj = re.match(self._URL_EXT, video_url)
2268 raise ValueError('Can not determine filename extension')
2269 ext = umobj.group(1)
2272 'id': data['item_id'],
2274 'uploader': data['display_name'],
2275 'upload_date': upload_date,
2276 'title': data['title'],
2278 'format': data['media']['mimeType'],
2279 'thumbnail': data['thumbnailUrl'],
2280 'description': data['description'],
2281 'player_url': data['embedUrl'],
2282 'user_agent': 'iTunes/10.6.1',
2284 except (ValueError,KeyError) as err:
2285 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2291 class MyVideoIE(InfoExtractor):
2292 """Information Extractor for myvideo.de."""
2294 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2295 IE_NAME = u'myvideo'
2297 def __init__(self, downloader=None):
2298 InfoExtractor.__init__(self, downloader)
2300 def report_extraction(self, video_id):
2301 """Report information extraction."""
2302 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2304 def _real_extract(self,url):
2305 mobj = re.match(self._VALID_URL, url)
2307 self._download.report_error(u'invalid URL: %s' % url)
2310 video_id = mobj.group(1)
2313 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2314 webpage = self._download_webpage(webpage_url, video_id)
2316 self.report_extraction(video_id)
2317 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2320 self._downloader.report_error(u'unable to extract media URL')
2322 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2324 mobj = re.search('<title>([^<]+)</title>', webpage)
2326 self._downloader.report_error(u'unable to extract title')
2329 video_title = mobj.group(1)
2335 'upload_date': None,
2336 'title': video_title,
2340 class ComedyCentralIE(InfoExtractor):
2341 """Information extractor for The Daily Show and Colbert Report """
2343 # urls can be abbreviations like :thedailyshow or :colbert
2344 # urls for episodes like:
2345 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2346 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2347 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2348 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2349 |(https?://)?(www\.)?
2350 (?P<showname>thedailyshow|colbertnation)\.com/
2351 (full-episodes/(?P<episode>.*)|
2353 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2354 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2357 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2359 _video_extensions = {
2367 _video_dimensions = {
2377 def suitable(cls, url):
2378 """Receives a URL and returns True if suitable for this IE."""
2379 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2381 def report_extraction(self, episode_id):
2382 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2384 def report_config_download(self, episode_id, media_id):
2385 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2387 def report_index_download(self, episode_id):
2388 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2390 def _print_formats(self, formats):
2391 print('Available formats:')
2393 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2396 def _real_extract(self, url):
2397 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399 self._downloader.report_error(u'invalid URL: %s' % url)
2402 if mobj.group('shortname'):
2403 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2404 url = u'http://www.thedailyshow.com/full-episodes/'
2406 url = u'http://www.colbertnation.com/full-episodes/'
2407 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408 assert mobj is not None
2410 if mobj.group('clip'):
2411 if mobj.group('showname') == 'thedailyshow':
2412 epTitle = mobj.group('tdstitle')
2414 epTitle = mobj.group('cntitle')
2417 dlNewest = not mobj.group('episode')
2419 epTitle = mobj.group('showname')
2421 epTitle = mobj.group('episode')
2423 req = compat_urllib_request.Request(url)
2424 self.report_extraction(epTitle)
2426 htmlHandle = compat_urllib_request.urlopen(req)
2427 html = htmlHandle.read()
2428 webpage = html.decode('utf-8')
2429 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2430 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2433 url = htmlHandle.geturl()
2434 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2436 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2438 if mobj.group('episode') == '':
2439 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2441 epTitle = mobj.group('episode')
2443 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2445 if len(mMovieParams) == 0:
2446 # The Colbert Report embeds the information in a without
2447 # a URL prefix; so extract the alternate reference
2448 # and then add the URL prefix manually.
2450 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2451 if len(altMovieParams) == 0:
2452 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2455 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2457 uri = mMovieParams[0][1]
2458 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2459 self.report_index_download(epTitle)
2461 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2468 idoc = xml.etree.ElementTree.fromstring(indexXml)
2469 itemEls = idoc.findall('.//item')
2470 for partNum,itemEl in enumerate(itemEls):
2471 mediaId = itemEl.findall('./guid')[0].text
2472 shortMediaId = mediaId.split(':')[-1]
2473 showId = mediaId.split(':')[-2].replace('.com', '')
2474 officialTitle = itemEl.findall('./title')[0].text
2475 officialDate = itemEl.findall('./pubDate')[0].text
2477 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2478 compat_urllib_parse.urlencode({'uri': mediaId}))
2479 configReq = compat_urllib_request.Request(configUrl)
2480 self.report_config_download(epTitle, shortMediaId)
2482 configXml = compat_urllib_request.urlopen(configReq).read()
2483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2487 cdoc = xml.etree.ElementTree.fromstring(configXml)
2489 for rendition in cdoc.findall('.//rendition'):
2490 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2494 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2497 if self._downloader.params.get('listformats', None):
2498 self._print_formats([i[0] for i in turls])
2501 # For now, just pick the highest bitrate
2502 format,rtmp_video_url = turls[-1]
2504 # Get the format arg from the arg stream
2505 req_format = self._downloader.params.get('format', None)
2507 # Select format if we can find one
2510 format, rtmp_video_url = f, v
2513 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2515 raise ExtractorError(u'Cannot transform RTMP url')
2516 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2517 video_url = base + m.group('finalid')
2519 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2524 'upload_date': officialDate,
2529 'description': officialTitle,
2531 results.append(info)
2536 class EscapistIE(InfoExtractor):
2537 """Information extractor for The Escapist """
2539 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540 IE_NAME = u'escapist'
2542 def report_extraction(self, showName):
2543 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2545 def report_config_download(self, showName):
2546 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2551 self._downloader.report_error(u'invalid URL: %s' % url)
2553 showName = mobj.group('showname')
2554 videoId = mobj.group('episode')
2556 self.report_extraction(showName)
2558 webPage = compat_urllib_request.urlopen(url)
2559 webPageBytes = webPage.read()
2560 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2566 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567 description = unescapeHTML(descMatch.group(1))
2568 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569 imgUrl = unescapeHTML(imgMatch.group(1))
2570 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572 configUrlMatch = re.search('config=(.*)$', playerUrl)
2573 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2575 self.report_config_download(showName)
2577 configJSON = compat_urllib_request.urlopen(configUrl)
2578 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2584 # Technically, it's JavaScript, not JSON
2585 configJSON = configJSON.replace("'", '"')
2588 config = json.loads(configJSON)
2589 except (ValueError,) as err:
2590 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2593 playlist = config['playlist']
2594 videoUrl = playlist[1]['url']
2599 'uploader': showName,
2600 'upload_date': None,
2603 'thumbnail': imgUrl,
2604 'description': description,
2605 'player_url': playerUrl,
2610 class CollegeHumorIE(InfoExtractor):
2611 """Information extractor for collegehumor.com"""
2614 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2615 IE_NAME = u'collegehumor'
2617 def report_manifest(self, video_id):
2618 """Report information extraction."""
2619 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621 def report_extraction(self, video_id):
2622 """Report information extraction."""
2623 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625 def _real_extract(self, url):
2626 mobj = re.match(self._VALID_URL, url)
2628 self._downloader.report_error(u'invalid URL: %s' % url)
2630 video_id = mobj.group('videoid')
2635 'upload_date': None,
2638 self.report_extraction(video_id)
2639 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2646 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648 videoNode = mdoc.findall('./video')[0]
2649 info['description'] = videoNode.findall('./description')[0].text
2650 info['title'] = videoNode.findall('./caption')[0].text
2651 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2652 manifest_url = videoNode.findall('./file')[0].text
2654 self._downloader.report_error(u'Invalid metadata XML file')
2657 manifest_url += '?hdcore=2.10.3'
2658 self.report_manifest(video_id)
2660 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2661 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2665 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2668 node_id = media_node.attrib['url']
2669 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2670 except IndexError as err:
2671 self._downloader.report_error(u'Invalid manifest file')
2674 url_pr = compat_urllib_parse_urlparse(manifest_url)
2675 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2682 class XVideosIE(InfoExtractor):
2683 """Information extractor for xvideos.com"""
2685 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2686 IE_NAME = u'xvideos'
2688 def report_extraction(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692 def _real_extract(self, url):
2693 mobj = re.match(self._VALID_URL, url)
2695 self._downloader.report_error(u'invalid URL: %s' % url)
2697 video_id = mobj.group(1)
2699 webpage = self._download_webpage(url, video_id)
2701 self.report_extraction(video_id)
2705 mobj = re.search(r'flv_url=(.+?)&', webpage)
2707 self._downloader.report_error(u'unable to extract video url')
2709 video_url = compat_urllib_parse.unquote(mobj.group(1))
2713 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2715 self._downloader.report_error(u'unable to extract video title')
2717 video_title = mobj.group(1)
2720 # Extract video thumbnail
2721 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2723 self._downloader.report_error(u'unable to extract video thumbnail')
2725 video_thumbnail = mobj.group(0)
2731 'upload_date': None,
2732 'title': video_title,
2734 'thumbnail': video_thumbnail,
2735 'description': None,
2741 class SoundcloudIE(InfoExtractor):
2742 """Information extractor for soundcloud.com
2743 To access the media, the uid of the song and a stream token
2744 must be extracted from the page source and the script must make
2745 a request to media.soundcloud.com/crossdomain.xml. Then
2746 the media can be grabbed by requesting from an url composed
2747 of the stream token and uid
2750 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2751 IE_NAME = u'soundcloud'
2753 def __init__(self, downloader=None):
2754 InfoExtractor.__init__(self, downloader)
2756 def report_resolve(self, video_id):
2757 """Report information extraction."""
2758 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2760 def report_extraction(self, video_id):
2761 """Report information extraction."""
2762 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2764 def _real_extract(self, url):
2765 mobj = re.match(self._VALID_URL, url)
2767 self._downloader.report_error(u'invalid URL: %s' % url)
2770 # extract uploader (which is in the url)
2771 uploader = mobj.group(1)
2772 # extract simple title (uploader + slug of song title)
2773 slug_title = mobj.group(2)
2774 simple_title = uploader + u'-' + slug_title
2776 self.report_resolve('%s/%s' % (uploader, slug_title))
2778 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2779 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2780 request = compat_urllib_request.Request(resolv_url)
2782 info_json_bytes = compat_urllib_request.urlopen(request).read()
2783 info_json = info_json_bytes.decode('utf-8')
2784 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2788 info = json.loads(info_json)
2789 video_id = info['id']
2790 self.report_extraction('%s/%s' % (uploader, slug_title))
2792 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793 request = compat_urllib_request.Request(streams_url)
2795 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2796 stream_json = stream_json_bytes.decode('utf-8')
2797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2801 streams = json.loads(stream_json)
2802 mediaURL = streams['http_mp3_128_url']
2807 'uploader': info['user']['username'],
2808 'upload_date': info['created_at'],
2809 'title': info['title'],
2811 'description': info['description'],
2814 class SoundcloudSetIE(InfoExtractor):
2815 """Information extractor for soundcloud.com sets
2816 To access the media, the uid of the song and a stream token
2817 must be extracted from the page source and the script must make
2818 a request to media.soundcloud.com/crossdomain.xml. Then
2819 the media can be grabbed by requesting from an url composed
2820 of the stream token and uid
2823 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2824 IE_NAME = u'soundcloud'
2826 def __init__(self, downloader=None):
2827 InfoExtractor.__init__(self, downloader)
2829 def report_resolve(self, video_id):
2830 """Report information extraction."""
2831 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2833 def report_extraction(self, video_id):
2834 """Report information extraction."""
2835 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2837 def _real_extract(self, url):
2838 mobj = re.match(self._VALID_URL, url)
2840 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2843 # extract uploader (which is in the url)
2844 uploader = mobj.group(1)
2845 # extract simple title (uploader + slug of song title)
2846 slug_title = mobj.group(2)
2847 simple_title = uploader + u'-' + slug_title
2849 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2851 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2852 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2853 request = compat_urllib_request.Request(resolv_url)
2855 info_json_bytes = compat_urllib_request.urlopen(request).read()
2856 info_json = info_json_bytes.decode('utf-8')
2857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2862 info = json.loads(info_json)
2863 if 'errors' in info:
2864 for err in info['errors']:
2865 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2868 for track in info['tracks']:
2869 video_id = track['id']
2870 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2872 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2873 request = compat_urllib_request.Request(streams_url)
2875 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2876 stream_json = stream_json_bytes.decode('utf-8')
2877 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2878 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2881 streams = json.loads(stream_json)
2882 mediaURL = streams['http_mp3_128_url']
2887 'uploader': track['user']['username'],
2888 'upload_date': track['created_at'],
2889 'title': track['title'],
2891 'description': track['description'],
2896 class InfoQIE(InfoExtractor):
2897 """Information extractor for infoq.com"""
2898 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2900 def report_extraction(self, video_id):
2901 """Report information extraction."""
2902 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2904 def _real_extract(self, url):
2905 mobj = re.match(self._VALID_URL, url)
2907 self._downloader.report_error(u'invalid URL: %s' % url)
2910 webpage = self._download_webpage(url, video_id=url)
2911 self.report_extraction(url)
2914 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2916 self._downloader.report_error(u'unable to extract video url')
2918 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2919 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2922 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2924 self._downloader.report_error(u'unable to extract video title')
2926 video_title = mobj.group(1)
2928 # Extract description
2929 video_description = u'No description available.'
2930 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2931 if mobj is not None:
2932 video_description = mobj.group(1)
2934 video_filename = video_url.split('/')[-1]
2935 video_id, extension = video_filename.split('.')
2941 'upload_date': None,
2942 'title': video_title,
2943 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2945 'description': video_description,
2950 class MixcloudIE(InfoExtractor):
2951 """Information extractor for www.mixcloud.com"""
2953 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2954 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2955 IE_NAME = u'mixcloud'
2957 def __init__(self, downloader=None):
2958 InfoExtractor.__init__(self, downloader)
2960 def report_download_json(self, file_id):
2961 """Report JSON download."""
2962 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2964 def report_extraction(self, file_id):
2965 """Report information extraction."""
2966 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2968 def get_urls(self, jsonData, fmt, bitrate='best'):
2969 """Get urls from 'audio_formats' section in json"""
2972 bitrate_list = jsonData[fmt]
2973 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2974 bitrate = max(bitrate_list) # select highest
2976 url_list = jsonData[fmt][bitrate]
2977 except TypeError: # we have no bitrate info.
2978 url_list = jsonData[fmt]
2981 def check_urls(self, url_list):
2982 """Returns 1st active url from list"""
2983 for url in url_list:
2985 compat_urllib_request.urlopen(url)
2987 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2992 def _print_formats(self, formats):
2993 print('Available formats:')
2994 for fmt in formats.keys():
2995 for b in formats[fmt]:
2997 ext = formats[fmt][b][0]
2998 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2999 except TypeError: # we have no bitrate info
3000 ext = formats[fmt][0]
3001 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3004 def _real_extract(self, url):
3005 mobj = re.match(self._VALID_URL, url)
3007 self._downloader.report_error(u'invalid URL: %s' % url)
3009 # extract uploader & filename from url
3010 uploader = mobj.group(1).decode('utf-8')
3011 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3013 # construct API request
3014 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3015 # retrieve .json file with links to files
3016 request = compat_urllib_request.Request(file_url)
3018 self.report_download_json(file_url)
3019 jsonData = compat_urllib_request.urlopen(request).read()
3020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3025 json_data = json.loads(jsonData)
3026 player_url = json_data['player_swf_url']
3027 formats = dict(json_data['audio_formats'])
3029 req_format = self._downloader.params.get('format', None)
3032 if self._downloader.params.get('listformats', None):
3033 self._print_formats(formats)
3036 if req_format is None or req_format == 'best':
3037 for format_param in formats.keys():
3038 url_list = self.get_urls(formats, format_param)
3040 file_url = self.check_urls(url_list)
3041 if file_url is not None:
3044 if req_format not in formats:
3045 self._downloader.report_error(u'format is not available')
3048 url_list = self.get_urls(formats, req_format)
3049 file_url = self.check_urls(url_list)
3050 format_param = req_format
3053 'id': file_id.decode('utf-8'),
3054 'url': file_url.decode('utf-8'),
3055 'uploader': uploader.decode('utf-8'),
3056 'upload_date': None,
3057 'title': json_data['name'],
3058 'ext': file_url.split('.')[-1].decode('utf-8'),
3059 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3060 'thumbnail': json_data['thumbnail_url'],
3061 'description': json_data['description'],
3062 'player_url': player_url.decode('utf-8'),
3065 class StanfordOpenClassroomIE(InfoExtractor):
3066 """Information extractor for Stanford's Open ClassRoom"""
3068 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3069 IE_NAME = u'stanfordoc'
3071 def report_download_webpage(self, objid):
3072 """Report information extraction."""
3073 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3075 def report_extraction(self, video_id):
3076 """Report information extraction."""
3077 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3079 def _real_extract(self, url):
3080 mobj = re.match(self._VALID_URL, url)
3082 raise ExtractorError(u'Invalid URL: %s' % url)
3084 if mobj.group('course') and mobj.group('video'): # A specific video
3085 course = mobj.group('course')
3086 video = mobj.group('video')
3088 'id': course + '_' + video,
3090 'upload_date': None,
3093 self.report_extraction(info['id'])
3094 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3095 xmlUrl = baseUrl + video + '.xml'
3097 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3099 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3101 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3103 info['title'] = mdoc.findall('./title')[0].text
3104 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3106 self._downloader.report_error(u'Invalid metadata XML file')
3108 info['ext'] = info['url'].rpartition('.')[2]
3110 elif mobj.group('course'): # A course page
3111 course = mobj.group('course')
3116 'upload_date': None,
3119 coursepage = self._download_webpage(url, info['id'],
3120 note='Downloading course info page',
3121 errnote='Unable to download course info page')
3123 m = re.search('<h1>([^<]+)</h1>', coursepage)
3125 info['title'] = unescapeHTML(m.group(1))
3127 info['title'] = info['id']
3129 m = re.search('<description>([^<]+)</description>', coursepage)
3131 info['description'] = unescapeHTML(m.group(1))
3133 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3136 'type': 'reference',
3137 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3141 for entry in info['list']:
3142 assert entry['type'] == 'reference'
3143 results += self.extract(entry['url'])
3147 'id': 'Stanford OpenClassroom',
3150 'upload_date': None,
3153 self.report_download_webpage(info['id'])
3154 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3156 rootpage = compat_urllib_request.urlopen(rootURL).read()
3157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3158 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3161 info['title'] = info['id']
3163 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3166 'type': 'reference',
3167 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3172 for entry in info['list']:
3173 assert entry['type'] == 'reference'
3174 results += self.extract(entry['url'])
3177 class MTVIE(InfoExtractor):
3178 """Information extractor for MTV.com"""
3180 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3183 def report_extraction(self, video_id):
3184 """Report information extraction."""
3185 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187 def _real_extract(self, url):
3188 mobj = re.match(self._VALID_URL, url)
3190 self._downloader.report_error(u'invalid URL: %s' % url)
3192 if not mobj.group('proto'):
3193 url = 'http://' + url
3194 video_id = mobj.group('videoid')
3196 webpage = self._download_webpage(url, video_id)
3198 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3200 self._downloader.report_error(u'unable to extract song name')
3202 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3203 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3205 self._downloader.report_error(u'unable to extract performer')
3207 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3208 video_title = performer + ' - ' + song_name
3210 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3212 self._downloader.report_error(u'unable to mtvn_uri')
3214 mtvn_uri = mobj.group(1)
3216 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3218 self._downloader.report_error(u'unable to extract content id')
3220 content_id = mobj.group(1)
3222 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3223 self.report_extraction(video_id)
3224 request = compat_urllib_request.Request(videogen_url)
3226 metadataXml = compat_urllib_request.urlopen(request).read()
3227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3231 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3232 renditions = mdoc.findall('.//rendition')
3234 # For now, always pick the highest quality.
3235 rendition = renditions[-1]
3238 _,_,ext = rendition.attrib['type'].partition('/')
3239 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3240 video_url = rendition.find('./src').text
3242 self._downloader.trouble('Invalid rendition field.')
3248 'uploader': performer,
3249 'upload_date': None,
3250 'title': video_title,
3258 class YoukuIE(InfoExtractor):
3259 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3261 def report_download_webpage(self, file_id):
3262 """Report webpage download."""
3263 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3265 def report_extraction(self, file_id):
3266 """Report information extraction."""
3267 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3270 nowTime = int(time.time() * 1000)
3271 random1 = random.randint(1000,1998)
3272 random2 = random.randint(1000,9999)
3274 return "%d%d%d" %(nowTime,random1,random2)
3276 def _get_file_ID_mix_string(self, seed):
3278 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3280 for i in range(len(source)):
3281 seed = (seed * 211 + 30031 ) % 65536
3282 index = math.floor(seed / 65536 * len(source) )
3283 mixed.append(source[int(index)])
3284 source.remove(source[int(index)])
3285 #return ''.join(mixed)
3288 def _get_file_id(self, fileId, seed):
3289 mixed = self._get_file_ID_mix_string(seed)
3290 ids = fileId.split('*')
3294 realId.append(mixed[int(ch)])
3295 return ''.join(realId)
3297 def _real_extract(self, url):
3298 mobj = re.match(self._VALID_URL, url)
3300 self._downloader.report_error(u'invalid URL: %s' % url)
3302 video_id = mobj.group('ID')
3304 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3306 request = compat_urllib_request.Request(info_url, None, std_headers)
3308 self.report_download_webpage(video_id)
3309 jsondata = compat_urllib_request.urlopen(request).read()
3310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3311 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3314 self.report_extraction(video_id)
3316 jsonstr = jsondata.decode('utf-8')
3317 config = json.loads(jsonstr)
3319 video_title = config['data'][0]['title']
3320 seed = config['data'][0]['seed']
3322 format = self._downloader.params.get('format', None)
3323 supported_format = list(config['data'][0]['streamfileids'].keys())
3325 if format is None or format == 'best':
3326 if 'hd2' in supported_format:
3331 elif format == 'worst':
3339 fileid = config['data'][0]['streamfileids'][format]
3340 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3341 except (UnicodeDecodeError, ValueError, KeyError):
3342 self._downloader.report_error(u'unable to extract info section')
3346 sid = self._gen_sid()
3347 fileid = self._get_file_id(fileid, seed)
3349 #column 8,9 of fileid represent the segment number
3350 #fileid[7:9] should be changed
3351 for index, key in enumerate(keys):
3353 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3354 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3357 'id': '%s_part%02d' % (video_id, index),
3358 'url': download_url,
3360 'upload_date': None,
3361 'title': video_title,
3364 files_info.append(info)
3369 class XNXXIE(InfoExtractor):
3370 """Information extractor for xnxx.com"""
3372 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3374 VIDEO_URL_RE = r'flv_url=(.*?)&'
3375 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3376 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3378 def report_webpage(self, video_id):
3379 """Report information extraction"""
3380 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3382 def report_extraction(self, video_id):
3383 """Report information extraction"""
3384 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3386 def _real_extract(self, url):
3387 mobj = re.match(self._VALID_URL, url)
3389 self._downloader.report_error(u'invalid URL: %s' % url)
3391 video_id = mobj.group(1)
3393 self.report_webpage(video_id)
3395 # Get webpage content
3397 webpage_bytes = compat_urllib_request.urlopen(url).read()
3398 webpage = webpage_bytes.decode('utf-8')
3399 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3400 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3403 result = re.search(self.VIDEO_URL_RE, webpage)
3405 self._downloader.report_error(u'unable to extract video url')
3407 video_url = compat_urllib_parse.unquote(result.group(1))
3409 result = re.search(self.VIDEO_TITLE_RE, webpage)
3411 self._downloader.report_error(u'unable to extract video title')
3413 video_title = result.group(1)
3415 result = re.search(self.VIDEO_THUMB_RE, webpage)
3417 self._downloader.report_error(u'unable to extract video thumbnail')
3419 video_thumbnail = result.group(1)
3425 'upload_date': None,
3426 'title': video_title,
3428 'thumbnail': video_thumbnail,
3429 'description': None,
3433 class GooglePlusIE(InfoExtractor):
3434 """Information extractor for plus.google.com."""
3436 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3437 IE_NAME = u'plus.google'
3439 def __init__(self, downloader=None):
3440 InfoExtractor.__init__(self, downloader)
3442 def report_extract_entry(self, url):
3443 """Report downloading extry"""
3444 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3446 def report_date(self, upload_date):
3447 """Report downloading extry"""
3448 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3450 def report_uploader(self, uploader):
3451 """Report downloading extry"""
3452 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3454 def report_title(self, video_title):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3458 def report_extract_vid_page(self, video_page):
3459 """Report information extraction."""
3460 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3462 def _real_extract(self, url):
3463 # Extract id from URL
3464 mobj = re.match(self._VALID_URL, url)
3466 self._downloader.report_error(u'Invalid URL: %s' % url)
3469 post_url = mobj.group(0)
3470 video_id = mobj.group(1)
3472 video_extension = 'flv'
3474 # Step 1, Retrieve post webpage to extract further information
3475 self.report_extract_entry(post_url)
3476 request = compat_urllib_request.Request(post_url)
3478 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3480 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3483 # Extract update date
3485 pattern = 'title="Timestamp">(.*?)</a>'
3486 mobj = re.search(pattern, webpage)
3488 upload_date = mobj.group(1)
3489 # Convert timestring to a format suitable for filename
3490 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3491 upload_date = upload_date.strftime('%Y%m%d')
3492 self.report_date(upload_date)
3496 pattern = r'rel\="author".*?>(.*?)</a>'
3497 mobj = re.search(pattern, webpage)
3499 uploader = mobj.group(1)
3500 self.report_uploader(uploader)
3503 # Get the first line for title
3505 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3506 mobj = re.search(pattern, webpage)
3508 video_title = mobj.group(1)
3509 self.report_title(video_title)
3511 # Step 2, Stimulate clicking the image box to launch video
3512 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3513 mobj = re.search(pattern, webpage)
3515 self._downloader.report_error(u'unable to extract video page URL')
3517 video_page = mobj.group(1)
3518 request = compat_urllib_request.Request(video_page)
3520 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3521 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3524 self.report_extract_vid_page(video_page)
3527 # Extract video links on video page
3528 """Extract video links of all sizes"""
3529 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3530 mobj = re.findall(pattern, webpage)
3532 self._downloader.report_error(u'unable to extract video links')
3534 # Sort in resolution
3535 links = sorted(mobj)
3537 # Choose the lowest of the sort, i.e. highest resolution
3538 video_url = links[-1]
3539 # Only get the url. The resolution part in the tuple has no use anymore
3540 video_url = video_url[-1]
3541 # Treat escaped \u0026 style hex
3543 video_url = video_url.decode("unicode_escape")
3544 except AttributeError: # Python 3
3545 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3551 'uploader': uploader,
3552 'upload_date': upload_date,
3553 'title': video_title,
3554 'ext': video_extension,
3557 class NBAIE(InfoExtractor):
3558 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3561 def _real_extract(self, url):
3562 mobj = re.match(self._VALID_URL, url)
3564 self._downloader.report_error(u'invalid URL: %s' % url)
3567 video_id = mobj.group(1)
3568 if video_id.endswith('/index.html'):
3569 video_id = video_id[:-len('/index.html')]
3571 webpage = self._download_webpage(url, video_id)
3573 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3574 def _findProp(rexp, default=None):
3575 m = re.search(rexp, webpage)
3577 return unescapeHTML(m.group(1))
3581 shortened_video_id = video_id.rpartition('/')[2]
3582 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3584 'id': shortened_video_id,
3588 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3589 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3593 class JustinTVIE(InfoExtractor):
3594 """Information extractor for justin.tv and twitch.tv"""
3595 # TODO: One broadcast may be split into multiple videos. The key
3596 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3597 # starts at 1 and increases. Can we treat all parts as one video?
3599 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3600 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3601 _JUSTIN_PAGE_LIMIT = 100
3602 IE_NAME = u'justin.tv'
3604 def report_extraction(self, file_id):
3605 """Report information extraction."""
3606 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3608 def report_download_page(self, channel, offset):
3609 """Report attempt to download a single page of videos."""
3610 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3611 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3613 # Return count of items, list of *valid* items
3614 def _parse_page(self, url):
3616 urlh = compat_urllib_request.urlopen(url)
3617 webpage_bytes = urlh.read()
3618 webpage = webpage_bytes.decode('utf-8', 'ignore')
3619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3620 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3623 response = json.loads(webpage)
3624 if type(response) != list:
3625 error_text = response.get('error', 'unknown error')
3626 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3629 for clip in response:
3630 video_url = clip['video_file_url']
3632 video_extension = os.path.splitext(video_url)[1][1:]
3633 video_date = re.sub('-', '', clip['start_time'][:10])
3634 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3635 video_id = clip['id']
3636 video_title = clip.get('title', video_id)
3640 'title': video_title,
3641 'uploader': clip.get('channel_name', video_uploader_id),
3642 'uploader_id': video_uploader_id,
3643 'upload_date': video_date,
3644 'ext': video_extension,
3646 return (len(response), info)
3648 def _real_extract(self, url):
3649 mobj = re.match(self._VALID_URL, url)
3651 self._downloader.report_error(u'invalid URL: %s' % url)
3654 api = 'http://api.justin.tv'
3655 video_id = mobj.group(mobj.lastindex)
3657 if mobj.lastindex == 1:
3659 api += '/channel/archives/%s.json'
3661 api += '/broadcast/by_archive/%s.json'
3662 api = api % (video_id,)
3664 self.report_extraction(video_id)
3668 limit = self._JUSTIN_PAGE_LIMIT
3671 self.report_download_page(video_id, offset)
3672 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3673 page_count, page_info = self._parse_page(page_url)
3674 info.extend(page_info)
3675 if not paged or page_count != limit:
3680 class FunnyOrDieIE(InfoExtractor):
3681 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3683 def _real_extract(self, url):
3684 mobj = re.match(self._VALID_URL, url)
3686 self._downloader.report_error(u'invalid URL: %s' % url)
3689 video_id = mobj.group('id')
3690 webpage = self._download_webpage(url, video_id)
3692 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3694 self._downloader.report_error(u'unable to find video information')
3695 video_url = unescapeHTML(m.group('url'))
3697 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3699 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3701 self._downloader.trouble(u'Cannot find video title')
3702 title = clean_html(m.group('title'))
3704 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3706 desc = unescapeHTML(m.group('desc'))
3715 'description': desc,
3719 class SteamIE(InfoExtractor):
3720 _VALID_URL = r"""http://store.steampowered.com/
3721 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3723 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3727 def suitable(cls, url):
3728 """Receives a URL and returns True if suitable for this IE."""
3729 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3731 def _real_extract(self, url):
3732 m = re.match(self._VALID_URL, url, re.VERBOSE)
3733 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3734 gameID = m.group('gameID')
3735 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3736 webpage = self._download_webpage(videourl, gameID)
3737 mweb = re.finditer(urlRE, webpage)
3738 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3739 titles = re.finditer(namesRE, webpage)
3740 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3741 thumbs = re.finditer(thumbsRE, webpage)
3743 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3744 video_id = vid.group('videoID')
3745 title = vtitle.group('videoName')
3746 video_url = vid.group('videoURL')
3747 video_thumb = thumb.group('thumbnail')
3749 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3754 'title': unescapeHTML(title),
3755 'thumbnail': video_thumb
3760 class UstreamIE(InfoExtractor):
3761 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3762 IE_NAME = u'ustream'
3764 def _real_extract(self, url):
3765 m = re.match(self._VALID_URL, url)
3766 video_id = m.group('videoID')
3767 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3768 webpage = self._download_webpage(url, video_id)
3769 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3770 title = m.group('title')
3771 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3772 uploader = m.group('uploader')
3778 'uploader': uploader
3782 class WorldStarHipHopIE(InfoExtractor):
3783 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3784 IE_NAME = u'WorldStarHipHop'
3786 def _real_extract(self, url):
3787 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3789 webpage_src = compat_urllib_request.urlopen(url).read()
3790 webpage_src = webpage_src.decode('utf-8')
3792 mobj = re.search(_src_url, webpage_src)
3794 m = re.match(self._VALID_URL, url)
3795 video_id = m.group('id')
3797 if mobj is not None:
3798 video_url = mobj.group()
3799 if 'mp4' in video_url:
3804 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3807 _title = r"""<title>(.*)</title>"""
3809 mobj = re.search(_title, webpage_src)
3811 if mobj is not None:
3812 title = mobj.group(1)
3814 title = 'World Start Hip Hop - %s' % time.ctime()
3816 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3817 mobj = re.search(_thumbnail, webpage_src)
3819 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3820 if mobj is not None:
3821 thumbnail = mobj.group(1)
3823 _title = r"""candytitles.*>(.*)</span>"""
3824 mobj = re.search(_title, webpage_src)
3825 if mobj is not None:
3826 title = mobj.group(1)
3833 'thumbnail' : thumbnail,
3838 class RBMARadioIE(InfoExtractor):
3839 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3841 def _real_extract(self, url):
3842 m = re.match(self._VALID_URL, url)
3843 video_id = m.group('videoID')
3845 webpage = self._download_webpage(url, video_id)
3846 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3848 raise ExtractorError(u'Cannot find metadata')
3849 json_data = m.group(1)
3852 data = json.loads(json_data)
3853 except ValueError as e:
3854 raise ExtractorError(u'Invalid JSON: ' + str(e))
3856 video_url = data['akamai_url'] + '&cbr=256'
3857 url_parts = compat_urllib_parse_urlparse(video_url)
3858 video_ext = url_parts.path.rpartition('.')[2]
3863 'title': data['title'],
3864 'description': data.get('teaser_text'),
3865 'location': data.get('country_of_origin'),
3866 'uploader': data.get('host', {}).get('name'),
3867 'uploader_id': data.get('host', {}).get('slug'),
3868 'thumbnail': data.get('image', {}).get('large_url_2x'),
3869 'duration': data.get('duration'),
3874 class YouPornIE(InfoExtractor):
3875 """Information extractor for youporn.com."""
3876 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3878 def _print_formats(self, formats):
3879 """Print all available formats"""
3880 print(u'Available formats:')
3881 print(u'ext\t\tformat')
3882 print(u'---------------------------------')
3883 for format in formats:
3884 print(u'%s\t\t%s' % (format['ext'], format['format']))
3886 def _specific(self, req_format, formats):
3888 if(x["format"]==req_format):
3892 def _real_extract(self, url):
3893 mobj = re.match(self._VALID_URL, url)
3895 self._downloader.report_error(u'invalid URL: %s' % url)
3898 video_id = mobj.group('videoid')
3900 req = compat_urllib_request.Request(url)
3901 req.add_header('Cookie', 'age_verified=1')
3902 webpage = self._download_webpage(req, video_id)
3904 # Get the video title
3905 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3907 raise ExtractorError(u'Unable to extract video title')
3908 video_title = result.group('title').strip()
3910 # Get the video date
3911 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3913 self._downloader.report_warning(u'unable to extract video date')
3916 upload_date = result.group('date').strip()
3918 # Get the video uploader
3919 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3921 self._downloader.report_warning(u'unable to extract uploader')
3922 video_uploader = None
3924 video_uploader = result.group('uploader').strip()
3925 video_uploader = clean_html( video_uploader )
3927 # Get all of the formats available
3928 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3929 result = re.search(DOWNLOAD_LIST_RE, webpage)
3931 raise ExtractorError(u'Unable to extract download list')
3932 download_list_html = result.group('download_list').strip()
3934 # Get all of the links from the page
3935 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3936 links = re.findall(LINK_RE, download_list_html)
3937 if(len(links) == 0):
3938 raise ExtractorError(u'ERROR: no known formats available for video')
3940 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3945 # A link looks like this:
3946 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3947 # A path looks like this:
3948 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3949 video_url = unescapeHTML( link )
3950 path = compat_urllib_parse_urlparse( video_url ).path
3951 extension = os.path.splitext( path )[1][1:]
3952 format = path.split('/')[4].split('_')[:2]
3955 format = "-".join( format )
3956 title = u'%s-%s-%s' % (video_title, size, bitrate)
3961 'uploader': video_uploader,
3962 'upload_date': upload_date,
3967 'description': None,
3971 if self._downloader.params.get('listformats', None):
3972 self._print_formats(formats)
3975 req_format = self._downloader.params.get('format', None)
3976 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3978 if req_format is None or req_format == 'best':
3980 elif req_format == 'worst':
3981 return [formats[-1]]
3982 elif req_format in ('-1', 'all'):
3985 format = self._specific( req_format, formats )
3987 self._downloader.report_error(u'requested format not available')
3993 class PornotubeIE(InfoExtractor):
3994 """Information extractor for pornotube.com."""
3995 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3997 def _real_extract(self, url):
3998 mobj = re.match(self._VALID_URL, url)
4000 self._downloader.report_error(u'invalid URL: %s' % url)
4003 video_id = mobj.group('videoid')
4004 video_title = mobj.group('title')
4006 # Get webpage content
4007 webpage = self._download_webpage(url, video_id)
4010 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4011 result = re.search(VIDEO_URL_RE, webpage)
4013 self._downloader.report_error(u'unable to extract video url')
4015 video_url = compat_urllib_parse.unquote(result.group('url'))
4017 #Get the uploaded date
4018 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4019 result = re.search(VIDEO_UPLOADED_RE, webpage)
4021 self._downloader.report_error(u'unable to extract video title')
4023 upload_date = result.group('date')
4025 info = {'id': video_id,
4028 'upload_date': upload_date,
4029 'title': video_title,
4035 class YouJizzIE(InfoExtractor):
4036 """Information extractor for youjizz.com."""
4037 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4039 def _real_extract(self, url):
4040 mobj = re.match(self._VALID_URL, url)
4042 self._downloader.report_error(u'invalid URL: %s' % url)
4045 video_id = mobj.group('videoid')
4047 # Get webpage content
4048 webpage = self._download_webpage(url, video_id)
4050 # Get the video title
4051 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4053 raise ExtractorError(u'ERROR: unable to extract video title')
4054 video_title = result.group('title').strip()
4056 # Get the embed page
4057 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4059 raise ExtractorError(u'ERROR: unable to extract embed page')
4061 embed_page_url = result.group(0).strip()
4062 video_id = result.group('videoid')
4064 webpage = self._download_webpage(embed_page_url, video_id)
4067 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4069 raise ExtractorError(u'ERROR: unable to extract video url')
4070 video_url = result.group('source')
4072 info = {'id': video_id,
4074 'title': video_title,
4077 'player_url': embed_page_url}
4081 class EightTracksIE(InfoExtractor):
4083 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4085 def _real_extract(self, url):
4086 mobj = re.match(self._VALID_URL, url)
4088 raise ExtractorError(u'Invalid URL: %s' % url)
4089 playlist_id = mobj.group('id')
4091 webpage = self._download_webpage(url, playlist_id)
4093 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4095 raise ExtractorError(u'Cannot find trax information')
4096 json_like = m.group(1)
4097 data = json.loads(json_like)
4099 session = str(random.randint(0, 1000000000))
4101 track_count = data['tracks_count']
4102 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4103 next_url = first_url
4105 for i in itertools.count():
4106 api_json = self._download_webpage(next_url, playlist_id,
4107 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4108 errnote=u'Failed to download song information')
4109 api_data = json.loads(api_json)
4110 track_data = api_data[u'set']['track']
4112 'id': track_data['id'],
4113 'url': track_data['track_file_stream_url'],
4114 'title': track_data['performer'] + u' - ' + track_data['name'],
4115 'raw_title': track_data['name'],
4116 'uploader_id': data['user']['login'],
4120 if api_data['set']['at_last_track']:
4122 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4125 class KeekIE(InfoExtractor):
4126 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4129 def _real_extract(self, url):
4130 m = re.match(self._VALID_URL, url)
4131 video_id = m.group('videoID')
4132 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4133 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4134 webpage = self._download_webpage(url, video_id)
4135 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4136 title = unescapeHTML(m.group('title'))
4137 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4138 uploader = clean_html(m.group('uploader'))
4144 'thumbnail': thumbnail,
4145 'uploader': uploader
4149 class TEDIE(InfoExtractor):
4150 _VALID_URL=r'''http://www.ted.com/
4152 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4154 ((?P<type_talk>talks)) # We have a simple talk
4156 /(?P<name>\w+) # Here goes the name and then ".html"
4160 def suitable(cls, url):
4161 """Receives a URL and returns True if suitable for this IE."""
4162 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4164 def _real_extract(self, url):
4165 m=re.match(self._VALID_URL, url, re.VERBOSE)
4166 if m.group('type_talk'):
4167 return [self._talk_info(url)]
4169 playlist_id=m.group('playlist_id')
4170 name=m.group('name')
4171 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4172 return self._playlist_videos_info(url,name,playlist_id)
4174 def _talk_video_link(self,mediaSlug):
4175 '''Returns the video link for that mediaSlug'''
4176 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4178 def _playlist_videos_info(self,url,name,playlist_id=0):
4179 '''Returns the videos of the playlist'''
4181 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4182 ([.\s]*?)data-playlist_item_id="(\d+)"
4183 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4185 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4186 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4187 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4188 m_names=re.finditer(video_name_RE,webpage)
4190 for m_video, m_name in zip(m_videos,m_names):
4191 video_id=m_video.group('video_id')
4192 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4193 info.append(self._talk_info(talk_url,video_id))
4196 def _talk_info(self, url, video_id=0):
4197 """Return the video for the talk in the url"""
4198 m=re.match(self._VALID_URL, url,re.VERBOSE)
4199 videoName=m.group('name')
4200 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4201 # If the url includes the language we get the title translated
4202 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4203 title=re.search(title_RE, webpage).group('title')
4204 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4205 "id":(?P<videoID>[\d]+).*?
4206 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4207 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4208 thumb_match=re.search(thumb_RE,webpage)
4209 info_match=re.search(info_RE,webpage,re.VERBOSE)
4210 video_id=info_match.group('videoID')
4211 mediaSlug=info_match.group('mediaSlug')
4212 video_url=self._talk_video_link(mediaSlug)
4218 'thumbnail': thumb_match.group('thumbnail')
4222 class MySpassIE(InfoExtractor):
4223 _VALID_URL = r'http://www.myspass.de/.*'
4225 def _real_extract(self, url):
4226 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4228 # video id is the last path element of the URL
4229 # usually there is a trailing slash, so also try the second but last
4230 url_path = compat_urllib_parse_urlparse(url).path
4231 url_parent_path, video_id = os.path.split(url_path)
4233 _, video_id = os.path.split(url_parent_path)
4236 metadata_url = META_DATA_URL_TEMPLATE % video_id
4237 metadata_text = self._download_webpage(metadata_url, video_id)
4238 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4240 # extract values from metadata
4241 url_flv_el = metadata.find('url_flv')
4242 if url_flv_el is None:
4243 self._downloader.report_error(u'unable to extract download url')
4245 video_url = url_flv_el.text
4246 extension = os.path.splitext(video_url)[1][1:]
4247 title_el = metadata.find('title')
4248 if title_el is None:
4249 self._downloader.report_error(u'unable to extract title')
4251 title = title_el.text
4252 format_id_el = metadata.find('format_id')
4253 if format_id_el is None:
4256 format = format_id_el.text
4257 description_el = metadata.find('description')
4258 if description_el is not None:
4259 description = description_el.text
4262 imagePreview_el = metadata.find('imagePreview')
4263 if imagePreview_el is not None:
4264 thumbnail = imagePreview_el.text
4273 'thumbnail': thumbnail,
4274 'description': description
4278 class SpiegelIE(InfoExtractor):
4279 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4281 def _real_extract(self, url):
4282 m = re.match(self._VALID_URL, url)
4283 video_id = m.group('videoID')
4285 webpage = self._download_webpage(url, video_id)
4286 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4288 raise ExtractorError(u'Cannot find title')
4289 video_title = unescapeHTML(m.group(1))
4291 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4292 xml_code = self._download_webpage(xml_url, video_id,
4293 note=u'Downloading XML', errnote=u'Failed to download XML')
4295 idoc = xml.etree.ElementTree.fromstring(xml_code)
4296 last_type = idoc[-1]
4297 filename = last_type.findall('./filename')[0].text
4298 duration = float(last_type.findall('./duration')[0].text)
4300 video_url = 'http://video2.spiegel.de/flash/' + filename
4301 video_ext = filename.rpartition('.')[2]
4306 'title': video_title,
4307 'duration': duration,
4311 class LiveLeakIE(InfoExtractor):
4313 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4314 IE_NAME = u'liveleak'
4316 def _real_extract(self, url):
4317 mobj = re.match(self._VALID_URL, url)
4319 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4322 video_id = mobj.group('video_id')
4324 webpage = self._download_webpage(url, video_id)
4326 m = re.search(r'file: "(.*?)",', webpage)
4328 self._downloader.report_error(u'unable to find video url')
4330 video_url = m.group(1)
4332 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4334 self._downloader.trouble(u'Cannot find video title')
4335 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4337 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4339 desc = unescapeHTML(m.group('desc'))
4343 m = re.search(r'By:.*?(\w+)</a>', webpage)
4345 uploader = clean_html(m.group(1))
4354 'description': desc,
4355 'uploader': uploader
4360 class ARDIE(InfoExtractor):
4361 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4362 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4363 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4365 def _real_extract(self, url):
4366 # determine video id from url
4367 m = re.match(self._VALID_URL, url)
4369 numid = re.search(r'documentId=([0-9]+)', url)
4371 video_id = numid.group(1)
4373 video_id = m.group('video_id')
4375 # determine title and media streams from webpage
4376 html = self._download_webpage(url, video_id)
4377 title = re.search(self._TITLE, html).group('title')
4378 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4380 assert '"fsk"' in html
4381 self._downloader.report_error(u'this video is only available after 8:00 pm')
4384 # choose default media type and highest quality for now
4385 stream = max([s for s in streams if int(s["media_type"]) == 0],
4386 key=lambda s: int(s["quality"]))
4388 # there's two possibilities: RTMP stream or HTTP download
4389 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4390 if stream['rtmp_url']:
4391 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4392 assert stream['video_url'].startswith('mp4:')
4393 info["url"] = stream["rtmp_url"]
4394 info["play_path"] = stream['video_url']
4396 assert stream["video_url"].endswith('.mp4')
4397 info["url"] = stream["video_url"]
4401 def gen_extractors():
4402 """ Return a list of an instance of every supported extractor.
4403 The order does matter; the first extractor matched is the one handling the URL.
4406 YoutubePlaylistIE(),
4431 StanfordOpenClassroomIE(),
4441 WorldStarHipHopIE(),