X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=6d8e7be109253306b757d7def8ad142ec7076c01;hb=a11ea50319c5dc5d01098e28122617391c97d555;hp=1d774b91e7d0ea6fb3a409ad04acfb375d35b3ba;hpb=55c05398724f8bbb58db1b5ea55ad22038155c36;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py old mode 100644 new mode 100755 index 1d774b91e..6d8e7be10 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3,7 +3,9 @@ from __future__ import absolute_import +import base64 import datetime +import itertools import netrc import os import re @@ -13,6 +15,7 @@ import email.utils import xml.etree.ElementTree import random import math +import operator from .utils import * @@ -23,7 +26,7 @@ class InfoExtractor(object): Information extractors are the classes that, given a URL, extract information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then + others. The information is stored in a dictionary which is then passed to the FileDownloader. The FileDownloader processes this information possibly downloading the video to the file system, among other possible outcomes. @@ -32,8 +35,6 @@ class InfoExtractor(object): id: Video identifier. url: Final video URL. - uploader: Nickname of the video uploader, unescaped. - upload_date: Video upload date (YYYYMMDD). title: Video title, unescaped. ext: Video filename extension. @@ -42,8 +43,12 @@ class InfoExtractor(object): format: The video format, defaults to ext (used for --get-format) thumbnail: Full URL to a video thumbnail image. description: One-line video description. + uploader: Full name of the video uploader. + upload_date: Video upload date (YYYYMMDD). + uploader_id: Nickname or id of the video uploader. + location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. + subtitles: The subtitle file contents. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -69,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -100,6 +107,82 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @property + def IE_NAME(self): + return type(self).__name__[:-2] + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the response handle """ + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + self.to_screen(u'%s: %s' % (video_id, note)) + try: + return compat_urllib_request.urlopen(url_or_request) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is None: + errnote = u'Unable to download webpage' + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the data of the page as a string """ + urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' + webpage_bytes = urlh.read() + if self._downloader.params.get('dump_intermediate_pages', False): + try: + url = url_or_request.get_full_url() + except AttributeError: + url = url_or_request + self.to_screen(u'Dumping request to ' + url) + dump = base64.b64encode(webpage_bytes).decode('ascii') + self._downloader.to_screen(dump) + return webpage_bytes.decode(encoding, 'replace') + + def to_screen(self, msg): + """Print msg to screen, prefixing it with '[ie_name]'""" + self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) + + def report_extraction(self, id_or_name): + """Report information extraction.""" + self.to_screen(u'%s: Extracting information' % id_or_name) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self.to_screen(u'%s: Downloading webpage' % video_id) + + def report_age_confirmation(self): + """Report attempt to confirm age.""" + self.to_screen(u'Confirming age') + + #Methods for following #608 + #They set the correct value of the '_type' key + def video_result(self, video_info): + """Returns a video""" + video_info['_type'] = 'video' + return video_info + def url_result(self, url, ie=None): + """Returns a url that points to a page that should be processed""" + #TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url, + 'ie_key': ie} + return video_info + def playlist_result(self, entries, playlist_id=None, playlist_title=None): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + if playlist_id: + video_info['id'] = playlist_id + if playlist_title: + video_info['title'] = playlist_title + return video_info + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" @@ -110,13 +193,12 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )? # optional -> youtube.com/xxxx is OK @@ -125,7 +207,7 @@ class YoutubeIE(InfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' @@ -159,65 +241,125 @@ class YoutubeIE(InfoExtractor): '44': '480x854', '45': '720x1280', '46': '1080x1920', - } + } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" - self._downloader.to_screen(u'[youtube] Setting language') + self.to_screen(u'Setting language') def report_login(self): """Report attempt to log in.""" - self._downloader.to_screen(u'[youtube] Logging in') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[youtube] Confirming age') + self.to_screen(u'Logging in') def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) + self.to_screen(u'%s: Downloading video webpage' % video_id) def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + self.to_screen(u'%s: Downloading video info webpage' % video_id) def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + self.to_screen(u'%s: Checking available subtitles' % video_id) + + def report_video_subtitles_request(self, video_id, sub_lang, format): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" - self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) + self.to_screen(u'%s: Extracting video information' % video_id) def report_unavailable_format(self, video_id, format): """Report extracted video URL.""" - self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) + self.to_screen(u'%s: Format %s not available' % (video_id, format)) def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" - self._downloader.to_screen(u'[youtube] RTMP download detected') - - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt + self.to_screen(u'RTMP download detected') + + def _get_available_subtitles(self, video_id): + self.report_video_subtitles_download(video_id) + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + return (u'unable to download video subtitles: %s' % compat_str(err), None) + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: + return (u'video doesn\'t have subtitles', None) + return sub_lang_list + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ + Return tuple: + (error_message, sub_lang, sub) + """ + self.report_video_subtitles_request(video_id, sub_lang, format) + params = compat_urllib_parse.urlencode({ + 'lang': sub_lang, + 'name': sub_name, + 'v': video_id, + 'fmt': format, + }) + url = 'http://www.youtube.com/api/timedtext?' + params + try: + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + return (u'unable to download video subtitles: %s' % compat_str(err), None, None) + if not sub: + return (u'Did not fetch video subtitles', None, None) + return (None, sub_lang, sub) + + def _extract_subtitle(self, video_id): + """ + Return a list with a tuple: + [(error_message, sub_lang, sub)] + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] + + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + return [subtitle] + + def _extract_all_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + subtitles.append(subtitle) + return subtitles def _print_formats(self, formats): print('Available formats:') @@ -245,7 +387,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -254,30 +396,65 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed if username is None: return + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return + + galx = None + dsh = None + match = re.search(re.compile(r']* name="loginForm"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -288,31 +465,34 @@ class YoutubeIE(InfoExtractor): request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) try: self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read() + age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err)) + return + + def _extract_id(self, url): + mobj = re.match(self._VALID_URL, url, re.VERBOSE) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) return + video_id = mobj.group(2) + return video_id def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') - - # Extract video id from URL - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - video_id = mobj.group(2) + video_id = self._extract_id(url) # Get video webpage self.report_video_webpage_download(video_id) - request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) + url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + request = compat_urllib_request.Request(url) try: video_webpage_bytes = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err)) return video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') @@ -327,28 +507,24 @@ class YoutubeIE(InfoExtractor): # Get video info self.report_video_info_webpage_download(video_id) for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (video_id, el_type)) - request = compat_urllib_request.Request(video_info_url) - try: - video_info_webpage_bytes = compat_urllib_request.urlopen(request).read() - video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - return + video_info_webpage = self._download_webpage(video_info_url, video_id, + note=False, + errnote='unable to download video info webpage') + video_info = compat_parse_qs(video_info_webpage) + if 'token' in video_info: + break if 'token' not in video_info: if 'reason' in video_info: - self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0]) + self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0]) else: - self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') + self._downloader.report_error(u'"token" parameter not in video info for unknown reason') return # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - self._downloader.trouble(u'ERROR: "rental" videos not supported') + self._downloader.report_error(u'"rental" videos not supported') return # Start extracting information @@ -356,19 +532,27 @@ class YoutubeIE(InfoExtractor): # uploader if 'author' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') + self._downloader.report_error(u'unable to extract uploader name') return video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) + # uploader_id + video_uploader_id = None + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_uploader_id = mobj.group(1) + else: + self._downloader.report_warning(u'unable to extract uploader nickname') + # title if 'title' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) # thumbnail image if 'thumbnail_url' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') + self._downloader.report_warning(u'unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -378,55 +562,42 @@ class YoutubeIE(InfoExtractor): mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] - for expression in format_expressions: - try: - upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') - except: - pass + upload_date = unified_strdate(upload_date) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: video_description = clean_html(video_description) else: - video_description = '' + fd_mobj = re.search(r']+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - raise Trouble(u'WARNING: video has no closed captions') - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = srt_lang_list.keys()[0] - if not srt_lang in srt_lang_list: - raise Trouble(u'WARNING: no closed captions found in the specified language') - request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id)) - try: - srt_xml = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) - if not srt_xml: - raise Trouble(u'WARNING: unable to download video subtitles') - video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) - except Trouble as trouble: - self._downloader.trouble(trouble[0]) + video_subtitles = self._extract_subtitle(video_id) + if video_subtitles: + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_all_subtitles(video_id) + for video_subtitle in video_subtitles: + (sub_error, sub_lang, sub) = video_subtitle + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('listsubtitles', False): + sub_lang_list = self._list_available_subtitles(video_id) + return if 'length_seconds' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video duration') + self._downloader.report_warning(u'unable to extract video duration') video_duration = '' else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) @@ -443,7 +614,7 @@ class YoutubeIE(InfoExtractor): elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') url_data = [compat_parse_qs(uds) for uds in url_data_strs] - url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) + url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud] url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) @@ -454,8 +625,7 @@ class YoutubeIE(InfoExtractor): format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: - self._downloader.trouble(u'ERROR: no known formats available for video') - return + raise ExtractorError(u'no known formats available for video') if self._downloader.params.get('listformats', None): self._print_formats(existing_formats) return @@ -475,11 +645,9 @@ class YoutubeIE(InfoExtractor): video_url_list = [(rf, url_map[rf])] break if video_url_list is None: - self._downloader.trouble(u'ERROR: requested format not available') - return + raise ExtractorError(u'requested format not available') else: - self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') - return + raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') results = [] for format_param, video_real_url in video_url_list: @@ -493,6 +661,7 @@ class YoutubeIE(InfoExtractor): 'id': video_id, 'url': video_real_url, 'uploader': video_uploader, + 'uploader_id': video_uploader_id, 'upload_date': upload_date, 'title': video_title, 'ext': video_extension, @@ -514,24 +683,9 @@ class MetacafeIE(InfoExtractor): _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_disclaimer(self): """Report disclaimer retrieval.""" - self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self._downloader.to_screen(u'[metacafe] Confirming age') - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) + self.to_screen(u'Retrieving disclaimer') def _real_initialize(self): # Retrieve disclaimer @@ -540,7 +694,7 @@ class MetacafeIE(InfoExtractor): self.report_disclaimer() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) + self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err)) return # Confirm age @@ -553,14 +707,14 @@ class MetacafeIE(InfoExtractor): self.report_age_confirmation() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) @@ -568,17 +722,10 @@ class MetacafeIE(InfoExtractor): # Check if video comes from YouTube mobj2 = re.match(r'^yt-(.*)$', video_id) if mobj2 is not None: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) - return + return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] # Retrieve video webpage to extract further information - request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -597,29 +744,29 @@ class MetacafeIE(InfoExtractor): else: mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return vardict = compat_parse_qs(mobj.group(1)) if 'mediaData' not in vardict: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return - mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) + mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return - mediaURL = mobj.group(1).replace('\\/', '/') + mediaURL = mobj.group('mediaURL').replace('\\/', '/') video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) + video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) mobj = re.search(r'(?im)(.*) - Video', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1).decode('utf-8') mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') + self._downloader.report_error(u'unable to extract uploader nickname') return video_uploader = mobj.group(1) @@ -639,22 +786,11 @@ class DailymotionIE(InfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1).split('_')[0].split('?')[0] @@ -664,33 +800,28 @@ class DailymotionIE(InfoExtractor): # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return flashvars = compat_urllib_parse.unquote(mobj.group(1)) for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: if key in flashvars: max_quality = key - self._downloader.to_screen(u'[dailymotion] Using %s' % key) + self.to_screen(u'Using %s' % key) break else: - self._downloader.trouble(u'ERROR: unable to extract video URL') + self._downloader.report_error(u'unable to extract video URL') return mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video URL') + self._downloader.report_error(u'unable to extract video URL') return video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') @@ -699,9 +830,9 @@ class DailymotionIE(InfoExtractor): mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return - video_title = unescapeHTML(mobj.group('title').decode('utf-8')) + video_title = unescapeHTML(mobj.group('title')) video_uploader = None mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) @@ -709,7 +840,7 @@ class DailymotionIE(InfoExtractor): # lookin for official user mobj_official = re.search(r']+?>([^<]+?)', webpage) if mobj_official is None: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + self._downloader.report_warning(u'unable to extract uploader nickname') else: video_uploader = mobj_official.group(1) else: @@ -721,105 +852,12 @@ class DailymotionIE(InfoExtractor): video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class GoogleIE(InfoExtractor): - """Information extractor for video.google.com.""" - - _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' - IE_NAME = u'video.google' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id) - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - video_id = mobj.group(1) - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - mobj = re.search(r"download_url:'([^']+)'", webpage) - if mobj is None: - video_extension = 'flv' - mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') - return - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - mediaURL = mediaURL.replace('\\x3d', '\x3d') - mediaURL = mediaURL.replace('\\x26', '\x26') - - video_url = mediaURL - - mobj = re.search(r'(.*)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') - return - video_title = mobj.group(1).decode('utf-8') - - # Extract video description - mobj = re.search(r'([^<]*)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video description') - return - video_description = mobj.group(1).decode('utf-8') - if not video_description: - video_description = 'No description available.' - - # Extract video thumbnail - if self._downloader.params.get('forcethumbnail', False): - request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) - try: - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1) - else: # we need something to pass to process_info - video_thumbnail = '' - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), + 'ext': video_extension, }] @@ -829,22 +867,11 @@ class PhotobucketIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' IE_NAME = u'photobucket' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return video_id = mobj.group(1) @@ -857,14 +884,14 @@ class PhotobucketIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage self.report_extraction(video_id) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return mediaURL = compat_urllib_parse.unquote(mobj.group(1)) @@ -872,7 +899,7 @@ class PhotobucketIE(InfoExtractor): mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1).decode('utf-8') @@ -891,28 +918,18 @@ class PhotobucketIE(InfoExtractor): class YahooIE(InfoExtractor): """Information extractor for video.yahoo.com.""" + _WORKING = False # _VALID_URL matches all Yahoo! Video URLs # _VPAGE_URL matches only the extractable '/watch/' URLs _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' IE_NAME = u'video.yahoo' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) - def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return video_id = mobj.group(2) @@ -925,18 +942,18 @@ class YahooIE(InfoExtractor): try: webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract id field') + self._downloader.report_error(u'Unable to extract id field') return yahoo_id = mobj.group(1) mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract vid field') + self._downloader.report_error(u'Unable to extract vid field') return yahoo_vid = mobj.group(1) @@ -949,34 +966,34 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = mobj.group(1).decode('utf-8') mobj = re.search(r'(.*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') + self._downloader.report_error(u'unable to extract video uploader') return video_uploader = mobj.group(1).decode('utf-8') # Extract video thumbnail mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return video_thumbnail = mobj.group(1).decode('utf-8') # Extract video description mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video description') + self._downloader.report_error(u'unable to extract video description') return video_description = mobj.group(1).decode('utf-8') if not video_description: @@ -985,13 +1002,13 @@ class YahooIE(InfoExtractor): # Extract video height and width mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video height') + self._downloader.report_error(u'unable to extract video height') return yv_video_height = mobj.group(1) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video width') + self._downloader.report_error(u'unable to extract video width') return yv_video_width = mobj.group(1) @@ -1007,13 +1024,13 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract media URL from playlist XML mobj = re.search(r'https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id) - def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group('id') + if not mobj.group('proto'): + url = 'https://' + url + if mobj.group('direct_link'): + url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, std_headers) try: self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() + webpage_bytes = compat_urllib_request.urlopen(request).read() + webpage = webpage_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Now we begin extracting as much information as we can from what we @@ -1076,28 +1087,32 @@ class VimeoIE(InfoExtractor): config = webpage.split(' = {config:')[1].split(',assets:')[0] config = json.loads(config) except: - self._downloader.trouble(u'ERROR: unable to extract info section') + if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): + self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option') + else: + self._downloader.report_error(u'unable to extract info section') return - + # Extract title video_title = config["video"]["title"] - # Extract uploader + # Extract uploader and uploader_id video_uploader = config["video"]["owner"]["name"] + video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] # Extract video thumbnail video_thumbnail = config["video"]["thumbnail"] # Extract video description - video_description = get_element_by_id("description", webpage.decode('utf8')) + video_description = get_element_by_attribute("itemprop", "description", webpage) if video_description: video_description = clean_html(video_description) - else: video_description = '' + else: video_description = u'' # Extract upload date video_upload_date = None - mobj = re.search(r'[^:]*: (.*?)( \([^\(]*\))?', webpage) + mobj = re.search(r' tag: %s' % url) + (1, 'url', u'Could not find tag: %s' % url) ] ) next_url = compat_urllib_parse.unquote(info.get('url')) @@ -1249,10 +1253,10 @@ class ArteTvIE(InfoExtractor): '(.*?)', re.DOTALL, [ - (1, 'id', u'ERROR: could not extract video id: %s' % url), - (2, 'title', u'ERROR: could not extract video title: %s' % url), - (3, 'date', u'ERROR: could not extract video date: %s' % url), - (4, 'url', u'ERROR: could not extract video url: %s' % url) + (1, 'id', u'could not extract video id: %s' % url), + (2, 'title', u'could not extract video title: %s' % url), + (3, 'date', u'could not extract video date: %s' % url), + (4, 'url', u'could not extract video url: %s' % url) ] ) @@ -1261,7 +1265,7 @@ class ArteTvIE(InfoExtractor): 'url': compat_urllib_parse.unquote(info.get('url')), 'uploader': u'arte.tv', 'upload_date': info.get('date'), - 'title': info.get('title'), + 'title': info.get('title').decode('utf-8'), 'ext': u'mp4', 'format': u'NA', 'player_url': None, @@ -1286,65 +1290,59 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = u'generic' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_webpage(self, video_id): """Report webpage download.""" - self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') - self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) + if not self._downloader.params.get('test', False): + self._downloader.report_warning(u'Falling back on generic information extractor.') + super(GenericIE, self).report_download_webpage(video_id) def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - + def _test_redirect(self, url): - """Check if it is a redirect, like url shorteners, in case restart chain.""" + """Check if it is a redirect, like url shorteners, in case return the new url.""" class HeadRequest(compat_urllib_request.Request): def get_method(self): return "HEAD" class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ - Subclass the HTTPRedirectHandler to make it use our + Subclass the HTTPRedirectHandler to make it use our HeadRequest also on the redirected URL """ - def redirect_request(self, req, fp, code, msg, headers, newurl): + def redirect_request(self, req, fp, code, msg, headers, newurl): if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') + newurl = newurl.replace(' ', '%20') newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, + return HeadRequest(newurl, headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) + origin_req_host=req.get_origin_req_host(), + unverifiable=True) + else: + raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) class HTTPMethodFallback(compat_urllib_request.BaseHandler): """ Fallback to GET if HEAD is not allowed (405 HTTP error) """ - def http_error_405(self, req, fp, code, msg, headers): + def http_error_405(self, req, fp, code, msg, headers): fp.read() fp.close() newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), + return self.parent.open(compat_urllib_request.Request(req.get_full_url(), + headers=newheaders, + origin_req_host=req.get_origin_req_host(), unverifiable=True)) # Build our opener - opener = compat_urllib_request.OpenerDirector() + opener = compat_urllib_request.OpenerDirector() for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: + compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) response = opener.open(HeadRequest(url)) @@ -1354,24 +1352,19 @@ class GenericIE(InfoExtractor): return False self.report_following_redirect(new_url) - self._downloader.download([new_url]) - return True + return new_url def _real_extract(self, url): - if self._test_redirect(url): return + new_url = self._test_redirect(url) + if new_url: return [self.url_result(new_url)] video_id = url.split('/')[-1] - request = compat_urllib_request.Request(url) try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(url, video_id) except ValueError as err: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return self.report_extraction(video_id) @@ -1381,13 +1374,16 @@ class GenericIE(InfoExtractor): # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + # Broaden the search a little bit: JWPlayer JS loader + mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) + if mobj is None: + self._downloader.report_error(u'Invalid URL: %s' % url) return # It's possible that one of the regexes # matched, but returned an empty group: if mobj.group(1) is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return video_url = compat_urllib_parse.unquote(mobj.group(1)) @@ -1405,24 +1401,24 @@ class GenericIE(InfoExtractor): # and so on and so forth; it's just not practical mobj = re.search(r'(.*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return - video_title = mobj.group(1).decode('utf-8') + video_title = mobj.group(1) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return - video_uploader = mobj.group(1).decode('utf-8') + video_uploader = mobj.group(1) return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), + 'id': video_id, + 'url': video_url, 'uploader': video_uploader, 'upload_date': None, 'title': video_title, - 'ext': video_extension.decode('utf-8'), + 'ext': video_extension, }] @@ -1433,9 +1429,6 @@ class YoutubeSearchIE(InfoExtractor): _max_youtube_results = 1000 IE_NAME = u'youtube:search' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_page(self, query, pagenum): """Report attempt to download search page with given number.""" query = query.decode(preferredencoding()) @@ -1444,35 +1437,31 @@ class YoutubeSearchIE(InfoExtractor): def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') prefix = prefix[8:] query = query.encode('utf-8') if prefix == '': - self._download_n_results(query, 1) - return + return self._get_n_results(query, 1) elif prefix == 'all': - self._download_n_results(query, self._max_youtube_results) - return + self._get_n_results(query, self._max_youtube_results) else: try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results - self._download_n_results(query, n) - return + return self._get_n_results(query, n) except ValueError: # parsing prefix as integer fails - self._download_n_results(query, 1) - return + return self._get_n_results(query, 1) - def _download_n_results(self, query, n): - """Downloads a specified number of results for a query""" + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" video_ids = [] pagenum = 0 @@ -1483,12 +1472,16 @@ class YoutubeSearchIE(InfoExtractor): result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) request = compat_urllib_request.Request(result_url) try: - data = compat_urllib_request.urlopen(request).read() + data = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download API page: %s' % compat_str(err)) return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.report_error(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1497,9 +1490,8 @@ class YoutubeSearchIE(InfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + return videos class GoogleSearchIE(InfoExtractor): @@ -1511,18 +1503,15 @@ class GoogleSearchIE(InfoExtractor): _max_google_results = 1000 IE_NAME = u'video.google:search' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) - self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum)) + self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum)) def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') @@ -1538,10 +1527,10 @@ class GoogleSearchIE(InfoExtractor): try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1562,7 +1551,7 @@ class GoogleSearchIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1586,6 +1575,8 @@ class GoogleSearchIE(InfoExtractor): class YahooSearchIE(InfoExtractor): """Information Extractor for Yahoo! Video search queries.""" + + _WORKING = False _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' @@ -1593,18 +1584,15 @@ class YahooSearchIE(InfoExtractor): _max_yahoo_results = 1000 IE_NAME = u'video.yahoo:search' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) - self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) + self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum)) def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') @@ -1620,10 +1608,10 @@ class YahooSearchIE(InfoExtractor): try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_yahoo_results: - self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) n = self._max_yahoo_results self._download_n_results(query, n) return @@ -1645,7 +1633,7 @@ class YahooSearchIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1671,14 +1659,28 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*' - _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s' - _MORE_PAGES_INDICATOR = r'yt-uix-pager-next' + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | p/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" @@ -1686,109 +1688,126 @@ class YoutubePlaylistIE(InfoExtractor): def _real_extract(self, url): # Extract playlist id - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) - return - - # Single video case - if mobj.group(3) is not None: - self._downloader.download([mobj.group(3)]) + self._downloader.report_error(u'invalid url: %s' % url) return - # Download playlist pages - # prefix is 'p' as default for playlists but there are other types that need extra care - playlist_prefix = mobj.group(1) - if playlist_prefix == 'a': - playlist_access = 'artist' - else: - playlist_prefix = 'p' - playlist_access = 'view_play_list' - playlist_id = mobj.group(2) - video_ids = [] - pagenum = 1 + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] while True: - self.report_download_page(playlist_id, pagenum) - url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) - request = compat_urllib_request.Request(url) + self.report_download_page(playlist_id, page_num) + + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) try: - page = compat_urllib_request.urlopen(request).read() + page = compat_urllib_request.urlopen(url).read().decode('utf8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return - # Extract video identifiers - ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + response = json.loads(page) + except ValueError as err: + self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err)) + return - if re.search(self._MORE_PAGES_INDICATOR, page) is None: + if 'feed' not in response: + self._downloader.report_error(u'Got a malformed response from YouTube API') + return + playlist_title = response['feed']['title']['$t'] + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS break - pagenum = pagenum + 1 - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - if playlistend == -1: - video_ids = video_ids[playliststart:] - else: - video_ids = video_ids[playliststart:playlistend] + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] + + if len(response['feed']['entry']) < self._MAX_RESULTS: + break + page_num += 1 + + videos = [v[1] for v in sorted(videos)] - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + url_results = [self.url_result(url, 'Youtube') for url in videos] + return [self.playlist_result(url_results, playlist_id, playlist_title)] class YoutubeChannelIE(InfoExtractor): """Information Extractor for YouTube channels.""" - _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$" + _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' - _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO + _MORE_PAGES_INDICATOR = 'yt-uix-load-more' + _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' def report_download_page(self, channel_id, pagenum): """Report attempt to download channel page with given number.""" self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum)) + def extract_videos_from_page(self, page): + ids_in_page = [] + for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + return ids_in_page + def _real_extract(self, url): # Extract channel id mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return - # Download channel pages + # Download channel page channel_id = mobj.group(1) video_ids = [] pagenum = 1 - while True: - self.report_download_page(channel_id, pagenum) - url = self._TEMPLATE_URL % (channel_id, pagenum) - request = compat_urllib_request.Request(url) - try: - page = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) - return + self.report_download_page(channel_id, pagenum) + url = self._TEMPLATE_URL % (channel_id, pagenum) + request = compat_urllib_request.Request(url) + try: + page = compat_urllib_request.urlopen(request).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return - # Extract video identifiers - ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + # Extract video identifiers + ids_in_page = self.extract_videos_from_page(page) + video_ids.extend(ids_in_page) - if re.search(self._MORE_PAGES_INDICATOR, page) is None: - break - pagenum = pagenum + 1 + # Download any subsequent channel pages using the json-based channel_ajax query + if self._MORE_PAGES_INDICATOR in page: + while True: + pagenum = pagenum + 1 + + self.report_download_page(channel_id, pagenum) + url = self._MORE_PAGES_URL % (pagenum, channel_id) + request = compat_urllib_request.Request(url) + try: + page = compat_urllib_request.urlopen(request).read().decode('utf8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) + return + + page = json.loads(page) + + ids_in_page = self.extract_videos_from_page(page['content_html']) + video_ids.extend(ids_in_page) + + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + break + + self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] + url_entries = [self.url_result(url, 'Youtube') for url in urls] + return [self.playlist_result(url_entries, channel_id)] class YoutubeUserIE(InfoExtractor): @@ -1801,9 +1820,6 @@ class YoutubeUserIE(InfoExtractor): _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' IE_NAME = u'youtube:user' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_page(self, username, start_index): """Report attempt to download user page.""" self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % @@ -1813,7 +1829,7 @@ class YoutubeUserIE(InfoExtractor): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return username = mobj.group(1) @@ -1833,9 +1849,9 @@ class YoutubeUserIE(InfoExtractor): request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) try: - page = compat_urllib_request.urlopen(request).read() + page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1858,20 +1874,9 @@ class YoutubeUserIE(InfoExtractor): pagenum += 1 - all_ids_count = len(video_ids) - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - - if playlistend == -1: - video_ids = video_ids[playliststart:] - else: - video_ids = video_ids[playliststart:playlistend] - - self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % - (username, all_ids_count, len(video_ids))) - - for video_id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) + urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] + url_results = [self.url_result(url, 'Youtube') for url in urls] + return [self.playlist_result(url_results, playlist_title = username)] class BlipTVUserIE(InfoExtractor): @@ -1881,19 +1886,16 @@ class BlipTVUserIE(InfoExtractor): _PAGE_SIZE = 12 IE_NAME = u'blip.tv:user' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_page(self, username, pagenum): """Report attempt to download user page.""" - self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' % - (self.IE_NAME, username, pagenum)) + self.to_screen(u'user %s: Downloading video ids from page %d' % + (username, pagenum)) def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return username = mobj.group(1) @@ -1907,7 +1909,7 @@ class BlipTVUserIE(InfoExtractor): mobj = re.search(r'data-users-id="([^"]+)"', page) page_base = page_base % mobj.group(1) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return @@ -1921,13 +1923,12 @@ class BlipTVUserIE(InfoExtractor): while True: self.report_download_page(username, pagenum) - - request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) ) - + url = page_base + "&page=" + str(pagenum) + request = compat_urllib_request.Request( url ) try: page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % str(err)) return # Extract video identifiers @@ -1950,38 +1951,15 @@ class BlipTVUserIE(InfoExtractor): pagenum += 1 - all_ids_count = len(video_ids) - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - - if playlistend == -1: - video_ids = video_ids[playliststart:] - else: - video_ids = video_ids[playliststart:playlistend] - - self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % - (self.IE_NAME, username, all_ids_count, len(video_ids))) - - for video_id in video_ids: - self._downloader.download([u'http://blip.tv/'+video_id]) + urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + url_entries = [self.url_result(url, 'BlipTV') for url in urls] + return [self.playlist_result(url_entries, playlist_title = username)] class DepositFilesIE(InfoExtractor): """Information extractor for depositfiles.com""" _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)' - IE_NAME = u'DepositFiles' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, file_id): - """Report webpage download.""" - self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id) - - def report_extraction(self, file_id): - """Report information extraction.""" - self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) def _real_extract(self, url): file_id = url.split('/')[-1] @@ -1995,7 +1973,7 @@ class DepositFilesIE(InfoExtractor): self.report_download_webpage(file_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err)) return # Search for the real file URL @@ -2005,9 +1983,9 @@ class DepositFilesIE(InfoExtractor): mobj = re.search(r'(Attention.*?)', webpage, re.DOTALL) if (mobj is not None) and (mobj.group(1) is not None): restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip() - self._downloader.trouble(u'ERROR: %s' % restriction_message) + self._downloader.report_error(u'%s' % restriction_message) else: - self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url) + self._downloader.report_error(u'unable to extract download URL from: %s' % url) return file_url = mobj.group(1) @@ -2016,7 +1994,7 @@ class DepositFilesIE(InfoExtractor): # Search for file title mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return file_title = mobj.group(1).decode('utf-8') @@ -2033,62 +2011,14 @@ class DepositFilesIE(InfoExtractor): class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _WORKING = False _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P\d+)(?:.*)' _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' _NETRC_MACHINE = 'facebook' - _available_formats = ['video', 'highqual', 'lowqual'] - _video_extensions = { - 'video': 'mp4', - 'highqual': 'mp4', - 'lowqual': 'mp4', - } IE_NAME = u'facebook' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def _reporter(self, message): - """Add header and report message.""" - self._downloader.to_screen(u'[facebook] %s' % message) - def report_login(self): """Report attempt to log in.""" - self._reporter(u'Logging in') - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self._reporter(u'%s: Downloading video webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self._reporter(u'%s: Extracting video information' % video_id) - - def _parse_page(self, video_webpage): - """Extract video information from page""" - # General data - data = {'title': r'\("video_title", "(.*?)"\)', - 'description': r'(.*?)', - 'owner': r'\("video_owner_name", "(.*?)"\)', - 'thumbnail': r'\("thumb_url", "(?P.*?)"\)', - } - video_info = {} - for piece in data.keys(): - mobj = re.search(data[piece], video_webpage) - if mobj is not None: - video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape")) - - # Video urls - video_urls = {} - for fmt in self._available_formats: - mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage) - if mobj is not None: - # URL is in a Javascript segment inside an escaped Unicode format within - # the generally utf-8 page - video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape")) - video_info['video_urls'] = video_urls - - return video_info + self.to_screen(u'Logging in') def _real_initialize(self): if self._downloader is None: @@ -2111,7 +2041,7 @@ class FacebookIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return if useremail is None: @@ -2128,113 +2058,54 @@ class FacebookIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('ID') - # Get video webpage - self.report_video_webpage_download(video_id) - request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id) - try: - page = compat_urllib_request.urlopen(request) - video_webpage = page.read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) - return - - # Start extracting information - self.report_information_extraction(video_id) - - # Extract information - video_info = self._parse_page(video_webpage) - - # uploader - if 'owner' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = video_info['owner'] - - # title - if 'title' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract video title') - return - video_title = video_info['title'] - video_title = video_title.decode('utf-8') - - # thumbnail image - if 'thumbnail' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') - video_thumbnail = '' - else: - video_thumbnail = video_info['thumbnail'] - - # upload date - upload_date = None - if 'upload_date' in video_info: - upload_time = video_info['upload_date'] - timetuple = email.utils.parsedate_tz(upload_time) - if timetuple is not None: - try: - upload_date = time.strftime('%Y%m%d', timetuple[0:9]) - except: - pass - - # description - video_description = video_info.get('description', 'No description available.') - - url_map = video_info['video_urls'] - if len(url_map.keys()) > 0: - # Decide which formats to download - req_format = self._downloader.params.get('format', None) - format_limit = self._downloader.params.get('format_limit', None) - - if format_limit is not None and format_limit in self._available_formats: - format_list = self._available_formats[self._available_formats.index(format_limit):] - else: - format_list = self._available_formats - existing_formats = [x for x in format_list if x in url_map] - if len(existing_formats) == 0: - self._downloader.trouble(u'ERROR: no known formats available for video') - return - if req_format is None: - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality - elif req_format == '-1': - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific format - if req_format not in url_map: - self._downloader.trouble(u'ERROR: requested format not available') - return - video_url_list = [(req_format, url_map[req_format])] # Specific format + url = 'https://www.facebook.com/video/video.php?v=%s' % video_id + webpage = self._download_webpage(url, video_id) + + BEFORE = '{swf.addParam(param[0], param[1]);});\n' + AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' + m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) + if not m: + raise ExtractorError(u'Cannot parse data') + data = dict(json.loads(m.group(1))) + params_raw = compat_urllib_parse.unquote(data['params']) + params = json.loads(params_raw) + video_data = params['video_data'][0] + video_url = video_data.get('hd_src') + if not video_url: + video_url = video_data['sd_src'] + if not video_url: + raise ExtractorError(u'Cannot find video URL') + video_duration = int(video_data['video_duration']) + thumbnail = video_data['thumbnail_src'] + + m = re.search('([^<]+)', webpage) + if not m: + raise ExtractorError(u'Cannot find title in webpage') + video_title = unescapeHTML(m.group(1)) - results = [] - for format_param, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(format_param, 'mp4') + info = { + 'id': video_id, + 'title': video_title, + 'url': video_url, + 'ext': 'mp4', + 'duration': video_duration, + 'thumbnail': thumbnail, + } + return [info] - results.append({ - 'id': video_id.decode('utf-8'), - 'url': video_real_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description.decode('utf-8'), - }) - return results class BlipTVIE(InfoExtractor): """Information extractor for blip.tv""" @@ -2243,26 +2114,34 @@ class BlipTVIE(InfoExtractor): _URL_EXT = r'^.*\.([a-z0-9]+)$' IE_NAME = u'blip.tv' - def report_extraction(self, file_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) - def report_direct_download(self, title): """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title)) + self.to_screen(u'%s: Direct download detected' % title) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + if '?' in url: cchar = '&' else: cchar = '?' json_url = url + cchar + 'skin=json&version=2&no_wrap=1' request = compat_urllib_request.Request(json_url) + request.add_header('User-Agent', 'iTunes/10.6.1') self.report_extraction(mobj.group(1)) info = None try: @@ -2283,14 +2162,13 @@ class BlipTVIE(InfoExtractor): 'urlhandle': urlh } except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - return + raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) if info is None: # Regular URL try: json_code_bytes = urlh.read() json_code = json_code_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err)) return try: @@ -2317,13 +2195,13 @@ class BlipTVIE(InfoExtractor): 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], 'description': data['description'], - 'player_url': data['embedUrl'] + 'player_url': data['embedUrl'], + 'user_agent': 'iTunes/10.6.1', } except (ValueError,KeyError) as err: - self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err)) + self._downloader.report_error(u'unable to parse video information: %s' % repr(err)) return - std_headers['User-Agent'] = 'iTunes/10.6.1' return [info] @@ -2333,45 +2211,29 @@ class MyVideoIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' IE_NAME = u'myvideo' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id) - def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._download.trouble(u'ERROR: invalid URL: %s' % url) + self._download.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) # Get video webpage - request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return + webpage_url = 'http://www.myvideo.de/watch/%s' % video_id + webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) - mobj = re.search(r'', + mobj = re.search(r'([^<]+)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1) @@ -2388,8 +2250,19 @@ class MyVideoIE(InfoExtractor): class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ - _VALID_URL = r'^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?Pthedailyshow|colbertnation)\.com/full-episodes/(?P.*)$' - IE_NAME = u'comedycentral' + # urls can be abbreviations like :thedailyshow or :colbert + # urls for episodes like: + # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day + # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news + # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 + _VALID_URL = r"""^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport) + |(https?://)?(www\.)? + (?Pthedailyshow|colbertnation)\.com/ + (full-episodes/(?P.*)| + (?P + (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) + |(watch/(?P[^/]*)/(?P.*))))) + $""" _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] @@ -2410,18 +2283,16 @@ class ComedyCentralIE(InfoExtractor): '400': '384x216', } - def report_extraction(self, episode_id): - self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def report_config_download(self, episode_id): - self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id) + def report_config_download(self, episode_id, media_id): + self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id)) def report_index_download(self, episode_id): - self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id) - - def report_player_url(self, episode_id): - self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id) - + self.to_screen(u'%s: Downloading show index' % episode_id) def _print_formats(self, formats): print('Available formats:') @@ -2430,9 +2301,9 @@ class ComedyCentralIE(InfoExtractor): def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return if mobj.group('shortname'): @@ -2440,56 +2311,55 @@ class ComedyCentralIE(InfoExtractor): url = u'http://www.thedailyshow.com/full-episodes/' else: url = u'http://www.colbertnation.com/full-episodes/' - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) assert mobj is not None - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') + if mobj.group('clip'): + if mobj.group('showname') == 'thedailyshow': + epTitle = mobj.group('tdstitle') + else: + epTitle = mobj.group('cntitle') + dlNewest = False else: - epTitle = mobj.group('episode') + dlNewest = not mobj.group('episode') + if dlNewest: + epTitle = mobj.group('showname') + else: + epTitle = mobj.group('episode') req = compat_urllib_request.Request(url) self.report_extraction(epTitle) try: htmlHandle = compat_urllib_request.urlopen(req) html = htmlHandle.read() + webpage = html.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return if dlNewest: url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url) + self._downloader.report_error(u'Invalid redirected URL: ' + url) return if mobj.group('episode') == '': - self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url) + self._downloader.report_error(u'Redirected URL is still not specific: ' + url) return epTitle = mobj.group('episode') - mMovieParams = re.findall('(?:gsp.comedystor/.*)$', rtmp_video_url) + if not m: + raise ExtractorError(u'Cannot transform RTMP url') + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + video_url = base + m.group('finalid') - if video_url.startswith(broken_cdn): - video_url = video_url.replace(broken_cdn, better_cdn) - - effTitle = showId + u'-' + epTitle + effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) info = { 'id': shortMediaId, 'url': video_url, @@ -2566,11 +2434,9 @@ class ComedyCentralIE(InfoExtractor): 'format': format, 'thumbnail': None, 'description': officialTitle, - 'player_url': None #playerUrl } - results.append(info) - + return results @@ -2580,16 +2446,13 @@ class EscapistIE(InfoExtractor): _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?]?.*$' IE_NAME = u'escapist' - def report_extraction(self, showName): - self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName) - def report_config_download(self, showName): - self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) + self.to_screen(u'%s: Downloading configuration' % showName) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return showName = mobj.group('showname') videoId = mobj.group('episode') @@ -2601,7 +2464,7 @@ class EscapistIE(InfoExtractor): m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type']) webPage = webPageBytes.decode(m.group(1) if m else 'utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err)) + self._downloader.report_error(u'unable to download webpage: ' + compat_str(err)) return descMatch = re.search('(.*?)\s+-\s+XVID', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return - video_title = mobj.group(1).decode('utf-8') + video_title = mobj.group(1) # Extract video thumbnail mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return - video_thumbnail = mobj.group(0).decode('utf-8') + video_thumbnail = mobj.group(0) info = { 'id': video_id, @@ -2801,21 +2646,14 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'soundcloud' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_resolve(self, video_id): """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id)) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id)) + self.to_screen(u'%s: Resolving id' % video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return # extract uploader (which is in the url) @@ -2833,7 +2671,7 @@ class SoundcloudIE(InfoExtractor): info_json_bytes = compat_urllib_request.urlopen(request).read() info_json = info_json_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err)) return info = json.loads(info_json) @@ -2846,75 +2684,131 @@ class SoundcloudIE(InfoExtractor): stream_json_bytes = compat_urllib_request.urlopen(request).read() stream_json = stream_json_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err)) return streams = json.loads(stream_json) mediaURL = streams['http_mp3_128_url'] + upload_date = unified_strdate(info['created_at']) return [{ 'id': info['id'], 'url': mediaURL, 'uploader': info['user']['username'], - 'upload_date': info['created_at'], + 'upload_date': upload_date, 'title': info['title'], 'ext': u'mp3', 'description': info['description'], }] +class SoundcloudSetIE(InfoExtractor): + """Information extractor for soundcloud.com sets + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ -class InfoQIE(InfoExtractor): - """Information extractor for infoq.com""" - - _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' - IE_NAME = u'infoq' - - def report_webpage(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' + IE_NAME = u'soundcloud:set' - def report_extraction(self, video_id): + def report_resolve(self, video_id): """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + self.to_screen(u'%s: Resolving id' % video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return - self.report_webpage(url) + # extract uploader (which is in the url) + uploader = mobj.group(1) + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2) + simple_title = uploader + u'-' + slug_title + + self.report_resolve('%s/sets/%s' % (uploader, slug_title)) - request = compat_urllib_request.Request(url) + url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) + resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + request = compat_urllib_request.Request(resolv_url) try: - webpage = compat_urllib_request.urlopen(request).read() + info_json_bytes = compat_urllib_request.urlopen(request).read() + info_json = info_json_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err)) return - self.report_extraction(url) + videos = [] + info = json.loads(info_json) + if 'errors' in info: + for err in info['errors']: + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message'])) + return + for track in info['tracks']: + video_id = track['id'] + self.report_extraction('%s/sets/%s' % (uploader, slug_title)) - # Extract video URL - mobj = re.search(r"jsclassref='([^']*)'", webpage) + streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + request = compat_urllib_request.Request(streams_url) + try: + stream_json_bytes = compat_urllib_request.urlopen(request).read() + stream_json = stream_json_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err)) + return + + streams = json.loads(stream_json) + mediaURL = streams['http_mp3_128_url'] + + videos.append({ + 'id': video_id, + 'url': mediaURL, + 'uploader': track['user']['username'], + 'upload_date': track['created_at'], + 'title': track['title'], + 'ext': u'mp3', + 'description': track['description'], + }) + return videos + + +class InfoQIE(InfoExtractor): + """Information extractor for infoq.com""" + _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video url') + self._downloader.report_error(u'invalid URL: %s' % url) return - video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64')) + webpage = self._download_webpage(url, video_id=url) + self.report_extraction(url) + + # Extract video URL + mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) + if mobj is None: + self._downloader.report_error(u'unable to extract video url') + return + real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) + video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title mobj = re.search(r'contentTitle = "(.*?)";', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return - video_title = mobj.group(1).decode('utf-8') + video_title = mobj.group(1) # Extract description video_description = u'No description available.' mobj = re.search(r'', webpage) if mobj is not None: - video_description = mobj.group(1).decode('utf-8') + video_description = mobj.group(1) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2934,19 +2828,14 @@ class InfoQIE(InfoExtractor): class MixcloudIE(InfoExtractor): """Information extractor for www.mixcloud.com""" + + _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'mixcloud' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_download_json(self, file_id): """Report JSON download.""" - self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME) - - def report_extraction(self, file_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) + self.to_screen(u'Downloading json') def get_urls(self, jsonData, fmt, bitrate='best'): """Get urls from 'audio_formats' section in json""" @@ -2987,7 +2876,7 @@ class MixcloudIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return # extract uploader & filename from url uploader = mobj.group(1).decode('utf-8') @@ -3001,7 +2890,7 @@ class MixcloudIE(InfoExtractor): self.report_download_json(file_url) jsonData = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err)) return # parse JSON @@ -3024,8 +2913,8 @@ class MixcloudIE(InfoExtractor): if file_url is not None: break # got it! else: - if req_format not in formats.keys(): - self._downloader.trouble(u'ERROR: format is not available') + if req_format not in formats: + self._downloader.report_error(u'format is not available') return url_list = self.get_urls(formats, req_format) @@ -3051,19 +2940,10 @@ class StanfordOpenClassroomIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P[^&]+))?(&.*)?)?))$' IE_NAME = u'stanfordoc' - def report_download_webpage(self, objid): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return + raise ExtractorError(u'Invalid URL: %s' % url) if mobj.group('course') and mobj.group('video'): # A specific video course = mobj.group('course') @@ -3080,14 +2960,14 @@ class StanfordOpenClassroomIE(InfoExtractor): try: metaXml = compat_urllib_request.urlopen(xmlUrl).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err)) return mdoc = xml.etree.ElementTree.fromstring(metaXml) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: - self._downloader.trouble(u'\nERROR: Invalid metadata XML file') + self._downloader.report_error(u'Invalid metadata XML file') return info['ext'] = info['url'].rpartition('.')[2] return [info] @@ -3100,12 +2980,9 @@ class StanfordOpenClassroomIE(InfoExtractor): 'upload_date': None, } - self.report_download_webpage(info['id']) - try: - coursepage = compat_urllib_request.urlopen(url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err)) - return + coursepage = self._download_webpage(url, info['id'], + note='Downloading course info page', + errnote='Unable to download course info page') m = re.search('([^<]+)', coursepage) if m: @@ -3129,7 +3006,6 @@ class StanfordOpenClassroomIE(InfoExtractor): assert entry['type'] == 'reference' results += self.extract(entry['url']) return results - else: # Root page info = { 'id': 'Stanford OpenClassroom', @@ -3143,7 +3019,7 @@ class StanfordOpenClassroomIE(InfoExtractor): try: rootpage = compat_urllib_request.urlopen(rootURL).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err)) + self._downloader.report_error(u'unable to download course info page: ' + compat_str(err)) return info['title'] = info['id'] @@ -3168,52 +3044,38 @@ class MTVIE(InfoExtractor): _VALID_URL = r'^(?Phttps?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$' IE_NAME = u'mtv' - def report_webpage(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return if not mobj.group('proto'): url = 'http://' + url video_id = mobj.group('videoid') - self.report_webpage(video_id) - request = compat_urllib_request.Request(url) - try: - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(url, video_id) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract song name') + self._downloader.report_error(u'unable to extract song name') return song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract performer') + self._downloader.report_error(u'unable to extract performer') return performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + video_title = performer + ' - ' + song_name mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to mtvn_uri') + self._downloader.report_error(u'unable to mtvn_uri') return mtvn_uri = mobj.group(1) mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract content id') + self._downloader.report_error(u'unable to extract content id') return content_id = mobj.group(1) @@ -3223,7 +3085,7 @@ class MTVIE(InfoExtractor): try: metadataXml = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err)) return mdoc = xml.etree.ElementTree.fromstring(metadataXml) @@ -3237,7 +3099,7 @@ class MTVIE(InfoExtractor): format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] video_url = rendition.find('./src').text except KeyError: - self._downloader.trouble('Invalid rendition field.') + self._downloader.report_error('Invalid rendition field.') return info = { @@ -3254,20 +3116,7 @@ class MTVIE(InfoExtractor): class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P[A-Za-z0-9]+)\.html' - IE_NAME = u'Youku' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, file_id): - """Report webpage download.""" - self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id) - - def report_extraction(self, file_id): - """Report information extraction.""" - self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id) def _gen_sid(self): nowTime = int(time.time() * 1000) @@ -3300,7 +3149,7 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('ID') @@ -3311,18 +3160,19 @@ class YoukuIE(InfoExtractor): self.report_download_webpage(video_id) jsondata = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return self.report_extraction(video_id) try: - config = json.loads(jsondata) + jsonstr = jsondata.decode('utf-8') + config = json.loads(jsonstr) video_title = config['data'][0]['title'] seed = config['data'][0]['seed'] format = self._downloader.params.get('format', None) - supported_format = config['data'][0]['streamfileids'].keys() + supported_format = list(config['data'][0]['streamfileids'].keys()) if format is None or format == 'best': if 'hd2' in supported_format: @@ -3339,16 +3189,9 @@ class YoukuIE(InfoExtractor): fileid = config['data'][0]['streamfileids'][format] - seg_number = len(config['data'][0]['segs'][format]) - - keys=[] - for i in xrange(seg_number): - keys.append(config['data'][0]['segs'][format][i]['k']) - - #TODO check error - #youku only could be viewed from mainland china - except: - self._downloader.trouble(u'ERROR: unable to extract info section') + keys = [s['k'] for s in config['data'][0]['segs'][format]] + except (UnicodeDecodeError, ValueError, KeyError): + self._downloader.report_error(u'unable to extract info section') return files_info=[] @@ -3378,53 +3221,46 @@ class YoukuIE(InfoExtractor): class XNXXIE(InfoExtractor): """Information extractor for xnxx.com""" - _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)' + _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' IE_NAME = u'xnxx' VIDEO_URL_RE = r'flv_url=(.*?)&' VIDEO_TITLE_RE = r'(.*?)\s+-\s+XNXX.COM' VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' - def report_webpage(self, video_id): - """Report information extraction""" - self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) - - def report_extraction(self, video_id): - """Report information extraction""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return - video_id = mobj.group(1).decode('utf-8') + video_id = mobj.group(1) - self.report_webpage(video_id) + self.report_download_webpage(video_id) # Get webpage content try: - webpage = compat_urllib_request.urlopen(url).read() + webpage_bytes = compat_urllib_request.urlopen(url).read() + webpage = webpage_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err) + self._downloader.report_error(u'unable to download video webpage: %s' % err) return result = re.search(self.VIDEO_URL_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video url') + self._downloader.report_error(u'unable to extract video url') return - video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8')) + video_url = compat_urllib_parse.unquote(result.group(1)) result = re.search(self.VIDEO_TITLE_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return - video_title = result.group(1).decode('utf-8') + video_title = result.group(1) result = re.search(self.VIDEO_THUMB_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return - video_thumbnail = result.group(1).decode('utf-8') + video_thumbnail = result.group(1) return [{ 'id': video_id, @@ -3441,41 +3277,38 @@ class XNXXIE(InfoExtractor): class GooglePlusIE(InfoExtractor): """Information extractor for plus.google.com.""" - _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)' + _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - def report_extract_entry(self, url): """Report downloading extry""" - self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8')) + self.to_screen(u'Downloading entry: %s' % url) def report_date(self, upload_date): """Report downloading extry""" - self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date) + self.to_screen(u'Entry date: %s' % upload_date) def report_uploader(self, uploader): """Report downloading extry""" - self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8')) + self.to_screen(u'Uploader: %s' % uploader) def report_title(self, video_title): """Report downloading extry""" - self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8')) + self.to_screen(u'Title: %s' % video_title) def report_extract_vid_page(self, video_page): """Report information extraction.""" - self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8')) + self.to_screen(u'Extracting video page: %s' % video_page) def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return post_url = mobj.group(0) - video_id = mobj.group(2) + video_id = mobj.group(1) video_extension = 'flv' @@ -3483,9 +3316,9 @@ class GooglePlusIE(InfoExtractor): self.report_extract_entry(post_url) request = compat_urllib_request.Request(post_url) try: - webpage = compat_urllib_request.urlopen(request).read() + webpage = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err)) return # Extract update date @@ -3520,14 +3353,14 @@ class GooglePlusIE(InfoExtractor): pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' mobj = re.search(pattern, webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video page URL') + self._downloader.report_error(u'unable to extract video page URL') video_page = mobj.group(1) request = compat_urllib_request.Request(video_page) try: - webpage = compat_urllib_request.urlopen(request).read() + webpage = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return self.report_extract_vid_page(video_page) @@ -3537,7 +3370,7 @@ class GooglePlusIE(InfoExtractor): pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' mobj = re.findall(pattern, webpage) if len(mobj) == 0: - self._downloader.trouble(u'ERROR: unable to extract video links') + self._downloader.report_error(u'unable to extract video links') # Sort in resolution links = sorted(mobj) @@ -3547,14 +3380,961 @@ class GooglePlusIE(InfoExtractor): # Only get the url. The resolution part in the tuple has no use anymore video_url = video_url[-1] # Treat escaped \u0026 style hex - video_url = unicode(video_url, "unicode_escape") + try: + video_url = video_url.decode("unicode_escape") + except AttributeError: # Python 3 + video_url = bytes(video_url, 'ascii').decode('unicode-escape') return [{ - 'id': video_id.decode('utf-8'), + 'id': video_id, 'url': video_url, - 'uploader': uploader.decode('utf-8'), - 'upload_date': upload_date.decode('utf-8'), - 'title': video_title.decode('utf-8'), - 'ext': video_extension.decode('utf-8'), + 'uploader': uploader, + 'upload_date': upload_date, + 'title': video_title, + 'ext': video_extension, }] + +class NBAIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + IE_NAME = u'nba' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + video_id = mobj.group(1) + if video_id.endswith('/index.html'): + video_id = video_id[:-len('/index.html')] + + webpage = self._download_webpage(url, video_id) + + video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + def _findProp(rexp, default=None): + m = re.search(rexp, webpage) + if m: + return unescapeHTML(m.group(1)) + else: + return default + + shortened_video_id = video_id.rpartition('/')[2] + title = _findProp(r'Date: (.*?)'), + 'description': _findProp(r'(.*?)'), + } + return [info] + +class JustinTVIE(InfoExtractor): + """Information extractor for justin.tv and twitch.tv""" + # TODO: One broadcast may be split into multiple videos. The key + # 'broadcast_id' is the same for all parts, and 'broadcast_part' + # starts at 1 and increases. Can we treat all parts as one video? + + _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ + ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$""" + _JUSTIN_PAGE_LIMIT = 100 + IE_NAME = u'justin.tv' + + def report_download_page(self, channel, offset): + """Report attempt to download a single page of videos.""" + self.to_screen(u'%s: Downloading video information from %d to %d' % + (channel, offset, offset + self._JUSTIN_PAGE_LIMIT)) + + # Return count of items, list of *valid* items + def _parse_page(self, url): + try: + urlh = compat_urllib_request.urlopen(url) + webpage_bytes = urlh.read() + webpage = webpage_bytes.decode('utf-8', 'ignore') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err)) + return + + response = json.loads(webpage) + if type(response) != list: + error_text = response.get('error', 'unknown error') + self._downloader.report_error(u'Justin.tv API: %s' % error_text) + return + info = [] + for clip in response: + video_url = clip['video_file_url'] + if video_url: + video_extension = os.path.splitext(video_url)[1][1:] + video_date = re.sub('-', '', clip['start_time'][:10]) + video_uploader_id = clip.get('user_id', clip.get('channel_id')) + video_id = clip['id'] + video_title = clip.get('title', video_id) + info.append({ + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'uploader': clip.get('channel_name', video_uploader_id), + 'uploader_id': video_uploader_id, + 'upload_date': video_date, + 'ext': video_extension, + }) + return (len(response), info) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + api = 'http://api.justin.tv' + video_id = mobj.group(mobj.lastindex) + paged = False + if mobj.lastindex == 1: + paged = True + api += '/channel/archives/%s.json' + else: + api += '/broadcast/by_archive/%s.json' + api = api % (video_id,) + + self.report_extraction(video_id) + + info = [] + offset = 0 + limit = self._JUSTIN_PAGE_LIMIT + while True: + if paged: + self.report_download_page(video_id, offset) + page_url = api + ('?offset=%d&limit=%d' % (offset, limit)) + page_count, page_info = self._parse_page(page_url) + info.extend(page_info) + if not paged or page_count != limit: + break + offset += limit + return info + +class FunnyOrDieIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P[0-9a-f]+)/.*$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + m = re.search(r']*>\s*]*>\s*(?P.*?)", webpage, flags=re.DOTALL) + if not m: + m = re.search(r'(?P[^<]+?)', webpage) + if not m: + self._downloader.report_error(u'Cannot find video title') + title = clean_html(m.group('title')) + + m = re.search(r'video|app)/ #If the page is only for videos or for a game + (?P\d+)/? + (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID + """ + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url, re.VERBOSE) + gameID = m.group('gameID') + videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + game_title = re.search(r'(?P.*?)', webpage).group('game_title') + + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," + mweb = re.finditer(urlRE, webpage) + namesRE = r'(?P.+?)' + titles = re.finditer(namesRE, webpage) + thumbsRE = r'' + thumbs = re.finditer(thumbsRE, webpage) + videos = [] + for vid,vtitle,thumb in zip(mweb,titles,thumbs): + video_id = vid.group('videoID') + title = vtitle.group('videoName') + video_url = vid.group('videoURL') + video_thumb = thumb.group('thumbnail') + if not video_url: + self._downloader.report_error(u'Cannot find video url for %s' % video_id) + info = { + 'id':video_id, + 'url':video_url, + 'ext': 'flv', + 'title': unescapeHTML(title), + 'thumbnail': video_thumb + } + videos.append(info) + return [self.playlist_result(videos, gameID, game_title)] + +class UstreamIE(InfoExtractor): + _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P\d+)' + IE_NAME = u'ustream' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + video_url = u'http://tcdn.ustream.tv/video/%s' % video_id + webpage = self._download_webpage(url, video_id) + m = re.search(r'data-title="(?P.+)"',webpage) + title = m.group('title') + m = re.search(r'.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(url).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + self._downloader.report_error(u'Cannot find video url for %s' % video_id) + return + + _title = r"""(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + +class RBMARadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'', webpage) + if not m: + raise ExtractorError(u'Cannot find metadata') + json_data = m.group(1) + + try: + data = json.loads(json_data) + except ValueError as e: + raise ExtractorError(u'Invalid JSON: ' + str(e)) + + video_url = data['akamai_url'] + '&cbr=256' + url_parts = compat_urllib_parse_urlparse(video_url) + video_ext = url_parts.path.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': data['title'], + 'description': data.get('teaser_text'), + 'location': data.get('country_of_origin'), + 'uploader': data.get('host', {}).get('name'), + 'uploader_id': data.get('host', {}).get('slug'), + 'thumbnail': data.get('image', {}).get('large_url_2x'), + 'duration': data.get('duration'), + } + return [info] + + +class YouPornIE(InfoExtractor): + """Information extractor for youporn.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' + + def _print_formats(self, formats): + """Print all available formats""" + print(u'Available formats:') + print(u'ext\t\tformat') + print(u'---------------------------------') + for format in formats: + print(u'%s\t\t%s' % (format['ext'], format['format'])) + + def _specific(self, req_format, formats): + for x in formats: + if(x["format"]==req_format): + return x + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + video_id = mobj.group('videoid') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + # Get the video title + result = re.search(r'(?P.*)', webpage) + if result is None: + raise ExtractorError(u'Unable to extract video title') + video_title = result.group('title').strip() + + # Get the video date + result = re.search(r'Date:(?P.*) ', webpage) + if result is None: + self._downloader.report_warning(u'unable to extract video date') + upload_date = None + else: + upload_date = unified_strdate(result.group('date').strip()) + + # Get the video uploader + result = re.search(r'Submitted:(?P.*)', webpage) + if result is None: + self._downloader.report_warning(u'unable to extract uploader') + video_uploader = None + else: + video_uploader = result.group('uploader').strip() + video_uploader = clean_html( video_uploader ) + + # Get all of the formats available + DOWNLOAD_LIST_RE = r'(?s)(?P.*?)' + result = re.search(DOWNLOAD_LIST_RE, webpage) + if result is None: + raise ExtractorError(u'Unable to extract download list') + download_list_html = result.group('download_list').strip() + + # Get all of the links from the page + LINK_RE = r'(?s)' + links = re.findall(LINK_RE, download_list_html) + if(len(links) == 0): + raise ExtractorError(u'ERROR: no known formats available for video') + + self.to_screen(u'Links found: %d' % len(links)) + + formats = [] + for link in links: + + # A link looks like this: + # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 + # A path looks like this: + # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 + video_url = unescapeHTML( link ) + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + size = format[0] + bitrate = format[1] + format = "-".join( format ) + title = u'%s-%s-%s' % (video_title, size, bitrate) + + formats.append({ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': title, + 'ext': extension, + 'format': format, + 'thumbnail': None, + 'description': None, + 'player_url': None + }) + + if self._downloader.params.get('listformats', None): + self._print_formats(formats) + return + + req_format = self._downloader.params.get('format', None) + self.to_screen(u'Format: %s' % req_format) + + if req_format is None or req_format == 'best': + return [formats[0]] + elif req_format == 'worst': + return [formats[-1]] + elif req_format in ('-1', 'all'): + return formats + else: + format = self._specific( req_format, formats ) + if result is None: + self._downloader.report_error(u'requested format not available') + return + return [format] + + + +class PornotubeIE(InfoExtractor): + """Information extractor for pornotube.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P[0-9]+))?(/m/(?P[0-9]+))(/(?P.+))$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + video_id = mobj.group('videoid') + video_title = mobj.group('title') + + # Get webpage content + webpage = self._download_webpage(url, video_id) + + # Get the video URL + VIDEO_URL_RE = r'url: "(?Phttp://video[0-9].pornotube.com/.+\.flv)",' + result = re.search(VIDEO_URL_RE, webpage) + if result is None: + self._downloader.report_error(u'unable to extract video url') + return + video_url = compat_urllib_parse.unquote(result.group('url')) + + #Get the uploaded date + VIDEO_UPLOADED_RE = r'Added (?P[0-9\/]+) by' + result = re.search(VIDEO_UPLOADED_RE, webpage) + if result is None: + self._downloader.report_error(u'unable to extract video title') + return + upload_date = unified_strdate(result.group('date')) + + info = {'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': upload_date, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv'} + + return [info] + +class YouJizzIE(InfoExtractor): + """Information extractor for youjizz.com.""" + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P[^.]+).html$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.report_error(u'invalid URL: %s' % url) + return + + video_id = mobj.group('videoid') + + # Get webpage content + webpage = self._download_webpage(url, video_id) + + # Get the video title + result = re.search(r'(?P.*)', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract video title') + video_title = result.group('title').strip() + + # Get the embed page + result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract embed page') + + embed_page_url = result.group(0).strip() + video_id = result.group('videoid') + + webpage = self._download_webpage(embed_page_url, video_id) + + # Get the video URL + result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) + if result is None: + raise ExtractorError(u'ERROR: unable to extract video url') + video_url = result.group('source') + + info = {'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv', + 'player_url': embed_page_url} + + return [info] + +class EightTracksIE(InfoExtractor): + IE_NAME = '8tracks' + _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + + m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) + if not m: + raise ExtractorError(u'Cannot find trax information') + json_like = m.group(1) + data = json.loads(json_like) + + session = str(random.randint(0, 1000000000)) + mix_id = data['id'] + track_count = data['tracks_count'] + first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) + next_url = first_url + res = [] + for i in itertools.count(): + api_json = self._download_webpage(next_url, playlist_id, + note=u'Downloading song information %s/%s' % (str(i+1), track_count), + errnote=u'Failed to download song information') + api_data = json.loads(api_json) + track_data = api_data[u'set']['track'] + info = { + 'id': track_data['id'], + 'url': track_data['track_file_stream_url'], + 'title': track_data['performer'] + u' - ' + track_data['name'], + 'raw_title': track_data['name'], + 'uploader_id': data['user']['login'], + 'ext': 'm4a', + } + res.append(info) + if api_data['set']['at_last_track']: + break + next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) + return res + +class KeekIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + IE_NAME = u'keek' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id + thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id + webpage = self._download_webpage(url, video_id) + m = re.search(r'[\S\s]+?(?P.+?)', webpage) + uploader = clean_html(m.group('uploader')) + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader + } + return [info] + +class TEDIE(InfoExtractor): + _VALID_URL=r'''http://www.ted.com/ + ( + ((?Pplaylists)/(?P\d+)) # We have a playlist + | + ((?Ptalks)) # We have a simple talk + ) + /(?P\w+) # Here goes the name and then ".html" + ''' + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def _real_extract(self, url): + m=re.match(self._VALID_URL, url, re.VERBOSE) + if m.group('type_talk'): + return [self._talk_info(url)] + else : + playlist_id=m.group('playlist_id') + name=m.group('name') + self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) + return [self._playlist_videos_info(url,name,playlist_id)] + + def _talk_video_link(self,mediaSlug): + '''Returns the video link for that mediaSlug''' + return 'http://download.ted.com/talks/%s.mp4' % mediaSlug + + def _playlist_videos_info(self,url,name,playlist_id=0): + '''Returns the videos of the playlist''' + video_RE=r''' + \d+)" + ([.\s]*?)data-playlist_item_id="(\d+)" + ([.\s]*?)data-mediaslug="(?P.+?)" + ''' + video_name_RE=r'(?P.+?)' + webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') + m_videos=re.finditer(video_RE,webpage,re.VERBOSE) + m_names=re.finditer(video_name_RE,webpage) + + playlist_RE = r'div class="headline">(\s*?)(\s*?)(?P.*?)' + m_playlist = re.search(playlist_RE, webpage) + playlist_title = m_playlist.group('playlist_title') + + playlist_entries = [] + for m_video, m_name in zip(m_videos,m_names): + video_id=m_video.group('video_id') + talk_url='http://www.ted.com%s' % m_name.group('talk_url') + playlist_entries.append(self.url_result(talk_url, 'TED')) + return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) + + def _talk_info(self, url, video_id=0): + """Return the video for the talk in the url""" + m=re.match(self._VALID_URL, url,re.VERBOSE) + videoName=m.group('name') + webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + # If the url includes the language we get the title translated + title_RE=r'(?P.*)' + title=re.search(title_RE, webpage).group('title') + info_RE=r'''
(?P.+?)