X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=34a28c126e068aa55f072cc011016c63d945a409;hb=39d60b715a75e92d169167876bb07424301c0b67;hp=dc508050413c8490882323e01d91ea3a3ba88c9a;hpb=03950c90f778293e65e2b79f2cf8ea3274695460;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..34a28c126 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,18 +10,17 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, - compat_HTTPError, + compat_getpass, compat_http_client, compat_urllib_error, - compat_urllib_parse_urlparse, - compat_urllib_request, + compat_urllib_parse, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -30,13 +29,20 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, + sanitized_Request, unescapeHTML, + unified_strdate, + url_basename, + xpath_text, + xpath_with_ns, + determine_protocol, ) @@ -147,6 +153,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. @@ -158,12 +165,14 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -200,8 +209,8 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title" and "id" attributes with the same - semantics as videos (see above). + Additionally, playlists can have "title", "description" and "id" attributes + with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that @@ -302,11 +311,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -324,7 +333,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -453,7 +463,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -505,6 +515,18 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -602,11 +624,11 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -620,13 +642,14 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -636,7 +659,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): @@ -720,20 +743,23 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): - return dict([ - (input.group('name'), input.group('value')) for input in re.finditer( - r'''(?x) - ["\'])hidden(?P=q_hidden)\s+ - name=(?P["\'])(?P.+?)(?P=q_name)\s+ - (?:id=(?P["\']).+?(?P=q_id)\s+)? - value=(?P["\'])(?P.*?)(?P=q_value) - ''', html) - ]) + html = re.sub(r'', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + continue + name = re.search(r'name=(["\'])(?P.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) @@ -752,14 +778,12 @@ class InfoExtractor(object): preference = f.get('preference') if preference is None: - proto = f.get('protocol') - if proto is None: - proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme - - preference = 0 if proto in ['http', 'https'] else -0.1 + preference = 0 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 + proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + if f.get('vcodec') == 'none': # audio only if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] @@ -790,6 +814,7 @@ class InfoExtractor(object): f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, + proto_preference, ext_preference, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, @@ -817,7 +842,7 @@ class InfoExtractor(object): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False @@ -848,13 +873,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return [] formats = [] manifest_version = '1.0' @@ -862,6 +892,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + base_url = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -869,13 +904,16 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -911,13 +949,15 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return [] + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() last_info = None last_media = None kv_rex = re.compile( @@ -978,69 +1018,237 @@ class InfoExtractor(object): self._sort_formats(formats) return formats - # TODO: improve extraction - def _extract_smil_formats(self, smil_url, video_id, fatal=True): - smil = self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: assert not fatal return [] - base = smil.find('./head/meta').get('base') + namespace = self._parse_smil_namespace(smil) + + return self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._parse_smil_namespace(smil) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + upload_date = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break formats = [] rtmp_count = 0 - if smil.findall('./body/seq/video'): - video = smil.findall('./body/seq/video')[0] - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) - else: - for video in smil.findall('./body/switch/video'): - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) + http_count = 0 + + videos = smil.findall(self._xpath_ns('.//video', namespace)) + for video in videos: + src = video.get('src') + if not src: + continue + + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + filesize = int_or_none(video.get('size') or video.get('fileSize')) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + ext = video.get('ext') + src_ext = determine_ext(src) + streamer = video.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): + rtmp_count += 1 + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + + if proto == 'm3u8' or src_ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + continue + + if src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + continue + + if src_url.startswith('http') and self._is_valid_url(src, video_id): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue self._sort_formats(formats) return formats - def _parse_smil_video(self, video, video_id, base, rtmp_count): - src = video.get('src') - if not src: - return [], rtmp_count - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - if not proto: - if base: - if base.startswith('rtmp'): - proto = 'rtmp' - elif base.startswith('http'): - proto = 'http' - ext = video.get('ext') - if proto == 'm3u8': - return self._extract_m3u8_formats(src, video_id, ext), rtmp_count - elif proto == 'rtmp': - rtmp_count += 1 - streamer = video.get('streamer') or base - return ([{ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) - elif proto.startswith('http'): - return ([{ - 'url': base + src, - 'ext': ext or 'flv', - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src: + continue + ext = textstream.get('ext') or determine_ext(src) + if not ext: + type_ = textstream.get('type') + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles + + def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + xspf = self._download_xml( + playlist_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf(xspf, playlist_id) + + def _parse_xspf(self, playlist, playlist_id): + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [{ + 'url': location.text, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries def _live_title(self, name): """ Generate the title for a live video """ @@ -1078,7 +1286,7 @@ class InfoExtractor(object): def _get_cookies(self, url): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = compat_urllib_request.Request(url) + req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) return compat_cookies.SimpleCookie(req.get_header('Cookie')) @@ -1120,6 +1328,23 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + return ret + def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) or self._downloader.params.get('listsubtitles')):