X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=199a04d1c2b3d7da1b66b65f7f1b16543484e9dd;hb=4e0cff2a50f4c297fc25dae01c460596d8f5badb;hp=5684227dcfca770be68d1feea28616a5e0d84e57;hpb=1e5bcdec0264190ed2a05ee49c1f9f5b20ba3aa6;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5684227dc..199a04d1c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,20 +10,17 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, compat_getpass, - compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse, - compat_urllib_parse_urlparse, - compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -32,16 +29,21 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, int_or_none, + parse_iso8601, RegexNotFoundError, sanitize_filename, + sanitized_Request, unescapeHTML, + unified_strdate, url_basename, xpath_text, xpath_with_ns, + determine_protocol, ) @@ -107,8 +109,9 @@ class InfoExtractor(object): -2 or smaller for less than default. < -1000 to hide the format (if there is another one which is strictly better) - * language_preference Is this in the correct requested - language? + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? 10 if it's what the URL is about, -1 for default (don't know), -10 otherwise, other values reserved for now. @@ -164,12 +167,14 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -197,6 +202,26 @@ class InfoExtractor(object): end_time: Time in seconds where the reproduction should end, as specified in the URL. + The following fields should only be used when the video belongs to some logical + chapter or section: + + chapter: Name or title of the chapter the video belongs to. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. + + The following fields should only be used when the video is an episode of some + series or programme: + + series: Title of the series or programme the video episode belongs to. + season: Title of the season the video episode belongs to. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. + episode: Title of the video episode. Unlike mandatory video title field, + this field should denote the exact title of the video episode + without any kind of decoration. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -289,9 +314,9 @@ class InfoExtractor(object): except ExtractorError: raise except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occured.', cause=e, expected=True) + raise ExtractorError('A network error has occurred.', cause=e, expected=True) except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occured.', cause=e) + raise ExtractorError('An extractor error has occurred.', cause=e) def set_downloader(self, downloader): """Sets the downloader for this IE.""" @@ -308,11 +333,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -330,7 +355,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -459,7 +485,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -620,7 +646,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) @@ -643,8 +669,9 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -736,6 +763,42 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') + def _search_json_ld(self, html, video_id, **kwargs): + json_ld = self._search_regex( + r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', + html, 'JSON-LD', group='json_ld', **kwargs) + if not json_ld: + return {} + return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + + def _json_ld(self, json_ld, video_id, fatal=True): + if isinstance(json_ld, compat_str): + json_ld = self._parse_json(json_ld, video_id, fatal=fatal) + if not json_ld: + return {} + info = {} + if json_ld.get('@context') == 'http://schema.org': + item_type = json_ld.get('@type') + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(json_ld.get('name')), + 'episode_number': int_or_none(json_ld.get('episodeNumber')), + 'description': unescapeHTML(json_ld.get('description')), + }) + part_of_season = json_ld.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = json_ld.get('partOfSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(json_ld.get('datePublished')), + 'title': unescapeHTML(json_ld.get('headline')), + 'description': unescapeHTML(json_ld.get('articleBody')), + }) + return dict((k, v) for k, v in info.items() if v is not None) + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) @@ -762,6 +825,12 @@ class InfoExtractor(object): if not formats: raise ExtractorError('No video formats found') + for f in formats: + # Automatically determine tbr when missing based on abr and vbr (improves + # formats sorting in some cases) + if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: + f['tbr'] = f['abr'] + f['vbr'] + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext @@ -773,14 +842,12 @@ class InfoExtractor(object): preference = f.get('preference') if preference is None: - proto = f.get('protocol') - if proto is None: - proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme - - preference = 0 if proto in ['http', 'https'] else -0.1 + preference = 0 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 + proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + if f.get('vcodec') == 'none': # audio only if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] @@ -811,6 +878,7 @@ class InfoExtractor(object): f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, + proto_preference, ext_preference, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, @@ -838,7 +906,7 @@ class InfoExtractor(object): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False @@ -869,13 +937,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return [] formats = [] manifest_version = '1.0' @@ -883,6 +956,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + base_url = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -890,13 +968,14 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -932,13 +1011,27 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return [] + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() + # A Media Playlist Tag MUST NOT appear in a Master Playlist + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: + return [{ + 'url': m3u8_url, + 'format_id': m3u8_id, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }] last_info = None last_media = None kv_rex = re.compile( @@ -983,9 +1076,9 @@ class InfoExtractor(object): # TODO: looks like video codec is not always necessarily goes first va_codecs = codecs.split(',') if va_codecs[0]: - f['vcodec'] = va_codecs[0].partition('.')[0] + f['vcodec'] = va_codecs[0] if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1].partition('.')[0] + f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') @@ -1044,6 +1137,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1053,6 +1147,8 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) thumbnails = [{ 'id': image.get('type'), @@ -1065,6 +1161,7 @@ class InfoExtractor(object): 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, @@ -1085,6 +1182,7 @@ class InfoExtractor(object): formats = [] rtmp_count = 0 http_count = 0 + m3u8_count = 0 videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: @@ -1124,8 +1222,17 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if len(m3u8_formats) == 1: + m3u8_count += 1 + m3u8_formats[0].update({ + 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + formats.extend(m3u8_formats) continue if src_ext == 'f4m': @@ -1137,10 +1244,10 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1223,6 +1330,83 @@ class InfoExtractor(object): }) return entries + def _download_dash_manifest(self, dash_manifest_url, video_id, fatal=True): + return self._download_xml( + dash_manifest_url, video_id, + note='Downloading DASH manifest', + errnote='Could not download DASH manifest', + fatal=fatal) + + def _extract_dash_manifest_formats(self, dash_manifest_url, video_id, fatal=True, namespace=None, formats_dict={}): + dash_doc = self._download_dash_manifest(dash_manifest_url, video_id, fatal) + if dash_doc is False: + return [] + + return self._parse_dash_manifest( + dash_doc, namespace=namespace, formats_dict=formats_dict) + + def _parse_dash_manifest(self, dash_doc, namespace=None, formats_dict={}): + def _add_ns(path): + return self._xpath_ns(path, namespace) + + formats = [] + for a in dash_doc.findall('.//' + _add_ns('AdaptationSet')): + mime_type = a.attrib.get('mimeType') + for r in a.findall(_add_ns('Representation')): + mime_type = r.attrib.get('mimeType') or mime_type + url_el = r.find(_add_ns('BaseURL')) + if mime_type == 'text/vtt': + # TODO implement WebVTT downloading + pass + elif mime_type.startswith('audio/') or mime_type.startswith('video/'): + segment_list = r.find(_add_ns('SegmentList')) + format_id = r.attrib['id'] + video_url = url_el.text if url_el is not None else None + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'height': int_or_none(r.attrib.get('height')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + 'fps': int_or_none(r.attrib.get('frameRate')), + } + if segment_list is not None: + initialization_url = segment_list.find(_add_ns('Initialization')).attrib['sourceURL'] + f.update({ + 'initialization_url': initialization_url, + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall(_add_ns('SegmentURL'))], + 'protocol': 'http_dash_segments', + }) + if not f.get('url'): + f['url'] = initialization_url + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + full_info = formats_dict.get(format_id, {}).copy() + full_info.update(f) + codecs = r.attrib.get('codecs') + if codecs: + if mime_type.startswith('video/'): + vcodec, acodec = codecs, 'none' + else: # mime_type.startswith('audio/') + vcodec, acodec = 'none', codecs + + full_info.update({ + 'vcodec': vcodec, + 'acodec': acodec, + }) + formats.append(full_info) + else: + existing_format.update(f) + else: + self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1259,7 +1443,7 @@ class InfoExtractor(object): def _get_cookies(self, url): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = compat_urllib_request.Request(url) + req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) return compat_cookies.SimpleCookie(req.get_header('Cookie'))