X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=b3d57dfce100f10605287a7d086281eb2d055fe7;hb=eab3c2895c66a8d2f5da181d3ccba35a901b813f;hp=c443daf2001633acc02a39c5571e2f804e3a5f0e;hpb=7109903e6140e01b5d5980fc80c6ef1b6c86797a;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c443daf20..b3d57dfce 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,6 +34,7 @@ from ..utils import ( fix_xml_ampersands, float_or_none, int_or_none, + parse_iso8601, RegexNotFoundError, sanitize_filename, sanitized_Request, @@ -108,8 +109,9 @@ class InfoExtractor(object): -2 or smaller for less than default. < -1000 to hide the format (if there is another one which is strictly better) - * language_preference Is this in the correct requested - language? + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? 10 if it's what the URL is about, -1 for default (don't know), -10 otherwise, other values reserved for now. @@ -204,21 +206,21 @@ class InfoExtractor(object): chapter or section: chapter: Name or title of the chapter the video belongs to. - chapter_id: Number or id of the chapter the video belongs to, as an integer - or unicode string. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. The following fields should only be used when the video is an episode of some series or programme: series: Title of the series or programme the video episode belongs to. season: Title of the season the video episode belongs to. - season_id: Number or id of the season the video episode belongs to, as an - integer or unicode string. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. episode: Title of the video episode. Unlike mandatory video title field, this field should denote the exact title of the video episode without any kind of decoration. - episode_id: Number or id of the video episode within a season, as an integer - or unicode string. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. Unless mentioned otherwise, the fields should be Unicode strings. @@ -312,9 +314,9 @@ class InfoExtractor(object): except ExtractorError: raise except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occured.', cause=e, expected=True) + raise ExtractorError('A network error has occurred.', cause=e, expected=True) except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occured.', cause=e) + raise ExtractorError('An extractor error has occurred.', cause=e) def set_downloader(self, downloader): """Sets the downloader for this IE.""" @@ -761,6 +763,42 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') + def _search_json_ld(self, html, video_id, **kwargs): + json_ld = self._search_regex( + r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', + html, 'JSON-LD', group='json_ld', **kwargs) + if not json_ld: + return {} + return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + + def _json_ld(self, json_ld, video_id, fatal=True): + if isinstance(json_ld, compat_str): + json_ld = self._parse_json(json_ld, video_id, fatal=fatal) + if not json_ld: + return {} + info = {} + if json_ld.get('@context') == 'http://schema.org': + item_type = json_ld.get('@type') + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(json_ld.get('name')), + 'episode_number': int_or_none(json_ld.get('episodeNumber')), + 'description': unescapeHTML(json_ld.get('description')), + }) + part_of_season = json_ld.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = json_ld.get('partOfSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(json_ld.get('datePublished')), + 'title': unescapeHTML(json_ld.get('headline')), + 'description': unescapeHTML(json_ld.get('articleBody')), + }) + return dict((k, v) for k, v in info.items() if v is not None) + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) @@ -787,6 +825,12 @@ class InfoExtractor(object): if not formats: raise ExtractorError('No video formats found') + for f in formats: + # Automatically determine tbr when missing based on abr and vbr (improves + # formats sorting in some cases) + if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: + f['tbr'] = f['abr'] + f['vbr'] + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext @@ -976,6 +1020,18 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() + # A Media Playlist Tag MUST NOT appear in a Master Playlist + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: + return [{ + 'url': m3u8_url, + 'format_id': m3u8_id, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }] last_info = None last_media = None kv_rex = re.compile( @@ -1020,9 +1076,9 @@ class InfoExtractor(object): # TODO: looks like video codec is not always necessarily goes first va_codecs = codecs.split(',') if va_codecs[0]: - f['vcodec'] = va_codecs[0].partition('.')[0] + f['vcodec'] = va_codecs[0] if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1].partition('.')[0] + f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') @@ -1126,6 +1182,7 @@ class InfoExtractor(object): formats = [] rtmp_count = 0 http_count = 0 + m3u8_count = 0 videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: @@ -1165,8 +1222,17 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if len(m3u8_formats) == 1: + m3u8_count += 1 + m3u8_formats[0].update({ + 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + formats.extend(m3u8_formats) continue if src_ext == 'f4m':