X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=fcdd0fd14a85a12690031b409058d932a3d4e4db;hb=f4b742727975c2b1660a196409c3f2818a309a2d;hp=3b79b8cb41c265c3fee6568a03e9ccfd98faae05;hpb=2132edaa03857085821b6a1214ce1410e0c2e463;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3b79b8cb4..fcdd0fd14 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -174,6 +174,8 @@ class InfoExtractor(object): width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader url: Final video URL. ext: Video filename extension. @@ -1027,7 +1029,7 @@ class InfoExtractor(object): part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': + elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), @@ -1880,6 +1882,7 @@ class InfoExtractor(object): 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) @@ -2007,16 +2010,14 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats @@ -2056,7 +2057,7 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) @@ -2249,9 +2250,10 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex( - r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') - http_base_url = '%s:%s' % ('http', url_base) + mobj = re.search( + r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) + url_base = mobj.group('url') + http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) formats = [] def manifest_url(manifest): @@ -2351,7 +2353,10 @@ class InfoExtractor(object): for track in tracks: if not isinstance(track, dict): continue - if track.get('kind') != 'captions': + track_kind = track.get('kind') + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): continue track_url = urljoin(base_url, track.get('file')) if not track_url: @@ -2405,7 +2410,7 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil':