X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=2e2a02948d1ad66753cb23729ce7fcf74d983556;hb=47a5cb77344536ca79d81a04904ac9ef9b02050f;hp=a67ac441191ae0d2342f3bb5979589a0e5004240;hpb=044eeb145556cb41485b6b6644f40b2161a4e0f1;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a67ac4411..2e2a02948 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,7 +29,10 @@ from ..compat import ( compat_urlparse, compat_xml_parse_error, ) -from ..downloader.f4m import remove_encrypted_media +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) from ..utils import ( NO_DEFAULT, age_restricted, @@ -171,6 +174,8 @@ class InfoExtractor(object): width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader url: Final video URL. ext: Video filename extension. @@ -298,8 +303,9 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title", "description" and "id" attributes - with the same semantics as videos (see above). + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). _type "multi_video" indicates that there are multiple videos that @@ -491,6 +497,16 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -520,15 +536,6 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -589,19 +596,11 @@ class InfoExtractor(object): if not encoding: encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - self.to_screen('Dumping request to ' + url) + self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - basen = '%s_%s' % (video_id, url) + basen = '%s_%s' % (video_id, urlh.geturl()) if len(basen) > 240: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:240 - len(h)] + h @@ -645,19 +644,31 @@ class InfoExtractor(object): content, _ = res return content + def _download_xml_handle( + self, url_or_request, video_id, note='Downloading XML', + errnote='Unable to download XML', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) + if res is False: + return res + xml_string, urlh = res + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" - xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) - if xml_string is False: - return xml_string - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal) + res = self._download_xml_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: @@ -1030,7 +1041,7 @@ class InfoExtractor(object): part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': + elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), @@ -1239,11 +1250,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1275,7 +1283,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested @@ -1310,6 +1318,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, @@ -1355,6 +1364,9 @@ class InfoExtractor(object): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] + if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay + return [] + formats = [] format_url = lambda u: ( @@ -1694,22 +1706,24 @@ class InfoExtractor(object): }) return subtitles - def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): xspf = self._download_xml( - playlist_url, playlist_id, 'Downloading xpsf playlist', + xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) if xspf is False: return [] - return self._parse_xspf(xspf, playlist_id) + return self._parse_xspf( + xspf, playlist_id, xspf_url=xspf_url, + xspf_base_url=base_url(xspf_url)) - def _parse_xspf(self, playlist, playlist_id): + def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', } entries = [] - for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): title = xpath_text( track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) description = xpath_text( @@ -1719,12 +1733,18 @@ class InfoExtractor(object): duration = float_or_none( xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - formats = [{ - 'url': location.text, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + formats = [] + for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): + format_url = urljoin(xspf_base_url, location.text) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'manifest_url': xspf_url, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + }) self._sort_formats(formats) entries.append({ @@ -1738,18 +1758,18 @@ class InfoExtractor(object): return entries def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_webpage_handle( + res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal) if res is False: return [] - mpd, urlh = res + mpd_doc, urlh = res mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, formats_dict=formats_dict, mpd_url=mpd_url) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): @@ -1882,6 +1902,7 @@ class InfoExtractor(object): 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) @@ -1979,6 +2000,22 @@ class InfoExtractor(object): }) segment_index += 1 representation_ms_info['fragments'] = fragments + elif 'segment_urls' in representation_ms_info: + # Segment URLs with no SegmentTimeline + # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # https://github.com/rg3/youtube-dl/pull/14844 + fragments = [] + segment_duration = float_or_none( + representation_ms_info['segment_duration'], + representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None + for segment_url in representation_ms_info['segment_urls']: + fragment = { + location_key(segment_url): segment_url, + } + if segment_duration: + fragment['duration'] = segment_duration + fragments.append(fragment) + representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. if 'fragments' in representation_ms_info: @@ -1993,32 +2030,29 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_webpage_handle( + res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal) if res is False: return [] - ism, urlh = res + ism_doc, urlh = res - return self._parse_ism_formats( - compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ @@ -2042,7 +2076,7 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) @@ -2235,9 +2269,10 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex( - r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') - http_base_url = '%s:%s' % ('http', url_base) + mobj = re.search( + r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) + url_base = mobj.group('url') + http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) formats = [] def manifest_url(manifest): @@ -2337,7 +2372,10 @@ class InfoExtractor(object): for track in tracks: if not isinstance(track, dict): continue - if track.get('kind') != 'captions': + track_kind = track.get('kind') + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): continue track_url = urljoin(base_url, track.get('file')) if not track_url: @@ -2391,7 +2429,7 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil':