X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=8a8c07226868abb79fb5fa51666507d7da5a81e8;hb=ed56f260399728f1975dd30f4c8ee110cf106d84;hp=07bd2cbe2f9f6c4213e31e4c6e90d2df7b611f58;hpb=e9c0cdd3895436170de33324c1762364380c6a5a;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 07bd2cbe2..8a8c07226 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,9 +21,11 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_urlencode, + compat_urllib_request, compat_urlparse, ) +from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -48,6 +50,8 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_Request, + update_url_query, ) @@ -105,7 +109,7 @@ class InfoExtractor(object): * protocol The protocol that will be used for the actual download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", or "m3u8_native". + "m3u8", "m3u8_native" or "http_dash_segments". * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -159,7 +163,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The main artist who created the video. + creator: The creator of the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). @@ -228,6 +232,24 @@ class InfoExtractor(object): episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -345,7 +367,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -354,6 +376,14 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: + if query: + url_or_request = update_url_query(url_or_request, query) + if data is not None or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -369,13 +399,13 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal return False @@ -462,13 +492,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -483,10 +513,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None): + transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string if transform_source: @@ -497,10 +527,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None): + fatal=True, encoding=None, data=None, headers={}, query={}): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding) + encoding=encoding, data=data, headers=headers, query=query) if (not fatal) and json_string is False: return None return self._parse_json( @@ -812,7 +842,7 @@ class InfoExtractor(object): for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue - name = re.search(r'name=(["\'])(?P.+?)\1', input) + name = re.search(r'(?:name|id)=(["\'])(?P.+?)\1', input) if not name: continue value = re.search(r'value=(["\'])(?P.*?)\1', input) @@ -855,6 +885,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -865,6 +896,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: @@ -966,12 +999,31 @@ class InfoExtractor(object): if manifest is False: return [] + return self._parse_f4m_formats( + manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal) + + def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/rg3/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats base_url = xpath_text( manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 'base URL', default=None) @@ -991,7 +1043,8 @@ class InfoExtractor(object): # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': formats.extend(self._extract_f4m_formats( - manifest_url, video_id, preference, f4m_id, fatal=fatal)) + manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1003,14 +1056,12 @@ class InfoExtractor(object): 'height': int_or_none(media_el.attrib.get('height')), 'preference': preference, }) - self._sort_formats(formats) - return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True): + fatal=True, live=False): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -1088,7 +1139,14 @@ class InfoExtractor(object): if m3u8_id: format_id.append(m3u8_id) last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF it still sometimes may be present + stream_name = last_info.get('NAME') or last_media_name + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if not live: + format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), @@ -1125,7 +1183,6 @@ class InfoExtractor(object): last_media = None formats.append(f) last_info = {} - self._sort_formats(formats) return formats @staticmethod @@ -1140,8 +1197,8 @@ class InfoExtractor(object): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: assert not fatal @@ -1158,10 +1215,10 @@ class InfoExtractor(object): return {} return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - def _download_smil(self, smil_url, video_id, fatal=True): + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): return self._download_xml( smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) @@ -1282,7 +1339,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue @@ -1299,8 +1356,6 @@ class InfoExtractor(object): }) continue - self._sort_formats(formats) - return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -1311,7 +1366,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1447,8 +1502,9 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - mime_type = representation_attrib.get('mimeType') - content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') + # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + mime_type = representation_attrib['mimeType'] + content_type = mime_type.split('/')[0] if content_type == 'text': # TODO implement WebVTT downloading pass @@ -1471,6 +1527,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), @@ -1489,9 +1546,16 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1516,7 +1580,6 @@ class InfoExtractor(object): existing_format.update(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - self._sort_formats(formats) return formats def _live_title(self, name):