X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=a285ee7d898e0777005ff625b5fcc44f1013d683;hb=d6712378e73951bede475569c887a1ac73f660a9;hp=aaca25a12a872eabfe3f43734e0599e7e3d4d78e;hpb=15bf934de589f0598b763f8351869c5de69ecad3;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index aaca25a12..a285ee7d8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,9 +21,11 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_urlencode, + compat_urllib_request, compat_urlparse, ) +from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -48,6 +50,7 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_Request, update_url_query, ) @@ -229,6 +232,24 @@ class InfoExtractor(object): episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -346,7 +367,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -355,12 +376,14 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) - # data, headers and query params will be ignored for `Request` objects - if isinstance(url_or_request, compat_str): + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: if query: url_or_request = update_url_query(url_or_request, query) - if data or headers: - url_or_request = sanitized_Request(url_or_request, data, headers or {}) + if data is not None or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -376,7 +399,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -469,7 +492,7 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): """ Returns the data of the page as a string """ success = False try_count = 0 @@ -490,7 +513,7 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): + transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) @@ -504,7 +527,7 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers=None, query=None): + fatal=True, encoding=None, data=None, headers={}, query={}): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) @@ -819,7 +842,7 @@ class InfoExtractor(object): for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue - name = re.search(r'name=(["\'])(?P.+?)\1', input) + name = re.search(r'(?:name|id)=(["\'])(?P.+?)\1', input) if not name: continue value = re.search(r'value=(["\'])(?P.*?)\1', input) @@ -862,6 +885,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -872,6 +896,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: @@ -973,12 +999,31 @@ class InfoExtractor(object): if manifest is False: return [] + return self._parse_f4m_formats( + manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal) + + def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/rg3/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats base_url = xpath_text( manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 'base URL', default=None) @@ -998,7 +1043,8 @@ class InfoExtractor(object): # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': formats.extend(self._extract_f4m_formats( - manifest_url, video_id, preference, f4m_id, fatal=fatal)) + manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1010,8 +1056,6 @@ class InfoExtractor(object): 'height': int_or_none(media_el.attrib.get('height')), 'preference': preference, }) - self._sort_formats(formats) - return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, @@ -1132,7 +1176,6 @@ class InfoExtractor(object): last_media = None formats.append(f) last_info = {} - self._sort_formats(formats) return formats @staticmethod @@ -1147,8 +1190,8 @@ class InfoExtractor(object): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: assert not fatal @@ -1165,10 +1208,10 @@ class InfoExtractor(object): return {} return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - def _download_smil(self, smil_url, video_id, fatal=True): + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): return self._download_xml( smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) @@ -1289,7 +1332,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue @@ -1306,8 +1349,6 @@ class InfoExtractor(object): }) continue - self._sort_formats(formats) - return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -1318,7 +1359,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1498,9 +1539,16 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1525,7 +1573,6 @@ class InfoExtractor(object): existing_format.update(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - self._sort_formats(formats) return formats def _live_title(self, name):