X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=bfd4321605a906862987063b243703445136dc5e;hb=b5aad37f6bdc72acaca198202dc9f7eaa3185e51;hp=85ac0400c1e01e160828de9b6805437631cd61aa;hpb=41d06b042427aa019200bb80a1935d12110f50b1;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 85ac0400c..bfd432160 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -45,6 +45,7 @@ from ..utils import ( unescapeHTML, unified_strdate, url_basename, + xpath_element, xpath_text, xpath_with_ns, determine_protocol, @@ -163,7 +164,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The main artist who created the video. + creator: The creator of the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). @@ -232,6 +233,24 @@ class InfoExtractor(object): episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -358,14 +377,13 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) - # data, headers and query params will be ignored for `Request` objects if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) else: if query: url_or_request = update_url_query(url_or_request, query) - if data or headers: + if data is not None or headers: url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) @@ -825,7 +843,7 @@ class InfoExtractor(object): for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue - name = re.search(r'name=(["\'])(?P.+?)\1', input) + name = re.search(r'(?:name|id)=(["\'])(?P.+?)\1', input) if not name: continue value = re.search(r'value=(["\'])(?P.*?)\1', input) @@ -970,7 +988,7 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True): + fatal=True, m3u8_id=None): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', @@ -984,11 +1002,18 @@ class InfoExtractor(object): return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal) + transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True): + fatal=True, m3u8_id=None): + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') @@ -1005,9 +1030,26 @@ class InfoExtractor(object): 'base URL', default=None) if base_url: base_url = base_url.strip() + + bootstrap_info = xpath_element( + manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], + 'bootstrap info', default=None) + for i, media_el in enumerate(media_nodes): - if manifest_version == '2.0': - media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + tbr = int_or_none(media_el.attrib.get('bitrate')) + width = int_or_none(media_el.attrib.get('width')) + height = int_or_none(media_el.attrib.get('height')) + format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + # If is present, the specified f4m is a + # stream-level manifest, and only set-level manifests may refer to + # external resources. See section 11.4 and section 4 of F4M spec + if bootstrap_info is None: + media_url = None + # @href is introduced in 2.0, see section 11.6 of F4M spec + if manifest_version == '2.0': + media_url = media_el.attrib.get('href') + if media_url is None: + media_url = media_el.attrib.get('url') if not media_url: continue manifest_url = ( @@ -1017,29 +1059,43 @@ class InfoExtractor(object): # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader - if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats( + ext = determine_ext(manifest_url) + if ext == 'f4m': + f4m_formats = self._extract_f4m_formats( manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal)) + transform_source=transform_source, fatal=fatal) + # Sometimes stream-level manifest contains single media entry that + # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). + # At the same time parent's media entry in set-level manifest may + # contain it. We will copy it from parent in such cases. + if len(f4m_formats) == 1: + f = f4m_formats[0] + f.update({ + 'tbr': f.get('tbr') or tbr, + 'width': f.get('width') or width, + 'height': f.get('height') or height, + 'format_id': f.get('format_id') if not tbr else format_id, + }) + formats.extend(f4m_formats) + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', preference=preference, + m3u8_id=m3u8_id, fatal=fatal)) continue - tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ - 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), + 'format_id': format_id, 'url': manifest_url, - 'ext': 'flv', + 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, - 'width': int_or_none(media_el.attrib.get('width')), - 'height': int_or_none(media_el.attrib.get('height')), + 'width': width, + 'height': height, 'preference': preference, }) return formats - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None, - fatal=True): - - formats = [{ + def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): + return { 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 'url': m3u8_url, 'ext': ext, @@ -1047,7 +1103,14 @@ class InfoExtractor(object): 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', - }] + } + + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + entry_protocol='m3u8', preference=None, + m3u8_id=None, note=None, errnote=None, + fatal=True, live=False): + + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] format_url = lambda u: ( u @@ -1114,8 +1177,15 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF it still sometimes may be present + stream_name = last_info.get('NAME') or last_media_name + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if not live: + format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), @@ -1247,21 +1317,21 @@ class InfoExtractor(object): m3u8_count = 0 srcs = [] - videos = smil.findall(self._xpath_ns('.//video', namespace)) - for video in videos: - src = video.get('src') + media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) + for medium in media: + src = medium.get('src') if not src or src in srcs: continue srcs.append(src) - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - filesize = int_or_none(video.get('size') or video.get('fileSize')) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - ext = video.get('ext') + bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) + filesize = int_or_none(medium.get('size') or medium.get('fileSize')) + width = int_or_none(medium.get('width')) + height = int_or_none(medium.get('height')) + proto = medium.get('proto') + ext = medium.get('ext') src_ext = determine_ext(src) - streamer = video.get('streamer') or base + streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): rtmp_count += 1 @@ -1335,7 +1405,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1515,9 +1585,16 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'],