X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=05c51fac9b0b4162fb126cb79a79d871b591ead8;hb=c2530d3319fd32adfc43cc349b9491040ee631d5;hp=e637b33d51689756b569b752c55e63fe4503de26;hpb=b4c1d6e800a5b28accf4ba588b8fa3f0c420ce13;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e637b33d5..05c51fac9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, + compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -29,6 +30,7 @@ from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, + base_url, bug_reports_message, clean_html, compiled_regex_type, @@ -86,9 +88,10 @@ class InfoExtractor(object): from worst to best quality. Potential fields: - * url Mandatory. The URL of the video file or URL of - the manifest file in case of fragmented media - (DASH, hls, hds). + * url Mandatory. The URL of the video file + * manifest_url + The URL of the manifest file in case of + fragmented media (DASH, hls, hds) * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -233,7 +236,7 @@ class InfoExtractor(object): chapter_id: Id of the chapter the video belongs to, as a unicode string. The following fields should only be used when the video is an episode of some - series or programme: + series, programme or podcast: series: Title of the series or programme the video episode belongs to. season: Title of the season the video episode belongs to. @@ -883,7 +886,7 @@ class InfoExtractor(object): 'url': e.get('contentUrl'), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl'), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1098,6 +1101,13 @@ class InfoExtractor(object): manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 'bootstrap info', default=None) + vcodec = None + mime_type = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], + 'base URL', default=None) + if mime_type and mime_type.startswith('audio/'): + vcodec = 'none' + for i, media_el in enumerate(media_nodes): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) @@ -1138,6 +1148,7 @@ class InfoExtractor(object): 'width': f.get('width') or width, 'height': f.get('height') or height, 'format_id': f.get('format_id') if not tbr else format_id, + 'vcodec': vcodec, }) formats.extend(f4m_formats) continue @@ -1149,10 +1160,12 @@ class InfoExtractor(object): formats.append({ 'format_id': format_id, 'url': manifest_url, + 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, 'width': width, 'height': height, + 'vcodec': vcodec, 'preference': preference, }) return formats @@ -1254,9 +1267,11 @@ class InfoExtractor(object): # format_id intact. if not live: format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) + manifest_url = format_url(line.strip()) f = { 'format_id': '-'.join(format_id), - 'url': format_url(line.strip()), + 'url': manifest_url, + 'manifest_url': manifest_url, 'tbr': tbr, 'ext': ext, 'fps': float_or_none(last_info.get('FRAME-RATE')), @@ -1265,9 +1280,10 @@ class InfoExtractor(object): } resolution = last_info.get('RESOLUTION') if resolution: - width_str, height_str = resolution.split('x') - f['width'] = int(width_str) - f['height'] = int(height_str) + mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) # Unified Streaming Platform mobj = re.search( r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) @@ -1525,12 +1541,13 @@ class InfoExtractor(object): if res is False: return [] mpd, urlh = res - mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() + mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) + compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + formats_dict=formats_dict, mpd_url=mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): """ Parse formats from MPD manifest. References: @@ -1654,6 +1671,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'manifest_url': mpd_url, 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), @@ -1682,18 +1700,10 @@ class InfoExtractor(object): if 'total_number' not in representation_ms_info and 'segment_duration': segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), - } - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), }, 'duration': segment_duration, } for segment_number in range( @@ -1703,7 +1713,6 @@ class InfoExtractor(object): # $Number*$ or $Time$ in media template with S list available # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['segment_urls'] = [] representation_ms_info['fragments'] = [] segment_time = 0 segment_d = None @@ -1712,10 +1721,9 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), 'Number': segment_number, } - representation_ms_info['segment_urls'].append(segment_url) representation_ms_info['fragments'].append({ 'url': segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), @@ -1745,17 +1753,15 @@ class InfoExtractor(object): 'duration': float_or_none(s['d'], representation_ms_info['timescale']), }) representation_ms_info['fragments'] = fragments - if 'segment_urls' in representation_ms_info: + # NB: MPD manifest may contain direct URLs to unfragmented media. + # No fragments key is present in this case. + if 'fragments' in representation_ms_info: f.update({ - 'segment_urls': representation_ms_info['segment_urls'], 'fragments': [], 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) - f.update({ - 'initialization_url': initialization_url, - }) if not f.get('url'): f['url'] = initialization_url f['fragments'].append({'url': initialization_url}) @@ -1776,6 +1782,105 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + res = self._download_webpage_handle( + ism_url, video_id, + note=note or 'Downloading ISM manifest', + errnote=errnote or 'Failed to download ISM manifest', + fatal=fatal) + if res is False: + return [] + ism, urlh = res + + return self._parse_ism_formats( + compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + + def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: + return [] + + duration = int(ism_doc.attrib['Duration']) + timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 + + formats = [] + for stream in ism_doc.findall('StreamIndex'): + stream_type = stream.get('Type') + if stream_type not in ('video', 'audio'): + continue + url_pattern = stream.attrib['Url'] + stream_timescale = int_or_none(stream.get('TimeScale')) or timescale + stream_name = stream.get('Name') + for track in stream.findall('QualityLevel'): + fourcc = track.get('FourCC') + # TODO: add support for WVC1 and WMAP + if fourcc not in ('H264', 'AVC1', 'AACL'): + self.report_warning('%s is not a supported codec' % fourcc) + continue + tbr = int(track.attrib['Bitrate']) // 1000 + width = int_or_none(track.get('MaxWidth')) + height = int_or_none(track.get('MaxHeight')) + sampling_rate = int_or_none(track.get('SamplingRate')) + + track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) + track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + + fragments = [] + fragment_ctx = { + 'time': 0, + } + stream_fragments = stream.findall('c') + for stream_fragment_index, stream_fragment in enumerate(stream_fragments): + fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] + fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 + fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) + if not fragment_ctx['duration']: + try: + next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) + except IndexError: + next_fragment_time = duration + fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat + for _ in range(fragment_repeat): + fragments.append({ + 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'duration': fragment_ctx['duration'] / stream_timescale, + }) + fragment_ctx['time'] += fragment_ctx['duration'] + + format_id = [] + if ism_id: + format_id.append(ism_id) + if stream_name: + format_id.append(stream_name) + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + '_download_params': { + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1807,7 +1912,11 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] - for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): + media_tags = [(media_tag, media_type, '') + for media_tag, media_type + in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + media_tags.extend(re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage)) + for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], 'subtitles': {}, @@ -1834,7 +1943,7 @@ class InfoExtractor(object): for track_tag in re.findall(r']+>', media_content): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') - if not kind or kind == 'subtitles': + if not kind or kind in ('subtitles', 'captions'): src = track_attributes.get('src') if not src: continue @@ -1842,16 +1951,21 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) - if media_info['formats']: + if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries def _extract_akamai_formats(self, manifest_url, video_id): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - formats.extend(self._extract_f4m_formats( - update_url_query(f4m_url, {'hdcore': '3.7.0'}), - video_id, f4m_id='hds', fatal=False)) + if 'hdcore=' not in f4m_url: + f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.extend(f4m_formats) m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', @@ -1871,11 +1985,11 @@ class InfoExtractor(object): formats.extend(self._extract_f4m_formats( http_base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if 'dash' not in skip_protocols: + formats.extend(self._extract_mpd_formats( + http_base_url + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): - if 'dash' not in skip_protocols: - formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', - video_id, mpd_id='dash', fatal=False)) if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( http_base_url + '/jwplayer.smil', @@ -2021,6 +2135,12 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers + def _generic_id(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + + def _generic_title(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + class SearchInfoExtractor(InfoExtractor): """