X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=dce8c7d0d5ad389aa84bf84f25731a2e680e91e3;hb=4201ba13e674788c36ae69fbfbffc4b246717d6a;hp=415dc84c82d537fad9493faf38d5348143a7e521;hpb=0384932e3d3566835014f8f48edd47d2ccfa4ea9;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 415dc84c8..dce8c7d0d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -30,6 +30,7 @@ from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, + base_url, bug_reports_message, clean_html, compiled_regex_type, @@ -58,6 +59,7 @@ from ..utils import ( parse_m3u8_attributes, extract_attributes, parse_codecs, + urljoin, ) @@ -187,9 +189,10 @@ class InfoExtractor(object): uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format - {language: subformats}. "subformats" is a list sorted from - lower to higher preference, each element is a dictionary - with the "ext" entry and one of: + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file "ext" will be calculated from URL if missing @@ -885,7 +888,7 @@ class InfoExtractor(object): 'url': e.get('contentUrl'), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl'), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1223,6 +1226,7 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] + audio_in_video_stream = {} last_info = {} last_media = {} for line in m3u8_doc.splitlines(): @@ -1232,25 +1236,32 @@ class InfoExtractor(object): media = parse_m3u8_attributes(line) media_type = media.get('TYPE') if media_type in ('VIDEO', 'AUDIO'): + group_id = media.get('GROUP-ID') media_url = media.get('URI') if media_url: format_id = [] - for v in (media.get('GROUP-ID'), media.get('NAME')): + for v in (group_id, media.get('NAME')): if v: format_id.append(v) - formats.append({ + f = { 'format_id': '-'.join(format_id), 'url': format_url(media_url), 'language': media.get('LANGUAGE'), - 'vcodec': 'none' if media_type == 'AUDIO' else None, 'ext': ext, 'protocol': entry_protocol, 'preference': preference, - }) + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + if group_id and not audio_in_video_stream.get(group_id): + audio_in_video_stream[group_id] = False + formats.append(f) else: # When there is no URI in EXT-X-MEDIA let this tag's # data be used by regular URI lines below last_media = media + if media_type == 'AUDIO' and group_id: + audio_in_video_stream[group_id] = True elif line.startswith('#') or not line.strip(): continue else: @@ -1279,9 +1290,10 @@ class InfoExtractor(object): } resolution = last_info.get('RESOLUTION') if resolution: - width_str, height_str = resolution.split('x') - f['width'] = int(width_str) - f['height'] = int(height_str) + mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) # Unified Streaming Platform mobj = re.search( r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) @@ -1293,6 +1305,9 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) + if audio_in_video_stream.get(last_info.get('AUDIO')) is False: + # TODO: update acodec for for audio only formats with the same GROUP-ID + f['acodec'] = 'none' formats.append(f) last_info = {} last_media = {} @@ -1539,7 +1554,7 @@ class InfoExtractor(object): if res is False: return [] mpd, urlh = res - mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() + mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, @@ -1622,11 +1637,6 @@ class InfoExtractor(object): extract_Initialization(segment_template) return ms_info - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] for period in mpd_doc.findall(_add_ns('Period')): @@ -1676,12 +1686,11 @@ class InfoExtractor(object): 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), - 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), - 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, } + f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: @@ -1701,7 +1710,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), }, 'duration': segment_duration, } for segment_number in range( @@ -1719,7 +1728,7 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), 'Number': segment_number, } representation_ms_info['fragments'].append({ @@ -1765,7 +1774,7 @@ class InfoExtractor(object): f['fragments'].append({'url': initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) for fragment in f['fragments']: - fragment['url'] = combine_url(base_url, fragment['url']) + fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -1780,7 +1789,106 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + res = self._download_webpage_handle( + ism_url, video_id, + note=note or 'Downloading ISM manifest', + errnote=errnote or 'Failed to download ISM manifest', + fatal=fatal) + if res is False: + return [] + ism, urlh = res + + return self._parse_ism_formats( + compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + + def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: + return [] + + duration = int(ism_doc.attrib['Duration']) + timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 + + formats = [] + for stream in ism_doc.findall('StreamIndex'): + stream_type = stream.get('Type') + if stream_type not in ('video', 'audio'): + continue + url_pattern = stream.attrib['Url'] + stream_timescale = int_or_none(stream.get('TimeScale')) or timescale + stream_name = stream.get('Name') + for track in stream.findall('QualityLevel'): + fourcc = track.get('FourCC') + # TODO: add support for WVC1 and WMAP + if fourcc not in ('H264', 'AVC1', 'AACL'): + self.report_warning('%s is not a supported codec' % fourcc) + continue + tbr = int(track.attrib['Bitrate']) // 1000 + width = int_or_none(track.get('MaxWidth')) + height = int_or_none(track.get('MaxHeight')) + sampling_rate = int_or_none(track.get('SamplingRate')) + + track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) + track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + + fragments = [] + fragment_ctx = { + 'time': 0, + } + stream_fragments = stream.findall('c') + for stream_fragment_index, stream_fragment in enumerate(stream_fragments): + fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] + fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 + fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) + if not fragment_ctx['duration']: + try: + next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) + except IndexError: + next_fragment_time = duration + fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat + for _ in range(fragment_repeat): + fragments.append({ + 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'duration': fragment_ctx['duration'] / stream_timescale, + }) + fragment_ctx['time'] += fragment_ctx['duration'] + + format_id = [] + if ism_id: + format_id.append(ism_id) + if stream_name: + format_id.append(stream_name) + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + '_download_params': { + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats + + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1797,11 +1905,16 @@ class InfoExtractor(object): def _media_formats(src, cur_media_type): full_url = absolute_url(src) - if determine_ext(full_url) == 'm3u8': + ext = determine_ext(full_url) + if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + elif ext == 'mpd': + is_plain_url = False + formats = self._extract_mpd_formats( + full_url, video_id, mpd_id=mpd_id) else: is_plain_url = True formats = [{ @@ -1854,10 +1967,13 @@ class InfoExtractor(object): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id): + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) if 'hdcore=' not in f4m_url: f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign f4m_formats = self._extract_f4m_formats( @@ -1865,7 +1981,10 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))