X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=6fa7c334ea9e7ebafbb7805768e55e4f43ca00df;hb=4606c34e19af395a1ddd31d2941d4ccd90e5e279;hp=2e9f05ae3b36583f721a045cdb16aa1a3e8f849e;hpb=b2758123c5e759fdb0c7d23d380e4dd9e245cd4a;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2e9f05ae3..6fa7c334e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -30,6 +30,7 @@ from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, + base_url, bug_reports_message, clean_html, compiled_regex_type, @@ -58,6 +59,7 @@ from ..utils import ( parse_m3u8_attributes, extract_attributes, parse_codecs, + urljoin, ) @@ -187,9 +189,10 @@ class InfoExtractor(object): uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format - {language: subformats}. "subformats" is a list sorted from - lower to higher preference, each element is a dictionary - with the "ext" entry and one of: + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file "ext" will be calculated from URL if missing @@ -885,7 +888,7 @@ class InfoExtractor(object): 'url': e.get('contentUrl'), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl'), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1223,6 +1226,7 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] + audio_in_video_stream = {} last_info = {} last_media = {} for line in m3u8_doc.splitlines(): @@ -1232,25 +1236,32 @@ class InfoExtractor(object): media = parse_m3u8_attributes(line) media_type = media.get('TYPE') if media_type in ('VIDEO', 'AUDIO'): + group_id = media.get('GROUP-ID') media_url = media.get('URI') if media_url: format_id = [] - for v in (media.get('GROUP-ID'), media.get('NAME')): + for v in (group_id, media.get('NAME')): if v: format_id.append(v) - formats.append({ + f = { 'format_id': '-'.join(format_id), 'url': format_url(media_url), 'language': media.get('LANGUAGE'), - 'vcodec': 'none' if media_type == 'AUDIO' else None, 'ext': ext, 'protocol': entry_protocol, 'preference': preference, - }) + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + if group_id and not audio_in_video_stream.get(group_id): + audio_in_video_stream[group_id] = False + formats.append(f) else: # When there is no URI in EXT-X-MEDIA let this tag's # data be used by regular URI lines below last_media = media + if media_type == 'AUDIO' and group_id: + audio_in_video_stream[group_id] = True elif line.startswith('#') or not line.strip(): continue else: @@ -1279,9 +1290,10 @@ class InfoExtractor(object): } resolution = last_info.get('RESOLUTION') if resolution: - width_str, height_str = resolution.split('x') - f['width'] = int(width_str) - f['height'] = int(height_str) + mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) # Unified Streaming Platform mobj = re.search( r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) @@ -1293,6 +1305,9 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) + if audio_in_video_stream.get(last_info.get('AUDIO')) is False: + # TODO: update acodec for for audio only formats with the same GROUP-ID + f['acodec'] = 'none' formats.append(f) last_info = {} last_media = {} @@ -1539,7 +1554,7 @@ class InfoExtractor(object): if res is False: return [] mpd, urlh = res - mpd_base_url = re.match(r'https?://[^?#&]+/', urlh.geturl()).group() + mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, @@ -1622,11 +1637,6 @@ class InfoExtractor(object): extract_Initialization(segment_template) return ms_info - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] for period in mpd_doc.findall(_add_ns('Period')): @@ -1676,12 +1686,11 @@ class InfoExtractor(object): 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), - 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), - 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, } + f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: @@ -1701,7 +1710,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), }, 'duration': segment_duration, } for segment_number in range( @@ -1719,7 +1728,7 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': representation_attrib.get('bandwidth'), + 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), 'Number': segment_number, } representation_ms_info['fragments'].append({ @@ -1765,7 +1774,7 @@ class InfoExtractor(object): f['fragments'].append({'url': initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) for fragment in f['fragments']: - fragment['url'] = combine_url(base_url, fragment['url']) + fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -1797,8 +1806,6 @@ class InfoExtractor(object): if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] - ism_base_url = re.match(r'https?://.+/', ism_url).group() - duration = int(ism_doc.attrib['Duration']) timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 @@ -1838,10 +1845,10 @@ class InfoExtractor(object): next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) except IndexError: next_fragment_time = duration - fragment_ctx['duration'] = (next_fragment_time - frgament_time) / fragment_repeat + fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat for _ in range(fragment_repeat): fragments.append({ - 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern), + 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), 'duration': fragment_ctx['duration'] / stream_timescale, }) fragment_ctx['time'] += fragment_ctx['duration'] @@ -1881,7 +1888,7 @@ class InfoExtractor(object): }) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1898,11 +1905,16 @@ class InfoExtractor(object): def _media_formats(src, cur_media_type): full_url = absolute_url(src) - if determine_ext(full_url) == 'm3u8': + ext = determine_ext(full_url) + if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + elif ext == 'mpd': + is_plain_url = False + formats = self._extract_mpd_formats( + full_url, video_id, mpd_id=mpd_id) else: is_plain_url = True formats = [{