X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=70909fc1ccd1c272e5203decc2e5616984e51e93;hb=a7e5f274123dcfe9afd53134a33f83d53d1e497e;hp=bfd4321605a906862987063b243703445136dc5e;hpb=a6571f1073eab6c9a4cc9800a0bff31cf12fe09f;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfd432160..70909fc1c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -53,6 +54,9 @@ from ..utils import ( mimetype2ext, update_Request, update_url_query, + parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -160,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -722,9 +727,14 @@ class InfoExtractor(object): [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): + if not isinstance(prop, (list, tuple)): + prop = [prop] if name is None: - name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) + name = 'OpenGraph %s' % prop[0] + og_regexes = [] + for p in prop: + og_regexes.extend(self._og_regexes(p)) + escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -748,10 +758,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -800,40 +812,63 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + return self._json_ld( + json_ld, video_id, fatal=kwargs.get('fatal', True), + expected_type=expected_type) - def _json_ld(self, json_ld, video_id, fatal=True): + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if json_ld.get('@context') == 'http://schema.org': - item_type = json_ld.get('@type') - if item_type == 'TVEpisode': - info.update({ - 'episode': unescapeHTML(json_ld.get('name')), - 'episode_number': int_or_none(json_ld.get('episodeNumber')), - 'description': unescapeHTML(json_ld.get('description')), - }) - part_of_season = json_ld.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = json_ld.get('partOfSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': - info.update({ - 'timestamp': parse_iso8601(json_ld.get('datePublished')), - 'title': unescapeHTML(json_ld.get('headline')), - 'description': unescapeHTML(json_ld.get('articleBody')), - }) + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + for e in json_ld: + if e.get('@context') == 'http://schema.org': + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + return info + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(e.get('name')), + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = e.get('partOfSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + }) + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -875,7 +910,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -883,7 +922,8 @@ class InfoExtractor(object): if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 - proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + protocol = f.get('protocol') or determine_protocol(f) + proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) if f.get('vcodec') == 'none': # audio only preference -= 50 @@ -1150,23 +1190,11 @@ class InfoExtractor(object): }] last_info = None last_media = None - kv_rex = re.compile( - r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_info[m.group('key')] = v + last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_media[m.group('key')] = v + last_media = parse_m3u8_attributes(line) elif line.startswith('#') or not line.strip(): continue else: @@ -1191,6 +1219,7 @@ class InfoExtractor(object): 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'fps': float_or_none(last_info.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } @@ -1199,24 +1228,17 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) - codecs = last_info.get('CODECS') - if codecs: - vcodec, acodec = [None] * 2 - va_codecs = codecs.split(',') - if len(va_codecs) == 1: - # Audio only entries usually come with single codec and - # no resolution. For more robustness we also check it to - # be mp4 audio. - if not resolution and va_codecs[0].startswith('mp4a'): - vcodec, acodec = 'none', va_codecs[0] - else: - vcodec = va_codecs[0] - else: - vcodec, acodec = va_codecs[:2] + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) f.update({ - 'acodec': acodec, - 'vcodec': vcodec, + 'vbr': vbr, + 'abr': abr, }) + f.update(parse_codecs(last_info.get('CODECS'))) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1471,6 +1493,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1503,8 +1532,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1541,7 +1578,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1585,16 +1622,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1621,6 +1682,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P[^/]+/[^;]+)(?:;\s*codecs="?(?P[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r']+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r']+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1681,7 +1798,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) @@ -1734,6 +1851,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """