X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=816baa424e2a9b8efc5f9ce0c27ff320cca77e74;hb=9558dcec9c7806c811f4fe8e7758977eaa01a702;hp=e53b7ad643e073f758bf5fcb3e1654b90c35c875;hpb=448bb5f333c6c4c8084e479e1035ff674e4f8fd4;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e53b7ad64..816baa424 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,7 +44,9 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, + xpath_element, xpath_text, xpath_with_ns, determine_protocol, @@ -52,6 +54,7 @@ from ..utils import ( mimetype2ext, update_Request, update_url_query, + parse_m3u8_attributes, ) @@ -159,6 +162,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -747,10 +751,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -799,15 +805,17 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + return self._json_ld( + json_ld, video_id, fatal=kwargs.get('fatal', True), + expected_type=expected_type) - def _json_ld(self, json_ld, video_id, fatal=True): + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: @@ -815,6 +823,8 @@ class InfoExtractor(object): info = {} if json_ld.get('@context') == 'http://schema.org': item_type = json_ld.get('@type') + if expected_type is not None and expected_type != item_type: + return info if item_type == 'TVEpisode': info.update({ 'episode': unescapeHTML(json_ld.get('name')), @@ -833,6 +843,19 @@ class InfoExtractor(object): 'title': unescapeHTML(json_ld.get('headline')), 'description': unescapeHTML(json_ld.get('articleBody')), }) + elif item_type == 'VideoObject': + info.update({ + 'url': json_ld.get('contentUrl'), + 'title': unescapeHTML(json_ld.get('name')), + 'description': unescapeHTML(json_ld.get('description')), + 'thumbnail': json_ld.get('thumbnailUrl'), + 'duration': parse_duration(json_ld.get('duration')), + 'timestamp': unified_timestamp(json_ld.get('uploadDate')), + 'filesize': float_or_none(json_ld.get('contentSize')), + 'tbr': int_or_none(json_ld.get('bitrate')), + 'width': int_or_none(json_ld.get('width')), + 'height': int_or_none(json_ld.get('height')), + }) return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -874,7 +897,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -1030,11 +1057,15 @@ class InfoExtractor(object): if base_url: base_url = base_url.strip() - bootstrap_info = xpath_text( + bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 'bootstrap info', default=None) for i, media_el in enumerate(media_nodes): + tbr = int_or_none(media_el.attrib.get('bitrate')) + width = int_or_none(media_el.attrib.get('width')) + height = int_or_none(media_el.attrib.get('height')) + format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) # If is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1056,23 +1087,35 @@ class InfoExtractor(object): # bitrate in f4m downloader ext = determine_ext(manifest_url) if ext == 'f4m': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal)) + transform_source=transform_source, fatal=fatal) + # Sometimes stream-level manifest contains single media entry that + # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). + # At the same time parent's media entry in set-level manifest may + # contain it. We will copy it from parent in such cases. + if len(f4m_formats) == 1: + f = f4m_formats[0] + f.update({ + 'tbr': f.get('tbr') or tbr, + 'width': f.get('width') or width, + 'height': f.get('height') or height, + 'format_id': f.get('format_id') if not tbr else format_id, + }) + formats.extend(f4m_formats) continue elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( manifest_url, video_id, 'mp4', preference=preference, - m3u8_id=m3u8_id, fatal=False)) + m3u8_id=m3u8_id, fatal=fatal)) continue - tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ - 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), + 'format_id': format_id, 'url': manifest_url, - 'ext': 'flv' if bootstrap_info else None, + 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, - 'width': int_or_none(media_el.attrib.get('width')), - 'height': int_or_none(media_el.attrib.get('height')), + 'width': width, + 'height': height, 'preference': preference, }) return formats @@ -1133,23 +1176,11 @@ class InfoExtractor(object): }] last_info = None last_media = None - kv_rex = re.compile( - r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_info[m.group('key')] = v + last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_media[m.group('key')] = v + last_media = parse_m3u8_attributes(line) elif line.startswith('#') or not line.strip(): continue else: @@ -1717,6 +1748,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """