X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=df546da2736c441428e941f845853f0205ce107a;hb=0b68de3cc1f99ce8c49a497245c02d4d03201aa8;hp=4bfa610c168ce354c681220d89600a8e2b229143;hpb=7b2fcbfd4ea34e6d29484f5987a36665117aefaa;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 4bfa610c1..df546da27 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,7 +44,9 @@ from ..utils import (
sanitized_Request,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
url_basename,
+ xpath_element,
xpath_text,
xpath_with_ns,
determine_protocol,
@@ -52,6 +54,9 @@ from ..utils import (
mimetype2ext,
update_Request,
update_url_query,
+ parse_m3u8_attributes,
+ extract_attributes,
+ parse_codecs,
)
@@ -159,6 +164,7 @@ class InfoExtractor(object):
* "height" (optional, int)
* "resolution" (optional, string "{width}x{height"},
deprecated)
+ * "filesize" (optional, int)
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
@@ -747,10 +753,12 @@ class InfoExtractor(object):
return self._og_search_property('url', html, **kargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+ if not isinstance(name, (list, tuple)):
+ name = [name]
if display_name is None:
- display_name = name
+ display_name = name[0]
return self._html_search_regex(
- self._meta_regex(name),
+ [self._meta_regex(n) for n in name],
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -799,15 +807,17 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _search_json_ld(self, html, video_id, **kwargs):
+ def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
r'(?s)',
html, 'JSON-LD', group='json_ld', **kwargs)
if not json_ld:
return {}
- return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+ return self._json_ld(
+ json_ld, video_id, fatal=kwargs.get('fatal', True),
+ expected_type=expected_type)
- def _json_ld(self, json_ld, video_id, fatal=True):
+ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
if not json_ld:
@@ -815,6 +825,8 @@ class InfoExtractor(object):
info = {}
if json_ld.get('@context') == 'http://schema.org':
item_type = json_ld.get('@type')
+ if expected_type is not None and expected_type != item_type:
+ return info
if item_type == 'TVEpisode':
info.update({
'episode': unescapeHTML(json_ld.get('name')),
@@ -833,6 +845,19 @@ class InfoExtractor(object):
'title': unescapeHTML(json_ld.get('headline')),
'description': unescapeHTML(json_ld.get('articleBody')),
})
+ elif item_type == 'VideoObject':
+ info.update({
+ 'url': json_ld.get('contentUrl'),
+ 'title': unescapeHTML(json_ld.get('name')),
+ 'description': unescapeHTML(json_ld.get('description')),
+ 'thumbnail': json_ld.get('thumbnailUrl'),
+ 'duration': parse_duration(json_ld.get('duration')),
+ 'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+ 'filesize': float_or_none(json_ld.get('contentSize')),
+ 'tbr': int_or_none(json_ld.get('bitrate')),
+ 'width': int_or_none(json_ld.get('width')),
+ 'height': int_or_none(json_ld.get('height')),
+ })
return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod
@@ -874,7 +899,11 @@ class InfoExtractor(object):
f['ext'] = determine_ext(f['url'])
if isinstance(field_preference, (list, tuple)):
- return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+ return tuple(
+ f.get(field)
+ if f.get(field) is not None
+ else ('' if field == 'format_id' else -1)
+ for field in field_preference)
preference = f.get('preference')
if preference is None:
@@ -987,7 +1016,7 @@ class InfoExtractor(object):
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
- fatal=True):
+ fatal=True, m3u8_id=None):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
@@ -1001,11 +1030,11 @@ class InfoExtractor(object):
return self._parse_f4m_formats(
manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
- transform_source=transform_source, fatal=fatal)
+ transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
- fatal=True):
+ fatal=True, m3u8_id=None):
# currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
if akamai_pv is not None and ';' in akamai_pv.text:
@@ -1029,9 +1058,26 @@ class InfoExtractor(object):
'base URL', default=None)
if base_url:
base_url = base_url.strip()
+
+ bootstrap_info = xpath_element(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+ 'bootstrap info', default=None)
+
for i, media_el in enumerate(media_nodes):
- if manifest_version == '2.0':
- media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ tbr = int_or_none(media_el.attrib.get('bitrate'))
+ width = int_or_none(media_el.attrib.get('width'))
+ height = int_or_none(media_el.attrib.get('height'))
+ format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+ # If is present, the specified f4m is a
+ # stream-level manifest, and only set-level manifests may refer to
+ # external resources. See section 11.4 and section 4 of F4M spec
+ if bootstrap_info is None:
+ media_url = None
+ # @href is introduced in 2.0, see section 11.6 of F4M spec
+ if manifest_version == '2.0':
+ media_url = media_el.attrib.get('href')
+ if media_url is None:
+ media_url = media_el.attrib.get('url')
if not media_url:
continue
manifest_url = (
@@ -1041,19 +1087,37 @@ class InfoExtractor(object):
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
- if determine_ext(manifest_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
+ ext = determine_ext(manifest_url)
+ if ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
manifest_url, video_id, preference=preference, f4m_id=f4m_id,
- transform_source=transform_source, fatal=fatal))
+ transform_source=transform_source, fatal=fatal)
+ # Sometimes stream-level manifest contains single media entry that
+ # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+ # At the same time parent's media entry in set-level manifest may
+ # contain it. We will copy it from parent in such cases.
+ if len(f4m_formats) == 1:
+ f = f4m_formats[0]
+ f.update({
+ 'tbr': f.get('tbr') or tbr,
+ 'width': f.get('width') or width,
+ 'height': f.get('height') or height,
+ 'format_id': f.get('format_id') if not tbr else format_id,
+ })
+ formats.extend(f4m_formats)
+ continue
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', preference=preference,
+ m3u8_id=m3u8_id, fatal=fatal))
continue
- tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
- 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+ 'format_id': format_id,
'url': manifest_url,
- 'ext': 'flv',
+ 'ext': 'flv' if bootstrap_info is not None else None,
'tbr': tbr,
- 'width': int_or_none(media_el.attrib.get('width')),
- 'height': int_or_none(media_el.attrib.get('height')),
+ 'width': width,
+ 'height': height,
'preference': preference,
})
return formats
@@ -1114,23 +1178,11 @@ class InfoExtractor(object):
}]
last_info = None
last_media = None
- kv_rex = re.compile(
- r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)')
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = {}
- for m in kv_rex.finditer(line):
- v = m.group('val')
- if v.startswith('"'):
- v = v[1:-1]
- last_info[m.group('key')] = v
+ last_info = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- last_media = {}
- for m in kv_rex.finditer(line):
- v = m.group('val')
- if v.startswith('"'):
- v = v[1:-1]
- last_media[m.group('key')] = v
+ last_media = parse_m3u8_attributes(line)
elif line.startswith('#') or not line.strip():
continue
else:
@@ -1585,6 +1637,62 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
+ def _parse_html5_media_entries(self, base_url, webpage):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P[^/]+/[^;]+)(?:;\s*codecs="?(?P[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ entries = []
+ for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)(?P=tag)>', webpage):
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ media_info['formats'].append({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'