X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=df546da2736c441428e941f845853f0205ce107a;hb=0b68de3cc1f99ce8c49a497245c02d4d03201aa8;hp=0cb5e5bb06604bd9bb07adc91d4b075e80e0645d;hpb=2de624fdd5b2d94bcf548633d6fe1897ccb7cf46;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 0cb5e5bb0..df546da27 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ from ..utils import (
sanitized_Request,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
url_basename,
xpath_element,
xpath_text,
@@ -54,6 +55,8 @@ from ..utils import (
update_Request,
update_url_query,
parse_m3u8_attributes,
+ extract_attributes,
+ parse_codecs,
)
@@ -804,15 +807,17 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _search_json_ld(self, html, video_id, **kwargs):
+ def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
r'(?s)',
html, 'JSON-LD', group='json_ld', **kwargs)
if not json_ld:
return {}
- return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+ return self._json_ld(
+ json_ld, video_id, fatal=kwargs.get('fatal', True),
+ expected_type=expected_type)
- def _json_ld(self, json_ld, video_id, fatal=True):
+ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
if not json_ld:
@@ -820,6 +825,8 @@ class InfoExtractor(object):
info = {}
if json_ld.get('@context') == 'http://schema.org':
item_type = json_ld.get('@type')
+ if expected_type is not None and expected_type != item_type:
+ return info
if item_type == 'TVEpisode':
info.update({
'episode': unescapeHTML(json_ld.get('name')),
@@ -840,10 +847,16 @@ class InfoExtractor(object):
})
elif item_type == 'VideoObject':
info.update({
+ 'url': json_ld.get('contentUrl'),
'title': unescapeHTML(json_ld.get('name')),
'description': unescapeHTML(json_ld.get('description')),
- 'upload_date': unified_strdate(json_ld.get('upload_date')),
- 'url': unescapeHTML(json_ld.get('contentUrl')),
+ 'thumbnail': json_ld.get('thumbnailUrl'),
+ 'duration': parse_duration(json_ld.get('duration')),
+ 'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+ 'filesize': float_or_none(json_ld.get('contentSize')),
+ 'tbr': int_or_none(json_ld.get('bitrate')),
+ 'width': int_or_none(json_ld.get('width')),
+ 'height': int_or_none(json_ld.get('height')),
})
return dict((k, v) for k, v in info.items() if v is not None)
@@ -1624,6 +1637,62 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
+ def _parse_html5_media_entries(self, base_url, webpage):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P[^/]+/[^;]+)(?:;\s*codecs="?(?P[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ entries = []
+ for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)(?P=tag)>', webpage):
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ media_info['formats'].append({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'