+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ def _media_formats(src, cur_media_type):
+ full_url = absolute_url(src)
+ if determine_ext(full_url) == 'm3u8':
+ is_plain_url = False
+ formats = self._extract_m3u8_formats(
+ full_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=m3u8_id)
+ else:
+ is_plain_url = True
+ formats = [{
+ 'url': full_url,
+ 'vcodec': 'none' if cur_media_type == 'audio' else None,
+ }]
+ return is_plain_url, formats
+
+ entries = []
+ for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ _, formats = _media_formats(src)
+ media_info['formats'].extend(formats)
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'<source[^>]+>', media_content):
+ source_attributes = extract_attributes(source_tag)
+ src = source_attributes.get('src')
+ if not src:
+ continue
+ is_plain_url, formats = _media_formats(src, media_type)
+ if is_plain_url:
+ f = parse_content_type(source_attributes.get('type'))
+ f.update(formats[0])
+ media_info['formats'].append(f)
+ else:
+ media_info['formats'].extend(formats)
+ for track_tag in re.findall(r'<track[^>]+>', media_content):
+ track_attributes = extract_attributes(track_tag)
+ kind = track_attributes.get('kind')
+ if not kind or kind == 'subtitles':
+ src = track_attributes.get('src')
+ if not src:
+ continue
+ lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+ media_info['subtitles'].setdefault(lang, []).append({
+ 'url': absolute_url(src),
+ })
+ if media_info['formats']:
+ entries.append(media_info)
+ return entries
+