projects
/
youtube-dl
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
[extractor/common] Add protocol for f4m formats
[youtube-dl]
/
youtube_dl
/
extractor
/
common.py
diff --git
a/youtube_dl/extractor/common.py
b/youtube_dl/extractor/common.py
index ceba4ca1c415cec0381d9d152dc72a7d6bf255fe..e2d9f52b018c25abc5a58a93785473ff88d90b74 100644
(file)
--- a/
youtube_dl/extractor/common.py
+++ b/
youtube_dl/extractor/common.py
@@
-27,8
+27,12
@@
from ..compat import (
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
+)
+from ..downloader.f4m import (
+ get_base_url,
+ remove_encrypted_media,
)
)
-from ..downloader.f4m import remove_encrypted_media
from ..utils import (
NO_DEFAULT,
age_restricted,
from ..utils import (
NO_DEFAULT,
age_restricted,
@@
-646,15
+650,29
@@
class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
if xml_string is False:
return xml_string
+ return self._parse_xml(
+ xml_string, video_id, transform_source=transform_source,
+ fatal=fatal)
+
+ def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
xml_string = transform_source(xml_string)
if transform_source:
xml_string = transform_source(xml_string)
- return compat_etree_fromstring(xml_string.encode('utf-8'))
+ try:
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
+ except compat_xml_parse_error as ve:
+ errmsg = '%s: Failed to parse XML ' % video_id
+ if fatal:
+ raise ExtractorError(errmsg, cause=ve)
+ else:
+ self.report_warning(errmsg + str(ve))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@
-1224,11
+1242,8
@@
class InfoExtractor(object):
media_nodes = remove_encrypted_media(media_nodes)
if not media_nodes:
return formats
media_nodes = remove_encrypted_media(media_nodes)
if not media_nodes:
return formats
- base_url = xpath_text(
- manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
- 'base URL', default=None)
- if base_url:
- base_url = base_url.strip()
+
+ manifest_base_url = get_base_url(manifest)
bootstrap_info = xpath_element(
manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
bootstrap_info = xpath_element(
manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
@@
-1260,7
+1275,7
@@
class InfoExtractor(object):
continue
manifest_url = (
media_url if media_url.startswith('http://') or media_url.startswith('https://')
continue
manifest_url = (
media_url if media_url.startswith('http://') or media_url.startswith('https://')
- else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
+ else ((
manifest_
base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
@@
-1295,6
+1310,7
@@
class InfoExtractor(object):
'url': manifest_url,
'manifest_url': manifest_url,
'ext': 'flv' if bootstrap_info is not None else None,
'url': manifest_url,
'manifest_url': manifest_url,
'ext': 'flv' if bootstrap_info is not None else None,
+ 'protocol': 'f4m',
'tbr': tbr,
'width': width,
'height': height,
'tbr': tbr,
'width': width,
'height': height,
@@
-1386,7
+1402,7
@@
class InfoExtractor(object):
media_url = media.get('URI')
if media_url:
format_id = []
media_url = media.get('URI')
if media_url:
format_id = []
- for v in (group_id, name):
+ for v in (
m3u8_id,
group_id, name):
if v:
format_id.append(v)
f = {
if v:
format_id.append(v)
f = {
@@
-1905,7
+1921,7
@@
class InfoExtractor(object):
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
- if 'total_number' not in representation_ms_info and 'segment_duration':
+ if 'total_number' not in representation_ms_info and 'segment_duration'
in representation_ms_info
:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
@@
-2169,6
+2185,12
@@
class InfoExtractor(object):
f = parse_content_type(source_attributes.get('type'))
is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
f = parse_content_type(source_attributes.get('type'))
is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
+ # res attribute is not standard but seen several times
+ # in the wild
+ f.update({
+ 'height': int_or_none(source_attributes.get('res')),
+ 'format_id': source_attributes.get('label'),
+ })
f.update(formats[0])
media_info['formats'].append(f)
else:
f.update(formats[0])
media_info['formats'].append(f)
else:
@@
-2212,27
+2234,35
@@
class InfoExtractor(object):
return formats
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
return formats
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+ query = compat_urlparse.urlparse(url).query
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
url_base = self._search_regex(
r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
http_base_url = '%s:%s' % ('http', url_base)
formats = []
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
url_base = self._search_regex(
r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
http_base_url = '%s:%s' % ('http', url_base)
formats = []
+
+ def manifest_url(manifest):
+ m_url = '%s/%s' % (http_base_url, manifest)
+ if query:
+ m_url += '?%s' % query
+ return m_url
+
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
-
http_base_url + '/playlist.m3u8'
, video_id, 'mp4',
+
manifest_url('playlist.m3u8')
, video_id, 'mp4',
m3u8_entry_protocol, m3u8_id='hls', fatal=False))
if 'f4m' not in skip_protocols:
formats.extend(self._extract_f4m_formats(
m3u8_entry_protocol, m3u8_id='hls', fatal=False))
if 'f4m' not in skip_protocols:
formats.extend(self._extract_f4m_formats(
-
http_base_url + '/manifest.f4m'
,
+
manifest_url('manifest.f4m')
,
video_id, f4m_id='hds', fatal=False))
if 'dash' not in skip_protocols:
formats.extend(self._extract_mpd_formats(
video_id, f4m_id='hds', fatal=False))
if 'dash' not in skip_protocols:
formats.extend(self._extract_mpd_formats(
-
http_base_url + '/manifest.mpd'
,
+
manifest_url('manifest.mpd')
,
video_id, mpd_id='dash', fatal=False))
if re.search(r'(?:/smil:|\.smil)', url_base):
if 'smil' not in skip_protocols:
rtmp_formats = self._extract_smil_formats(
video_id, mpd_id='dash', fatal=False))
if re.search(r'(?:/smil:|\.smil)', url_base):
if 'smil' not in skip_protocols:
rtmp_formats = self._extract_smil_formats(
-
http_base_url + '/jwplayer.smil'
,
+
manifest_url('jwplayer.smil')
,
video_id, fatal=False)
for rtmp_format in rtmp_formats:
rtsp_format = rtmp_format.copy()
video_id, fatal=False)
for rtmp_format in rtmp_formats:
rtsp_format = rtmp_format.copy()
@@
-2301,7
+2331,6
@@
class InfoExtractor(object):
formats = self._parse_jwplayer_formats(
video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
formats = self._parse_jwplayer_formats(
video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
- self._sort_formats(formats)
subtitles = {}
tracks = video_data.get('tracks')
subtitles = {}
tracks = video_data.get('tracks')
@@
-2318,16
+2347,25
@@
class InfoExtractor(object):
'url': self._proto_relative_url(track_url)
})
'url': self._proto_relative_url(track_url)
})
- entr
ies.append(
{
+ entr
y =
{
'id': this_video_id,
'id': this_video_id,
- 'title':
video_data['title'] if require_title else video_data.get('title'
),
+ 'title':
unescapeHTML(video_data['title'] if require_title else video_data.get('title')
),
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
- 'formats': formats,
- })
+ }
+ # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
+ if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
+ entry.update({
+ '_type': 'url_transparent',
+ 'url': formats[0]['url'],
+ })
+ else:
+ self._sort_formats(formats)
+ entry['formats'] = formats
+ entries.append(entry)
if len(entries) == 1:
return entries[0]
else:
if len(entries) == 1:
return entries[0]
else:
@@
-2428,10
+2466,12
@@
class InfoExtractor(object):
self._downloader.report_warning(msg)
return res
self._downloader.report_warning(msg)
return res
- def _set_cookie(self, domain, name, value, expire_time=None):
+ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
+ path='/', secure=False, discard=False, rest={}, **kwargs):
cookie = compat_cookiejar.Cookie(
cookie = compat_cookiejar.Cookie(
- 0, name, value, None, None, domain, None,
- None, '/', True, False, expire_time, '', None, None, None)
+ 0, name, value, port, port is not None, domain, True,
+ domain.startswith('.'), path, True, secure, expire_time,
+ discard, None, None, rest)
self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):