projects
/
youtube-dl
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Merge pull request #14225 from Tithen-Firion/openload-phantomjs-method
[youtube-dl]
/
youtube_dl
/
extractor
/
common.py
diff --git
a/youtube_dl/extractor/common.py
b/youtube_dl/extractor/common.py
index 74b6f1197bc77f3ad4c87b25127ce914b2359da8..317a9a76fc417e9ad4455bc99b30e782849eeabc 100644
(file)
--- a/
youtube_dl/extractor/common.py
+++ b/
youtube_dl/extractor/common.py
@@
-27,6
+27,7
@@
from ..compat import (
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
)
from ..downloader.f4m import remove_encrypted_media
from ..utils import (
)
from ..downloader.f4m import remove_encrypted_media
from ..utils import (
@@
-376,7
+377,7
@@
class InfoExtractor(object):
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
assert m
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
assert m
- return
m.group('id'
)
+ return
compat_str(m.group('id')
)
@classmethod
def working(cls):
@classmethod
def working(cls):
@@
-420,7
+421,7
@@
class InfoExtractor(object):
if country_code:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._downloader.params.get('verbose', False):
if country_code:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._downloader.params.get('verbose', False):
- self._downloader.to_s
tdout
(
+ self._downloader.to_s
creen
(
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
% (self._x_forwarded_for_ip, country_code.upper()))
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
% (self._x_forwarded_for_ip, country_code.upper()))
@@
-646,15
+647,29
@@
class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
if xml_string is False:
return xml_string
+ return self._parse_xml(
+ xml_string, video_id, transform_source=transform_source,
+ fatal=fatal)
+
+ def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
xml_string = transform_source(xml_string)
if transform_source:
xml_string = transform_source(xml_string)
- return compat_etree_fromstring(xml_string.encode('utf-8'))
+ try:
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
+ except compat_xml_parse_error as ve:
+ errmsg = '%s: Failed to parse XML ' % video_id
+ if fatal:
+ raise ExtractorError(errmsg, cause=ve)
+ else:
+ self.report_warning(errmsg + str(ve))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@
-730,12
+745,12
@@
class InfoExtractor(object):
video_info['title'] = video_title
return video_info
video_info['title'] = video_title
return video_info
- def playlist_from_matches(self, matches,
video_id, video_titl
e, getter=None, ie=None):
- url
r
s = orderedSet(
+ def playlist_from_matches(self, matches,
playlist_id=None, playlist_title=Non
e, getter=None, ie=None):
+ urls = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
- url
rs, playlist_id=video_id, playlist_title=video
_title)
+ url
s, playlist_id=playlist_id, playlist_title=playlist
_title)
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@@
-940,7
+955,8
@@
class InfoExtractor(object):
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
- family_friendly = self._html_search_meta('isFamilyFriendly', html)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', html, default=None)
if not family_friendly:
return None
if not family_friendly:
return None
@@
-1002,17
+1018,17
@@
class InfoExtractor(object):
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
return info
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
return info
- if item_type
== 'TVEpisode'
:
+ if item_type
in ('TVEpisode', 'Episode')
:
info.update({
'episode': unescapeHTML(e.get('name')),
'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')),
})
part_of_season = e.get('partOfSeason')
info.update({
'episode': unescapeHTML(e.get('name')),
'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')),
})
part_of_season = e.get('partOfSeason')
- if isinstance(part_of_season, dict) and part_of_season.get('@type')
== 'TVSeason'
:
+ if isinstance(part_of_season, dict) and part_of_season.get('@type')
in ('TVSeason', 'Season', 'CreativeWorkSeason')
:
info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
- if isinstance(part_of_series, dict) and part_of_series.get('@type')
== 'TVSeries'
:
+ if isinstance(part_of_series, dict) and part_of_series.get('@type')
in ('TVSeries', 'Series', 'CreativeWorkSeries')
:
info['series'] = unescapeHTML(part_of_series.get('name'))
elif item_type == 'Article':
info.update({
info['series'] = unescapeHTML(part_of_series.get('name'))
elif item_type == 'Article':
info.update({
@@
-1022,10
+1038,10
@@
class InfoExtractor(object):
})
elif item_type == 'VideoObject':
extract_video_object(e)
})
elif item_type == 'VideoObject':
extract_video_object(e)
- elif item_type == 'WebPage':
-
video = e.get('video')
-
if isinstance(video, dict) and video.get('@type') == 'VideoObject':
-
extract_video_object(video)
+ continue
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
break
return dict((k, v) for k, v in info.items() if v is not None)
break
return dict((k, v) for k, v in info.items() if v is not None)
@@
-1785,7
+1801,7
@@
class InfoExtractor(object):
ms_info['timescale'] = int(timescale)
segment_duration = source.get('duration')
if segment_duration:
ms_info['timescale'] = int(timescale)
segment_duration = source.get('duration')
if segment_duration:
- ms_info['segment_duration'] =
in
t(segment_duration)
+ ms_info['segment_duration'] =
floa
t(segment_duration)
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
@@
-1892,9
+1908,13
@@
class InfoExtractor(object):
'Bandwidth': bandwidth,
}
'Bandwidth': bandwidth,
}
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
@@
-1904,7
+1924,7
@@
class InfoExtractor(object):
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
-
'url'
: media_template % {
+
media_location_key
: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'Number': segment_number,
'Bandwidth': bandwidth,
},
@@
-1928,7
+1948,7
@@
class InfoExtractor(object):
'Number': segment_number,
}
representation_ms_info['fragments'].append({
'Number': segment_number,
}
representation_ms_info['fragments'].append({
-
'url'
: segment_url,
+
media_location_key
: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
@@
-1952,8
+1972,9
@@
class InfoExtractor(object):
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
fragments.append({
-
'url': representation_ms_info['segment_urls'][segment_index]
,
+
location_key(segment_uri): segment_uri
,
'duration': duration,
})
segment_index += 1
'duration': duration,
})
segment_index += 1
@@
-1962,6
+1983,7
@@
class InfoExtractor(object):
# No fragments key is present in this case.
if 'fragments' in representation_ms_info:
f.update({
# No fragments key is present in this case.
if 'fragments' in representation_ms_info:
f.update({
+ 'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
})
'fragments': [],
'protocol': 'http_dash_segments',
})
@@
-1969,10
+1991,8
@@
class InfoExtractor(object):
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
- f['fragments'].append({
'url'
: initialization_url})
+ f['fragments'].append({
location_key(initialization_url)
: initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
f['fragments'].extend(representation_ms_info['fragments'])
- for fragment in f['fragments']:
- fragment['url'] = urljoin(base_url, fragment['url'])
try:
existing_format = next(
fo for fo in formats
try:
existing_format = next(
fo for fo in formats
@@
-2110,19
+2130,19
@@
class InfoExtractor(object):
return f
return {}
return f
return {}
- def _media_formats(src, cur_media_type):
+ def _media_formats(src, cur_media_type
, type_info={}
):
full_url = absolute_url(src)
full_url = absolute_url(src)
- ext = determine_ext(full_url)
+ ext =
type_info.get('ext') or
determine_ext(full_url)
if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
- preference=preference)
+ preference=preference
, fatal=False
)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
- full_url, video_id, mpd_id=mpd_id)
+ full_url, video_id, mpd_id=mpd_id
, fatal=False
)
else:
is_plain_url = True
formats = [{
else:
is_plain_url = True
formats = [{
@@
-2132,15
+2152,18
@@
class InfoExtractor(object):
return is_plain_url, formats
entries = []
return is_plain_url, formats
entries = []
+ # amp-video and amp-audio are very similar to their HTML5 counterparts
+ # so we wll include them right here (see
+ # https://www.ampproject.org/docs/reference/components/amp-video)
media_tags = [(media_tag, media_type, '')
for media_tag, media_type
media_tags = [(media_tag, media_type, '')
for media_tag, media_type
- in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+ in re.findall(r'(?s)(<(
?:amp-)?(
video|audio)[^>]*/>)', webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
- r'(?s)(<(?P<tag>
video|audio
)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
+ r'(?s)(<(?P<tag>
(?:amp-)?(?:video|audio)
)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags:
media_info = {
'formats': [],
for media_tag, media_type, media_content in media_tags:
media_info = {
'formats': [],
@@
-2158,9
+2181,15
@@
class InfoExtractor(object):
src = source_attributes.get('src')
if not src:
continue
src = source_attributes.get('src')
if not src:
continue
- is_plain_url, formats = _media_formats(src, media_type)
+ f = parse_content_type(source_attributes.get('type'))
+ is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
if is_plain_url:
- f = parse_content_type(source_attributes.get('type'))
+ # res attribute is not standard but seen several times
+ # in the wild
+ f.update({
+ 'height': int_or_none(source_attributes.get('res')),
+ 'format_id': source_attributes.get('label'),
+ })
f.update(formats[0])
media_info['formats'].append(f)
else:
f.update(formats[0])
media_info['formats'].append(f)
else:
@@
-2207,7
+2236,7
@@
class InfoExtractor(object):
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
url_base = self._search_regex(
r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
url_base = self._search_regex(
r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
- http_base_url =
self._proto_relative_url(url_base, scheme='http:'
)
+ http_base_url =
'%s:%s' % ('http', url_base
)
formats = []
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
formats = []
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
@@
-2241,7
+2270,7
@@
class InfoExtractor(object):
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
- 'url':
protocol + url_base
,
+ 'url':
'%s:%s' % (protocol, url_base)
,
'format_id': protocol,
'protocol': protocol,
})
'format_id': protocol,
'protocol': protocol,
})
@@
-2299,6
+2328,8
@@
class InfoExtractor(object):
tracks = video_data.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
tracks = video_data.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
+ if not isinstance(track, dict):
+ continue
if track.get('kind') != 'captions':
continue
track_url = urljoin(base_url, track.get('file'))
if track.get('kind') != 'captions':
continue
track_url = urljoin(base_url, track.get('file'))
@@
-2328,6
+2359,8
@@
class InfoExtractor(object):
urls = []
formats = []
for source in jwplayer_sources_data:
urls = []
formats = []
for source in jwplayer_sources_data:
+ if not isinstance(source, dict):
+ continue
source_url = self._proto_relative_url(source.get('file'))
if not source_url:
continue
source_url = self._proto_relative_url(source.get('file'))
if not source_url:
continue
@@
-2416,10
+2449,12
@@
class InfoExtractor(object):
self._downloader.report_warning(msg)
return res
self._downloader.report_warning(msg)
return res
- def _set_cookie(self, domain, name, value, expire_time=None):
+ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
+ path='/', secure=False, discard=False, rest={}, **kwargs):
cookie = compat_cookiejar.Cookie(
cookie = compat_cookiejar.Cookie(
- 0, name, value, None, None, domain, None,
- None, '/', True, False, expire_time, '', None, None, None)
+ 0, name, value, port, not port is None, domain, True,
+ domain.startswith('.'), path, True, secure, expire_time,
+ discard, None, None, rest)
self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):