X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=a227aeb9cfbbdccbe025ba4bdbb79874a6a06263;hb=5316bf7487b608b7c085950ff2fb0444f2c36dc0;hp=87fce9cd89425150baff91577199f706db2a1e81;hpb=bfc993cc9183d5f001e30267551bcdf9f0a98be9;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 87fce9cd8..a227aeb9c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -22,17 +22,20 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ NO_DEFAULT,
age_restricted,
+ bug_reports_message,
clean_html,
compiled_regex_type,
+ determine_ext,
ExtractorError,
+ fix_xml_ampersands,
float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
unescapeHTML,
)
-_NO_DEFAULT = object()
class InfoExtractor(object):
@@ -46,7 +49,7 @@ class InfoExtractor(object):
information possibly downloading the video to the file system, among
other possible outcomes.
- The type field determines the the type of the result.
+ The type field determines the type of the result.
By far the most common value (and the default if _type is missing) is
"video", which indicates a single video.
@@ -62,7 +65,7 @@ class InfoExtractor(object):
Potential fields:
* url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
+ * ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
Calculated from the format_id, width, height.
@@ -110,11 +113,8 @@ class InfoExtractor(object):
(quality takes higher priority)
-1 for default (order by other properties),
-2 or smaller for less than default.
- * http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
to add to the request.
- * http_post_data Additional data to send with a POST
- request.
* stretched_ratio If given and not 1, indicates that the
video's pixels are not square.
width : height ratio as float.
@@ -155,7 +155,7 @@ class InfoExtractor(object):
lower to higher preference, each element is a dictionary
with the "ext" entry and one of:
* "data": The subtitles file contents
- * "url": A url pointing to the subtitles file
+ * "url": A URL pointing to the subtitles file
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer.
@@ -176,13 +176,18 @@ class InfoExtractor(object):
Set to "root" to indicate that this is a
comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
- webpage_url: The url to the video webpage, if given to youtube-dl it
+ webpage_url: The URL to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ start_time: Time in seconds where the reproduction should start, as
+ specified in the URL.
+ end_time: Time in seconds where the reproduction should end, as
+ specified in the URL.
+ tags: A list of keywords attached to the video.
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -324,7 +329,7 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
@@ -334,14 +339,11 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
return (content, urlh)
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
- content_type = urlh.headers.get('Content-Type', '')
- webpage_bytes = urlh.read()
- if prefix is not None:
- webpage_bytes = prefix + webpage_bytes
+ @staticmethod
+ def _guess_encoding_from_content(content_type, webpage_bytes):
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
@@ -354,6 +356,16 @@ class InfoExtractor(object):
encoding = 'utf-16'
else:
encoding = 'utf-8'
+
+ return encoding
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
+ if prefix is not None:
+ webpage_bytes = prefix + webpage_bytes
+ if not encoding:
+ encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
@@ -410,13 +422,13 @@ class InfoExtractor(object):
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -431,10 +443,10 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True):
+ transform_source=None, fatal=True, encoding=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
if xml_string is False:
return xml_string
if transform_source:
@@ -445,9 +457,10 @@ class InfoExtractor(object):
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True):
+ fatal=True, encoding=None):
json_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -492,14 +505,16 @@ class InfoExtractor(object):
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None):
- """Returns a url that points to a page that should be processed"""
+ def url_result(url, ie=None, video_id=None, video_title=None):
+ """Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
if video_id is not None:
video_info['id'] = video_id
+ if video_title is not None:
+ video_info['title'] = video_title
return video_info
@staticmethod
@@ -515,7 +530,7 @@ class InfoExtractor(object):
video_info['description'] = playlist_description
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -541,16 +556,15 @@ class InfoExtractor(object):
return next(g for g in mobj.groups() if g is not None)
else:
return mobj.group(group)
- elif default is not _NO_DEFAULT:
+ elif default is not NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s; '
- 'please report this issue on http://yt-dl.org/bug' % _name)
+ self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
@@ -562,7 +576,7 @@ class InfoExtractor(object):
def _get_login_info(self):
"""
- Get the the login info as (username, password)
+ Get the login info as (username, password)
It will look in the netrc file using the _NETRC_MACHINE value
If there's no info available, return (None, None)
"""
@@ -626,7 +640,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -698,7 +712,26 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _sort_formats(self, formats):
+ @staticmethod
+ def _hidden_inputs(html):
+ return dict([
+ (input.group('name'), input.group('value')) for input in re.finditer(
+ r'''(?x)
+ ["\'])hidden(?P=q_hidden)\s+
+ name=(?P["\'])(?P.+?)(?P=q_name)\s+
+ (?:id=(?P["\']).+?(?P=q_id)\s+)?
+ value=(?P["\'])(?P.*?)(?P=q_value)
+ ''', html)
+ ])
+
+ def _form_hidden_inputs(self, form_id, html):
+ form = self._search_regex(
+ r'(?s)' % form_id,
+ html, '%s form' % form_id, group='form')
+ return self._hidden_inputs(form)
+
+ def _sort_formats(self, formats, field_preference=None):
if not formats:
raise ExtractorError('No video formats found')
@@ -708,6 +741,9 @@ class InfoExtractor(object):
if not f.get('ext') and 'url' in f:
f['ext'] = determine_ext(f['url'])
+ if isinstance(field_preference, (list, tuple)):
+ return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
preference = f.get('preference')
if preference is None:
proto = f.get('protocol')
@@ -754,7 +790,7 @@ class InfoExtractor(object):
f.get('fps') if f.get('fps') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id'),
+ f.get('format_id') if f.get('format_id') is not None else '',
)
formats.sort(key=_formats_key)
@@ -767,13 +803,17 @@ class InfoExtractor(object):
formats)
def _is_valid_url(self, url, video_id, item='video'):
+ url = self._proto_relative_url(url, scheme='http:')
+ # For now assume non HTTP(S) URLs always valid
+ if not (url.startswith('http://') or url.startswith('https://')):
+ return True
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
- self.report_warning(
- '%s URL is invalid, skipping' % item, video_id)
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
return False
raise
@@ -801,10 +841,14 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip()):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
- 'Unable to download f4m manifest')
+ 'Unable to download f4m manifest',
+ # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+ # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ transform_source=transform_source)
formats = []
manifest_version = '1.0'
@@ -814,11 +858,22 @@ class InfoExtractor(object):
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
- manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
- (media_el.attrib.get('href') or media_el.attrib.get('url')))
+ media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ if not media_url:
+ continue
+ manifest_url = (
+ media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+ # If media_url is itself a f4m manifest do the recursive extraction
+ # since bitrates in parent manifest (this one) and media_url manifest
+ # may differ leading to inability to resolve the format by requested
+ # bitrate in f4m downloader
+ if determine_ext(manifest_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
- 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
+ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
'url': manifest_url,
'ext': 'flv',
'tbr': tbr,
@@ -832,10 +887,11 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
- m3u8_id=None):
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True):
formats = [{
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
+ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
@@ -851,8 +907,11 @@ class InfoExtractor(object):
m3u8_doc = self._download_webpage(
m3u8_url, video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+ if m3u8_doc is False:
+ return m3u8_doc
last_info = None
last_media = None
kv_rex = re.compile(
@@ -879,8 +938,13 @@ class InfoExtractor(object):
formats.append({'url': format_url(line)})
continue
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+ format_id = []
+ if m3u8_id:
+ format_id.append(m3u8_id)
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+ format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
f = {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
+ 'format_id': '-'.join(format_id),
'url': format_url(line.strip()),
'tbr': tbr,
'ext': ext,
@@ -921,39 +985,57 @@ class InfoExtractor(object):
formats = []
rtmp_count = 0
- for video in smil.findall('./body/switch/video'):
- src = video.get('src')
- if not src:
- continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- proto = video.get('proto')
- if not proto:
- if base:
- if base.startswith('rtmp'):
- proto = 'rtmp'
- elif base.startswith('http'):
- proto = 'http'
- ext = video.get('ext')
- if proto == 'm3u8':
- formats.extend(self._extract_m3u8_formats(src, video_id, ext))
- elif proto == 'rtmp':
- rtmp_count += 1
- streamer = video.get('streamer') or base
- formats.append({
- 'url': streamer,
- 'play_path': src,
- 'ext': 'flv',
- 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- })
+ if smil.findall('./body/seq/video'):
+ video = smil.findall('./body/seq/video')[0]
+ fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
+ formats.extend(fmts)
+ else:
+ for video in smil.findall('./body/switch/video'):
+ fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
+ formats.extend(fmts)
+
self._sort_formats(formats)
return formats
+ def _parse_smil_video(self, video, video_id, base, rtmp_count):
+ src = video.get('src')
+ if not src:
+ return [], rtmp_count
+ bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ proto = video.get('proto')
+ if not proto:
+ if base:
+ if base.startswith('rtmp'):
+ proto = 'rtmp'
+ elif base.startswith('http'):
+ proto = 'http'
+ ext = video.get('ext')
+ if proto == 'm3u8':
+ return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
+ elif proto == 'rtmp':
+ rtmp_count += 1
+ streamer = video.get('streamer') or base
+ return ([{
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ }], rtmp_count)
+ elif proto.startswith('http'):
+ return ([{
+ 'url': base + src,
+ 'ext': ext or 'flv',
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ }], rtmp_count)
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
@@ -1039,7 +1121,7 @@ class InfoExtractor(object):
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+ They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""