X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=3c2d46dd5c8ee780a04cd0f3fedb05e33707b71c;hb=27f8b0994e9924724c974f46435552d401f5fc08;hp=611cf95f1125ec9340a96554df9542d1fcbd62b4;hpb=e7b6d12254702a4aa6a9f54420f80e6ea456b120;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 611cf95f1..3c2d46dd5 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -12,13 +12,14 @@ import sys
import time
import xml.etree.ElementTree
-from ..utils import (
+from ..compat import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
clean_html,
compiled_regex_type,
ExtractorError,
@@ -42,7 +43,11 @@ class InfoExtractor(object):
information possibly downloading the video to the file system, among
other possible outcomes.
- The dictionaries must include the following fields:
+ The type field determines the the type of the result.
+ By far the most common value (and the default if _type is missing) is
+ "video", which indicates a single video.
+
+ For a video, the dictionaries must include the following fields:
id: Video identifier.
title: Video title, unescaped.
@@ -72,6 +77,7 @@ class InfoExtractor(object):
* acodec Name of the audio codec in use
* asr Audio sampling rate in Hertz
* vbr Average video bitrate in KBit/s
+ * fps Frame rate
* vcodec Name of the video codec in use
* container Name of the container format
* filesize The number of bytes, if known in advance
@@ -85,10 +91,19 @@ class InfoExtractor(object):
by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ * language_preference Is this in the correct requested
+ language?
+ 10 if it's what the URL is about,
+ -1 for default (don't know),
+ -10 otherwise, other values reserved for now.
* quality Order number of the video quality of this
format, irrespective of the file format.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ * source_preference Order number for this video source
+ (quality takes higher priority)
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
* http_referer HTTP Referer header value to set.
* http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
@@ -138,6 +153,40 @@ class InfoExtractor(object):
Unless mentioned otherwise, the fields should be Unicode strings.
+ Unless mentioned otherwise, None is equivalent to absence of information.
+
+
+ _type "playlist" indicates multiple videos.
+ There must be a key "entries", which is a list or a PagedList object, each
+ element of which is a valid dictionary under this specfication.
+
+ Additionally, playlists can have "title" and "id" attributes with the same
+ semantics as videos (see above).
+
+
+ _type "multi_video" indicates that there are multiple videos that
+ form a single show, for examples multiple acts of an opera or TV episode.
+ It must have an entries key like a playlist and contain all the keys
+ required for a video at the same time.
+
+
+ _type "url" indicates that the video must be extracted from another
+ location, possibly by a different extractor. Its only required key is:
+ "url" - the next URL to extract.
+
+ Additionally, it may have properties believed to be identical to the
+ resolved entity, for example "title" if the title of the referred video is
+ known ahead of time.
+
+
+ _type "url_transparent" entities have the same specification as "url", but
+ indicate that the given additional information is more precise than the one
+ associated with the resolved URL.
+ This is useful when a site employs a video service that hosts the video and
+ its technical metadata, but that video service does not embed a useful
+ title, description etc.
+
+
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
@@ -236,7 +285,6 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """
-
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
@@ -245,6 +293,10 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ return (content, urlh)
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -279,6 +331,12 @@ class InfoExtractor(object):
raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
self.to_screen('Saving request to ' + filename)
+ # Working around MAX_PATH limitation on Windows (see
+ # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+ if os.name == 'nt':
+ absfilepath = os.path.abspath(filename)
+ if len(absfilepath) > 259:
+ filename = '\\\\?\\' + absfilepath
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -297,7 +355,7 @@ class InfoExtractor(object):
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
- return (content, urlh)
+ return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """
@@ -365,17 +423,18 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
- #Methods for following #608
+ # Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None):
"""Returns a url that points to a page that should be processed"""
- #TODO: ie should be the class used for getting the info
+ # TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
if video_id is not None:
video_info['id'] = video_id
return video_info
+
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
@@ -387,7 +446,7 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -408,22 +467,25 @@ class InfoExtractor(object):
_name = name
if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
+ if group is None:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ else:
+ return mobj.group(group)
elif default is not _NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
self._downloader.report_warning('unable to extract %s; '
- 'please report this issue on http://yt-dl.org/bug' % _name)
+ 'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
- res = self._search_regex(pattern, string, name, default, fatal, flags)
+ res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
@@ -456,7 +518,7 @@ class InfoExtractor(object):
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
-
+
return (username, password)
def _get_tfa_info(self):
@@ -517,9 +579,9 @@ class InfoExtractor(object):
display_name = name
return self._html_search_regex(
r'''(?ix)]+(?:itemprop|name|property)=["\']?%s["\']?)
- [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal, **kwargs)
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+content=(["\'])(?P.*?)\1''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
@@ -550,7 +612,7 @@ class InfoExtractor(object):
def _twitter_search_player(self, html):
return self._html_search_meta('twitter:player', html,
- 'twitter card player')
+ 'twitter card player')
def _sort_formats(self, formats):
if not formats:
@@ -595,6 +657,7 @@ class InfoExtractor(object):
return (
preference,
+ f.get('language_preference') if f.get('language_preference') is not None else -1,
f.get('quality') if f.get('quality') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
@@ -603,14 +666,16 @@ class InfoExtractor(object):
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
+ f.get('fps') if f.get('fps') is not None else -1,
f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+ f.get('source_preference') if f.get('source_preference') is not None else -1,
f.get('format_id'),
)
formats.sort(key=_formats_key)
def http_scheme(self):
- """ Either "https:" or "https:", depending on the user's preferences """
+ """ Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
if self._downloader.params.get('prefer_insecure', False)
@@ -673,7 +738,10 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(m3u8_url, video_id)
+ m3u8_doc = self._download_webpage(
+ m3u8_url, video_id,
+ note='Downloading m3u8 information',
+ errnote='Failed to download m3u8 information')
last_info = None
kv_rex = re.compile(
r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)')