X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=dc508050413c8490882323e01d91ea3a3ba88c9a;hb=3b58d94f71b0e7b3ace6f5965f7335aa95fd0c1e;hp=271bf85968690f00e25252294b51aa0e4cdecb15;hpb=5eb778bf4d52c2d1b53182e48c630a3458f2fb22;p=youtube-dl
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 271bf8596..dc5080504 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,10 +14,12 @@ import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
+ compat_cookies,
compat_HTTPError,
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
+ compat_urllib_request,
compat_urlparse,
compat_str,
)
@@ -65,7 +67,7 @@ class InfoExtractor(object):
Potential fields:
* url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
+ * ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
Calculated from the format_id, width, height.
@@ -155,7 +157,7 @@ class InfoExtractor(object):
lower to higher preference, each element is a dictionary
with the "ext" entry and one of:
* "data": The subtitles file contents
- * "url": A url pointing to the subtitles file
+ * "url": A URL pointing to the subtitles file
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer.
@@ -176,13 +178,18 @@ class InfoExtractor(object):
Set to "root" to indicate that this is a
comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
- webpage_url: The url to the video webpage, if given to youtube-dl it
+ webpage_url: The URL to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ start_time: Time in seconds where the reproduction should start, as
+ specified in the URL.
+ end_time: Time in seconds where the reproduction should end, as
+ specified in the URL.
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -501,7 +508,7 @@ class InfoExtractor(object):
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
- """Returns a url that points to a page that should be processed"""
+ """Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
@@ -626,6 +633,12 @@ class InfoExtractor(object):
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -635,7 +648,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -656,9 +669,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -836,13 +847,14 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip()):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
- transform_source=lambda s: fix_xml_ampersands(s).strip())
+ transform_source=transform_source)
formats = []
manifest_version = '1.0'
@@ -995,7 +1007,7 @@ class InfoExtractor(object):
def _parse_smil_video(self, video, video_id, base, rtmp_count):
src = video.get('src')
if not src:
- return ([], rtmp_count)
+ return [], rtmp_count
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
@@ -1008,7 +1020,7 @@ class InfoExtractor(object):
proto = 'http'
ext = video.get('ext')
if proto == 'm3u8':
- return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
+ return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
elif proto == 'rtmp':
rtmp_count += 1
streamer = video.get('streamer') or base
@@ -1064,6 +1076,12 @@ class InfoExtractor(object):
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def _get_cookies(self, url):
+ """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ req = compat_urllib_request.Request(url)
+ self._downloader.cookiejar.add_cookie_header(req)
+ return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
if t:
@@ -1115,7 +1133,7 @@ class InfoExtractor(object):
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+ They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""