from __future__ import unicode_literals
import base64
+import binascii
import calendar
import codecs
import contextlib
import zlib
from .compat import (
+ compat_HTMLParser,
compat_basestring,
compat_chr,
compat_etree_fromstring,
compat_str,
compat_urllib_error,
compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urlparse,
+ compat_xpath,
shlex_quote,
)
compiled_regex_type = type(re.compile(''))
std_headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
+KNOWN_EXTENSIONS = (
+ 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+ 'flv', 'f4v', 'f4a', 'f4b',
+ 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+ 'mkv', 'mka', 'mk3d',
+ 'avi', 'divx',
+ 'mov',
+ 'asf', 'wmv', 'wma',
+ '3gp', '3g2',
+ 'mp3',
+ 'flac',
+ 'ape',
+ 'wav',
+ 'f4f', 'f4m', 'm3u8', 'smil')
+
def preferredencoding():
"""Get preferred encoding.
def find_xpath_attr(node, xpath, key, val=None):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z_-]+$', key)
- if val:
- assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val=None):
- # Here comes the crazy part: In 2.6, if the xpath is a unicode,
- # .//node does not match if a node is a direct child of . !
- if isinstance(xpath, compat_str):
- xpath = xpath.encode('ascii')
-
- for f in node.findall(xpath):
+ for f in node.findall(compat_xpath(xpath)):
if key not in f.attrib:
continue
if val is None or f.attrib.get(key) == val:
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
def _find_xpath(xpath):
- if sys.version_info < (2, 7): # Crazy 2.6
- xpath = xpath.encode('ascii')
- return node.find(xpath)
+ return node.find(compat_xpath(xpath))
if isinstance(xpath, (str, compat_str)):
n = _find_xpath(xpath)
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
- return get_element_by_attribute("id", id, html)
+ return get_element_by_attribute('id', id, html)
def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res)
+class HTMLAttributeParser(compat_HTMLParser):
+ """Trivial HTML parser to gather the attributes for a single element"""
+ def __init__(self):
+ self.attrs = {}
+ compat_HTMLParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ self.attrs = dict(attrs)
+
+
+def extract_attributes(html_element):
+ """Given a string for an HTML element such as
+ <el
+ a="foo" B="bar" c="&98;az" d=boz
+ empty= noval entity="&"
+ sq='"' dq="'"
+ >
+ Decode and return a dictionary of attributes.
+ {
+ 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+ 'empty': '', 'noval': None, 'entity': '&',
+ 'sq': '"', 'dq': '\''
+ }.
+ NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+ but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+ """
+ parser = HTMLAttributeParser()
+ parser.feed(html_element)
+ parser.close()
+ return parser.attrs
+
+
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
# unwanted failures due to missing protocol
+def sanitize_url(url):
+ return 'http:%s' % url if url.startswith('//') else url
+
+
def sanitized_Request(url, *args, **kwargs):
- return compat_urllib_request.Request(
- 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+ return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
def orderedSet(iterable):
if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
return s
+ # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+ if sys.platform.startswith('java'):
+ return s
+
return s.encode(get_subprocess_encoding(), 'ignore')
# Substitute URL if any change after escaping
if url != url_escaped:
- req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
- new_req = req_type(
- url_escaped, data=req.data, headers=req.headers,
- origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
- new_req.timeout = req.timeout
- req = new_req
+ req = update_Request(req, url=url_escaped)
for h, v in std_headers.items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
'%d %b %Y',
'%B %d %Y',
'%b %d %Y',
- '%b %dst %Y %I:%M%p',
- '%b %dnd %Y %I:%M%p',
- '%b %dth %Y %I:%M%p',
+ '%b %dst %Y %I:%M',
+ '%b %dnd %Y %I:%M',
+ '%b %dth %Y %I:%M',
'%Y %m %d',
'%Y-%m-%d',
'%Y/%m/%d',
guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
- elif guess.rstrip('/') in (
- 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
- 'flv', 'f4v', 'f4a', 'f4b',
- 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
- 'mkv', 'mka', 'mk3d',
- 'avi', 'divx',
- 'mov',
- 'asf', 'wmv', 'wma',
- '3gp', '3g2',
- 'mp3',
- 'flac',
- 'ape',
- 'wav',
- 'f4f', 'f4m', 'm3u8', 'smil'):
+ # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
+ elif guess.rstrip('/') in KNOWN_EXTENSIONS:
return guess.rstrip('/')
else:
return default_ext
if sign == '-':
time = -time
unit = match.group('unit')
- # A bad aproximation?
+ # A bad approximation?
if unit == 'month':
unit = 'day'
time *= 30
unit += 's'
delta = datetime.timedelta(**{unit: time})
return today + delta
- return datetime.datetime.strptime(date_str, "%Y%m%d").date()
+ return datetime.datetime.strptime(date_str, '%Y%m%d').date()
def hyphenate_date(date_str):
GetStdHandle = ctypes.WINFUNCTYPE(
ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
- (b"GetStdHandle", ctypes.windll.kernel32))
+ (b'GetStdHandle', ctypes.windll.kernel32))
h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
WriteConsoleW = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
- ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
+ ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
written = ctypes.wintypes.DWORD(0)
- GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
+ GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
ctypes.POINTER(ctypes.wintypes.DWORD))(
- (b"GetConsoleMode", ctypes.windll.kernel32))
+ (b'GetConsoleMode', ctypes.windll.kernel32))
INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
def not_a_console(handle):
raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
else:
- import fcntl
+ # Some platforms, such as Jython, is missing fcntl
+ try:
+ import fcntl
- def _lock_file(f, exclusive):
- fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+ def _lock_file(f, exclusive):
+ fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
- def _unlock_file(f):
- fcntl.flock(f, fcntl.LOCK_UN)
+ def _unlock_file(f):
+ fcntl.flock(f, fcntl.LOCK_UN)
+ except ImportError:
+ UNSUPPORTED_MSG = 'file locking is not supported on this platform'
+
+ def _lock_file(f, exclusive):
+ raise IOError(UNSUPPORTED_MSG)
+
+ def _unlock_file(f):
+ raise IOError(UNSUPPORTED_MSG)
class locked_file(object):
def smuggle_url(url, data):
""" Pass additional data in a URL for internal use. """
- sdata = compat_urllib_parse.urlencode(
+ sdata = compat_urllib_parse_urlencode(
{'__youtubedl_smuggle': json.dumps(data)})
return url + '#' + sdata
return '%.2f%s' % (converted, suffix)
+def lookup_unit_table(unit_table, s):
+ units_re = '|'.join(re.escape(u) for u in unit_table)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
+ if not m:
+ return None
+ num_str = m.group('num').replace(',', '.')
+ mult = unit_table[m.group('unit')]
+ return int(float(num_str) * mult)
+
+
def parse_filesize(s):
if s is None:
return None
- # The lower-case forms are of course incorrect and inofficial,
+ # The lower-case forms are of course incorrect and unofficial,
# but we support those too
_UNIT_TABLE = {
'B': 1,
'Yb': 1000 ** 8,
}
- units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
- m = re.match(
- r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
- if not m:
+ return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+ if s is None:
return None
- num_str = m.group('num').replace(',', '.')
- mult = _UNIT_TABLE[m.group('unit')]
- return int(float(num_str) * mult)
+ s = s.strip()
+
+ if re.match(r'^[\d,.]+$', s):
+ return str_to_int(s)
+
+ _UNIT_TABLE = {
+ 'k': 1000,
+ 'K': 1000,
+ 'm': 1000 ** 2,
+ 'M': 1000 ** 2,
+ 'kk': 1000 ** 2,
+ 'KK': 1000 ** 2,
+ }
+
+ return lookup_unit_table(_UNIT_TABLE, s)
def month_by_name(name):
def setproctitle(title):
assert isinstance(title, compat_str)
+
+ # ctypes in Jython is not complete
+ # http://bugs.jython.org/issue2148
+ if sys.platform.startswith('java'):
+ return
+
try:
- libc = ctypes.cdll.LoadLibrary("libc.so.6")
+ libc = ctypes.cdll.LoadLibrary('libc.so.6')
except OSError:
return
title_bytes = title.encode('utf-8')
class HEADRequest(compat_urllib_request.Request):
def get_method(self):
- return "HEAD"
+ return 'HEAD'
def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
s = s.strip()
- m = re.match(
- r'''(?ix)(?:P?T)?
- (?:
- (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
- (?P<only_hours>[0-9.]+)\s*(?:hours?)|
-
- \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
- (?:
+ days, hours, mins, secs, ms = [None] * 5
+ m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
+ if m:
+ days, hours, mins, secs, ms = m.groups()
+ else:
+ m = re.match(
+ r'''(?ix)(?:P?T)?
(?:
- (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
- (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
+ (?P<days>[0-9]+)\s*d(?:ays?)?\s*
)?
- (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
- )?
- (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
- )$''', s)
- if not m:
- return None
- res = 0
- if m.group('only_mins'):
- return float_or_none(m.group('only_mins'), invscale=60)
- if m.group('only_hours'):
- return float_or_none(m.group('only_hours'), invscale=60 * 60)
- if m.group('secs'):
- res += int(m.group('secs'))
- if m.group('mins_reversed'):
- res += int(m.group('mins_reversed')) * 60
- if m.group('mins'):
- res += int(m.group('mins')) * 60
- if m.group('hours'):
- res += int(m.group('hours')) * 60 * 60
- if m.group('hours_reversed'):
- res += int(m.group('hours_reversed')) * 60 * 60
- if m.group('days'):
- res += int(m.group('days')) * 24 * 60 * 60
- if m.group('ms'):
- res += float(m.group('ms'))
- return res
+ (?:
+ (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
+ )?
+ (?:
+ (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
+ )?
+ (?:
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
+ )?$''', s)
+ if m:
+ days, hours, mins, secs, ms = m.groups()
+ else:
+ m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
+ if m:
+ hours, mins = m.groups()
+ else:
+ return None
+
+ duration = 0
+ if secs:
+ duration += float(secs)
+ if mins:
+ duration += float(mins) * 60
+ if hours:
+ duration += float(hours) * 60 * 60
+ if days:
+ duration += float(days) * 24 * 60 * 60
+ if ms:
+ duration += float(ms)
+ return duration
def prepend_extension(filename, ext, expected_real_ext=None):
class OnDemandPagedList(PagedList):
- def __init__(self, pagefunc, pagesize):
+ def __init__(self, pagefunc, pagesize, use_cache=False):
self._pagefunc = pagefunc
self._pagesize = pagesize
+ self._use_cache = use_cache
+ if use_cache:
+ self._cache = {}
def getslice(self, start=0, end=None):
res = []
if start >= nextfirstid:
continue
- page_results = list(self._pagefunc(pagenum))
+ page_results = None
+ if self._use_cache:
+ page_results = self._cache.get(pagenum)
+ if page_results is None:
+ page_results = list(self._pagefunc(pagenum))
+ if self._use_cache:
+ self._cache[pagenum] = page_results
startv = (
start % self._pagesize
"""Escape URL as suggested by RFC 3986"""
url_parsed = compat_urllib_parse_urlparse(url)
return url_parsed._replace(
+ netloc=url_parsed.netloc.encode('idna').decode('ascii'),
path=escape_rfc3986(url_parsed.path),
params=escape_rfc3986(url_parsed.params),
query=escape_rfc3986(url_parsed.query),
try:
struct.pack('!I', 0)
except TypeError:
- # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
+ # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+ # See https://bugs.python.org/issue19099
def struct_pack(spec, *args):
if isinstance(spec, compat_str):
spec = spec.encode('ascii')
def urlencode_postdata(*args, **kargs):
- return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+ return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
-def encode_dict(d, encoding='utf-8'):
- def encode(v):
- return v.encode(encoding) if isinstance(v, compat_basestring) else v
- return dict((encode(k), encode(v)) for k, v in d.items())
+def update_url_query(url, query):
+ if not query:
+ return url
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_parse_qs(parsed_url.query)
+ qs.update(query)
+ return compat_urlparse.urlunparse(parsed_url._replace(
+ query=compat_urllib_parse_urlencode(qs, True)))
+
+
+def update_Request(req, url=None, data=None, headers={}, query={}):
+ req_headers = req.headers.copy()
+ req_headers.update(headers)
+ req_data = data or req.data
+ req_url = update_url_query(url or req.get_full_url(), query)
+ req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+ new_req = req_type(
+ req_url, data=req_data, headers=req_headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+ if hasattr(req, 'timeout'):
+ new_req.timeout = req.timeout
+ return new_req
+
+
+def dict_get(d, key_or_keys, default=None, skip_false_values=True):
+ if isinstance(key_or_keys, (list, tuple)):
+ for key in key_or_keys:
+ if key not in d or d[key] is None or skip_false_values and not d[key]:
+ continue
+ return d[key]
+ return default
+ return d.get(key_or_keys, default)
def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
if s is None:
return None
m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
- return int(m.group('age')) if m else US_RATINGS.get(s, None)
+ return int(m.group('age')) if m else US_RATINGS.get(s)
def strip_jsonp(code):
return re.sub(
- r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+ r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
def js_to_json(code):
def mimetype2ext(mt):
+ ext = {
+ 'audio/mp4': 'm4a',
+ }.get(mt)
+ if ext is not None:
+ return ext
+
_, _, res = mt.rpartition('/')
return {
- 'x-ms-wmv': 'wmv',
- 'x-mp4-fragmented': 'mp4',
+ '3gpp': '3gp',
+ 'smptett+xml': 'tt',
+ 'srt': 'srt',
+ 'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
+ 'vtt': 'vtt',
+ 'x-flv': 'flv',
+ 'x-mp4-fragmented': 'mp4',
+ 'x-ms-wmv': 'wmv',
}.get(res, res)
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+ 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
})
- def parse_node(node):
- str_or_empty = functools.partial(str_or_none, default='')
+ class TTMLPElementParser(object):
+ out = ''
- out = str_or_empty(node.text)
+ def start(self, tag, attrib):
+ if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+ self.out += '\n'
- for child in node:
- if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- out += '\n' + str_or_empty(child.tail)
- elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
- out += str_or_empty(parse_node(child))
- else:
- out += str_or_empty(xml.etree.ElementTree.tostring(child))
+ def end(self, tag):
+ pass
+
+ def data(self, data):
+ self.out += data
- return out
+ def close(self):
+ return self.out.strip()
+
+ def parse_node(node):
+ target = TTMLPElementParser()
+ parser = xml.etree.ElementTree.XMLParser(target=target)
+ parser.feed(xml.etree.ElementTree.tostring(node))
+ return parser.close()
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
return None # No Proxy
return compat_urllib_request.ProxyHandler.proxy_open(
self, req, proxy, type)
+
+
+def ohdave_rsa_encrypt(data, exponent, modulus):
+ '''
+ Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
+
+ Input:
+ data: data to encrypt, bytes-like object
+ exponent, modulus: parameter e and N of RSA algorithm, both integer
+ Output: hex string of encrypted data
+
+ Limitation: supports one block encryption only
+ '''
+
+ payload = int(binascii.hexlify(data[::-1]), 16)
+ encrypted = pow(payload, exponent, modulus)
+ return '%x' % encrypted
+
+
+def encode_base_n(num, n, table=None):
+ FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ if not table:
+ table = FULL_TABLE[:n]
+
+ if n > len(table):
+ raise ValueError('base %d exceeds table length %d' % (n, len(table)))
+
+ if num == 0:
+ return table[0]
+
+ ret = ''
+ while num:
+ ret = table[num % n] + ret
+ num = num // n
+ return ret
+
+
+def decode_packed_codes(code):
+ mobj = re.search(
+ r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
+ code)
+ obfucasted_code, base, count, symbols = mobj.groups()
+ base = int(base)
+ count = int(count)
+ symbols = symbols.split('|')
+ symbol_table = {}
+
+ while count:
+ count -= 1
+ base_n_count = encode_base_n(count, base)
+ symbol_table[base_n_count] = symbols[count] or base_n_count
+
+ return re.sub(
+ r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+ obfucasted_code)