X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=b0255c943008b50b4af428d218be322b5aa68248;hb=8c25f81beea169c9d6540eea1a6f71dc045da6ed;hp=1081a93680c7dc1d054748bcffd68cca50cf6b28;hpb=fe556f1b0cfd5782ec379a731f4b8879f2a352a3;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1081a9368..b0255c943 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -24,176 +24,25 @@ import socket import struct import subprocess import sys +import tempfile import traceback import xml.etree.ElementTree import zlib -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: - import html.parser as compat_html_parser -except ImportError: # Python 2 - import HTMLParser as compat_html_parser +from .compat import ( + compat_chr, + compat_getenv, + compat_html_entities, + compat_html_parser, + compat_parse_qs, + compat_str, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urlparse, +) -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, unicode - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -try: - from xml.etree.ElementTree import ParseError as compat_xml_parse_error -except ImportError: # Python 2.6 - from xml.parsers.expat import ExpatError as compat_xml_parse_error - -def compat_ord(c): - if type(c) is int: return c - else: return ord(c) # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -220,26 +69,42 @@ def preferredencoding(): return pref -if sys.version_info < (3,0): - def compat_print(s): - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert type(s) == type(u'') - print(s) - -# In Python 2.x, json.dump expects a bytestream. -# In Python 3.x, it writes to a character stream -if sys.version_info < (3,0): - def write_json_file(obj, fn): - with open(fn, 'wb') as f: - json.dump(obj, f) -else: - def write_json_file(obj, fn): - with open(fn, 'w', encoding='utf-8') as f: - json.dump(obj, f) -if sys.version_info >= (2,7): +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically """ + + args = { + 'suffix': '.tmp', + 'prefix': os.path.basename(fn) + '.', + 'dir': os.path.dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**args) + + try: + with tf: + json.dump(obj, tf) + os.rename(tf.name, fn) + except: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) @@ -248,6 +113,11 @@ if sys.version_info >= (2,7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + if isinstance(xpath, unicode): + xpath = xpath.encode('ascii') + for f in node.findall(xpath): if f.attrib.get(key) == val: return f @@ -266,30 +136,20 @@ def xpath_with_ns(path, ns_map): replaced.append('{%s}%s' % (ns_map[ns], tag)) return '/'.join(replaced) -def htmlentity_transform(matchobj): - """Transforms an HTML entity to a character. - - This function receives a match object and is intended to be used with - the re.sub() function. - """ - entity = matchobj.group(1) - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) +def xpath_text(node, xpath, name=None, fatal=False): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') - mobj = re.match(u'(?u)#(x?\\d+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith(u'x'): - base = 16 - numstr = u'0%s' % numstr + n = node.find(xpath) + if n is None: + if fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) else: - base = 10 - return compat_chr(int(numstr, base)) + return None + return n.text - # Unknown entity in name, return its literal representation - return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix class BaseHTMLParser(compat_html_parser.HTMLParser): @@ -511,13 +371,33 @@ def orderedSet(iterable): return res +def _htmlentity_transform(entity): + """Transforms an HTML entity to a character.""" + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + mobj = re.match(r'#(x?[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return compat_chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + + def unescapeHTML(s): if s is None: return None assert type(s) == compat_str - result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) - return result + return re.sub( + r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) def encodeFilename(s, for_subprocess=False): @@ -589,7 +469,7 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs): self.sock = sock self._tunnel() try: - self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1) except ssl.SSLError: self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) @@ -597,8 +477,14 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs): def https_open(self, req): return self.do_open(HTTPSConnectionV3, req) return HTTPSHandlerV3(**kwargs) - else: - context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) + elif hasattr(ssl, 'create_default_context'): # Python >= 3.4 + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3 + if opts_no_check_certificate: + context.verify_mode = ssl.CERT_NONE + return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + else: # Python < 3.4 + context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = (ssl.CERT_NONE if opts_no_check_certificate else ssl.CERT_REQUIRED) @@ -620,6 +506,8 @@ class ExtractorError(Exception): expected = True if video_id is not None: msg = video_id + ': ' + msg + if cause: + msg += u' (caused by %r)' % cause if not expected: msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) @@ -734,10 +622,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): - for h,v in std_headers.items(): - if h in req.headers: - del req.headers[h] - req.add_header(h, v) + for h, v in std_headers.items(): + if h not in req.headers: + req.add_header(h, v) if 'Youtubedl-no-compression' in req.headers: if 'Accept-encoding' in req.headers: del req.headers['Accept-encoding'] @@ -747,6 +634,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['User-agent'] req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): @@ -789,7 +682,7 @@ def parse_iso8601(date_str, delimiter='T'): return None m = re.search( - r'Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$', + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -802,7 +695,7 @@ def parse_iso8601(date_str, delimiter='T'): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt.timetuple()) @@ -830,8 +723,11 @@ def unified_strdate(date_str): '%Y/%m/%d', '%d.%m.%Y', '%d/%m/%Y', + '%d/%m/%y', '%Y/%m/%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1048,12 +944,6 @@ def intlist_to_bytes(xs): return bytes(xs) -def get_cachedir(params={}): - cache_root = os.environ.get('XDG_CACHE_HOME', - os.path.expanduser('~/.cache')) - return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) - - # Cross-platform file locking if sys.platform == 'win32': import ctypes.wintypes @@ -1113,10 +1003,10 @@ else: import fcntl def _lock_file(f, exclusive): - fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) def _unlock_file(f): - fcntl.lockf(f, fcntl.LOCK_UN) + fcntl.flock(f, fcntl.LOCK_UN) class locked_file(object): @@ -1150,11 +1040,14 @@ class locked_file(object): return self.f.read(*args) +def get_filesystem_encoding(): + encoding = sys.getfilesystemencoding() + return encoding if encoding is not None else 'utf-8' + + def shell_quote(args): quoted_args = [] - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' + encoding = get_filesystem_encoding() for a in args: if isinstance(a, bytes): # We may get a filename encoded with 'encodeFilename' @@ -1204,7 +1097,7 @@ def format_bytes(bytes): def get_term_width(): - columns = os.environ.get('COLUMNS', None) + columns = compat_getenv('COLUMNS', None) if columns: return int(columns) @@ -1260,6 +1153,12 @@ def remove_start(s, start): return s +def remove_end(s, end): + if s.endswith(end): + return s[:-len(end)] + return s + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip(u'/').split(u'/')[-1] @@ -1284,9 +1183,10 @@ def str_or_none(v, default=None): def str_to_int(int_str): + """ A more relaxed version of int_or_none """ if int_str is None: return None - int_str = re.sub(r'[,\.]', u'', int_str) + int_str = re.sub(r'[,\.\+]', u'', int_str) return int(int_str) @@ -1298,8 +1198,10 @@ def parse_duration(s): if s is None: return None + s = s.strip() + m = re.match( - r'(?:(?:(?P[0-9]+)[:h])?(?P[0-9]+)[:m])?(?P[0-9]+)s?(?::[0-9]+)?$', s) + r'(?i)(?:(?:(?P[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P[0-9]+)(?P\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s) if not m: return None res = int(m.group('secs')) @@ -1307,6 +1209,8 @@ def parse_duration(s): res += int(m.group('mins')) * 60 if m.group('hours'): res += int(m.group('hours')) * 60 * 60 + if m.group('ms'): + res += float(m.group('ms')) return res @@ -1325,15 +1229,36 @@ def check_executable(exe, args=[]): return exe -class PagedList(object): - def __init__(self, pagefunc, pagesize): - self._pagefunc = pagefunc - self._pagesize = pagesize +def get_exe_version(exe, args=['--version'], + version_re=r'version\s+([0-9._-a-zA-Z]+)', + unrecognized=u'present'): + """ Returns the version of the specified executable, + or False if the executable is not present """ + try: + out, err = subprocess.Popen( + [exe] + args, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() + except OSError: + return False + firstline = out.partition(b'\n')[0].decode('ascii', 'ignore') + m = re.search(version_re, firstline) + if m: + return m.group(1) + else: + return unrecognized + +class PagedList(object): def __len__(self): # This is only useful for tests return len(self.getslice()) + +class OnDemandPagedList(PagedList): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): @@ -1372,6 +1297,35 @@ class PagedList(object): return res +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagefunc = pagefunc + self._pagecount = pagecount + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page = list(self._pagefunc(pagenum)) + if skip_elems: + page = page[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page) < only_more: + only_more -= len(page) + else: + page = page[:only_more] + res.extend(page) + break + res.extend(page) + return res + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -1379,6 +1333,24 @@ def uppercase_escape(s): lambda m: unicode_escape(m.group(0))[0], s) + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + if sys.version_info < (3, 0) and isinstance(s, unicode): + s = s.encode('utf-8') + return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") + + +def escape_url(url): + """Escape URL as suggested by RFC 3986""" + url_parsed = compat_urllib_parse_urlparse(url) + return url_parsed._replace( + path=escape_rfc3986(url_parsed.path), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() + try: struct.pack(u'!I', 0) except TypeError: @@ -1417,6 +1389,12 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +try: + etree_iter = xml.etree.ElementTree.Element.iter +except AttributeError: # Python <=2.6 + etree_iter = lambda n: n.findall('.//*') + + def parse_xml(s): class TreeBuilder(xml.etree.ElementTree.TreeBuilder): def doctype(self, name, pubid, system): @@ -1424,16 +1402,14 @@ def parse_xml(s): parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - - -if sys.version_info < (3, 0) and sys.platform == 'win32': - def compat_getpass(prompt, *args, **kwargs): - if isinstance(prompt, compat_str): - prompt = prompt.encode(preferredencoding()) - return getpass.getpass(prompt, *args, **kwargs) -else: - compat_getpass = getpass.getpass + tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) + # Fix up XML parser in Python 2.x + if sys.version_info < (3, 0): + for n in etree_iter(tree): + if n.text is not None: + if not isinstance(n.text, compat_str): + n.text = n.text.decode('utf-8') + return tree US_RATINGS = { @@ -1445,10 +1421,42 @@ US_RATINGS = { } +def parse_age_limit(s): + if s is None: + return None + m = re.match(r'^(?P\d{1,2})\+?$', s) + return int(m.group('age')) if m else US_RATINGS.get(s, None) + + def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) +def js_to_json(code): + def fix_kv(m): + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + if v.startswith('"'): + return v + if v.startswith("'"): + v = v[1:-1] + v = re.sub(r"\\\\|\\'|\"", lambda m: { + '\\\\': '\\\\', + "\\'": "'", + '"': '\\"', + }[m.group(0)], v) + return '"%s"' % v + + res = re.sub(r'''(?x) + "(?:[^"\\]*(?:\\\\|\\")?)*"| + '(?:[^'\\]*(?:\\\\|\\')?)*'| + [a-zA-Z_][a-zA-Z_0-9]* + ''', fix_kv, code) + res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + return res + + def qualities(quality_ids): """ Get a numeric quality value out of a list of possible values """ def q(qid): @@ -1461,14 +1469,25 @@ def qualities(quality_ids): DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' -try: - subprocess_check_output = subprocess.check_output -except AttributeError: - def subprocess_check_output(*args, **kwargs): - assert 'input' not in kwargs - p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) - output, _ = p.communicate() - ret = p.poll() - if ret: - raise subprocess.CalledProcessError(ret, p.args, output=output) - return output + +def limit_length(s, length): + """ Add ellipses to overly long strings """ + if s is None: + return None + ELLIPSES = '...' + if len(s) > length: + return s[:length - len(ELLIPSES)] + ELLIPSES + return s + + +def version_tuple(v): + return [int(e) for e in v.split('.')] + + +def is_outdated_version(version, limit, assume_new=True): + if not version: + return not assume_new + try: + return version_tuple(version) < version_tuple(limit) + except ValueError: + return not assume_new