X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=7460d87017d702dce0093049379d80ab149b490b;hb=bbefcf04bf953a24d3cfaa3ac0eeab04589aed73;hp=2864e51428e69591ba9592869de7f4bbc87072f9;hpb=7d11297f3f91e6ddd3f0caa5ad4dca1a40d6c820;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2864e5142..7460d8701 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import unicode_literals + import calendar import codecs import contextlib @@ -8,7 +10,6 @@ import ctypes import datetime import email.utils import errno -import getpass import gzip import itertools import io @@ -29,254 +30,18 @@ import traceback import xml.etree.ElementTree import zlib -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: - import html.parser as compat_html_parser -except ImportError: # Python 2 - import HTMLParser as compat_html_parser - -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, unicode - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -try: - from xml.etree.ElementTree import ParseError as compat_xml_parse_error -except ImportError: # Python 2.6 - from xml.parsers.expat import ExpatError as compat_xml_parse_error - -try: - from shlex import quote as shlex_quote -except ImportError: # Python < 3.3 - def shlex_quote(s): - return "'" + s.replace("'", "'\"'\"'") + "'" - - -def compat_ord(c): - if type(c) is int: return c - else: return ord(c) - - -if sys.version_info >= (3, 0): - compat_getenv = os.getenv - compat_expanduser = os.path.expanduser -else: - # Environment variables should be decoded with filesystem encoding. - # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) - - def compat_getenv(key, default=None): - env = os.getenv(key, default) - if env: - env = env.decode(get_filesystem_encoding()) - return env - - # HACK: The default implementations of os.path.expanduser from cpython do not decode - # environment variables with filesystem encoding. We will work around this by - # providing adjusted implementations. - # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib - # for different platforms with correct environment variables decoding. - - if os.name == 'posix': - def compat_expanduser(path): - """Expand ~ and ~user constructions. If user or $HOME is unknown, - do nothing.""" - if not path.startswith('~'): - return path - i = path.find('/', 1) - if i < 0: - i = len(path) - if i == 1: - if 'HOME' not in os.environ: - import pwd - userhome = pwd.getpwuid(os.getuid()).pw_dir - else: - userhome = compat_getenv('HOME') - else: - import pwd - try: - pwent = pwd.getpwnam(path[1:i]) - except KeyError: - return path - userhome = pwent.pw_dir - userhome = userhome.rstrip('/') - return (userhome + path[i:]) or '/' - elif os.name == 'nt' or os.name == 'ce': - def compat_expanduser(path): - """Expand ~ and ~user constructs. - - If user or $HOME is unknown, do nothing.""" - if path[:1] != '~': - return path - i, n = 1, len(path) - while i < n and path[i] not in '/\\': - i = i + 1 - - if 'HOME' in os.environ: - userhome = compat_getenv('HOME') - elif 'USERPROFILE' in os.environ: - userhome = compat_getenv('USERPROFILE') - elif not 'HOMEPATH' in os.environ: - return path - else: - try: - drive = compat_getenv('HOMEDRIVE') - except KeyError: - drive = '' - userhome = os.path.join(drive, compat_getenv('HOMEPATH')) - - if i != 1: #~user - userhome = os.path.join(os.path.dirname(userhome), path[1:i]) - - return userhome + path[i:] - else: - compat_expanduser = os.path.expanduser +from .compat import ( + compat_chr, + compat_getenv, + compat_html_entities, + compat_parse_qs, + compat_str, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urlparse, +) # This is not clearly defined otherwise @@ -304,14 +69,6 @@ def preferredencoding(): return pref -if sys.version_info < (3,0): - def compat_print(s): - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert type(s) == type(u'') - print(s) - def write_json_file(obj, fn): """ Encode obj as JSON and write it to fn, atomically """ @@ -394,127 +151,32 @@ def xpath_text(node, xpath, name=None, fatal=False): return n.text -compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class BaseHTMLParser(compat_html_parser.HTMLParser): - def __init(self): - compat_html_parser.HTMLParser.__init__(self) - self.html = None - - def loads(self, html): - self.html = html - self.feed(html) - self.close() - -class AttrParser(BaseHTMLParser): - """Modified HTMLParser that isolates a tag with the specified attribute""" - def __init__(self, attribute, value): - self.attribute = attribute - self.value = value - self.result = None - self.started = False - self.depth = {} - self.watch_startpos = False - self.error_count = 0 - BaseHTMLParser.__init__(self) - - def error(self, message): - if self.error_count > 10 or self.started: - raise compat_html_parser.HTMLParseError(message, self.getpos()) - self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line - self.error_count += 1 - self.goahead(1) - - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.started: - self.find_startpos(None) - if self.attribute in attrs and attrs[self.attribute] == self.value: - self.result = [tag] - self.started = True - self.watch_startpos = True - if self.started: - if not tag in self.depth: self.depth[tag] = 0 - self.depth[tag] += 1 - - def handle_endtag(self, tag): - if self.started: - if tag in self.depth: self.depth[tag] -= 1 - if self.depth[self.result[0]] == 0: - self.started = False - self.result.append(self.getpos()) - - def find_startpos(self, x): - """Needed to put the start position of the result (self.result[1]) - after the opening tag with the requested id""" - if self.watch_startpos: - self.watch_startpos = False - self.result.append(self.getpos()) - handle_entityref = handle_charref = handle_data = handle_comment = \ - handle_decl = handle_pi = unknown_decl = find_startpos - - def get_result(self): - if self.result is None: - return None - if len(self.result) != 3: - return None - lines = self.html.split('\n') - lines = lines[self.result[1][0]-1:self.result[2][0]] - lines[0] = lines[0][self.result[1][1]:] - if len(lines) == 1: - lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] - lines[-1] = lines[-1][:self.result[2][1]] - return '\n'.join(lines).strip() -# Hack for https://github.com/rg3/youtube-dl/issues/662 -if sys.version_info < (2, 7, 3): - AttrParser.parse_endtag = (lambda self, i: - i + len("") - if self.rawdata[i:].startswith("") - else compat_html_parser.HTMLParser.parse_endtag(self, i)) - def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" return get_element_by_attribute("id", id, html) + def get_element_by_attribute(attribute, value, html): """Return the content of the tag with the specified attribute in the passed HTML document""" - parser = AttrParser(attribute, value) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() -class MetaParser(BaseHTMLParser): - """ - Modified HTMLParser that isolates a meta tag with the specified name - attribute. - """ - def __init__(self, name): - BaseHTMLParser.__init__(self) - self.name = name - self.content = None - self.result = None - - def handle_starttag(self, tag, attrs): - if tag != 'meta': - return - attrs = dict(attrs) - if attrs.get('name') == self.name: - self.result = attrs.get('content') + m = re.search(r'''(?xs) + <([a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s+%s=['"]?%s['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s*> + (?P.*?) + + ''' % (re.escape(attribute), re.escape(value)), html) - def get_result(self): - return self.result + if not m: + return None + res = m.group('content') -def get_meta_content(name, html): - """ - Return the content attribute from the meta tag with the given name attribute. - """ - parser = MetaParser(name) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] + + return unescapeHTML(res) def clean_html(html): @@ -1181,10 +843,7 @@ def bytes_to_intlist(bs): def intlist_to_bytes(xs): if not xs: return b'' - if isinstance(chr(0), bytes): # Python 2 - return ''.join([chr(x) for x in xs]) - else: - return bytes(xs) + return struct_pack('%dB' % len(xs), *xs) # Cross-platform file locking @@ -1472,6 +1131,25 @@ def check_executable(exe, args=[]): return exe +def get_exe_version(exe, args=['--version'], + version_re=r'version\s+([0-9._-a-zA-Z]+)', + unrecognized=u'present'): + """ Returns the version of the specified executable, + or False if the executable is not present """ + try: + out, err = subprocess.Popen( + [exe] + args, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() + except OSError: + return False + firstline = out.partition(b'\n')[0].decode('ascii', 'ignore') + m = re.search(version_re, firstline) + if m: + return m.group(1) + else: + return unrecognized + + class PagedList(object): def __len__(self): # This is only useful for tests @@ -1562,7 +1240,7 @@ def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, unicode): s = s.encode('utf-8') - return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") + return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") def escape_url(url): @@ -1636,15 +1314,6 @@ def parse_xml(s): return tree -if sys.version_info < (3, 0) and sys.platform == 'win32': - def compat_getpass(prompt, *args, **kwargs): - if isinstance(prompt, compat_str): - prompt = prompt.encode(preferredencoding()) - return getpass.getpass(prompt, *args, **kwargs) -else: - compat_getpass = getpass.getpass - - US_RATINGS = { 'G': 0, 'PG': 10, @@ -1662,7 +1331,8 @@ def parse_age_limit(s): def strip_jsonp(code): - return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) + return re.sub( + r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) def js_to_json(code): @@ -1702,18 +1372,6 @@ def qualities(quality_ids): DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' -try: - subprocess_check_output = subprocess.check_output -except AttributeError: - def subprocess_check_output(*args, **kwargs): - assert 'input' not in kwargs - p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) - output, _ = p.communicate() - ret = p.poll() - if ret: - raise subprocess.CalledProcessError(ret, p.args, output=output) - return output - def limit_length(s, length): """ Add ellipses to overly long strings """