X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=229de4b39a0ee6408277fcf1f999b53c1d78ee78;hb=e21c26daf9dfb56b9bc4f18f30e7f1c724c9189e;hp=dc73f34079360e4d1411dccd938ef42a53c51208;hpb=d5ae6bb50124f8320f2b492380480038c487a6d2;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dc73f3407..229de4b39 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -42,17 +42,18 @@ from .compat import ( compat_http_client, compat_kwargs, compat_parse_qs, + compat_shlex_quote, compat_socket_create_connection, compat_str, + compat_struct_pack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, compat_xpath, - shlex_quote, - struct_pack, ) from .socks import ( @@ -104,9 +105,9 @@ KNOWN_EXTENSIONS = ( 'f4f', 'f4m', 'm3u8', 'smil') # needed for sanitizing filenames in restricted mode -ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) def preferredencoding(): @@ -860,9 +861,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 if sys.version_info >= (3, 0): location = location.encode('iso-8859-1').decode('utf-8') + else: + location = location.decode('utf-8') location_escaped = escape_url(location) if location != location_escaped: del resp.headers['Location'] + if sys.version_info < (3, 0): + location_escaped = location_escaped.encode('utf-8') resp.headers['Location'] = location_escaped return resp @@ -882,11 +887,17 @@ def make_socks_conn_class(base_class, socks_proxy): elif url_components.scheme.lower() == 'socks4a': socks_type = ProxyType.SOCKS4A + def unquote_if_non_empty(s): + if not s: + return s + return compat_urllib_parse_unquote_plus(s) + proxy_args = ( socks_type, url_components.hostname, url_components.port or 1080, True, # Remote DNS - url_components.username, url_components.password + unquote_if_non_empty(url_components.username), + unquote_if_non_empty(url_components.password), ) class SocksConnection(base_class): @@ -1028,6 +1039,7 @@ def unified_strdate(date_str, day_first=True): format_expressions.extend([ '%d-%m-%Y', '%d.%m.%Y', + '%d.%m.%y', '%d/%m/%Y', '%d/%m/%y', '%d/%m/%Y %H:%M:%S', @@ -1048,7 +1060,10 @@ def unified_strdate(date_str, day_first=True): if upload_date is None: timetuple = email.utils.parsedate_tz(date_str) if timetuple: - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + try: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + except ValueError: + pass if upload_date is not None: return compat_str(upload_date) @@ -1259,7 +1274,7 @@ def bytes_to_intlist(bs): def intlist_to_bytes(xs): if not xs: return b'' - return struct_pack('%dB' % len(xs), *xs) + return compat_struct_pack('%dB' % len(xs), *xs) # Cross-platform file locking @@ -1542,15 +1557,11 @@ def setproctitle(title): def remove_start(s, start): - if s.startswith(start): - return s[len(start):] - return s + return s[len(start):] if s is not None and s.startswith(start) else s def remove_end(s, end): - if s.endswith(end): - return s[:-len(end)] - return s + return s[:-len(end)] if s is not None and s.endswith(end) else s def remove_quotes(s): @@ -1904,7 +1915,7 @@ def parse_age_limit(s): def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) def js_to_json(code): @@ -1912,24 +1923,38 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - if v.startswith('"'): - v = re.sub(r"\\'", "'", v[1:-1]) - elif v.startswith("'"): - v = v[1:-1] - v = re.sub(r"\\\\|\\'|\"", lambda m: { - '\\\\': '\\\\', - "\\'": "'", + elif v.startswith('/*') or v == ',': + return "" + + if v[0] in ("'", '"'): + v = re.sub(r'(?s)\\.|"', lambda m: { '"': '\\"', - }[m.group(0)], v) + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)), v[1:-1]) + + INTEGER_TABLE = ( + (r'^0[xX][0-9a-fA-F]+', 16), + (r'^0+[0-7]+', 8), + ) + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(0), base) + return '"%d":' % i if v.endswith(':') else '%d' % i + return '"%s"' % v - res = re.sub(r'''(?x) - "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| - [a-zA-Z_][.a-zA-Z_0-9]* + return re.sub(r'''(?sx) + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + /\*.*?\*/|,(?=\s*[\]}])| + [a-zA-Z_][.a-zA-Z_0-9]*| + (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| + [0-9]+(?=\s*:) ''', fix_kv, code) - res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) - return res def qualities(quality_ids): @@ -1977,7 +2002,7 @@ def ytdl_is_updateable(): def args_to_str(args): # Get a short string representation for a subprocess command - return ' '.join(shlex_quote(a) for a in args) + return ' '.join(compat_shlex_quote(a) for a in args) def error_to_compat_str(err): @@ -1995,6 +2020,9 @@ def mimetype2ext(mt): ext = { 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as + # it's the most popular one + 'audio/mpeg': 'mp3', }.get(mt) if ext is not None: return ext @@ -2015,11 +2043,7 @@ def mimetype2ext(mt): def urlhandle_detect_ext(url_handle): - try: - url_handle.headers - getheader = lambda h: url_handle.headers[h] - except AttributeError: # Python < 3 - getheader = url_handle.info().getheader + getheader = url_handle.headers.get cd = getheader('Content-Disposition') if cd: