X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=1013f7c1879e0afac035327e1e60c13ee295ee21;hb=5268a05e4722d74e125a97b023d92943745bb249;hp=edeee1853e30c2b409fe53809ae7912e7966446c;hpb=ca75235d3db0d9349c2365a6d2cbaf10c2098063;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index edeee1853..1013f7c18 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -37,6 +37,7 @@ from .compat import ( compat_chr, compat_html_entities, compat_http_client, + compat_kwargs, compat_parse_qs, compat_socket_create_connection, compat_str, @@ -114,7 +115,7 @@ def write_json_file(obj, fn): 'encoding': 'utf-8', }) - tf = tempfile.NamedTemporaryFile(**args) + tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) try: with tf: @@ -371,6 +372,18 @@ def unescapeHTML(s): r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + def encodeFilename(s, for_subprocess=False): """ @param s The name of the file @@ -382,21 +395,24 @@ def encodeFilename(s, for_subprocess=False): if sys.version_info >= (3, 0): return s - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess: - return s - else: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return s.encode(encoding, 'ignore') + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') def encodeArgument(s): @@ -408,6 +424,10 @@ def encodeArgument(s): return encodeFilename(s, True) +def decodeArgument(b): + return decodeFilename(b, True) + + def decodeOption(optval): if optval is None: return optval @@ -1109,15 +1129,6 @@ def shell_quote(args): return ' '.join(quoted_args) -def takewhile_inclusive(pred, seq): - """ Like itertools.takewhile, but include the latest evaluated element - (the first element so that Not pred(e)) """ - for e in seq: - yield e - if not pred(e): - return - - def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ @@ -1338,9 +1349,19 @@ def parse_duration(s): return res -def prepend_extension(filename, ext): +def prepend_extension(filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return '{0}.{1}{2}'.format(name, ext, real_ext) + return ( + '{0}.{1}{2}'.format(name, ext, real_ext) + if not expected_real_ext or real_ext[1:] == expected_real_ext + else '{0}.{1}'.format(filename, ext)) + + +def replace_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return '{0}.{1}'.format( + name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, + ext) def check_executable(exe, args=[]): @@ -1465,6 +1486,14 @@ def uppercase_escape(s): s) +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str): @@ -1800,6 +1829,59 @@ def match_filter_func(filter_str): return _match_func +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return 0.0 + + mobj = re.match(r'^(?P\d+(?:\.\d+)?)s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def format_srt_time(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, mins) = divmod(mins, 60) + millisecs = (secs - int(secs)) * 1000 + secs = int(secs) + return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) + + +def dfxp2srt(dfxp_data): + _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + + def parse_node(node): + str_or_empty = functools.partial(str_or_none, default='') + + out = str_or_empty(node.text) + + for child in node: + if child.tag == _x('ttml:br'): + out += '\n' + str_or_empty(child.tail) + elif child.tag == _x('ttml:span'): + out += str_or_empty(parse_node(child)) + else: + out += str_or_empty(xml.etree.ElementTree.tostring(child)) + + return out + + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) + + for para, index in zip(paras, itertools.count(1)): + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), + format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), + parse_node(para))) + + return ''.join(out) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers