X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=a2746b2d1f0106ab9d9e651250a070df34af2d11;hb=aa5740fb61d388754e9278a3e38de12203c1b89d;hp=52f0dd09aac2ef0103212086a280fc317b36b82d;hpb=af14ded75e10653b4713c23f8c428c6cd88610ad;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52f0dd09a..a2746b2d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -37,6 +37,7 @@ from .compat import ( compat_chr, compat_html_entities, compat_http_client, + compat_kwargs, compat_parse_qs, compat_socket_create_connection, compat_str, @@ -114,7 +115,7 @@ def write_json_file(obj, fn): 'encoding': 'utf-8', }) - tf = tempfile.NamedTemporaryFile(**args) + tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) try: with tf: @@ -312,27 +313,20 @@ def sanitize_path(s): """Sanitizes and normalizes path on Windows""" if sys.platform != 'win32': return s - drive, _ = os.path.splitdrive(s) - unc, _ = os.path.splitunc(s) - unc_or_drive = unc or drive - norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) - if unc_or_drive: + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: norm_path.pop(0) sanitized_path = [ path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) for path_part in norm_path] - if unc_or_drive: - sanitized_path.insert(0, unc_or_drive + os.path.sep) + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) -def sanitize_url_path_consecutive_slashes(url): - """Collapses consecutive slashes in URLs' path""" - parsed_url = list(compat_urlparse.urlparse(url)) - parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) - return compat_urlparse.urlunparse(parsed_url) - - def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -371,6 +365,18 @@ def unescapeHTML(s): r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + def encodeFilename(s, for_subprocess=False): """ @param s The name of the file @@ -382,21 +388,24 @@ def encodeFilename(s, for_subprocess=False): if sys.version_info >= (3, 0): return s - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess: - return s - else: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return s.encode(encoding, 'ignore') + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') def encodeArgument(s): @@ -408,6 +417,10 @@ def encodeArgument(s): return encodeFilename(s, True) +def decodeArgument(b): + return decodeFilename(b, True) + + def decodeOption(optval): if optval is None: return optval @@ -452,6 +465,17 @@ def make_HTTPS_handler(params, **kwargs): return YoutubeDLHTTPSHandler(params, context=context, **kwargs) +def bug_reports_message(): + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg = '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + return msg + + class ExtractorError(Exception): """Error during info extraction.""" @@ -467,13 +491,7 @@ class ExtractorError(Exception): if cause: msg += ' (caused by %r)' % cause if not expected: - if ytdl_is_updateable(): - update_cmd = 'type youtube-dl -U to update' - else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg += '; please report this issue on https://yt-dl.org/bug .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + msg += bug_reports_message() super(ExtractorError, self).__init__(msg) self.traceback = tb @@ -1104,15 +1122,6 @@ def shell_quote(args): return ' '.join(quoted_args) -def takewhile_inclusive(pred, seq): - """ Like itertools.takewhile, but include the latest evaluated element - (the first element so that Not pred(e)) """ - for e in seq: - yield e - if not pred(e): - return - - def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ @@ -1333,9 +1342,19 @@ def parse_duration(s): return res -def prepend_extension(filename, ext): +def prepend_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return ( + '{0}.{1}{2}'.format(name, ext, real_ext) + if not expected_real_ext or real_ext[1:] == expected_real_ext + else '{0}.{1}'.format(filename, ext)) + + +def replace_extension(filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return '{0}.{1}{2}'.format(name, ext, real_ext) + return '{0}.{1}'.format( + name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, + ext) def check_executable(exe, args=[]): @@ -1354,7 +1373,7 @@ def get_exe_version(exe, args=['--version'], or False if the executable is not present """ try: out, _ = subprocess.Popen( - [exe] + args, + [encodeArgument(exe)] + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() except OSError: return False @@ -1460,6 +1479,14 @@ def uppercase_escape(s): s) +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str): @@ -1638,6 +1665,7 @@ def mimetype2ext(mt): return { 'x-ms-wmv': 'wmv', 'x-mp4-fragmented': 'mp4', + 'ttml+xml': 'ttml', }.get(res, res) @@ -1795,6 +1823,267 @@ def match_filter_func(filter_str): return _match_func +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return 0.0 + + mobj = re.match(r'^(?P\d+(?:\.\d+)?)s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) + + +def dfxp2srt(dfxp_data): + _x = functools.partial(xpath_with_ns, ns_map={ + 'ttml': 'http://www.w3.org/ns/ttml', + 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + }) + + def parse_node(node): + str_or_empty = functools.partial(str_or_none, default='') + + out = str_or_empty(node.text) + + for child in node: + if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): + out += '\n' + str_or_empty(child.tail) + elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): + out += str_or_empty(parse_node(child)) + else: + out += str_or_empty(xml.etree.ElementTree.tostring(child)) + + return out + + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') + + for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib['begin']) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + if not end_time: + end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), + parse_node(para))) + + return ''.join(out) + + +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers