_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_os_name,
  46     compat_parse_qs,
  47     compat_shlex_quote,
  48     compat_socket_create_connection,
  49     compat_str,
  50     compat_struct_pack,
  51     compat_struct_unpack,
  52     compat_urllib_error,
  53     compat_urllib_parse,
  54     compat_urllib_parse_urlencode,
  55     compat_urllib_parse_urlparse,
  56     compat_urllib_parse_unquote_plus,
  57     compat_urllib_request,
  58     compat_urlparse,
  59     compat_xpath,
  60 )
  61
  62 from .socks import (
  63     ProxyType,
  64     sockssocket,
  65 )
  66
  67
  68 def register_socks_protocols():
  69     # "Register" SOCKS protocols
  70     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  71     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  72     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  73         if scheme not in compat_urlparse.uses_netloc:
  74             compat_urlparse.uses_netloc.append(scheme)
  75
  76
  77 # This is not clearly defined otherwise
  78 compiled_regex_type = type(re.compile(''))
  79
  80 std_headers = {
  81     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  82     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84     'Accept-Encoding': 'gzip, deflate',
  85     'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88
  89 USER_AGENTS = {
  90     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  91 }
  92
  93
  94 NO_DEFAULT = object()
  95
  96 ENGLISH_MONTH_NAMES = [
  97     'January', 'February', 'March', 'April', 'May', 'June',
  98     'July', 'August', 'September', 'October', 'November', 'December']
  99
 100 MONTH_NAMES = {
 101     'en': ENGLISH_MONTH_NAMES,
 102     'fr': [
 103         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 104         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 105 }
 106
 107 KNOWN_EXTENSIONS = (
 108     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 109     'flv', 'f4v', 'f4a', 'f4b',
 110     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 111     'mkv', 'mka', 'mk3d',
 112     'avi', 'divx',
 113     'mov',
 114     'asf', 'wmv', 'wma',
 115     '3gp', '3g2',
 116     'mp3',
 117     'flac',
 118     'ape',
 119     'wav',
 120     'f4f', 'f4m', 'm3u8', 'smil')
 121
 122 # needed for sanitizing filenames in restricted mode
 123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 124                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 125                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 126
 127 DATE_FORMATS = (
 128     '%d %B %Y',
 129     '%d %b %Y',
 130     '%B %d %Y',
 131     '%b %d %Y',
 132     '%b %dst %Y %I:%M',
 133     '%b %dnd %Y %I:%M',
 134     '%b %dth %Y %I:%M',
 135     '%Y %m %d',
 136     '%Y-%m-%d',
 137     '%Y/%m/%d',
 138     '%Y/%m/%d %H:%M',
 139     '%Y/%m/%d %H:%M:%S',
 140     '%Y-%m-%d %H:%M:%S',
 141     '%Y-%m-%d %H:%M:%S.%f',
 142     '%d.%m.%Y %H:%M',
 143     '%d.%m.%Y %H.%M',
 144     '%Y-%m-%dT%H:%M:%SZ',
 145     '%Y-%m-%dT%H:%M:%S.%fZ',
 146     '%Y-%m-%dT%H:%M:%S.%f0Z',
 147     '%Y-%m-%dT%H:%M:%S',
 148     '%Y-%m-%dT%H:%M:%S.%f',
 149     '%Y-%m-%dT%H:%M',
 150     '%b %d %Y at %H:%M',
 151     '%b %d %Y at %H:%M:%S',
 152 )
 153
 154 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 155 DATE_FORMATS_DAY_FIRST.extend([
 156     '%d-%m-%Y',
 157     '%d.%m.%Y',
 158     '%d.%m.%y',
 159     '%d/%m/%Y',
 160     '%d/%m/%y',
 161     '%d/%m/%Y %H:%M:%S',
 162 ])
 163
 164 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 165 DATE_FORMATS_MONTH_FIRST.extend([
 166     '%m-%d-%Y',
 167     '%m.%d.%Y',
 168     '%m/%d/%Y',
 169     '%m/%d/%y',
 170     '%m/%d/%Y %H:%M:%S',
 171 ])
 172
 173 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 174
 175
 176 def preferredencoding():
 177     """Get preferred encoding.
 178
 179     Returns the best encoding scheme for the system, based on
 180     locale.getpreferredencoding() and some further tweaks.
 181     """
 182     try:
 183         pref = locale.getpreferredencoding()
 184         'TEST'.encode(pref)
 185     except Exception:
 186         pref = 'UTF-8'
 187
 188     return pref
 189
 190
 191 def write_json_file(obj, fn):
 192     """ Encode obj as JSON and write it to fn, atomically if possible """
 193
 194     fn = encodeFilename(fn)
 195     if sys.version_info < (3, 0) and sys.platform != 'win32':
 196         encoding = get_filesystem_encoding()
 197         # os.path.basename returns a bytes object, but NamedTemporaryFile
 198         # will fail if the filename contains non ascii characters unless we
 199         # use a unicode object
 200         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 201         # the same for os.path.dirname
 202         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 203     else:
 204         path_basename = os.path.basename
 205         path_dirname = os.path.dirname
 206
 207     args = {
 208         'suffix': '.tmp',
 209         'prefix': path_basename(fn) + '.',
 210         'dir': path_dirname(fn),
 211         'delete': False,
 212     }
 213
 214     # In Python 2.x, json.dump expects a bytestream.
 215     # In Python 3.x, it writes to a character stream
 216     if sys.version_info < (3, 0):
 217         args['mode'] = 'wb'
 218     else:
 219         args.update({
 220             'mode': 'w',
 221             'encoding': 'utf-8',
 222         })
 223
 224     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 225
 226     try:
 227         with tf:
 228             json.dump(obj, tf)
 229         if sys.platform == 'win32':
 230             # Need to remove existing file on Windows, else os.rename raises
 231             # WindowsError or FileExistsError.
 232             try:
 233                 os.unlink(fn)
 234             except OSError:
 235                 pass
 236         os.rename(tf.name, fn)
 237     except Exception:
 238         try:
 239             os.remove(tf.name)
 240         except OSError:
 241             pass
 242         raise
 243
 244
 245 if sys.version_info >= (2, 7):
 246     def find_xpath_attr(node, xpath, key, val=None):
 247         """ Find the xpath xpath[@key=val] """
 248         assert re.match(r'^[a-zA-Z_-]+$', key)
 249         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 250         return node.find(expr)
 251 else:
 252     def find_xpath_attr(node, xpath, key, val=None):
 253         for f in node.findall(compat_xpath(xpath)):
 254             if key not in f.attrib:
 255                 continue
 256             if val is None or f.attrib.get(key) == val:
 257                 return f
 258         return None
 259
 260 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 261 # the namespace parameter
 262
 263
 264 def xpath_with_ns(path, ns_map):
 265     components = [c.split(':') for c in path.split('/')]
 266     replaced = []
 267     for c in components:
 268         if len(c) == 1:
 269             replaced.append(c[0])
 270         else:
 271             ns, tag = c
 272             replaced.append('{%s}%s' % (ns_map[ns], tag))
 273     return '/'.join(replaced)
 274
 275
 276 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 277     def _find_xpath(xpath):
 278         return node.find(compat_xpath(xpath))
 279
 280     if isinstance(xpath, (str, compat_str)):
 281         n = _find_xpath(xpath)
 282     else:
 283         for xp in xpath:
 284             n = _find_xpath(xp)
 285             if n is not None:
 286                 break
 287
 288     if n is None:
 289         if default is not NO_DEFAULT:
 290             return default
 291         elif fatal:
 292             name = xpath if name is None else name
 293             raise ExtractorError('Could not find XML element %s' % name)
 294         else:
 295             return None
 296     return n
 297
 298
 299 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 300     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 301     if n is None or n == default:
 302         return n
 303     if n.text is None:
 304         if default is not NO_DEFAULT:
 305             return default
 306         elif fatal:
 307             name = xpath if name is None else name
 308             raise ExtractorError('Could not find XML element\'s text %s' % name)
 309         else:
 310             return None
 311     return n.text
 312
 313
 314 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 315     n = find_xpath_attr(node, xpath, key)
 316     if n is None:
 317         if default is not NO_DEFAULT:
 318             return default
 319         elif fatal:
 320             name = '%s[@%s]' % (xpath, key) if name is None else name
 321             raise ExtractorError('Could not find XML attribute %s' % name)
 322         else:
 323             return None
 324     return n.attrib[key]
 325
 326
 327 def get_element_by_id(id, html):
 328     """Return the content of the tag with the specified ID in the passed HTML document"""
 329     return get_element_by_attribute('id', id, html)
 330
 331
 332 def get_element_by_class(class_name, html):
 333     return get_element_by_attribute(
 334         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 335         html, escape_value=False)
 336
 337
 338 def get_element_by_attribute(attribute, value, html, escape_value=True):
 339     """Return the content of the tag with the specified attribute in the passed HTML document"""
 340
 341     value = re.escape(value) if escape_value else value
 342
 343     m = re.search(r'''(?xs)
 344         <([a-zA-Z0-9:._-]+)
 345          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 346          \s+%s=['"]?%s['"]?
 347          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 348         \s*>
 349         (?P<content>.*?)
 350         </\1>
 351     ''' % (re.escape(attribute), value), html)
 352
 353     if not m:
 354         return None
 355     res = m.group('content')
 356
 357     if res.startswith('"') or res.startswith("'"):
 358         res = res[1:-1]
 359
 360     return unescapeHTML(res)
 361
 362
 363 class HTMLAttributeParser(compat_HTMLParser):
 364     """Trivial HTML parser to gather the attributes for a single element"""
 365     def __init__(self):
 366         self.attrs = {}
 367         compat_HTMLParser.__init__(self)
 368
 369     def handle_starttag(self, tag, attrs):
 370         self.attrs = dict(attrs)
 371
 372
 373 def extract_attributes(html_element):
 374     """Given a string for an HTML element such as
 375     <el
 376          a="foo" B="bar" c="&98;az" d=boz
 377          empty= noval entity="&amp;"
 378          sq='"' dq="'"
 379     >
 380     Decode and return a dictionary of attributes.
 381     {
 382         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 383         'empty': '', 'noval': None, 'entity': '&',
 384         'sq': '"', 'dq': '\''
 385     }.
 386     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 387     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 388     """
 389     parser = HTMLAttributeParser()
 390     parser.feed(html_element)
 391     parser.close()
 392     return parser.attrs
 393
 394
 395 def clean_html(html):
 396     """Clean an HTML snippet into a readable string"""
 397
 398     if html is None:  # Convenience for sanitizing descriptions etc.
 399         return html
 400
 401     # Newline vs <br />
 402     html = html.replace('\n', ' ')
 403     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 404     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 405     # Strip html tags
 406     html = re.sub('<.*?>', '', html)
 407     # Replace html entities
 408     html = unescapeHTML(html)
 409     return html.strip()
 410
 411
 412 def sanitize_open(filename, open_mode):
 413     """Try to open the given filename, and slightly tweak it if this fails.
 414
 415     Attempts to open the given filename. If this fails, it tries to change
 416     the filename slightly, step by step, until it's either able to open it
 417     or it fails and raises a final exception, like the standard open()
 418     function.
 419
 420     It returns the tuple (stream, definitive_file_name).
 421     """
 422     try:
 423         if filename == '-':
 424             if sys.platform == 'win32':
 425                 import msvcrt
 426                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 427             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 428         stream = open(encodeFilename(filename), open_mode)
 429         return (stream, filename)
 430     except (IOError, OSError) as err:
 431         if err.errno in (errno.EACCES,):
 432             raise
 433
 434         # In case of error, try to remove win32 forbidden chars
 435         alt_filename = sanitize_path(filename)
 436         if alt_filename == filename:
 437             raise
 438         else:
 439             # An exception here should be caught in the caller
 440             stream = open(encodeFilename(alt_filename), open_mode)
 441             return (stream, alt_filename)
 442
 443
 444 def timeconvert(timestr):
 445     """Convert RFC 2822 defined time string into system timestamp"""
 446     timestamp = None
 447     timetuple = email.utils.parsedate_tz(timestr)
 448     if timetuple is not None:
 449         timestamp = email.utils.mktime_tz(timetuple)
 450     return timestamp
 451
 452
 453 def sanitize_filename(s, restricted=False, is_id=False):
 454     """Sanitizes a string so it could be used as part of a filename.
 455     If restricted is set, use a stricter subset of allowed characters.
 456     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 457     """
 458     def replace_insane(char):
 459         if restricted and char in ACCENT_CHARS:
 460             return ACCENT_CHARS[char]
 461         if char == '?' or ord(char) < 32 or ord(char) == 127:
 462             return ''
 463         elif char == '"':
 464             return '' if restricted else '\''
 465         elif char == ':':
 466             return '_-' if restricted else ' -'
 467         elif char in '\\/|*<>':
 468             return '_'
 469         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 470             return '_'
 471         if restricted and ord(char) > 127:
 472             return '_'
 473         return char
 474
 475     # Handle timestamps
 476     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 477     result = ''.join(map(replace_insane, s))
 478     if not is_id:
 479         while '__' in result:
 480             result = result.replace('__', '_')
 481         result = result.strip('_')
 482         # Common case of "Foreign band name - English song title"
 483         if restricted and result.startswith('-_'):
 484             result = result[2:]
 485         if result.startswith('-'):
 486             result = '_' + result[len('-'):]
 487         result = result.lstrip('.')
 488         if not result:
 489             result = '_'
 490     return result
 491
 492
 493 def sanitize_path(s):
 494     """Sanitizes and normalizes path on Windows"""
 495     if sys.platform != 'win32':
 496         return s
 497     drive_or_unc, _ = os.path.splitdrive(s)
 498     if sys.version_info < (2, 7) and not drive_or_unc:
 499         drive_or_unc, _ = os.path.splitunc(s)
 500     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 501     if drive_or_unc:
 502         norm_path.pop(0)
 503     sanitized_path = [
 504         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 505         for path_part in norm_path]
 506     if drive_or_unc:
 507         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 508     return os.path.join(*sanitized_path)
 509
 510
 511 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 512 # unwanted failures due to missing protocol
 513 def sanitize_url(url):
 514     return 'http:%s' % url if url.startswith('//') else url
 515
 516
 517 def sanitized_Request(url, *args, **kwargs):
 518     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 519
 520
 521 def orderedSet(iterable):
 522     """ Remove all duplicates from the input iterable """
 523     res = []
 524     for el in iterable:
 525         if el not in res:
 526             res.append(el)
 527     return res
 528
 529
 530 def _htmlentity_transform(entity_with_semicolon):
 531     """Transforms an HTML entity to a character."""
 532     entity = entity_with_semicolon[:-1]
 533
 534     # Known non-numeric HTML entity
 535     if entity in compat_html_entities.name2codepoint:
 536         return compat_chr(compat_html_entities.name2codepoint[entity])
 537
 538     # TODO: HTML5 allows entities without a semicolon. For example,
 539     # '&Eacuteric' should be decoded as 'Éric'.
 540     if entity_with_semicolon in compat_html_entities_html5:
 541         return compat_html_entities_html5[entity_with_semicolon]
 542
 543     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 544     if mobj is not None:
 545         numstr = mobj.group(1)
 546         if numstr.startswith('x'):
 547             base = 16
 548             numstr = '0%s' % numstr
 549         else:
 550             base = 10
 551         # See https://github.com/rg3/youtube-dl/issues/7518
 552         try:
 553             return compat_chr(int(numstr, base))
 554         except ValueError:
 555             pass
 556
 557     # Unknown entity in name, return its literal representation
 558     return '&%s;' % entity
 559
 560
 561 def unescapeHTML(s):
 562     if s is None:
 563         return None
 564     assert type(s) == compat_str
 565
 566     return re.sub(
 567         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 568
 569
 570 def get_subprocess_encoding():
 571     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 572         # For subprocess calls, encode with locale encoding
 573         # Refer to http://stackoverflow.com/a/9951851/35070
 574         encoding = preferredencoding()
 575     else:
 576         encoding = sys.getfilesystemencoding()
 577     if encoding is None:
 578         encoding = 'utf-8'
 579     return encoding
 580
 581
 582 def encodeFilename(s, for_subprocess=False):
 583     """
 584     @param s The name of the file
 585     """
 586
 587     assert type(s) == compat_str
 588
 589     # Python 3 has a Unicode API
 590     if sys.version_info >= (3, 0):
 591         return s
 592
 593     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 594     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 595     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 596     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 597         return s
 598
 599     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 600     if sys.platform.startswith('java'):
 601         return s
 602
 603     return s.encode(get_subprocess_encoding(), 'ignore')
 604
 605
 606 def decodeFilename(b, for_subprocess=False):
 607
 608     if sys.version_info >= (3, 0):
 609         return b
 610
 611     if not isinstance(b, bytes):
 612         return b
 613
 614     return b.decode(get_subprocess_encoding(), 'ignore')
 615
 616
 617 def encodeArgument(s):
 618     if not isinstance(s, compat_str):
 619         # Legacy code that uses byte strings
 620         # Uncomment the following line after fixing all post processors
 621         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 622         s = s.decode('ascii')
 623     return encodeFilename(s, True)
 624
 625
 626 def decodeArgument(b):
 627     return decodeFilename(b, True)
 628
 629
 630 def decodeOption(optval):
 631     if optval is None:
 632         return optval
 633     if isinstance(optval, bytes):
 634         optval = optval.decode(preferredencoding())
 635
 636     assert isinstance(optval, compat_str)
 637     return optval
 638
 639
 640 def formatSeconds(secs):
 641     if secs > 3600:
 642         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 643     elif secs > 60:
 644         return '%d:%02d' % (secs // 60, secs % 60)
 645     else:
 646         return '%d' % secs
 647
 648
 649 def make_HTTPS_handler(params, **kwargs):
 650     opts_no_check_certificate = params.get('nocheckcertificate', False)
 651     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 652         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 653         if opts_no_check_certificate:
 654             context.check_hostname = False
 655             context.verify_mode = ssl.CERT_NONE
 656         try:
 657             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 658         except TypeError:
 659             # Python 2.7.8
 660             # (create_default_context present but HTTPSHandler has no context=)
 661             pass
 662
 663     if sys.version_info < (3, 2):
 664         return YoutubeDLHTTPSHandler(params, **kwargs)
 665     else:  # Python < 3.4
 666         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 667         context.verify_mode = (ssl.CERT_NONE
 668                                if opts_no_check_certificate
 669                                else ssl.CERT_REQUIRED)
 670         context.set_default_verify_paths()
 671         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 672
 673
 674 def bug_reports_message():
 675     if ytdl_is_updateable():
 676         update_cmd = 'type  youtube-dl -U  to update'
 677     else:
 678         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 679     msg = '; please report this issue on https://yt-dl.org/bug .'
 680     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 681     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 682     return msg
 683
 684
 685 class ExtractorError(Exception):
 686     """Error during info extraction."""
 687
 688     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 689         """ tb, if given, is the original traceback (so that it can be printed out).
 690         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 691         """
 692
 693         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 694             expected = True
 695         if video_id is not None:
 696             msg = video_id + ': ' + msg
 697         if cause:
 698             msg += ' (caused by %r)' % cause
 699         if not expected:
 700             msg += bug_reports_message()
 701         super(ExtractorError, self).__init__(msg)
 702
 703         self.traceback = tb
 704         self.exc_info = sys.exc_info()  # preserve original exception
 705         self.cause = cause
 706         self.video_id = video_id
 707
 708     def format_traceback(self):
 709         if self.traceback is None:
 710             return None
 711         return ''.join(traceback.format_tb(self.traceback))
 712
 713
 714 class UnsupportedError(ExtractorError):
 715     def __init__(self, url):
 716         super(UnsupportedError, self).__init__(
 717             'Unsupported URL: %s' % url, expected=True)
 718         self.url = url
 719
 720
 721 class RegexNotFoundError(ExtractorError):
 722     """Error when a regex didn't match"""
 723     pass
 724
 725
 726 class DownloadError(Exception):
 727     """Download Error exception.
 728
 729     This exception may be thrown by FileDownloader objects if they are not
 730     configured to continue on errors. They will contain the appropriate
 731     error message.
 732     """
 733
 734     def __init__(self, msg, exc_info=None):
 735         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 736         super(DownloadError, self).__init__(msg)
 737         self.exc_info = exc_info
 738
 739
 740 class SameFileError(Exception):
 741     """Same File exception.
 742
 743     This exception will be thrown by FileDownloader objects if they detect
 744     multiple files would have to be downloaded to the same file on disk.
 745     """
 746     pass
 747
 748
 749 class PostProcessingError(Exception):
 750     """Post Processing exception.
 751
 752     This exception may be raised by PostProcessor's .run() method to
 753     indicate an error in the postprocessing task.
 754     """
 755
 756     def __init__(self, msg):
 757         self.msg = msg
 758
 759
 760 class MaxDownloadsReached(Exception):
 761     """ --max-downloads limit has been reached. """
 762     pass
 763
 764
 765 class UnavailableVideoError(Exception):
 766     """Unavailable Format exception.
 767
 768     This exception will be thrown when a video is requested
 769     in a format that is not available for that video.
 770     """
 771     pass
 772
 773
 774 class ContentTooShortError(Exception):
 775     """Content Too Short exception.
 776
 777     This exception may be raised by FileDownloader objects when a file they
 778     download is too small for what the server announced first, indicating
 779     the connection was probably interrupted.
 780     """
 781
 782     def __init__(self, downloaded, expected):
 783         # Both in bytes
 784         self.downloaded = downloaded
 785         self.expected = expected
 786
 787
 788 class XAttrMetadataError(Exception):
 789     def __init__(self, code=None, msg='Unknown error'):
 790         super(XAttrMetadataError, self).__init__(msg)
 791         self.code = code
 792         self.msg = msg
 793
 794         # Parsing code and msg
 795         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 796                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 797             self.reason = 'NO_SPACE'
 798         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 799             self.reason = 'VALUE_TOO_LONG'
 800         else:
 801             self.reason = 'NOT_SUPPORTED'
 802
 803
 804 class XAttrUnavailableError(Exception):
 805     pass
 806
 807
 808 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 809     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 810     # expected HTTP responses to meet HTTP/1.0 or later (see also
 811     # https://github.com/rg3/youtube-dl/issues/6727)
 812     if sys.version_info < (3, 0):
 813         kwargs[b'strict'] = True
 814     hc = http_class(*args, **kwargs)
 815     source_address = ydl_handler._params.get('source_address')
 816     if source_address is not None:
 817         sa = (source_address, 0)
 818         if hasattr(hc, 'source_address'):  # Python 2.7+
 819             hc.source_address = sa
 820         else:  # Python 2.6
 821             def _hc_connect(self, *args, **kwargs):
 822                 sock = compat_socket_create_connection(
 823                     (self.host, self.port), self.timeout, sa)
 824                 if is_https:
 825                     self.sock = ssl.wrap_socket(
 826                         sock, self.key_file, self.cert_file,
 827                         ssl_version=ssl.PROTOCOL_TLSv1)
 828                 else:
 829                     self.sock = sock
 830             hc.connect = functools.partial(_hc_connect, hc)
 831
 832     return hc
 833
 834
 835 def handle_youtubedl_headers(headers):
 836     filtered_headers = headers
 837
 838     if 'Youtubedl-no-compression' in filtered_headers:
 839         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 840         del filtered_headers['Youtubedl-no-compression']
 841
 842     return filtered_headers
 843
 844
 845 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 846     """Handler for HTTP requests and responses.
 847
 848     This class, when installed with an OpenerDirector, automatically adds
 849     the standard headers to every HTTP request and handles gzipped and
 850     deflated responses from web servers. If compression is to be avoided in
 851     a particular request, the original request in the program code only has
 852     to include the HTTP header "Youtubedl-no-compression", which will be
 853     removed before making the real request.
 854
 855     Part of this code was copied from:
 856
 857     http://techknack.net/python-urllib2-handlers/
 858
 859     Andrew Rowls, the author of that code, agreed to release it to the
 860     public domain.
 861     """
 862
 863     def __init__(self, params, *args, **kwargs):
 864         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 865         self._params = params
 866
 867     def http_open(self, req):
 868         conn_class = compat_http_client.HTTPConnection
 869
 870         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 871         if socks_proxy:
 872             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 873             del req.headers['Ytdl-socks-proxy']
 874
 875         return self.do_open(functools.partial(
 876             _create_http_connection, self, conn_class, False),
 877             req)
 878
 879     @staticmethod
 880     def deflate(data):
 881         try:
 882             return zlib.decompress(data, -zlib.MAX_WBITS)
 883         except zlib.error:
 884             return zlib.decompress(data)
 885
 886     @staticmethod
 887     def addinfourl_wrapper(stream, headers, url, code):
 888         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 889             return compat_urllib_request.addinfourl(stream, headers, url, code)
 890         ret = compat_urllib_request.addinfourl(stream, headers, url)
 891         ret.code = code
 892         return ret
 893
 894     def http_request(self, req):
 895         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 896         # always respected by websites, some tend to give out URLs with non percent-encoded
 897         # non-ASCII characters (see telemb.py, ard.py [#3412])
 898         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 899         # To work around aforementioned issue we will replace request's original URL with
 900         # percent-encoded one
 901         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 902         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 903         url = req.get_full_url()
 904         url_escaped = escape_url(url)
 905
 906         # Substitute URL if any change after escaping
 907         if url != url_escaped:
 908             req = update_Request(req, url=url_escaped)
 909
 910         for h, v in std_headers.items():
 911             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 912             # The dict keys are capitalized because of this bug by urllib
 913             if h.capitalize() not in req.headers:
 914                 req.add_header(h, v)
 915
 916         req.headers = handle_youtubedl_headers(req.headers)
 917
 918         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 919             # Python 2.6 is brain-dead when it comes to fragments
 920             req._Request__original = req._Request__original.partition('#')[0]
 921             req._Request__r_type = req._Request__r_type.partition('#')[0]
 922
 923         return req
 924
 925     def http_response(self, req, resp):
 926         old_resp = resp
 927         # gzip
 928         if resp.headers.get('Content-encoding', '') == 'gzip':
 929             content = resp.read()
 930             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 931             try:
 932                 uncompressed = io.BytesIO(gz.read())
 933             except IOError as original_ioerror:
 934                 # There may be junk add the end of the file
 935                 # See http://stackoverflow.com/q/4928560/35070 for details
 936                 for i in range(1, 1024):
 937                     try:
 938                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 939                         uncompressed = io.BytesIO(gz.read())
 940                     except IOError:
 941                         continue
 942                     break
 943                 else:
 944                     raise original_ioerror
 945             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 946             resp.msg = old_resp.msg
 947             del resp.headers['Content-encoding']
 948         # deflate
 949         if resp.headers.get('Content-encoding', '') == 'deflate':
 950             gz = io.BytesIO(self.deflate(resp.read()))
 951             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 952             resp.msg = old_resp.msg
 953             del resp.headers['Content-encoding']
 954         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 955         # https://github.com/rg3/youtube-dl/issues/6457).
 956         if 300 <= resp.code < 400:
 957             location = resp.headers.get('Location')
 958             if location:
 959                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 960                 if sys.version_info >= (3, 0):
 961                     location = location.encode('iso-8859-1').decode('utf-8')
 962                 else:
 963                     location = location.decode('utf-8')
 964                 location_escaped = escape_url(location)
 965                 if location != location_escaped:
 966                     del resp.headers['Location']
 967                     if sys.version_info < (3, 0):
 968                         location_escaped = location_escaped.encode('utf-8')
 969                     resp.headers['Location'] = location_escaped
 970         return resp
 971
 972     https_request = http_request
 973     https_response = http_response
 974
 975
 976 def make_socks_conn_class(base_class, socks_proxy):
 977     assert issubclass(base_class, (
 978         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 979
 980     url_components = compat_urlparse.urlparse(socks_proxy)
 981     if url_components.scheme.lower() == 'socks5':
 982         socks_type = ProxyType.SOCKS5
 983     elif url_components.scheme.lower() in ('socks', 'socks4'):
 984         socks_type = ProxyType.SOCKS4
 985     elif url_components.scheme.lower() == 'socks4a':
 986         socks_type = ProxyType.SOCKS4A
 987
 988     def unquote_if_non_empty(s):
 989         if not s:
 990             return s
 991         return compat_urllib_parse_unquote_plus(s)
 992
 993     proxy_args = (
 994         socks_type,
 995         url_components.hostname, url_components.port or 1080,
 996         True,  # Remote DNS
 997         unquote_if_non_empty(url_components.username),
 998         unquote_if_non_empty(url_components.password),
 999     )
1000
1001     class SocksConnection(base_class):
1002         def connect(self):
1003             self.sock = sockssocket()
1004             self.sock.setproxy(*proxy_args)
1005             if type(self.timeout) in (int, float):
1006                 self.sock.settimeout(self.timeout)
1007             self.sock.connect((self.host, self.port))
1008
1009             if isinstance(self, compat_http_client.HTTPSConnection):
1010                 if hasattr(self, '_context'):  # Python > 2.6
1011                     self.sock = self._context.wrap_socket(
1012                         self.sock, server_hostname=self.host)
1013                 else:
1014                     self.sock = ssl.wrap_socket(self.sock)
1015
1016     return SocksConnection
1017
1018
1019 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1020     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1021         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1022         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1023         self._params = params
1024
1025     def https_open(self, req):
1026         kwargs = {}
1027         conn_class = self._https_conn_class
1028
1029         if hasattr(self, '_context'):  # python > 2.6
1030             kwargs['context'] = self._context
1031         if hasattr(self, '_check_hostname'):  # python 3.x
1032             kwargs['check_hostname'] = self._check_hostname
1033
1034         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1035         if socks_proxy:
1036             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1037             del req.headers['Ytdl-socks-proxy']
1038
1039         return self.do_open(functools.partial(
1040             _create_http_connection, self, conn_class, True),
1041             req, **kwargs)
1042
1043
1044 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1045     def __init__(self, cookiejar=None):
1046         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1047
1048     def http_response(self, request, response):
1049         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1050         # characters in Set-Cookie HTTP header of last response (see
1051         # https://github.com/rg3/youtube-dl/issues/6769).
1052         # In order to at least prevent crashing we will percent encode Set-Cookie
1053         # header before HTTPCookieProcessor starts processing it.
1054         # if sys.version_info < (3, 0) and response.headers:
1055         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1056         #         set_cookie = response.headers.get(set_cookie_header)
1057         #         if set_cookie:
1058         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1059         #             if set_cookie != set_cookie_escaped:
1060         #                 del response.headers[set_cookie_header]
1061         #                 response.headers[set_cookie_header] = set_cookie_escaped
1062         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1063
1064     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1065     https_response = http_response
1066
1067
1068 def extract_timezone(date_str):
1069     m = re.search(
1070         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1071         date_str)
1072     if not m:
1073         timezone = datetime.timedelta()
1074     else:
1075         date_str = date_str[:-len(m.group('tz'))]
1076         if not m.group('sign'):
1077             timezone = datetime.timedelta()
1078         else:
1079             sign = 1 if m.group('sign') == '+' else -1
1080             timezone = datetime.timedelta(
1081                 hours=sign * int(m.group('hours')),
1082                 minutes=sign * int(m.group('minutes')))
1083     return timezone, date_str
1084
1085
1086 def parse_iso8601(date_str, delimiter='T', timezone=None):
1087     """ Return a UNIX timestamp from the given date """
1088
1089     if date_str is None:
1090         return None
1091
1092     date_str = re.sub(r'\.[0-9]+', '', date_str)
1093
1094     if timezone is None:
1095         timezone, date_str = extract_timezone(date_str)
1096
1097     try:
1098         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1099         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1100         return calendar.timegm(dt.timetuple())
1101     except ValueError:
1102         pass
1103
1104
1105 def date_formats(day_first=True):
1106     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1107
1108
1109 def unified_strdate(date_str, day_first=True):
1110     """Return a string with the date in the format YYYYMMDD"""
1111
1112     if date_str is None:
1113         return None
1114     upload_date = None
1115     # Replace commas
1116     date_str = date_str.replace(',', ' ')
1117     # Remove AM/PM + timezone
1118     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1119     _, date_str = extract_timezone(date_str)
1120
1121     for expression in date_formats(day_first):
1122         try:
1123             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1124         except ValueError:
1125             pass
1126     if upload_date is None:
1127         timetuple = email.utils.parsedate_tz(date_str)
1128         if timetuple:
1129             try:
1130                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1131             except ValueError:
1132                 pass
1133     if upload_date is not None:
1134         return compat_str(upload_date)
1135
1136
1137 def unified_timestamp(date_str, day_first=True):
1138     if date_str is None:
1139         return None
1140
1141     date_str = date_str.replace(',', ' ')
1142
1143     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1144     timezone, date_str = extract_timezone(date_str)
1145
1146     # Remove AM/PM + timezone
1147     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1148
1149     for expression in date_formats(day_first):
1150         try:
1151             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1152             return calendar.timegm(dt.timetuple())
1153         except ValueError:
1154             pass
1155     timetuple = email.utils.parsedate_tz(date_str)
1156     if timetuple:
1157         return calendar.timegm(timetuple) + pm_delta * 3600
1158
1159
1160 def determine_ext(url, default_ext='unknown_video'):
1161     if url is None:
1162         return default_ext
1163     guess = url.partition('?')[0].rpartition('.')[2]
1164     if re.match(r'^[A-Za-z0-9]+$', guess):
1165         return guess
1166     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1167     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1168         return guess.rstrip('/')
1169     else:
1170         return default_ext
1171
1172
1173 def subtitles_filename(filename, sub_lang, sub_format):
1174     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1175
1176
1177 def date_from_str(date_str):
1178     """
1179     Return a datetime object from a string in the format YYYYMMDD or
1180     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1181     today = datetime.date.today()
1182     if date_str in ('now', 'today'):
1183         return today
1184     if date_str == 'yesterday':
1185         return today - datetime.timedelta(days=1)
1186     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1187     if match is not None:
1188         sign = match.group('sign')
1189         time = int(match.group('time'))
1190         if sign == '-':
1191             time = -time
1192         unit = match.group('unit')
1193         # A bad approximation?
1194         if unit == 'month':
1195             unit = 'day'
1196             time *= 30
1197         elif unit == 'year':
1198             unit = 'day'
1199             time *= 365
1200         unit += 's'
1201         delta = datetime.timedelta(**{unit: time})
1202         return today + delta
1203     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1204
1205
1206 def hyphenate_date(date_str):
1207     """
1208     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1209     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1210     if match is not None:
1211         return '-'.join(match.groups())
1212     else:
1213         return date_str
1214
1215
1216 class DateRange(object):
1217     """Represents a time interval between two dates"""
1218
1219     def __init__(self, start=None, end=None):
1220         """start and end must be strings in the format accepted by date"""
1221         if start is not None:
1222             self.start = date_from_str(start)
1223         else:
1224             self.start = datetime.datetime.min.date()
1225         if end is not None:
1226             self.end = date_from_str(end)
1227         else:
1228             self.end = datetime.datetime.max.date()
1229         if self.start > self.end:
1230             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1231
1232     @classmethod
1233     def day(cls, day):
1234         """Returns a range that only contains the given day"""
1235         return cls(day, day)
1236
1237     def __contains__(self, date):
1238         """Check if the date is in the range"""
1239         if not isinstance(date, datetime.date):
1240             date = date_from_str(date)
1241         return self.start <= date <= self.end
1242
1243     def __str__(self):
1244         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1245
1246
1247 def platform_name():
1248     """ Returns the platform name as a compat_str """
1249     res = platform.platform()
1250     if isinstance(res, bytes):
1251         res = res.decode(preferredencoding())
1252
1253     assert isinstance(res, compat_str)
1254     return res
1255
1256
1257 def _windows_write_string(s, out):
1258     """ Returns True if the string was written using special methods,
1259     False if it has yet to be written out."""
1260     # Adapted from http://stackoverflow.com/a/3259271/35070
1261
1262     import ctypes
1263     import ctypes.wintypes
1264
1265     WIN_OUTPUT_IDS = {
1266         1: -11,
1267         2: -12,
1268     }
1269
1270     try:
1271         fileno = out.fileno()
1272     except AttributeError:
1273         # If the output stream doesn't have a fileno, it's virtual
1274         return False
1275     except io.UnsupportedOperation:
1276         # Some strange Windows pseudo files?
1277         return False
1278     if fileno not in WIN_OUTPUT_IDS:
1279         return False
1280
1281     GetStdHandle = ctypes.WINFUNCTYPE(
1282         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1283         (b'GetStdHandle', ctypes.windll.kernel32))
1284     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1285
1286     WriteConsoleW = ctypes.WINFUNCTYPE(
1287         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1288         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1289         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1290     written = ctypes.wintypes.DWORD(0)
1291
1292     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1293     FILE_TYPE_CHAR = 0x0002
1294     FILE_TYPE_REMOTE = 0x8000
1295     GetConsoleMode = ctypes.WINFUNCTYPE(
1296         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1297         ctypes.POINTER(ctypes.wintypes.DWORD))(
1298         (b'GetConsoleMode', ctypes.windll.kernel32))
1299     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1300
1301     def not_a_console(handle):
1302         if handle == INVALID_HANDLE_VALUE or handle is None:
1303             return True
1304         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1305                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1306
1307     if not_a_console(h):
1308         return False
1309
1310     def next_nonbmp_pos(s):
1311         try:
1312             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1313         except StopIteration:
1314             return len(s)
1315
1316     while s:
1317         count = min(next_nonbmp_pos(s), 1024)
1318
1319         ret = WriteConsoleW(
1320             h, s, count if count else 2, ctypes.byref(written), None)
1321         if ret == 0:
1322             raise OSError('Failed to write string')
1323         if not count:  # We just wrote a non-BMP character
1324             assert written.value == 2
1325             s = s[1:]
1326         else:
1327             assert written.value > 0
1328             s = s[written.value:]
1329     return True
1330
1331
1332 def write_string(s, out=None, encoding=None):
1333     if out is None:
1334         out = sys.stderr
1335     assert type(s) == compat_str
1336
1337     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1338         if _windows_write_string(s, out):
1339             return
1340
1341     if ('b' in getattr(out, 'mode', '') or
1342             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1343         byt = s.encode(encoding or preferredencoding(), 'ignore')
1344         out.write(byt)
1345     elif hasattr(out, 'buffer'):
1346         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1347         byt = s.encode(enc, 'ignore')
1348         out.buffer.write(byt)
1349     else:
1350         out.write(s)
1351     out.flush()
1352
1353
1354 def bytes_to_intlist(bs):
1355     if not bs:
1356         return []
1357     if isinstance(bs[0], int):  # Python 3
1358         return list(bs)
1359     else:
1360         return [ord(c) for c in bs]
1361
1362
1363 def intlist_to_bytes(xs):
1364     if not xs:
1365         return b''
1366     return compat_struct_pack('%dB' % len(xs), *xs)
1367
1368
1369 # Cross-platform file locking
1370 if sys.platform == 'win32':
1371     import ctypes.wintypes
1372     import msvcrt
1373
1374     class OVERLAPPED(ctypes.Structure):
1375         _fields_ = [
1376             ('Internal', ctypes.wintypes.LPVOID),
1377             ('InternalHigh', ctypes.wintypes.LPVOID),
1378             ('Offset', ctypes.wintypes.DWORD),
1379             ('OffsetHigh', ctypes.wintypes.DWORD),
1380             ('hEvent', ctypes.wintypes.HANDLE),
1381         ]
1382
1383     kernel32 = ctypes.windll.kernel32
1384     LockFileEx = kernel32.LockFileEx
1385     LockFileEx.argtypes = [
1386         ctypes.wintypes.HANDLE,     # hFile
1387         ctypes.wintypes.DWORD,      # dwFlags
1388         ctypes.wintypes.DWORD,      # dwReserved
1389         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1390         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1391         ctypes.POINTER(OVERLAPPED)  # Overlapped
1392     ]
1393     LockFileEx.restype = ctypes.wintypes.BOOL
1394     UnlockFileEx = kernel32.UnlockFileEx
1395     UnlockFileEx.argtypes = [
1396         ctypes.wintypes.HANDLE,     # hFile
1397         ctypes.wintypes.DWORD,      # dwReserved
1398         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1399         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1400         ctypes.POINTER(OVERLAPPED)  # Overlapped
1401     ]
1402     UnlockFileEx.restype = ctypes.wintypes.BOOL
1403     whole_low = 0xffffffff
1404     whole_high = 0x7fffffff
1405
1406     def _lock_file(f, exclusive):
1407         overlapped = OVERLAPPED()
1408         overlapped.Offset = 0
1409         overlapped.OffsetHigh = 0
1410         overlapped.hEvent = 0
1411         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1412         handle = msvcrt.get_osfhandle(f.fileno())
1413         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1414                           whole_low, whole_high, f._lock_file_overlapped_p):
1415             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1416
1417     def _unlock_file(f):
1418         assert f._lock_file_overlapped_p
1419         handle = msvcrt.get_osfhandle(f.fileno())
1420         if not UnlockFileEx(handle, 0,
1421                             whole_low, whole_high, f._lock_file_overlapped_p):
1422             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1423
1424 else:
1425     # Some platforms, such as Jython, is missing fcntl
1426     try:
1427         import fcntl
1428
1429         def _lock_file(f, exclusive):
1430             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1431
1432         def _unlock_file(f):
1433             fcntl.flock(f, fcntl.LOCK_UN)
1434     except ImportError:
1435         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1436
1437         def _lock_file(f, exclusive):
1438             raise IOError(UNSUPPORTED_MSG)
1439
1440         def _unlock_file(f):
1441             raise IOError(UNSUPPORTED_MSG)
1442
1443
1444 class locked_file(object):
1445     def __init__(self, filename, mode, encoding=None):
1446         assert mode in ['r', 'a', 'w']
1447         self.f = io.open(filename, mode, encoding=encoding)
1448         self.mode = mode
1449
1450     def __enter__(self):
1451         exclusive = self.mode != 'r'
1452         try:
1453             _lock_file(self.f, exclusive)
1454         except IOError:
1455             self.f.close()
1456             raise
1457         return self
1458
1459     def __exit__(self, etype, value, traceback):
1460         try:
1461             _unlock_file(self.f)
1462         finally:
1463             self.f.close()
1464
1465     def __iter__(self):
1466         return iter(self.f)
1467
1468     def write(self, *args):
1469         return self.f.write(*args)
1470
1471     def read(self, *args):
1472         return self.f.read(*args)
1473
1474
1475 def get_filesystem_encoding():
1476     encoding = sys.getfilesystemencoding()
1477     return encoding if encoding is not None else 'utf-8'
1478
1479
1480 def shell_quote(args):
1481     quoted_args = []
1482     encoding = get_filesystem_encoding()
1483     for a in args:
1484         if isinstance(a, bytes):
1485             # We may get a filename encoded with 'encodeFilename'
1486             a = a.decode(encoding)
1487         quoted_args.append(pipes.quote(a))
1488     return ' '.join(quoted_args)
1489
1490
1491 def smuggle_url(url, data):
1492     """ Pass additional data in a URL for internal use. """
1493
1494     url, idata = unsmuggle_url(url, {})
1495     data.update(idata)
1496     sdata = compat_urllib_parse_urlencode(
1497         {'__youtubedl_smuggle': json.dumps(data)})
1498     return url + '#' + sdata
1499
1500
1501 def unsmuggle_url(smug_url, default=None):
1502     if '#__youtubedl_smuggle' not in smug_url:
1503         return smug_url, default
1504     url, _, sdata = smug_url.rpartition('#')
1505     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1506     data = json.loads(jsond)
1507     return url, data
1508
1509
1510 def format_bytes(bytes):
1511     if bytes is None:
1512         return 'N/A'
1513     if type(bytes) is str:
1514         bytes = float(bytes)
1515     if bytes == 0.0:
1516         exponent = 0
1517     else:
1518         exponent = int(math.log(bytes, 1024.0))
1519     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1520     converted = float(bytes) / float(1024 ** exponent)
1521     return '%.2f%s' % (converted, suffix)
1522
1523
1524 def lookup_unit_table(unit_table, s):
1525     units_re = '|'.join(re.escape(u) for u in unit_table)
1526     m = re.match(
1527         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1528     if not m:
1529         return None
1530     num_str = m.group('num').replace(',', '.')
1531     mult = unit_table[m.group('unit')]
1532     return int(float(num_str) * mult)
1533
1534
1535 def parse_filesize(s):
1536     if s is None:
1537         return None
1538
1539     # The lower-case forms are of course incorrect and unofficial,
1540     # but we support those too
1541     _UNIT_TABLE = {
1542         'B': 1,
1543         'b': 1,
1544         'bytes': 1,
1545         'KiB': 1024,
1546         'KB': 1000,
1547         'kB': 1024,
1548         'Kb': 1000,
1549         'kb': 1000,
1550         'kilobytes': 1000,
1551         'kibibytes': 1024,
1552         'MiB': 1024 ** 2,
1553         'MB': 1000 ** 2,
1554         'mB': 1024 ** 2,
1555         'Mb': 1000 ** 2,
1556         'mb': 1000 ** 2,
1557         'megabytes': 1000 ** 2,
1558         'mebibytes': 1024 ** 2,
1559         'GiB': 1024 ** 3,
1560         'GB': 1000 ** 3,
1561         'gB': 1024 ** 3,
1562         'Gb': 1000 ** 3,
1563         'gb': 1000 ** 3,
1564         'gigabytes': 1000 ** 3,
1565         'gibibytes': 1024 ** 3,
1566         'TiB': 1024 ** 4,
1567         'TB': 1000 ** 4,
1568         'tB': 1024 ** 4,
1569         'Tb': 1000 ** 4,
1570         'tb': 1000 ** 4,
1571         'terabytes': 1000 ** 4,
1572         'tebibytes': 1024 ** 4,
1573         'PiB': 1024 ** 5,
1574         'PB': 1000 ** 5,
1575         'pB': 1024 ** 5,
1576         'Pb': 1000 ** 5,
1577         'pb': 1000 ** 5,
1578         'petabytes': 1000 ** 5,
1579         'pebibytes': 1024 ** 5,
1580         'EiB': 1024 ** 6,
1581         'EB': 1000 ** 6,
1582         'eB': 1024 ** 6,
1583         'Eb': 1000 ** 6,
1584         'eb': 1000 ** 6,
1585         'exabytes': 1000 ** 6,
1586         'exbibytes': 1024 ** 6,
1587         'ZiB': 1024 ** 7,
1588         'ZB': 1000 ** 7,
1589         'zB': 1024 ** 7,
1590         'Zb': 1000 ** 7,
1591         'zb': 1000 ** 7,
1592         'zettabytes': 1000 ** 7,
1593         'zebibytes': 1024 ** 7,
1594         'YiB': 1024 ** 8,
1595         'YB': 1000 ** 8,
1596         'yB': 1024 ** 8,
1597         'Yb': 1000 ** 8,
1598         'yb': 1000 ** 8,
1599         'yottabytes': 1000 ** 8,
1600         'yobibytes': 1024 ** 8,
1601     }
1602
1603     return lookup_unit_table(_UNIT_TABLE, s)
1604
1605
1606 def parse_count(s):
1607     if s is None:
1608         return None
1609
1610     s = s.strip()
1611
1612     if re.match(r'^[\d,.]+$', s):
1613         return str_to_int(s)
1614
1615     _UNIT_TABLE = {
1616         'k': 1000,
1617         'K': 1000,
1618         'm': 1000 ** 2,
1619         'M': 1000 ** 2,
1620         'kk': 1000 ** 2,
1621         'KK': 1000 ** 2,
1622     }
1623
1624     return lookup_unit_table(_UNIT_TABLE, s)
1625
1626
1627 def month_by_name(name, lang='en'):
1628     """ Return the number of a month by (locale-independently) English name """
1629
1630     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1631
1632     try:
1633         return month_names.index(name) + 1
1634     except ValueError:
1635         return None
1636
1637
1638 def month_by_abbreviation(abbrev):
1639     """ Return the number of a month by (locale-independently) English
1640         abbreviations """
1641
1642     try:
1643         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1644     except ValueError:
1645         return None
1646
1647
1648 def fix_xml_ampersands(xml_str):
1649     """Replace all the '&' by '&amp;' in XML"""
1650     return re.sub(
1651         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1652         '&amp;',
1653         xml_str)
1654
1655
1656 def setproctitle(title):
1657     assert isinstance(title, compat_str)
1658
1659     # ctypes in Jython is not complete
1660     # http://bugs.jython.org/issue2148
1661     if sys.platform.startswith('java'):
1662         return
1663
1664     try:
1665         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1666     except OSError:
1667         return
1668     title_bytes = title.encode('utf-8')
1669     buf = ctypes.create_string_buffer(len(title_bytes))
1670     buf.value = title_bytes
1671     try:
1672         libc.prctl(15, buf, 0, 0, 0)
1673     except AttributeError:
1674         return  # Strange libc, just skip this
1675
1676
1677 def remove_start(s, start):
1678     return s[len(start):] if s is not None and s.startswith(start) else s
1679
1680
1681 def remove_end(s, end):
1682     return s[:-len(end)] if s is not None and s.endswith(end) else s
1683
1684
1685 def remove_quotes(s):
1686     if s is None or len(s) < 2:
1687         return s
1688     for quote in ('"', "'", ):
1689         if s[0] == quote and s[-1] == quote:
1690             return s[1:-1]
1691     return s
1692
1693
1694 def url_basename(url):
1695     path = compat_urlparse.urlparse(url).path
1696     return path.strip('/').split('/')[-1]
1697
1698
1699 def base_url(url):
1700     return re.match(r'https?://[^?#&]+/', url).group()
1701
1702
1703 def urljoin(base, path):
1704     if not isinstance(path, compat_str) or not path:
1705         return None
1706     if re.match(r'^(?:https?:)?//', path):
1707         return path
1708     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1709         return None
1710     return compat_urlparse.urljoin(base, path)
1711
1712
1713 class HEADRequest(compat_urllib_request.Request):
1714     def get_method(self):
1715         return 'HEAD'
1716
1717
1718 class PUTRequest(compat_urllib_request.Request):
1719     def get_method(self):
1720         return 'PUT'
1721
1722
1723 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1724     if get_attr:
1725         if v is not None:
1726             v = getattr(v, get_attr, None)
1727     if v == '':
1728         v = None
1729     if v is None:
1730         return default
1731     try:
1732         return int(v) * invscale // scale
1733     except ValueError:
1734         return default
1735
1736
1737 def str_or_none(v, default=None):
1738     return default if v is None else compat_str(v)
1739
1740
1741 def str_to_int(int_str):
1742     """ A more relaxed version of int_or_none """
1743     if int_str is None:
1744         return None
1745     int_str = re.sub(r'[,\.\+]', '', int_str)
1746     return int(int_str)
1747
1748
1749 def float_or_none(v, scale=1, invscale=1, default=None):
1750     if v is None:
1751         return default
1752     try:
1753         return float(v) * invscale / scale
1754     except ValueError:
1755         return default
1756
1757
1758 def strip_or_none(v):
1759     return None if v is None else v.strip()
1760
1761
1762 def parse_duration(s):
1763     if not isinstance(s, compat_basestring):
1764         return None
1765
1766     s = s.strip()
1767
1768     days, hours, mins, secs, ms = [None] * 5
1769     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1770     if m:
1771         days, hours, mins, secs, ms = m.groups()
1772     else:
1773         m = re.match(
1774             r'''(?ix)(?:P?T)?
1775                 (?:
1776                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1777                 )?
1778                 (?:
1779                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1780                 )?
1781                 (?:
1782                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1783                 )?
1784                 (?:
1785                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1786                 )?$''', s)
1787         if m:
1788             days, hours, mins, secs, ms = m.groups()
1789         else:
1790             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1791             if m:
1792                 hours, mins = m.groups()
1793             else:
1794                 return None
1795
1796     duration = 0
1797     if secs:
1798         duration += float(secs)
1799     if mins:
1800         duration += float(mins) * 60
1801     if hours:
1802         duration += float(hours) * 60 * 60
1803     if days:
1804         duration += float(days) * 24 * 60 * 60
1805     if ms:
1806         duration += float(ms)
1807     return duration
1808
1809
1810 def prepend_extension(filename, ext, expected_real_ext=None):
1811     name, real_ext = os.path.splitext(filename)
1812     return (
1813         '{0}.{1}{2}'.format(name, ext, real_ext)
1814         if not expected_real_ext or real_ext[1:] == expected_real_ext
1815         else '{0}.{1}'.format(filename, ext))
1816
1817
1818 def replace_extension(filename, ext, expected_real_ext=None):
1819     name, real_ext = os.path.splitext(filename)
1820     return '{0}.{1}'.format(
1821         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1822         ext)
1823
1824
1825 def check_executable(exe, args=[]):
1826     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1827     args can be a list of arguments for a short output (like -version) """
1828     try:
1829         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1830     except OSError:
1831         return False
1832     return exe
1833
1834
1835 def get_exe_version(exe, args=['--version'],
1836                     version_re=None, unrecognized='present'):
1837     """ Returns the version of the specified executable,
1838     or False if the executable is not present """
1839     try:
1840         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1841         # SIGTTOU if youtube-dl is run in the background.
1842         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1843         out, _ = subprocess.Popen(
1844             [encodeArgument(exe)] + args,
1845             stdin=subprocess.PIPE,
1846             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1847     except OSError:
1848         return False
1849     if isinstance(out, bytes):  # Python 2.x
1850         out = out.decode('ascii', 'ignore')
1851     return detect_exe_version(out, version_re, unrecognized)
1852
1853
1854 def detect_exe_version(output, version_re=None, unrecognized='present'):
1855     assert isinstance(output, compat_str)
1856     if version_re is None:
1857         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1858     m = re.search(version_re, output)
1859     if m:
1860         return m.group(1)
1861     else:
1862         return unrecognized
1863
1864
1865 class PagedList(object):
1866     def __len__(self):
1867         # This is only useful for tests
1868         return len(self.getslice())
1869
1870
1871 class OnDemandPagedList(PagedList):
1872     def __init__(self, pagefunc, pagesize, use_cache=False):
1873         self._pagefunc = pagefunc
1874         self._pagesize = pagesize
1875         self._use_cache = use_cache
1876         if use_cache:
1877             self._cache = {}
1878
1879     def getslice(self, start=0, end=None):
1880         res = []
1881         for pagenum in itertools.count(start // self._pagesize):
1882             firstid = pagenum * self._pagesize
1883             nextfirstid = pagenum * self._pagesize + self._pagesize
1884             if start >= nextfirstid:
1885                 continue
1886
1887             page_results = None
1888             if self._use_cache:
1889                 page_results = self._cache.get(pagenum)
1890             if page_results is None:
1891                 page_results = list(self._pagefunc(pagenum))
1892             if self._use_cache:
1893                 self._cache[pagenum] = page_results
1894
1895             startv = (
1896                 start % self._pagesize
1897                 if firstid <= start < nextfirstid
1898                 else 0)
1899
1900             endv = (
1901                 ((end - 1) % self._pagesize) + 1
1902                 if (end is not None and firstid <= end <= nextfirstid)
1903                 else None)
1904
1905             if startv != 0 or endv is not None:
1906                 page_results = page_results[startv:endv]
1907             res.extend(page_results)
1908
1909             # A little optimization - if current page is not "full", ie. does
1910             # not contain page_size videos then we can assume that this page
1911             # is the last one - there are no more ids on further pages -
1912             # i.e. no need to query again.
1913             if len(page_results) + startv < self._pagesize:
1914                 break
1915
1916             # If we got the whole page, but the next page is not interesting,
1917             # break out early as well
1918             if end == nextfirstid:
1919                 break
1920         return res
1921
1922
1923 class InAdvancePagedList(PagedList):
1924     def __init__(self, pagefunc, pagecount, pagesize):
1925         self._pagefunc = pagefunc
1926         self._pagecount = pagecount
1927         self._pagesize = pagesize
1928
1929     def getslice(self, start=0, end=None):
1930         res = []
1931         start_page = start // self._pagesize
1932         end_page = (
1933             self._pagecount if end is None else (end // self._pagesize + 1))
1934         skip_elems = start - start_page * self._pagesize
1935         only_more = None if end is None else end - start
1936         for pagenum in range(start_page, end_page):
1937             page = list(self._pagefunc(pagenum))
1938             if skip_elems:
1939                 page = page[skip_elems:]
1940                 skip_elems = None
1941             if only_more is not None:
1942                 if len(page) < only_more:
1943                     only_more -= len(page)
1944                 else:
1945                     page = page[:only_more]
1946                     res.extend(page)
1947                     break
1948             res.extend(page)
1949         return res
1950
1951
1952 def uppercase_escape(s):
1953     unicode_escape = codecs.getdecoder('unicode_escape')
1954     return re.sub(
1955         r'\\U[0-9a-fA-F]{8}',
1956         lambda m: unicode_escape(m.group(0))[0],
1957         s)
1958
1959
1960 def lowercase_escape(s):
1961     unicode_escape = codecs.getdecoder('unicode_escape')
1962     return re.sub(
1963         r'\\u[0-9a-fA-F]{4}',
1964         lambda m: unicode_escape(m.group(0))[0],
1965         s)
1966
1967
1968 def escape_rfc3986(s):
1969     """Escape non-ASCII characters as suggested by RFC 3986"""
1970     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1971         s = s.encode('utf-8')
1972     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1973
1974
1975 def escape_url(url):
1976     """Escape URL as suggested by RFC 3986"""
1977     url_parsed = compat_urllib_parse_urlparse(url)
1978     return url_parsed._replace(
1979         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1980         path=escape_rfc3986(url_parsed.path),
1981         params=escape_rfc3986(url_parsed.params),
1982         query=escape_rfc3986(url_parsed.query),
1983         fragment=escape_rfc3986(url_parsed.fragment)
1984     ).geturl()
1985
1986
1987 def read_batch_urls(batch_fd):
1988     def fixup(url):
1989         if not isinstance(url, compat_str):
1990             url = url.decode('utf-8', 'replace')
1991         BOM_UTF8 = '\xef\xbb\xbf'
1992         if url.startswith(BOM_UTF8):
1993             url = url[len(BOM_UTF8):]
1994         url = url.strip()
1995         if url.startswith(('#', ';', ']')):
1996             return False
1997         return url
1998
1999     with contextlib.closing(batch_fd) as fd:
2000         return [url for url in map(fixup, fd) if url]
2001
2002
2003 def urlencode_postdata(*args, **kargs):
2004     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2005
2006
2007 def update_url_query(url, query):
2008     if not query:
2009         return url
2010     parsed_url = compat_urlparse.urlparse(url)
2011     qs = compat_parse_qs(parsed_url.query)
2012     qs.update(query)
2013     return compat_urlparse.urlunparse(parsed_url._replace(
2014         query=compat_urllib_parse_urlencode(qs, True)))
2015
2016
2017 def update_Request(req, url=None, data=None, headers={}, query={}):
2018     req_headers = req.headers.copy()
2019     req_headers.update(headers)
2020     req_data = data or req.data
2021     req_url = update_url_query(url or req.get_full_url(), query)
2022     req_get_method = req.get_method()
2023     if req_get_method == 'HEAD':
2024         req_type = HEADRequest
2025     elif req_get_method == 'PUT':
2026         req_type = PUTRequest
2027     else:
2028         req_type = compat_urllib_request.Request
2029     new_req = req_type(
2030         req_url, data=req_data, headers=req_headers,
2031         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2032     if hasattr(req, 'timeout'):
2033         new_req.timeout = req.timeout
2034     return new_req
2035
2036
2037 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2038     if isinstance(key_or_keys, (list, tuple)):
2039         for key in key_or_keys:
2040             if key not in d or d[key] is None or skip_false_values and not d[key]:
2041                 continue
2042             return d[key]
2043         return default
2044     return d.get(key_or_keys, default)
2045
2046
2047 def try_get(src, getter, expected_type=None):
2048     try:
2049         v = getter(src)
2050     except (AttributeError, KeyError, TypeError, IndexError):
2051         pass
2052     else:
2053         if expected_type is None or isinstance(v, expected_type):
2054             return v
2055
2056
2057 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2058     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2059
2060
2061 US_RATINGS = {
2062     'G': 0,
2063     'PG': 10,
2064     'PG-13': 13,
2065     'R': 16,
2066     'NC': 18,
2067 }
2068
2069
2070 TV_PARENTAL_GUIDELINES = {
2071     'TV-Y': 0,
2072     'TV-Y7': 7,
2073     'TV-G': 0,
2074     'TV-PG': 0,
2075     'TV-14': 14,
2076     'TV-MA': 17,
2077 }
2078
2079
2080 def parse_age_limit(s):
2081     if type(s) == int:
2082         return s if 0 <= s <= 21 else None
2083     if not isinstance(s, compat_basestring):
2084         return None
2085     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2086     if m:
2087         return int(m.group('age'))
2088     if s in US_RATINGS:
2089         return US_RATINGS[s]
2090     return TV_PARENTAL_GUIDELINES.get(s)
2091
2092
2093 def strip_jsonp(code):
2094     return re.sub(
2095         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2096
2097
2098 def js_to_json(code):
2099     def fix_kv(m):
2100         v = m.group(0)
2101         if v in ('true', 'false', 'null'):
2102             return v
2103         elif v.startswith('/*') or v == ',':
2104             return ""
2105
2106         if v[0] in ("'", '"'):
2107             v = re.sub(r'(?s)\\.|"', lambda m: {
2108                 '"': '\\"',
2109                 "\\'": "'",
2110                 '\\\n': '',
2111                 '\\x': '\\u00',
2112             }.get(m.group(0), m.group(0)), v[1:-1])
2113
2114         INTEGER_TABLE = (
2115             (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2116             (r'^(0+[0-7]+)\s*:?$', 8),
2117         )
2118
2119         for regex, base in INTEGER_TABLE:
2120             im = re.match(regex, v)
2121             if im:
2122                 i = int(im.group(1), base)
2123                 return '"%d":' % i if v.endswith(':') else '%d' % i
2124
2125         return '"%s"' % v
2126
2127     return re.sub(r'''(?sx)
2128         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2129         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2130         /\*.*?\*/|,(?=\s*[\]}])|
2131         [a-zA-Z_][.a-zA-Z_0-9]*|
2132         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2133         [0-9]+(?=\s*:)
2134         ''', fix_kv, code)
2135
2136
2137 def qualities(quality_ids):
2138     """ Get a numeric quality value out of a list of possible values """
2139     def q(qid):
2140         try:
2141             return quality_ids.index(qid)
2142         except ValueError:
2143             return -1
2144     return q
2145
2146
2147 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2148
2149
2150 def limit_length(s, length):
2151     """ Add ellipses to overly long strings """
2152     if s is None:
2153         return None
2154     ELLIPSES = '...'
2155     if len(s) > length:
2156         return s[:length - len(ELLIPSES)] + ELLIPSES
2157     return s
2158
2159
2160 def version_tuple(v):
2161     return tuple(int(e) for e in re.split(r'[-.]', v))
2162
2163
2164 def is_outdated_version(version, limit, assume_new=True):
2165     if not version:
2166         return not assume_new
2167     try:
2168         return version_tuple(version) < version_tuple(limit)
2169     except ValueError:
2170         return not assume_new
2171
2172
2173 def ytdl_is_updateable():
2174     """ Returns if youtube-dl can be updated with -U """
2175     from zipimport import zipimporter
2176
2177     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2178
2179
2180 def args_to_str(args):
2181     # Get a short string representation for a subprocess command
2182     return ' '.join(compat_shlex_quote(a) for a in args)
2183
2184
2185 def error_to_compat_str(err):
2186     err_str = str(err)
2187     # On python 2 error byte string must be decoded with proper
2188     # encoding rather than ascii
2189     if sys.version_info[0] < 3:
2190         err_str = err_str.decode(preferredencoding())
2191     return err_str
2192
2193
2194 def mimetype2ext(mt):
2195     if mt is None:
2196         return None
2197
2198     ext = {
2199         'audio/mp4': 'm4a',
2200         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2201         # it's the most popular one
2202         'audio/mpeg': 'mp3',
2203     }.get(mt)
2204     if ext is not None:
2205         return ext
2206
2207     _, _, res = mt.rpartition('/')
2208     res = res.split(';')[0].strip().lower()
2209
2210     return {
2211         '3gpp': '3gp',
2212         'smptett+xml': 'tt',
2213         'srt': 'srt',
2214         'ttaf+xml': 'dfxp',
2215         'ttml+xml': 'ttml',
2216         'vtt': 'vtt',
2217         'x-flv': 'flv',
2218         'x-mp4-fragmented': 'mp4',
2219         'x-ms-wmv': 'wmv',
2220         'mpegurl': 'm3u8',
2221         'x-mpegurl': 'm3u8',
2222         'vnd.apple.mpegurl': 'm3u8',
2223         'dash+xml': 'mpd',
2224         'f4m': 'f4m',
2225         'f4m+xml': 'f4m',
2226         'hds+xml': 'f4m',
2227         'vnd.ms-sstr+xml': 'ism',
2228         'quicktime': 'mov',
2229     }.get(res, res)
2230
2231
2232 def parse_codecs(codecs_str):
2233     # http://tools.ietf.org/html/rfc6381
2234     if not codecs_str:
2235         return {}
2236     splited_codecs = list(filter(None, map(
2237         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2238     vcodec, acodec = None, None
2239     for full_codec in splited_codecs:
2240         codec = full_codec.split('.')[0]
2241         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2242             if not vcodec:
2243                 vcodec = full_codec
2244         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2245             if not acodec:
2246                 acodec = full_codec
2247         else:
2248             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2249     if not vcodec and not acodec:
2250         if len(splited_codecs) == 2:
2251             return {
2252                 'vcodec': vcodec,
2253                 'acodec': acodec,
2254             }
2255         elif len(splited_codecs) == 1:
2256             return {
2257                 'vcodec': 'none',
2258                 'acodec': vcodec,
2259             }
2260     else:
2261         return {
2262             'vcodec': vcodec or 'none',
2263             'acodec': acodec or 'none',
2264         }
2265     return {}
2266
2267
2268 def urlhandle_detect_ext(url_handle):
2269     getheader = url_handle.headers.get
2270
2271     cd = getheader('Content-Disposition')
2272     if cd:
2273         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2274         if m:
2275             e = determine_ext(m.group('filename'), default_ext=None)
2276             if e:
2277                 return e
2278
2279     return mimetype2ext(getheader('Content-Type'))
2280
2281
2282 def encode_data_uri(data, mime_type):
2283     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2284
2285
2286 def age_restricted(content_limit, age_limit):
2287     """ Returns True iff the content should be blocked """
2288
2289     if age_limit is None:  # No limit set
2290         return False
2291     if content_limit is None:
2292         return False  # Content available for everyone
2293     return age_limit < content_limit
2294
2295
2296 def is_html(first_bytes):
2297     """ Detect whether a file contains HTML by examining its first bytes. """
2298
2299     BOMS = [
2300         (b'\xef\xbb\xbf', 'utf-8'),
2301         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2302         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2303         (b'\xff\xfe', 'utf-16-le'),
2304         (b'\xfe\xff', 'utf-16-be'),
2305     ]
2306     for bom, enc in BOMS:
2307         if first_bytes.startswith(bom):
2308             s = first_bytes[len(bom):].decode(enc, 'replace')
2309             break
2310     else:
2311         s = first_bytes.decode('utf-8', 'replace')
2312
2313     return re.match(r'^\s*<', s)
2314
2315
2316 def determine_protocol(info_dict):
2317     protocol = info_dict.get('protocol')
2318     if protocol is not None:
2319         return protocol
2320
2321     url = info_dict['url']
2322     if url.startswith('rtmp'):
2323         return 'rtmp'
2324     elif url.startswith('mms'):
2325         return 'mms'
2326     elif url.startswith('rtsp'):
2327         return 'rtsp'
2328
2329     ext = determine_ext(url)
2330     if ext == 'm3u8':
2331         return 'm3u8'
2332     elif ext == 'f4m':
2333         return 'f4m'
2334
2335     return compat_urllib_parse_urlparse(url).scheme
2336
2337
2338 def render_table(header_row, data):
2339     """ Render a list of rows, each as a list of values """
2340     table = [header_row] + data
2341     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2342     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2343     return '\n'.join(format_str % tuple(row) for row in table)
2344
2345
2346 def _match_one(filter_part, dct):
2347     COMPARISON_OPERATORS = {
2348         '<': operator.lt,
2349         '<=': operator.le,
2350         '>': operator.gt,
2351         '>=': operator.ge,
2352         '=': operator.eq,
2353         '!=': operator.ne,
2354     }
2355     operator_rex = re.compile(r'''(?x)\s*
2356         (?P<key>[a-z_]+)
2357         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2358         (?:
2359             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2360             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2361         )
2362         \s*$
2363         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2364     m = operator_rex.search(filter_part)
2365     if m:
2366         op = COMPARISON_OPERATORS[m.group('op')]
2367         actual_value = dct.get(m.group('key'))
2368         if (m.group('strval') is not None or
2369             # If the original field is a string and matching comparisonvalue is
2370             # a number we should respect the origin of the original field
2371             # and process comparison value as a string (see
2372             # https://github.com/rg3/youtube-dl/issues/11082).
2373             actual_value is not None and m.group('intval') is not None and
2374                 isinstance(actual_value, compat_str)):
2375             if m.group('op') not in ('=', '!='):
2376                 raise ValueError(
2377                     'Operator %s does not support string values!' % m.group('op'))
2378             comparison_value = m.group('strval') or m.group('intval')
2379         else:
2380             try:
2381                 comparison_value = int(m.group('intval'))
2382             except ValueError:
2383                 comparison_value = parse_filesize(m.group('intval'))
2384                 if comparison_value is None:
2385                     comparison_value = parse_filesize(m.group('intval') + 'B')
2386                 if comparison_value is None:
2387                     raise ValueError(
2388                         'Invalid integer value %r in filter part %r' % (
2389                             m.group('intval'), filter_part))
2390         if actual_value is None:
2391             return m.group('none_inclusive')
2392         return op(actual_value, comparison_value)
2393
2394     UNARY_OPERATORS = {
2395         '': lambda v: v is not None,
2396         '!': lambda v: v is None,
2397     }
2398     operator_rex = re.compile(r'''(?x)\s*
2399         (?P<op>%s)\s*(?P<key>[a-z_]+)
2400         \s*$
2401         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2402     m = operator_rex.search(filter_part)
2403     if m:
2404         op = UNARY_OPERATORS[m.group('op')]
2405         actual_value = dct.get(m.group('key'))
2406         return op(actual_value)
2407
2408     raise ValueError('Invalid filter part %r' % filter_part)
2409
2410
2411 def match_str(filter_str, dct):
2412     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2413
2414     return all(
2415         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2416
2417
2418 def match_filter_func(filter_str):
2419     def _match_func(info_dict):
2420         if match_str(filter_str, info_dict):
2421             return None
2422         else:
2423             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2424             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2425     return _match_func
2426
2427
2428 def parse_dfxp_time_expr(time_expr):
2429     if not time_expr:
2430         return
2431
2432     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2433     if mobj:
2434         return float(mobj.group('time_offset'))
2435
2436     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2437     if mobj:
2438         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2439
2440
2441 def srt_subtitles_timecode(seconds):
2442     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2443
2444
2445 def dfxp2srt(dfxp_data):
2446     _x = functools.partial(xpath_with_ns, ns_map={
2447         'ttml': 'http://www.w3.org/ns/ttml',
2448         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2449         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2450     })
2451
2452     class TTMLPElementParser(object):
2453         out = ''
2454
2455         def start(self, tag, attrib):
2456             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2457                 self.out += '\n'
2458
2459         def end(self, tag):
2460             pass
2461
2462         def data(self, data):
2463             self.out += data
2464
2465         def close(self):
2466             return self.out.strip()
2467
2468     def parse_node(node):
2469         target = TTMLPElementParser()
2470         parser = xml.etree.ElementTree.XMLParser(target=target)
2471         parser.feed(xml.etree.ElementTree.tostring(node))
2472         return parser.close()
2473
2474     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2475     out = []
2476     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2477
2478     if not paras:
2479         raise ValueError('Invalid dfxp/TTML subtitle')
2480
2481     for para, index in zip(paras, itertools.count(1)):
2482         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2483         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2484         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2485         if begin_time is None:
2486             continue
2487         if not end_time:
2488             if not dur:
2489                 continue
2490             end_time = begin_time + dur
2491         out.append('%d\n%s --> %s\n%s\n\n' % (
2492             index,
2493             srt_subtitles_timecode(begin_time),
2494             srt_subtitles_timecode(end_time),
2495             parse_node(para)))
2496
2497     return ''.join(out)
2498
2499
2500 def cli_option(params, command_option, param):
2501     param = params.get(param)
2502     if param:
2503         param = compat_str(param)
2504     return [command_option, param] if param is not None else []
2505
2506
2507 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2508     param = params.get(param)
2509     assert isinstance(param, bool)
2510     if separator:
2511         return [command_option + separator + (true_value if param else false_value)]
2512     return [command_option, true_value if param else false_value]
2513
2514
2515 def cli_valueless_option(params, command_option, param, expected_value=True):
2516     param = params.get(param)
2517     return [command_option] if param == expected_value else []
2518
2519
2520 def cli_configuration_args(params, param, default=[]):
2521     ex_args = params.get(param)
2522     if ex_args is None:
2523         return default
2524     assert isinstance(ex_args, list)
2525     return ex_args
2526
2527
2528 class ISO639Utils(object):
2529     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2530     _lang_map = {
2531         'aa': 'aar',
2532         'ab': 'abk',
2533         'ae': 'ave',
2534         'af': 'afr',
2535         'ak': 'aka',
2536         'am': 'amh',
2537         'an': 'arg',
2538         'ar': 'ara',
2539         'as': 'asm',
2540         'av': 'ava',
2541         'ay': 'aym',
2542         'az': 'aze',
2543         'ba': 'bak',
2544         'be': 'bel',
2545         'bg': 'bul',
2546         'bh': 'bih',
2547         'bi': 'bis',
2548         'bm': 'bam',
2549         'bn': 'ben',
2550         'bo': 'bod',
2551         'br': 'bre',
2552         'bs': 'bos',
2553         'ca': 'cat',
2554         'ce': 'che',
2555         'ch': 'cha',
2556         'co': 'cos',
2557         'cr': 'cre',
2558         'cs': 'ces',
2559         'cu': 'chu',
2560         'cv': 'chv',
2561         'cy': 'cym',
2562         'da': 'dan',
2563         'de': 'deu',
2564         'dv': 'div',
2565         'dz': 'dzo',
2566         'ee': 'ewe',
2567         'el': 'ell',
2568         'en': 'eng',
2569         'eo': 'epo',
2570         'es': 'spa',
2571         'et': 'est',
2572         'eu': 'eus',
2573         'fa': 'fas',
2574         'ff': 'ful',
2575         'fi': 'fin',
2576         'fj': 'fij',
2577         'fo': 'fao',
2578         'fr': 'fra',
2579         'fy': 'fry',
2580         'ga': 'gle',
2581         'gd': 'gla',
2582         'gl': 'glg',
2583         'gn': 'grn',
2584         'gu': 'guj',
2585         'gv': 'glv',
2586         'ha': 'hau',
2587         'he': 'heb',
2588         'hi': 'hin',
2589         'ho': 'hmo',
2590         'hr': 'hrv',
2591         'ht': 'hat',
2592         'hu': 'hun',
2593         'hy': 'hye',
2594         'hz': 'her',
2595         'ia': 'ina',
2596         'id': 'ind',
2597         'ie': 'ile',
2598         'ig': 'ibo',
2599         'ii': 'iii',
2600         'ik': 'ipk',
2601         'io': 'ido',
2602         'is': 'isl',
2603         'it': 'ita',
2604         'iu': 'iku',
2605         'ja': 'jpn',
2606         'jv': 'jav',
2607         'ka': 'kat',
2608         'kg': 'kon',
2609         'ki': 'kik',
2610         'kj': 'kua',
2611         'kk': 'kaz',
2612         'kl': 'kal',
2613         'km': 'khm',
2614         'kn': 'kan',
2615         'ko': 'kor',
2616         'kr': 'kau',
2617         'ks': 'kas',
2618         'ku': 'kur',
2619         'kv': 'kom',
2620         'kw': 'cor',
2621         'ky': 'kir',
2622         'la': 'lat',
2623         'lb': 'ltz',
2624         'lg': 'lug',
2625         'li': 'lim',
2626         'ln': 'lin',
2627         'lo': 'lao',
2628         'lt': 'lit',
2629         'lu': 'lub',
2630         'lv': 'lav',
2631         'mg': 'mlg',
2632         'mh': 'mah',
2633         'mi': 'mri',
2634         'mk': 'mkd',
2635         'ml': 'mal',
2636         'mn': 'mon',
2637         'mr': 'mar',
2638         'ms': 'msa',
2639         'mt': 'mlt',
2640         'my': 'mya',
2641         'na': 'nau',
2642         'nb': 'nob',
2643         'nd': 'nde',
2644         'ne': 'nep',
2645         'ng': 'ndo',
2646         'nl': 'nld',
2647         'nn': 'nno',
2648         'no': 'nor',
2649         'nr': 'nbl',
2650         'nv': 'nav',
2651         'ny': 'nya',
2652         'oc': 'oci',
2653         'oj': 'oji',
2654         'om': 'orm',
2655         'or': 'ori',
2656         'os': 'oss',
2657         'pa': 'pan',
2658         'pi': 'pli',
2659         'pl': 'pol',
2660         'ps': 'pus',
2661         'pt': 'por',
2662         'qu': 'que',
2663         'rm': 'roh',
2664         'rn': 'run',
2665         'ro': 'ron',
2666         'ru': 'rus',
2667         'rw': 'kin',
2668         'sa': 'san',
2669         'sc': 'srd',
2670         'sd': 'snd',
2671         'se': 'sme',
2672         'sg': 'sag',
2673         'si': 'sin',
2674         'sk': 'slk',
2675         'sl': 'slv',
2676         'sm': 'smo',
2677         'sn': 'sna',
2678         'so': 'som',
2679         'sq': 'sqi',
2680         'sr': 'srp',
2681         'ss': 'ssw',
2682         'st': 'sot',
2683         'su': 'sun',
2684         'sv': 'swe',
2685         'sw': 'swa',
2686         'ta': 'tam',
2687         'te': 'tel',
2688         'tg': 'tgk',
2689         'th': 'tha',
2690         'ti': 'tir',
2691         'tk': 'tuk',
2692         'tl': 'tgl',
2693         'tn': 'tsn',
2694         'to': 'ton',
2695         'tr': 'tur',
2696         'ts': 'tso',
2697         'tt': 'tat',
2698         'tw': 'twi',
2699         'ty': 'tah',
2700         'ug': 'uig',
2701         'uk': 'ukr',
2702         'ur': 'urd',
2703         'uz': 'uzb',
2704         've': 'ven',
2705         'vi': 'vie',
2706         'vo': 'vol',
2707         'wa': 'wln',
2708         'wo': 'wol',
2709         'xh': 'xho',
2710         'yi': 'yid',
2711         'yo': 'yor',
2712         'za': 'zha',
2713         'zh': 'zho',
2714         'zu': 'zul',
2715     }
2716
2717     @classmethod
2718     def short2long(cls, code):
2719         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2720         return cls._lang_map.get(code[:2])
2721
2722     @classmethod
2723     def long2short(cls, code):
2724         """Convert language code from ISO 639-2/T to ISO 639-1"""
2725         for short_name, long_name in cls._lang_map.items():
2726             if long_name == code:
2727                 return short_name
2728
2729
2730 class ISO3166Utils(object):
2731     # From http://data.okfn.org/data/core/country-list
2732     _country_map = {
2733         'AF': 'Afghanistan',
2734         'AX': 'Åland Islands',
2735         'AL': 'Albania',
2736         'DZ': 'Algeria',
2737         'AS': 'American Samoa',
2738         'AD': 'Andorra',
2739         'AO': 'Angola',
2740         'AI': 'Anguilla',
2741         'AQ': 'Antarctica',
2742         'AG': 'Antigua and Barbuda',
2743         'AR': 'Argentina',
2744         'AM': 'Armenia',
2745         'AW': 'Aruba',
2746         'AU': 'Australia',
2747         'AT': 'Austria',
2748         'AZ': 'Azerbaijan',
2749         'BS': 'Bahamas',
2750         'BH': 'Bahrain',
2751         'BD': 'Bangladesh',
2752         'BB': 'Barbados',
2753         'BY': 'Belarus',
2754         'BE': 'Belgium',
2755         'BZ': 'Belize',
2756         'BJ': 'Benin',
2757         'BM': 'Bermuda',
2758         'BT': 'Bhutan',
2759         'BO': 'Bolivia, Plurinational State of',
2760         'BQ': 'Bonaire, Sint Eustatius and Saba',
2761         'BA': 'Bosnia and Herzegovina',
2762         'BW': 'Botswana',
2763         'BV': 'Bouvet Island',
2764         'BR': 'Brazil',
2765         'IO': 'British Indian Ocean Territory',
2766         'BN': 'Brunei Darussalam',
2767         'BG': 'Bulgaria',
2768         'BF': 'Burkina Faso',
2769         'BI': 'Burundi',
2770         'KH': 'Cambodia',
2771         'CM': 'Cameroon',
2772         'CA': 'Canada',
2773         'CV': 'Cape Verde',
2774         'KY': 'Cayman Islands',
2775         'CF': 'Central African Republic',
2776         'TD': 'Chad',
2777         'CL': 'Chile',
2778         'CN': 'China',
2779         'CX': 'Christmas Island',
2780         'CC': 'Cocos (Keeling) Islands',
2781         'CO': 'Colombia',
2782         'KM': 'Comoros',
2783         'CG': 'Congo',
2784         'CD': 'Congo, the Democratic Republic of the',
2785         'CK': 'Cook Islands',
2786         'CR': 'Costa Rica',
2787         'CI': 'Côte d\'Ivoire',
2788         'HR': 'Croatia',
2789         'CU': 'Cuba',
2790         'CW': 'Curaçao',
2791         'CY': 'Cyprus',
2792         'CZ': 'Czech Republic',
2793         'DK': 'Denmark',
2794         'DJ': 'Djibouti',
2795         'DM': 'Dominica',
2796         'DO': 'Dominican Republic',
2797         'EC': 'Ecuador',
2798         'EG': 'Egypt',
2799         'SV': 'El Salvador',
2800         'GQ': 'Equatorial Guinea',
2801         'ER': 'Eritrea',
2802         'EE': 'Estonia',
2803         'ET': 'Ethiopia',
2804         'FK': 'Falkland Islands (Malvinas)',
2805         'FO': 'Faroe Islands',
2806         'FJ': 'Fiji',
2807         'FI': 'Finland',
2808         'FR': 'France',
2809         'GF': 'French Guiana',
2810         'PF': 'French Polynesia',
2811         'TF': 'French Southern Territories',
2812         'GA': 'Gabon',
2813         'GM': 'Gambia',
2814         'GE': 'Georgia',
2815         'DE': 'Germany',
2816         'GH': 'Ghana',
2817         'GI': 'Gibraltar',
2818         'GR': 'Greece',
2819         'GL': 'Greenland',
2820         'GD': 'Grenada',
2821         'GP': 'Guadeloupe',
2822         'GU': 'Guam',
2823         'GT': 'Guatemala',
2824         'GG': 'Guernsey',
2825         'GN': 'Guinea',
2826         'GW': 'Guinea-Bissau',
2827         'GY': 'Guyana',
2828         'HT': 'Haiti',
2829         'HM': 'Heard Island and McDonald Islands',
2830         'VA': 'Holy See (Vatican City State)',
2831         'HN': 'Honduras',
2832         'HK': 'Hong Kong',
2833         'HU': 'Hungary',
2834         'IS': 'Iceland',
2835         'IN': 'India',
2836         'ID': 'Indonesia',
2837         'IR': 'Iran, Islamic Republic of',
2838         'IQ': 'Iraq',
2839         'IE': 'Ireland',
2840         'IM': 'Isle of Man',
2841         'IL': 'Israel',
2842         'IT': 'Italy',
2843         'JM': 'Jamaica',
2844         'JP': 'Japan',
2845         'JE': 'Jersey',
2846         'JO': 'Jordan',
2847         'KZ': 'Kazakhstan',
2848         'KE': 'Kenya',
2849         'KI': 'Kiribati',
2850         'KP': 'Korea, Democratic People\'s Republic of',
2851         'KR': 'Korea, Republic of',
2852         'KW': 'Kuwait',
2853         'KG': 'Kyrgyzstan',
2854         'LA': 'Lao People\'s Democratic Republic',
2855         'LV': 'Latvia',
2856         'LB': 'Lebanon',
2857         'LS': 'Lesotho',
2858         'LR': 'Liberia',
2859         'LY': 'Libya',
2860         'LI': 'Liechtenstein',
2861         'LT': 'Lithuania',
2862         'LU': 'Luxembourg',
2863         'MO': 'Macao',
2864         'MK': 'Macedonia, the Former Yugoslav Republic of',
2865         'MG': 'Madagascar',
2866         'MW': 'Malawi',
2867         'MY': 'Malaysia',
2868         'MV': 'Maldives',
2869         'ML': 'Mali',
2870         'MT': 'Malta',
2871         'MH': 'Marshall Islands',
2872         'MQ': 'Martinique',
2873         'MR': 'Mauritania',
2874         'MU': 'Mauritius',
2875         'YT': 'Mayotte',
2876         'MX': 'Mexico',
2877         'FM': 'Micronesia, Federated States of',
2878         'MD': 'Moldova, Republic of',
2879         'MC': 'Monaco',
2880         'MN': 'Mongolia',
2881         'ME': 'Montenegro',
2882         'MS': 'Montserrat',
2883         'MA': 'Morocco',
2884         'MZ': 'Mozambique',
2885         'MM': 'Myanmar',
2886         'NA': 'Namibia',
2887         'NR': 'Nauru',
2888         'NP': 'Nepal',
2889         'NL': 'Netherlands',
2890         'NC': 'New Caledonia',
2891         'NZ': 'New Zealand',
2892         'NI': 'Nicaragua',
2893         'NE': 'Niger',
2894         'NG': 'Nigeria',
2895         'NU': 'Niue',
2896         'NF': 'Norfolk Island',
2897         'MP': 'Northern Mariana Islands',
2898         'NO': 'Norway',
2899         'OM': 'Oman',
2900         'PK': 'Pakistan',
2901         'PW': 'Palau',
2902         'PS': 'Palestine, State of',
2903         'PA': 'Panama',
2904         'PG': 'Papua New Guinea',
2905         'PY': 'Paraguay',
2906         'PE': 'Peru',
2907         'PH': 'Philippines',
2908         'PN': 'Pitcairn',
2909         'PL': 'Poland',
2910         'PT': 'Portugal',
2911         'PR': 'Puerto Rico',
2912         'QA': 'Qatar',
2913         'RE': 'Réunion',
2914         'RO': 'Romania',
2915         'RU': 'Russian Federation',
2916         'RW': 'Rwanda',
2917         'BL': 'Saint Barthélemy',
2918         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2919         'KN': 'Saint Kitts and Nevis',
2920         'LC': 'Saint Lucia',
2921         'MF': 'Saint Martin (French part)',
2922         'PM': 'Saint Pierre and Miquelon',
2923         'VC': 'Saint Vincent and the Grenadines',
2924         'WS': 'Samoa',
2925         'SM': 'San Marino',
2926         'ST': 'Sao Tome and Principe',
2927         'SA': 'Saudi Arabia',
2928         'SN': 'Senegal',
2929         'RS': 'Serbia',
2930         'SC': 'Seychelles',
2931         'SL': 'Sierra Leone',
2932         'SG': 'Singapore',
2933         'SX': 'Sint Maarten (Dutch part)',
2934         'SK': 'Slovakia',
2935         'SI': 'Slovenia',
2936         'SB': 'Solomon Islands',
2937         'SO': 'Somalia',
2938         'ZA': 'South Africa',
2939         'GS': 'South Georgia and the South Sandwich Islands',
2940         'SS': 'South Sudan',
2941         'ES': 'Spain',
2942         'LK': 'Sri Lanka',
2943         'SD': 'Sudan',
2944         'SR': 'Suriname',
2945         'SJ': 'Svalbard and Jan Mayen',
2946         'SZ': 'Swaziland',
2947         'SE': 'Sweden',
2948         'CH': 'Switzerland',
2949         'SY': 'Syrian Arab Republic',
2950         'TW': 'Taiwan, Province of China',
2951         'TJ': 'Tajikistan',
2952         'TZ': 'Tanzania, United Republic of',
2953         'TH': 'Thailand',
2954         'TL': 'Timor-Leste',
2955         'TG': 'Togo',
2956         'TK': 'Tokelau',
2957         'TO': 'Tonga',
2958         'TT': 'Trinidad and Tobago',
2959         'TN': 'Tunisia',
2960         'TR': 'Turkey',
2961         'TM': 'Turkmenistan',
2962         'TC': 'Turks and Caicos Islands',
2963         'TV': 'Tuvalu',
2964         'UG': 'Uganda',
2965         'UA': 'Ukraine',
2966         'AE': 'United Arab Emirates',
2967         'GB': 'United Kingdom',
2968         'US': 'United States',
2969         'UM': 'United States Minor Outlying Islands',
2970         'UY': 'Uruguay',
2971         'UZ': 'Uzbekistan',
2972         'VU': 'Vanuatu',
2973         'VE': 'Venezuela, Bolivarian Republic of',
2974         'VN': 'Viet Nam',
2975         'VG': 'Virgin Islands, British',
2976         'VI': 'Virgin Islands, U.S.',
2977         'WF': 'Wallis and Futuna',
2978         'EH': 'Western Sahara',
2979         'YE': 'Yemen',
2980         'ZM': 'Zambia',
2981         'ZW': 'Zimbabwe',
2982     }
2983
2984     @classmethod
2985     def short2full(cls, code):
2986         """Convert an ISO 3166-2 country code to the corresponding full name"""
2987         return cls._country_map.get(code.upper())
2988
2989
2990 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2991     def __init__(self, proxies=None):
2992         # Set default handlers
2993         for type in ('http', 'https'):
2994             setattr(self, '%s_open' % type,
2995                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2996                         meth(r, proxy, type))
2997         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2998
2999     def proxy_open(self, req, proxy, type):
3000         req_proxy = req.headers.get('Ytdl-request-proxy')
3001         if req_proxy is not None:
3002             proxy = req_proxy
3003             del req.headers['Ytdl-request-proxy']
3004
3005         if proxy == '__noproxy__':
3006             return None  # No Proxy
3007         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3008             req.add_header('Ytdl-socks-proxy', proxy)
3009             # youtube-dl's http/https handlers do wrapping the socket with socks
3010             return None
3011         return compat_urllib_request.ProxyHandler.proxy_open(
3012             self, req, proxy, type)
3013
3014
3015 def ohdave_rsa_encrypt(data, exponent, modulus):
3016     '''
3017     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3018
3019     Input:
3020         data: data to encrypt, bytes-like object
3021         exponent, modulus: parameter e and N of RSA algorithm, both integer
3022     Output: hex string of encrypted data
3023
3024     Limitation: supports one block encryption only
3025     '''
3026
3027     payload = int(binascii.hexlify(data[::-1]), 16)
3028     encrypted = pow(payload, exponent, modulus)
3029     return '%x' % encrypted
3030
3031
3032 def encode_base_n(num, n, table=None):
3033     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3034     if not table:
3035         table = FULL_TABLE[:n]
3036
3037     if n > len(table):
3038         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3039
3040     if num == 0:
3041         return table[0]
3042
3043     ret = ''
3044     while num:
3045         ret = table[num % n] + ret
3046         num = num // n
3047     return ret
3048
3049
3050 def decode_packed_codes(code):
3051     mobj = re.search(PACKED_CODES_RE, code)
3052     obfucasted_code, base, count, symbols = mobj.groups()
3053     base = int(base)
3054     count = int(count)
3055     symbols = symbols.split('|')
3056     symbol_table = {}
3057
3058     while count:
3059         count -= 1
3060         base_n_count = encode_base_n(count, base)
3061         symbol_table[base_n_count] = symbols[count] or base_n_count
3062
3063     return re.sub(
3064         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3065         obfucasted_code)
3066
3067
3068 def parse_m3u8_attributes(attrib):
3069     info = {}
3070     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3071         if val.startswith('"'):
3072             val = val[1:-1]
3073         info[key] = val
3074     return info
3075
3076
3077 def urshift(val, n):
3078     return val >> n if val >= 0 else (val + 0x100000000) >> n
3079
3080
3081 # Based on png2str() written by @gdkchan and improved by @yokrysty
3082 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3083 def decode_png(png_data):
3084     # Reference: https://www.w3.org/TR/PNG/
3085     header = png_data[8:]
3086
3087     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3088         raise IOError('Not a valid PNG file.')
3089
3090     int_map = {1: '>B', 2: '>H', 4: '>I'}
3091     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3092
3093     chunks = []
3094
3095     while header:
3096         length = unpack_integer(header[:4])
3097         header = header[4:]
3098
3099         chunk_type = header[:4]
3100         header = header[4:]
3101
3102         chunk_data = header[:length]
3103         header = header[length:]
3104
3105         header = header[4:]  # Skip CRC
3106
3107         chunks.append({
3108             'type': chunk_type,
3109             'length': length,
3110             'data': chunk_data
3111         })
3112
3113     ihdr = chunks[0]['data']
3114
3115     width = unpack_integer(ihdr[:4])
3116     height = unpack_integer(ihdr[4:8])
3117
3118     idat = b''
3119
3120     for chunk in chunks:
3121         if chunk['type'] == b'IDAT':
3122             idat += chunk['data']
3123
3124     if not idat:
3125         raise IOError('Unable to read PNG data.')
3126
3127     decompressed_data = bytearray(zlib.decompress(idat))
3128
3129     stride = width * 3
3130     pixels = []
3131
3132     def _get_pixel(idx):
3133         x = idx % stride
3134         y = idx // stride
3135         return pixels[y][x]
3136
3137     for y in range(height):
3138         basePos = y * (1 + stride)
3139         filter_type = decompressed_data[basePos]
3140
3141         current_row = []
3142
3143         pixels.append(current_row)
3144
3145         for x in range(stride):
3146             color = decompressed_data[1 + basePos + x]
3147             basex = y * stride + x
3148             left = 0
3149             up = 0
3150
3151             if x > 2:
3152                 left = _get_pixel(basex - 3)
3153             if y > 0:
3154                 up = _get_pixel(basex - stride)
3155
3156             if filter_type == 1:  # Sub
3157                 color = (color + left) & 0xff
3158             elif filter_type == 2:  # Up
3159                 color = (color + up) & 0xff
3160             elif filter_type == 3:  # Average
3161                 color = (color + ((left + up) >> 1)) & 0xff
3162             elif filter_type == 4:  # Paeth
3163                 a = left
3164                 b = up
3165                 c = 0
3166
3167                 if x > 2 and y > 0:
3168                     c = _get_pixel(basex - stride - 3)
3169
3170                 p = a + b - c
3171
3172                 pa = abs(p - a)
3173                 pb = abs(p - b)
3174                 pc = abs(p - c)
3175
3176                 if pa <= pb and pa <= pc:
3177                     color = (color + a) & 0xff
3178                 elif pb <= pc:
3179                     color = (color + b) & 0xff
3180                 else:
3181                     color = (color + c) & 0xff
3182
3183             current_row.append(color)
3184
3185     return width, height, pixels
3186
3187
3188 def write_xattr(path, key, value):
3189     # This mess below finds the best xattr tool for the job
3190     try:
3191         # try the pyxattr module...
3192         import xattr
3193
3194         if hasattr(xattr, 'set'):  # pyxattr
3195             # Unicode arguments are not supported in python-pyxattr until
3196             # version 0.5.0
3197             # See https://github.com/rg3/youtube-dl/issues/5498
3198             pyxattr_required_version = '0.5.0'
3199             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3200                 # TODO: fallback to CLI tools
3201                 raise XAttrUnavailableError(
3202                     'python-pyxattr is detected but is too old. '
3203                     'youtube-dl requires %s or above while your version is %s. '
3204                     'Falling back to other xattr implementations' % (
3205                         pyxattr_required_version, xattr.__version__))
3206
3207             setxattr = xattr.set
3208         else:  # xattr
3209             setxattr = xattr.setxattr
3210
3211         try:
3212             setxattr(path, key, value)
3213         except EnvironmentError as e:
3214             raise XAttrMetadataError(e.errno, e.strerror)
3215
3216     except ImportError:
3217         if compat_os_name == 'nt':
3218             # Write xattrs to NTFS Alternate Data Streams:
3219             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3220             assert ':' not in key
3221             assert os.path.exists(path)
3222
3223             ads_fn = path + ':' + key
3224             try:
3225                 with open(ads_fn, 'wb') as f:
3226                     f.write(value)
3227             except EnvironmentError as e:
3228                 raise XAttrMetadataError(e.errno, e.strerror)
3229         else:
3230             user_has_setfattr = check_executable('setfattr', ['--version'])
3231             user_has_xattr = check_executable('xattr', ['-h'])
3232
3233             if user_has_setfattr or user_has_xattr:
3234
3235                 value = value.decode('utf-8')
3236                 if user_has_setfattr:
3237                     executable = 'setfattr'
3238                     opts = ['-n', key, '-v', value]
3239                 elif user_has_xattr:
3240                     executable = 'xattr'
3241                     opts = ['-w', key, value]
3242
3243                 cmd = ([encodeFilename(executable, True)] +
3244                        [encodeArgument(o) for o in opts] +
3245                        [encodeFilename(path, True)])
3246
3247                 try:
3248                     p = subprocess.Popen(
3249                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3250                 except EnvironmentError as e:
3251                     raise XAttrMetadataError(e.errno, e.strerror)
3252                 stdout, stderr = p.communicate()
3253                 stderr = stderr.decode('utf-8', 'replace')
3254                 if p.returncode != 0:
3255                     raise XAttrMetadataError(p.returncode, stderr)
3256
3257             else:
3258                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3259                 if sys.platform.startswith('linux'):
3260                     raise XAttrUnavailableError(
3261                         "Couldn't find a tool to set the xattrs. "
3262                         "Install either the python 'pyxattr' or 'xattr' "
3263                         "modules, or the GNU 'attr' package "
3264                         "(which contains the 'setfattr' tool).")
3265                 else:
3266                     raise XAttrUnavailableError(
3267                         "Couldn't find a tool to set the xattrs. "
3268                         "Install either the python 'xattr' module, "
3269                         "or the 'xattr' binary.")