git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_os_name,
  46     compat_parse_qs,
  47     compat_shlex_quote,
  48     compat_socket_create_connection,
  49     compat_str,
  50     compat_struct_pack,
  51     compat_struct_unpack,
  52     compat_urllib_error,
  53     compat_urllib_parse,
  54     compat_urllib_parse_urlencode,
  55     compat_urllib_parse_urlparse,
  56     compat_urllib_parse_unquote_plus,
  57     compat_urllib_request,
  58     compat_urlparse,
  59     compat_xpath,
  60 )
  61
  62 from .socks import (
  63     ProxyType,
  64     sockssocket,
  65 )
  66
  67
  68 def register_socks_protocols():
  69     # "Register" SOCKS protocols
  70     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  71     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  72     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  73         if scheme not in compat_urlparse.uses_netloc:
  74             compat_urlparse.uses_netloc.append(scheme)
  75
  76
  77 # This is not clearly defined otherwise
  78 compiled_regex_type = type(re.compile(''))
  79
  80 std_headers = {
  81     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  82     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84     'Accept-Encoding': 'gzip, deflate',
  85     'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88
  89 USER_AGENTS = {
  90     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  91 }
  92
  93
  94 NO_DEFAULT = object()
  95
  96 ENGLISH_MONTH_NAMES = [
  97     'January', 'February', 'March', 'April', 'May', 'June',
  98     'July', 'August', 'September', 'October', 'November', 'December']
  99
 100 MONTH_NAMES = {
 101     'en': ENGLISH_MONTH_NAMES,
 102     'fr': [
 103         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 104         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 105 }
 106
 107 KNOWN_EXTENSIONS = (
 108     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 109     'flv', 'f4v', 'f4a', 'f4b',
 110     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 111     'mkv', 'mka', 'mk3d',
 112     'avi', 'divx',
 113     'mov',
 114     'asf', 'wmv', 'wma',
 115     '3gp', '3g2',
 116     'mp3',
 117     'flac',
 118     'ape',
 119     'wav',
 120     'f4f', 'f4m', 'm3u8', 'smil')
 121
 122 # needed for sanitizing filenames in restricted mode
 123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 124                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 125                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 126
 127 DATE_FORMATS = (
 128     '%d %B %Y',
 129     '%d %b %Y',
 130     '%B %d %Y',
 131     '%B %dst %Y',
 132     '%B %dnd %Y',
 133     '%B %dth %Y',
 134     '%b %d %Y',
 135     '%b %dst %Y',
 136     '%b %dnd %Y',
 137     '%b %dth %Y',
 138     '%b %dst %Y %I:%M',
 139     '%b %dnd %Y %I:%M',
 140     '%b %dth %Y %I:%M',
 141     '%Y %m %d',
 142     '%Y-%m-%d',
 143     '%Y/%m/%d',
 144     '%Y/%m/%d %H:%M',
 145     '%Y/%m/%d %H:%M:%S',
 146     '%Y-%m-%d %H:%M',
 147     '%Y-%m-%d %H:%M:%S',
 148     '%Y-%m-%d %H:%M:%S.%f',
 149     '%d.%m.%Y %H:%M',
 150     '%d.%m.%Y %H.%M',
 151     '%Y-%m-%dT%H:%M:%SZ',
 152     '%Y-%m-%dT%H:%M:%S.%fZ',
 153     '%Y-%m-%dT%H:%M:%S.%f0Z',
 154     '%Y-%m-%dT%H:%M:%S',
 155     '%Y-%m-%dT%H:%M:%S.%f',
 156     '%Y-%m-%dT%H:%M',
 157     '%b %d %Y at %H:%M',
 158     '%b %d %Y at %H:%M:%S',
 159 )
 160
 161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_DAY_FIRST.extend([
 163     '%d-%m-%Y',
 164     '%d.%m.%Y',
 165     '%d.%m.%y',
 166     '%d/%m/%Y',
 167     '%d/%m/%y',
 168     '%d/%m/%Y %H:%M:%S',
 169 ])
 170
 171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 172 DATE_FORMATS_MONTH_FIRST.extend([
 173     '%m-%d-%Y',
 174     '%m.%d.%Y',
 175     '%m/%d/%Y',
 176     '%m/%d/%y',
 177     '%m/%d/%Y %H:%M:%S',
 178 ])
 179
 180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 181
 182
 183 def preferredencoding():
 184     """Get preferred encoding.
 185
 186     Returns the best encoding scheme for the system, based on
 187     locale.getpreferredencoding() and some further tweaks.
 188     """
 189     try:
 190         pref = locale.getpreferredencoding()
 191         'TEST'.encode(pref)
 192     except Exception:
 193         pref = 'UTF-8'
 194
 195     return pref
 196
 197
 198 def write_json_file(obj, fn):
 199     """ Encode obj as JSON and write it to fn, atomically if possible """
 200
 201     fn = encodeFilename(fn)
 202     if sys.version_info < (3, 0) and sys.platform != 'win32':
 203         encoding = get_filesystem_encoding()
 204         # os.path.basename returns a bytes object, but NamedTemporaryFile
 205         # will fail if the filename contains non ascii characters unless we
 206         # use a unicode object
 207         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 208         # the same for os.path.dirname
 209         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 210     else:
 211         path_basename = os.path.basename
 212         path_dirname = os.path.dirname
 213
 214     args = {
 215         'suffix': '.tmp',
 216         'prefix': path_basename(fn) + '.',
 217         'dir': path_dirname(fn),
 218         'delete': False,
 219     }
 220
 221     # In Python 2.x, json.dump expects a bytestream.
 222     # In Python 3.x, it writes to a character stream
 223     if sys.version_info < (3, 0):
 224         args['mode'] = 'wb'
 225     else:
 226         args.update({
 227             'mode': 'w',
 228             'encoding': 'utf-8',
 229         })
 230
 231     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 232
 233     try:
 234         with tf:
 235             json.dump(obj, tf)
 236         if sys.platform == 'win32':
 237             # Need to remove existing file on Windows, else os.rename raises
 238             # WindowsError or FileExistsError.
 239             try:
 240                 os.unlink(fn)
 241             except OSError:
 242                 pass
 243         os.rename(tf.name, fn)
 244     except Exception:
 245         try:
 246             os.remove(tf.name)
 247         except OSError:
 248             pass
 249         raise
 250
 251
 252 if sys.version_info >= (2, 7):
 253     def find_xpath_attr(node, xpath, key, val=None):
 254         """ Find the xpath xpath[@key=val] """
 255         assert re.match(r'^[a-zA-Z_-]+$', key)
 256         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 257         return node.find(expr)
 258 else:
 259     def find_xpath_attr(node, xpath, key, val=None):
 260         for f in node.findall(compat_xpath(xpath)):
 261             if key not in f.attrib:
 262                 continue
 263             if val is None or f.attrib.get(key) == val:
 264                 return f
 265         return None
 266
 267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 268 # the namespace parameter
 269
 270
 271 def xpath_with_ns(path, ns_map):
 272     components = [c.split(':') for c in path.split('/')]
 273     replaced = []
 274     for c in components:
 275         if len(c) == 1:
 276             replaced.append(c[0])
 277         else:
 278             ns, tag = c
 279             replaced.append('{%s}%s' % (ns_map[ns], tag))
 280     return '/'.join(replaced)
 281
 282
 283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 284     def _find_xpath(xpath):
 285         return node.find(compat_xpath(xpath))
 286
 287     if isinstance(xpath, (str, compat_str)):
 288         n = _find_xpath(xpath)
 289     else:
 290         for xp in xpath:
 291             n = _find_xpath(xp)
 292             if n is not None:
 293                 break
 294
 295     if n is None:
 296         if default is not NO_DEFAULT:
 297             return default
 298         elif fatal:
 299             name = xpath if name is None else name
 300             raise ExtractorError('Could not find XML element %s' % name)
 301         else:
 302             return None
 303     return n
 304
 305
 306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 307     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 308     if n is None or n == default:
 309         return n
 310     if n.text is None:
 311         if default is not NO_DEFAULT:
 312             return default
 313         elif fatal:
 314             name = xpath if name is None else name
 315             raise ExtractorError('Could not find XML element\'s text %s' % name)
 316         else:
 317             return None
 318     return n.text
 319
 320
 321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 322     n = find_xpath_attr(node, xpath, key)
 323     if n is None:
 324         if default is not NO_DEFAULT:
 325             return default
 326         elif fatal:
 327             name = '%s[@%s]' % (xpath, key) if name is None else name
 328             raise ExtractorError('Could not find XML attribute %s' % name)
 329         else:
 330             return None
 331     return n.attrib[key]
 332
 333
 334 def get_element_by_id(id, html):
 335     """Return the content of the tag with the specified ID in the passed HTML document"""
 336     return get_element_by_attribute('id', id, html)
 337
 338
 339 def get_element_by_class(class_name, html):
 340     """Return the content of the first tag with the specified class in the passed HTML document"""
 341     retval = get_elements_by_class(class_name, html)
 342     return retval[0] if retval else None
 343
 344
 345 def get_element_by_attribute(attribute, value, html, escape_value=True):
 346     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 347     return retval[0] if retval else None
 348
 349
 350 def get_elements_by_class(class_name, html):
 351     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 352     return get_elements_by_attribute(
 353         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 354         html, escape_value=False)
 355
 356
 357 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 358     """Return the content of the tag with the specified attribute in the passed HTML document"""
 359
 360     value = re.escape(value) if escape_value else value
 361
 362     retlist = []
 363     for m in re.finditer(r'''(?xs)
 364         <([a-zA-Z0-9:._-]+)
 365          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 366          \s+%s=['"]?%s['"]?
 367          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 368         \s*>
 369         (?P<content>.*?)
 370         </\1>
 371     ''' % (re.escape(attribute), value), html):
 372         res = m.group('content')
 373
 374         if res.startswith('"') or res.startswith("'"):
 375             res = res[1:-1]
 376
 377         retlist.append(unescapeHTML(res))
 378
 379     return retlist
 380
 381
 382 class HTMLAttributeParser(compat_HTMLParser):
 383     """Trivial HTML parser to gather the attributes for a single element"""
 384     def __init__(self):
 385         self.attrs = {}
 386         compat_HTMLParser.__init__(self)
 387
 388     def handle_starttag(self, tag, attrs):
 389         self.attrs = dict(attrs)
 390
 391
 392 def extract_attributes(html_element):
 393     """Given a string for an HTML element such as
 394     <el
 395          a="foo" B="bar" c="&98;az" d=boz
 396          empty= noval entity="&amp;"
 397          sq='"' dq="'"
 398     >
 399     Decode and return a dictionary of attributes.
 400     {
 401         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 402         'empty': '', 'noval': None, 'entity': '&',
 403         'sq': '"', 'dq': '\''
 404     }.
 405     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 406     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 407     """
 408     parser = HTMLAttributeParser()
 409     parser.feed(html_element)
 410     parser.close()
 411     return parser.attrs
 412
 413
 414 def clean_html(html):
 415     """Clean an HTML snippet into a readable string"""
 416
 417     if html is None:  # Convenience for sanitizing descriptions etc.
 418         return html
 419
 420     # Newline vs <br />
 421     html = html.replace('\n', ' ')
 422     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 423     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 424     # Strip html tags
 425     html = re.sub('<.*?>', '', html)
 426     # Replace html entities
 427     html = unescapeHTML(html)
 428     return html.strip()
 429
 430
 431 def sanitize_open(filename, open_mode):
 432     """Try to open the given filename, and slightly tweak it if this fails.
 433
 434     Attempts to open the given filename. If this fails, it tries to change
 435     the filename slightly, step by step, until it's either able to open it
 436     or it fails and raises a final exception, like the standard open()
 437     function.
 438
 439     It returns the tuple (stream, definitive_file_name).
 440     """
 441     try:
 442         if filename == '-':
 443             if sys.platform == 'win32':
 444                 import msvcrt
 445                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 446             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 447         stream = open(encodeFilename(filename), open_mode)
 448         return (stream, filename)
 449     except (IOError, OSError) as err:
 450         if err.errno in (errno.EACCES,):
 451             raise
 452
 453         # In case of error, try to remove win32 forbidden chars
 454         alt_filename = sanitize_path(filename)
 455         if alt_filename == filename:
 456             raise
 457         else:
 458             # An exception here should be caught in the caller
 459             stream = open(encodeFilename(alt_filename), open_mode)
 460             return (stream, alt_filename)
 461
 462
 463 def timeconvert(timestr):
 464     """Convert RFC 2822 defined time string into system timestamp"""
 465     timestamp = None
 466     timetuple = email.utils.parsedate_tz(timestr)
 467     if timetuple is not None:
 468         timestamp = email.utils.mktime_tz(timetuple)
 469     return timestamp
 470
 471
 472 def sanitize_filename(s, restricted=False, is_id=False):
 473     """Sanitizes a string so it could be used as part of a filename.
 474     If restricted is set, use a stricter subset of allowed characters.
 475     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 476     """
 477     def replace_insane(char):
 478         if restricted and char in ACCENT_CHARS:
 479             return ACCENT_CHARS[char]
 480         if char == '?' or ord(char) < 32 or ord(char) == 127:
 481             return ''
 482         elif char == '"':
 483             return '' if restricted else '\''
 484         elif char == ':':
 485             return '_-' if restricted else ' -'
 486         elif char in '\\/|*<>':
 487             return '_'
 488         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 489             return '_'
 490         if restricted and ord(char) > 127:
 491             return '_'
 492         return char
 493
 494     # Handle timestamps
 495     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 496     result = ''.join(map(replace_insane, s))
 497     if not is_id:
 498         while '__' in result:
 499             result = result.replace('__', '_')
 500         result = result.strip('_')
 501         # Common case of "Foreign band name - English song title"
 502         if restricted and result.startswith('-_'):
 503             result = result[2:]
 504         if result.startswith('-'):
 505             result = '_' + result[len('-'):]
 506         result = result.lstrip('.')
 507         if not result:
 508             result = '_'
 509     return result
 510
 511
 512 def sanitize_path(s):
 513     """Sanitizes and normalizes path on Windows"""
 514     if sys.platform != 'win32':
 515         return s
 516     drive_or_unc, _ = os.path.splitdrive(s)
 517     if sys.version_info < (2, 7) and not drive_or_unc:
 518         drive_or_unc, _ = os.path.splitunc(s)
 519     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 520     if drive_or_unc:
 521         norm_path.pop(0)
 522     sanitized_path = [
 523         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 524         for path_part in norm_path]
 525     if drive_or_unc:
 526         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 527     return os.path.join(*sanitized_path)
 528
 529
 530 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 531 # unwanted failures due to missing protocol
 532 def sanitize_url(url):
 533     return 'http:%s' % url if url.startswith('//') else url
 534
 535
 536 def sanitized_Request(url, *args, **kwargs):
 537     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 538
 539
 540 def orderedSet(iterable):
 541     """ Remove all duplicates from the input iterable """
 542     res = []
 543     for el in iterable:
 544         if el not in res:
 545             res.append(el)
 546     return res
 547
 548
 549 def _htmlentity_transform(entity_with_semicolon):
 550     """Transforms an HTML entity to a character."""
 551     entity = entity_with_semicolon[:-1]
 552
 553     # Known non-numeric HTML entity
 554     if entity in compat_html_entities.name2codepoint:
 555         return compat_chr(compat_html_entities.name2codepoint[entity])
 556
 557     # TODO: HTML5 allows entities without a semicolon. For example,
 558     # '&Eacuteric' should be decoded as 'Éric'.
 559     if entity_with_semicolon in compat_html_entities_html5:
 560         return compat_html_entities_html5[entity_with_semicolon]
 561
 562     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 563     if mobj is not None:
 564         numstr = mobj.group(1)
 565         if numstr.startswith('x'):
 566             base = 16
 567             numstr = '0%s' % numstr
 568         else:
 569             base = 10
 570         # See https://github.com/rg3/youtube-dl/issues/7518
 571         try:
 572             return compat_chr(int(numstr, base))
 573         except ValueError:
 574             pass
 575
 576     # Unknown entity in name, return its literal representation
 577     return '&%s;' % entity
 578
 579
 580 def unescapeHTML(s):
 581     if s is None:
 582         return None
 583     assert type(s) == compat_str
 584
 585     return re.sub(
 586         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 587
 588
 589 def get_subprocess_encoding():
 590     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 591         # For subprocess calls, encode with locale encoding
 592         # Refer to http://stackoverflow.com/a/9951851/35070
 593         encoding = preferredencoding()
 594     else:
 595         encoding = sys.getfilesystemencoding()
 596     if encoding is None:
 597         encoding = 'utf-8'
 598     return encoding
 599
 600
 601 def encodeFilename(s, for_subprocess=False):
 602     """
 603     @param s The name of the file
 604     """
 605
 606     assert type(s) == compat_str
 607
 608     # Python 3 has a Unicode API
 609     if sys.version_info >= (3, 0):
 610         return s
 611
 612     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 613     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 614     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 615     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 616         return s
 617
 618     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 619     if sys.platform.startswith('java'):
 620         return s
 621
 622     return s.encode(get_subprocess_encoding(), 'ignore')
 623
 624
 625 def decodeFilename(b, for_subprocess=False):
 626
 627     if sys.version_info >= (3, 0):
 628         return b
 629
 630     if not isinstance(b, bytes):
 631         return b
 632
 633     return b.decode(get_subprocess_encoding(), 'ignore')
 634
 635
 636 def encodeArgument(s):
 637     if not isinstance(s, compat_str):
 638         # Legacy code that uses byte strings
 639         # Uncomment the following line after fixing all post processors
 640         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 641         s = s.decode('ascii')
 642     return encodeFilename(s, True)
 643
 644
 645 def decodeArgument(b):
 646     return decodeFilename(b, True)
 647
 648
 649 def decodeOption(optval):
 650     if optval is None:
 651         return optval
 652     if isinstance(optval, bytes):
 653         optval = optval.decode(preferredencoding())
 654
 655     assert isinstance(optval, compat_str)
 656     return optval
 657
 658
 659 def formatSeconds(secs):
 660     if secs > 3600:
 661         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 662     elif secs > 60:
 663         return '%d:%02d' % (secs // 60, secs % 60)
 664     else:
 665         return '%d' % secs
 666
 667
 668 def make_HTTPS_handler(params, **kwargs):
 669     opts_no_check_certificate = params.get('nocheckcertificate', False)
 670     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 671         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 672         if opts_no_check_certificate:
 673             context.check_hostname = False
 674             context.verify_mode = ssl.CERT_NONE
 675         try:
 676             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 677         except TypeError:
 678             # Python 2.7.8
 679             # (create_default_context present but HTTPSHandler has no context=)
 680             pass
 681
 682     if sys.version_info < (3, 2):
 683         return YoutubeDLHTTPSHandler(params, **kwargs)
 684     else:  # Python < 3.4
 685         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 686         context.verify_mode = (ssl.CERT_NONE
 687                                if opts_no_check_certificate
 688                                else ssl.CERT_REQUIRED)
 689         context.set_default_verify_paths()
 690         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 691
 692
 693 def bug_reports_message():
 694     if ytdl_is_updateable():
 695         update_cmd = 'type  youtube-dl -U  to update'
 696     else:
 697         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 698     msg = '; please report this issue on https://yt-dl.org/bug .'
 699     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 700     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 701     return msg
 702
 703
 704 class ExtractorError(Exception):
 705     """Error during info extraction."""
 706
 707     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 708         """ tb, if given, is the original traceback (so that it can be printed out).
 709         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 710         """
 711
 712         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 713             expected = True
 714         if video_id is not None:
 715             msg = video_id + ': ' + msg
 716         if cause:
 717             msg += ' (caused by %r)' % cause
 718         if not expected:
 719             msg += bug_reports_message()
 720         super(ExtractorError, self).__init__(msg)
 721
 722         self.traceback = tb
 723         self.exc_info = sys.exc_info()  # preserve original exception
 724         self.cause = cause
 725         self.video_id = video_id
 726
 727     def format_traceback(self):
 728         if self.traceback is None:
 729             return None
 730         return ''.join(traceback.format_tb(self.traceback))
 731
 732
 733 class UnsupportedError(ExtractorError):
 734     def __init__(self, url):
 735         super(UnsupportedError, self).__init__(
 736             'Unsupported URL: %s' % url, expected=True)
 737         self.url = url
 738
 739
 740 class RegexNotFoundError(ExtractorError):
 741     """Error when a regex didn't match"""
 742     pass
 743
 744
 745 class DownloadError(Exception):
 746     """Download Error exception.
 747
 748     This exception may be thrown by FileDownloader objects if they are not
 749     configured to continue on errors. They will contain the appropriate
 750     error message.
 751     """
 752
 753     def __init__(self, msg, exc_info=None):
 754         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 755         super(DownloadError, self).__init__(msg)
 756         self.exc_info = exc_info
 757
 758
 759 class SameFileError(Exception):
 760     """Same File exception.
 761
 762     This exception will be thrown by FileDownloader objects if they detect
 763     multiple files would have to be downloaded to the same file on disk.
 764     """
 765     pass
 766
 767
 768 class PostProcessingError(Exception):
 769     """Post Processing exception.
 770
 771     This exception may be raised by PostProcessor's .run() method to
 772     indicate an error in the postprocessing task.
 773     """
 774
 775     def __init__(self, msg):
 776         self.msg = msg
 777
 778
 779 class MaxDownloadsReached(Exception):
 780     """ --max-downloads limit has been reached. """
 781     pass
 782
 783
 784 class UnavailableVideoError(Exception):
 785     """Unavailable Format exception.
 786
 787     This exception will be thrown when a video is requested
 788     in a format that is not available for that video.
 789     """
 790     pass
 791
 792
 793 class ContentTooShortError(Exception):
 794     """Content Too Short exception.
 795
 796     This exception may be raised by FileDownloader objects when a file they
 797     download is too small for what the server announced first, indicating
 798     the connection was probably interrupted.
 799     """
 800
 801     def __init__(self, downloaded, expected):
 802         # Both in bytes
 803         self.downloaded = downloaded
 804         self.expected = expected
 805
 806
 807 class XAttrMetadataError(Exception):
 808     def __init__(self, code=None, msg='Unknown error'):
 809         super(XAttrMetadataError, self).__init__(msg)
 810         self.code = code
 811         self.msg = msg
 812
 813         # Parsing code and msg
 814         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 815                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 816             self.reason = 'NO_SPACE'
 817         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 818             self.reason = 'VALUE_TOO_LONG'
 819         else:
 820             self.reason = 'NOT_SUPPORTED'
 821
 822
 823 class XAttrUnavailableError(Exception):
 824     pass
 825
 826
 827 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 828     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 829     # expected HTTP responses to meet HTTP/1.0 or later (see also
 830     # https://github.com/rg3/youtube-dl/issues/6727)
 831     if sys.version_info < (3, 0):
 832         kwargs[b'strict'] = True
 833     hc = http_class(*args, **kwargs)
 834     source_address = ydl_handler._params.get('source_address')
 835     if source_address is not None:
 836         sa = (source_address, 0)
 837         if hasattr(hc, 'source_address'):  # Python 2.7+
 838             hc.source_address = sa
 839         else:  # Python 2.6
 840             def _hc_connect(self, *args, **kwargs):
 841                 sock = compat_socket_create_connection(
 842                     (self.host, self.port), self.timeout, sa)
 843                 if is_https:
 844                     self.sock = ssl.wrap_socket(
 845                         sock, self.key_file, self.cert_file,
 846                         ssl_version=ssl.PROTOCOL_TLSv1)
 847                 else:
 848                     self.sock = sock
 849             hc.connect = functools.partial(_hc_connect, hc)
 850
 851     return hc
 852
 853
 854 def handle_youtubedl_headers(headers):
 855     filtered_headers = headers
 856
 857     if 'Youtubedl-no-compression' in filtered_headers:
 858         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 859         del filtered_headers['Youtubedl-no-compression']
 860
 861     return filtered_headers
 862
 863
 864 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 865     """Handler for HTTP requests and responses.
 866
 867     This class, when installed with an OpenerDirector, automatically adds
 868     the standard headers to every HTTP request and handles gzipped and
 869     deflated responses from web servers. If compression is to be avoided in
 870     a particular request, the original request in the program code only has
 871     to include the HTTP header "Youtubedl-no-compression", which will be
 872     removed before making the real request.
 873
 874     Part of this code was copied from:
 875
 876     http://techknack.net/python-urllib2-handlers/
 877
 878     Andrew Rowls, the author of that code, agreed to release it to the
 879     public domain.
 880     """
 881
 882     def __init__(self, params, *args, **kwargs):
 883         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 884         self._params = params
 885
 886     def http_open(self, req):
 887         conn_class = compat_http_client.HTTPConnection
 888
 889         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 890         if socks_proxy:
 891             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 892             del req.headers['Ytdl-socks-proxy']
 893
 894         return self.do_open(functools.partial(
 895             _create_http_connection, self, conn_class, False),
 896             req)
 897
 898     @staticmethod
 899     def deflate(data):
 900         try:
 901             return zlib.decompress(data, -zlib.MAX_WBITS)
 902         except zlib.error:
 903             return zlib.decompress(data)
 904
 905     @staticmethod
 906     def addinfourl_wrapper(stream, headers, url, code):
 907         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 908             return compat_urllib_request.addinfourl(stream, headers, url, code)
 909         ret = compat_urllib_request.addinfourl(stream, headers, url)
 910         ret.code = code
 911         return ret
 912
 913     def http_request(self, req):
 914         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 915         # always respected by websites, some tend to give out URLs with non percent-encoded
 916         # non-ASCII characters (see telemb.py, ard.py [#3412])
 917         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 918         # To work around aforementioned issue we will replace request's original URL with
 919         # percent-encoded one
 920         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 921         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 922         url = req.get_full_url()
 923         url_escaped = escape_url(url)
 924
 925         # Substitute URL if any change after escaping
 926         if url != url_escaped:
 927             req = update_Request(req, url=url_escaped)
 928
 929         for h, v in std_headers.items():
 930             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 931             # The dict keys are capitalized because of this bug by urllib
 932             if h.capitalize() not in req.headers:
 933                 req.add_header(h, v)
 934
 935         req.headers = handle_youtubedl_headers(req.headers)
 936
 937         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 938             # Python 2.6 is brain-dead when it comes to fragments
 939             req._Request__original = req._Request__original.partition('#')[0]
 940             req._Request__r_type = req._Request__r_type.partition('#')[0]
 941
 942         return req
 943
 944     def http_response(self, req, resp):
 945         old_resp = resp
 946         # gzip
 947         if resp.headers.get('Content-encoding', '') == 'gzip':
 948             content = resp.read()
 949             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 950             try:
 951                 uncompressed = io.BytesIO(gz.read())
 952             except IOError as original_ioerror:
 953                 # There may be junk add the end of the file
 954                 # See http://stackoverflow.com/q/4928560/35070 for details
 955                 for i in range(1, 1024):
 956                     try:
 957                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 958                         uncompressed = io.BytesIO(gz.read())
 959                     except IOError:
 960                         continue
 961                     break
 962                 else:
 963                     raise original_ioerror
 964             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 965             resp.msg = old_resp.msg
 966             del resp.headers['Content-encoding']
 967         # deflate
 968         if resp.headers.get('Content-encoding', '') == 'deflate':
 969             gz = io.BytesIO(self.deflate(resp.read()))
 970             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 971             resp.msg = old_resp.msg
 972             del resp.headers['Content-encoding']
 973         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 974         # https://github.com/rg3/youtube-dl/issues/6457).
 975         if 300 <= resp.code < 400:
 976             location = resp.headers.get('Location')
 977             if location:
 978                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 979                 if sys.version_info >= (3, 0):
 980                     location = location.encode('iso-8859-1').decode('utf-8')
 981                 else:
 982                     location = location.decode('utf-8')
 983                 location_escaped = escape_url(location)
 984                 if location != location_escaped:
 985                     del resp.headers['Location']
 986                     if sys.version_info < (3, 0):
 987                         location_escaped = location_escaped.encode('utf-8')
 988                     resp.headers['Location'] = location_escaped
 989         return resp
 990
 991     https_request = http_request
 992     https_response = http_response
 993
 994
 995 def make_socks_conn_class(base_class, socks_proxy):
 996     assert issubclass(base_class, (
 997         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 998
 999     url_components = compat_urlparse.urlparse(socks_proxy)
1000     if url_components.scheme.lower() == 'socks5':
1001         socks_type = ProxyType.SOCKS5
1002     elif url_components.scheme.lower() in ('socks', 'socks4'):
1003         socks_type = ProxyType.SOCKS4
1004     elif url_components.scheme.lower() == 'socks4a':
1005         socks_type = ProxyType.SOCKS4A
1006
1007     def unquote_if_non_empty(s):
1008         if not s:
1009             return s
1010         return compat_urllib_parse_unquote_plus(s)
1011
1012     proxy_args = (
1013         socks_type,
1014         url_components.hostname, url_components.port or 1080,
1015         True,  # Remote DNS
1016         unquote_if_non_empty(url_components.username),
1017         unquote_if_non_empty(url_components.password),
1018     )
1019
1020     class SocksConnection(base_class):
1021         def connect(self):
1022             self.sock = sockssocket()
1023             self.sock.setproxy(*proxy_args)
1024             if type(self.timeout) in (int, float):
1025                 self.sock.settimeout(self.timeout)
1026             self.sock.connect((self.host, self.port))
1027
1028             if isinstance(self, compat_http_client.HTTPSConnection):
1029                 if hasattr(self, '_context'):  # Python > 2.6
1030                     self.sock = self._context.wrap_socket(
1031                         self.sock, server_hostname=self.host)
1032                 else:
1033                     self.sock = ssl.wrap_socket(self.sock)
1034
1035     return SocksConnection
1036
1037
1038 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1039     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1040         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1041         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1042         self._params = params
1043
1044     def https_open(self, req):
1045         kwargs = {}
1046         conn_class = self._https_conn_class
1047
1048         if hasattr(self, '_context'):  # python > 2.6
1049             kwargs['context'] = self._context
1050         if hasattr(self, '_check_hostname'):  # python 3.x
1051             kwargs['check_hostname'] = self._check_hostname
1052
1053         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1054         if socks_proxy:
1055             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1056             del req.headers['Ytdl-socks-proxy']
1057
1058         return self.do_open(functools.partial(
1059             _create_http_connection, self, conn_class, True),
1060             req, **kwargs)
1061
1062
1063 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1064     def __init__(self, cookiejar=None):
1065         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1066
1067     def http_response(self, request, response):
1068         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1069         # characters in Set-Cookie HTTP header of last response (see
1070         # https://github.com/rg3/youtube-dl/issues/6769).
1071         # In order to at least prevent crashing we will percent encode Set-Cookie
1072         # header before HTTPCookieProcessor starts processing it.
1073         # if sys.version_info < (3, 0) and response.headers:
1074         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1075         #         set_cookie = response.headers.get(set_cookie_header)
1076         #         if set_cookie:
1077         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1078         #             if set_cookie != set_cookie_escaped:
1079         #                 del response.headers[set_cookie_header]
1080         #                 response.headers[set_cookie_header] = set_cookie_escaped
1081         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1082
1083     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1084     https_response = http_response
1085
1086
1087 def extract_timezone(date_str):
1088     m = re.search(
1089         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1090         date_str)
1091     if not m:
1092         timezone = datetime.timedelta()
1093     else:
1094         date_str = date_str[:-len(m.group('tz'))]
1095         if not m.group('sign'):
1096             timezone = datetime.timedelta()
1097         else:
1098             sign = 1 if m.group('sign') == '+' else -1
1099             timezone = datetime.timedelta(
1100                 hours=sign * int(m.group('hours')),
1101                 minutes=sign * int(m.group('minutes')))
1102     return timezone, date_str
1103
1104
1105 def parse_iso8601(date_str, delimiter='T', timezone=None):
1106     """ Return a UNIX timestamp from the given date """
1107
1108     if date_str is None:
1109         return None
1110
1111     date_str = re.sub(r'\.[0-9]+', '', date_str)
1112
1113     if timezone is None:
1114         timezone, date_str = extract_timezone(date_str)
1115
1116     try:
1117         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1118         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1119         return calendar.timegm(dt.timetuple())
1120     except ValueError:
1121         pass
1122
1123
1124 def date_formats(day_first=True):
1125     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1126
1127
1128 def unified_strdate(date_str, day_first=True):
1129     """Return a string with the date in the format YYYYMMDD"""
1130
1131     if date_str is None:
1132         return None
1133     upload_date = None
1134     # Replace commas
1135     date_str = date_str.replace(',', ' ')
1136     # Remove AM/PM + timezone
1137     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1138     _, date_str = extract_timezone(date_str)
1139
1140     for expression in date_formats(day_first):
1141         try:
1142             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1143         except ValueError:
1144             pass
1145     if upload_date is None:
1146         timetuple = email.utils.parsedate_tz(date_str)
1147         if timetuple:
1148             try:
1149                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1150             except ValueError:
1151                 pass
1152     if upload_date is not None:
1153         return compat_str(upload_date)
1154
1155
1156 def unified_timestamp(date_str, day_first=True):
1157     if date_str is None:
1158         return None
1159
1160     date_str = date_str.replace(',', ' ')
1161
1162     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1163     timezone, date_str = extract_timezone(date_str)
1164
1165     # Remove AM/PM + timezone
1166     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1167
1168     for expression in date_formats(day_first):
1169         try:
1170             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1171             return calendar.timegm(dt.timetuple())
1172         except ValueError:
1173             pass
1174     timetuple = email.utils.parsedate_tz(date_str)
1175     if timetuple:
1176         return calendar.timegm(timetuple) + pm_delta * 3600
1177
1178
1179 def determine_ext(url, default_ext='unknown_video'):
1180     if url is None:
1181         return default_ext
1182     guess = url.partition('?')[0].rpartition('.')[2]
1183     if re.match(r'^[A-Za-z0-9]+$', guess):
1184         return guess
1185     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1186     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1187         return guess.rstrip('/')
1188     else:
1189         return default_ext
1190
1191
1192 def subtitles_filename(filename, sub_lang, sub_format):
1193     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1194
1195
1196 def date_from_str(date_str):
1197     """
1198     Return a datetime object from a string in the format YYYYMMDD or
1199     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1200     today = datetime.date.today()
1201     if date_str in ('now', 'today'):
1202         return today
1203     if date_str == 'yesterday':
1204         return today - datetime.timedelta(days=1)
1205     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1206     if match is not None:
1207         sign = match.group('sign')
1208         time = int(match.group('time'))
1209         if sign == '-':
1210             time = -time
1211         unit = match.group('unit')
1212         # A bad approximation?
1213         if unit == 'month':
1214             unit = 'day'
1215             time *= 30
1216         elif unit == 'year':
1217             unit = 'day'
1218             time *= 365
1219         unit += 's'
1220         delta = datetime.timedelta(**{unit: time})
1221         return today + delta
1222     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1223
1224
1225 def hyphenate_date(date_str):
1226     """
1227     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1228     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1229     if match is not None:
1230         return '-'.join(match.groups())
1231     else:
1232         return date_str
1233
1234
1235 class DateRange(object):
1236     """Represents a time interval between two dates"""
1237
1238     def __init__(self, start=None, end=None):
1239         """start and end must be strings in the format accepted by date"""
1240         if start is not None:
1241             self.start = date_from_str(start)
1242         else:
1243             self.start = datetime.datetime.min.date()
1244         if end is not None:
1245             self.end = date_from_str(end)
1246         else:
1247             self.end = datetime.datetime.max.date()
1248         if self.start > self.end:
1249             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1250
1251     @classmethod
1252     def day(cls, day):
1253         """Returns a range that only contains the given day"""
1254         return cls(day, day)
1255
1256     def __contains__(self, date):
1257         """Check if the date is in the range"""
1258         if not isinstance(date, datetime.date):
1259             date = date_from_str(date)
1260         return self.start <= date <= self.end
1261
1262     def __str__(self):
1263         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1264
1265
1266 def platform_name():
1267     """ Returns the platform name as a compat_str """
1268     res = platform.platform()
1269     if isinstance(res, bytes):
1270         res = res.decode(preferredencoding())
1271
1272     assert isinstance(res, compat_str)
1273     return res
1274
1275
1276 def _windows_write_string(s, out):
1277     """ Returns True if the string was written using special methods,
1278     False if it has yet to be written out."""
1279     # Adapted from http://stackoverflow.com/a/3259271/35070
1280
1281     import ctypes
1282     import ctypes.wintypes
1283
1284     WIN_OUTPUT_IDS = {
1285         1: -11,
1286         2: -12,
1287     }
1288
1289     try:
1290         fileno = out.fileno()
1291     except AttributeError:
1292         # If the output stream doesn't have a fileno, it's virtual
1293         return False
1294     except io.UnsupportedOperation:
1295         # Some strange Windows pseudo files?
1296         return False
1297     if fileno not in WIN_OUTPUT_IDS:
1298         return False
1299
1300     GetStdHandle = ctypes.WINFUNCTYPE(
1301         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1302         (b'GetStdHandle', ctypes.windll.kernel32))
1303     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1304
1305     WriteConsoleW = ctypes.WINFUNCTYPE(
1306         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1307         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1308         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1309     written = ctypes.wintypes.DWORD(0)
1310
1311     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1312     FILE_TYPE_CHAR = 0x0002
1313     FILE_TYPE_REMOTE = 0x8000
1314     GetConsoleMode = ctypes.WINFUNCTYPE(
1315         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1316         ctypes.POINTER(ctypes.wintypes.DWORD))(
1317         (b'GetConsoleMode', ctypes.windll.kernel32))
1318     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1319
1320     def not_a_console(handle):
1321         if handle == INVALID_HANDLE_VALUE or handle is None:
1322             return True
1323         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1324                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1325
1326     if not_a_console(h):
1327         return False
1328
1329     def next_nonbmp_pos(s):
1330         try:
1331             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1332         except StopIteration:
1333             return len(s)
1334
1335     while s:
1336         count = min(next_nonbmp_pos(s), 1024)
1337
1338         ret = WriteConsoleW(
1339             h, s, count if count else 2, ctypes.byref(written), None)
1340         if ret == 0:
1341             raise OSError('Failed to write string')
1342         if not count:  # We just wrote a non-BMP character
1343             assert written.value == 2
1344             s = s[1:]
1345         else:
1346             assert written.value > 0
1347             s = s[written.value:]
1348     return True
1349
1350
1351 def write_string(s, out=None, encoding=None):
1352     if out is None:
1353         out = sys.stderr
1354     assert type(s) == compat_str
1355
1356     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1357         if _windows_write_string(s, out):
1358             return
1359
1360     if ('b' in getattr(out, 'mode', '') or
1361             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1362         byt = s.encode(encoding or preferredencoding(), 'ignore')
1363         out.write(byt)
1364     elif hasattr(out, 'buffer'):
1365         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1366         byt = s.encode(enc, 'ignore')
1367         out.buffer.write(byt)
1368     else:
1369         out.write(s)
1370     out.flush()
1371
1372
1373 def bytes_to_intlist(bs):
1374     if not bs:
1375         return []
1376     if isinstance(bs[0], int):  # Python 3
1377         return list(bs)
1378     else:
1379         return [ord(c) for c in bs]
1380
1381
1382 def intlist_to_bytes(xs):
1383     if not xs:
1384         return b''
1385     return compat_struct_pack('%dB' % len(xs), *xs)
1386
1387
1388 # Cross-platform file locking
1389 if sys.platform == 'win32':
1390     import ctypes.wintypes
1391     import msvcrt
1392
1393     class OVERLAPPED(ctypes.Structure):
1394         _fields_ = [
1395             ('Internal', ctypes.wintypes.LPVOID),
1396             ('InternalHigh', ctypes.wintypes.LPVOID),
1397             ('Offset', ctypes.wintypes.DWORD),
1398             ('OffsetHigh', ctypes.wintypes.DWORD),
1399             ('hEvent', ctypes.wintypes.HANDLE),
1400         ]
1401
1402     kernel32 = ctypes.windll.kernel32
1403     LockFileEx = kernel32.LockFileEx
1404     LockFileEx.argtypes = [
1405         ctypes.wintypes.HANDLE,     # hFile
1406         ctypes.wintypes.DWORD,      # dwFlags
1407         ctypes.wintypes.DWORD,      # dwReserved
1408         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1409         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1410         ctypes.POINTER(OVERLAPPED)  # Overlapped
1411     ]
1412     LockFileEx.restype = ctypes.wintypes.BOOL
1413     UnlockFileEx = kernel32.UnlockFileEx
1414     UnlockFileEx.argtypes = [
1415         ctypes.wintypes.HANDLE,     # hFile
1416         ctypes.wintypes.DWORD,      # dwReserved
1417         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1418         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1419         ctypes.POINTER(OVERLAPPED)  # Overlapped
1420     ]
1421     UnlockFileEx.restype = ctypes.wintypes.BOOL
1422     whole_low = 0xffffffff
1423     whole_high = 0x7fffffff
1424
1425     def _lock_file(f, exclusive):
1426         overlapped = OVERLAPPED()
1427         overlapped.Offset = 0
1428         overlapped.OffsetHigh = 0
1429         overlapped.hEvent = 0
1430         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1431         handle = msvcrt.get_osfhandle(f.fileno())
1432         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1433                           whole_low, whole_high, f._lock_file_overlapped_p):
1434             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1435
1436     def _unlock_file(f):
1437         assert f._lock_file_overlapped_p
1438         handle = msvcrt.get_osfhandle(f.fileno())
1439         if not UnlockFileEx(handle, 0,
1440                             whole_low, whole_high, f._lock_file_overlapped_p):
1441             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1442
1443 else:
1444     # Some platforms, such as Jython, is missing fcntl
1445     try:
1446         import fcntl
1447
1448         def _lock_file(f, exclusive):
1449             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1450
1451         def _unlock_file(f):
1452             fcntl.flock(f, fcntl.LOCK_UN)
1453     except ImportError:
1454         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1455
1456         def _lock_file(f, exclusive):
1457             raise IOError(UNSUPPORTED_MSG)
1458
1459         def _unlock_file(f):
1460             raise IOError(UNSUPPORTED_MSG)
1461
1462
1463 class locked_file(object):
1464     def __init__(self, filename, mode, encoding=None):
1465         assert mode in ['r', 'a', 'w']
1466         self.f = io.open(filename, mode, encoding=encoding)
1467         self.mode = mode
1468
1469     def __enter__(self):
1470         exclusive = self.mode != 'r'
1471         try:
1472             _lock_file(self.f, exclusive)
1473         except IOError:
1474             self.f.close()
1475             raise
1476         return self
1477
1478     def __exit__(self, etype, value, traceback):
1479         try:
1480             _unlock_file(self.f)
1481         finally:
1482             self.f.close()
1483
1484     def __iter__(self):
1485         return iter(self.f)
1486
1487     def write(self, *args):
1488         return self.f.write(*args)
1489
1490     def read(self, *args):
1491         return self.f.read(*args)
1492
1493
1494 def get_filesystem_encoding():
1495     encoding = sys.getfilesystemencoding()
1496     return encoding if encoding is not None else 'utf-8'
1497
1498
1499 def shell_quote(args):
1500     quoted_args = []
1501     encoding = get_filesystem_encoding()
1502     for a in args:
1503         if isinstance(a, bytes):
1504             # We may get a filename encoded with 'encodeFilename'
1505             a = a.decode(encoding)
1506         quoted_args.append(pipes.quote(a))
1507     return ' '.join(quoted_args)
1508
1509
1510 def smuggle_url(url, data):
1511     """ Pass additional data in a URL for internal use. """
1512
1513     url, idata = unsmuggle_url(url, {})
1514     data.update(idata)
1515     sdata = compat_urllib_parse_urlencode(
1516         {'__youtubedl_smuggle': json.dumps(data)})
1517     return url + '#' + sdata
1518
1519
1520 def unsmuggle_url(smug_url, default=None):
1521     if '#__youtubedl_smuggle' not in smug_url:
1522         return smug_url, default
1523     url, _, sdata = smug_url.rpartition('#')
1524     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1525     data = json.loads(jsond)
1526     return url, data
1527
1528
1529 def format_bytes(bytes):
1530     if bytes is None:
1531         return 'N/A'
1532     if type(bytes) is str:
1533         bytes = float(bytes)
1534     if bytes == 0.0:
1535         exponent = 0
1536     else:
1537         exponent = int(math.log(bytes, 1024.0))
1538     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1539     converted = float(bytes) / float(1024 ** exponent)
1540     return '%.2f%s' % (converted, suffix)
1541
1542
1543 def lookup_unit_table(unit_table, s):
1544     units_re = '|'.join(re.escape(u) for u in unit_table)
1545     m = re.match(
1546         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1547     if not m:
1548         return None
1549     num_str = m.group('num').replace(',', '.')
1550     mult = unit_table[m.group('unit')]
1551     return int(float(num_str) * mult)
1552
1553
1554 def parse_filesize(s):
1555     if s is None:
1556         return None
1557
1558     # The lower-case forms are of course incorrect and unofficial,
1559     # but we support those too
1560     _UNIT_TABLE = {
1561         'B': 1,
1562         'b': 1,
1563         'bytes': 1,
1564         'KiB': 1024,
1565         'KB': 1000,
1566         'kB': 1024,
1567         'Kb': 1000,
1568         'kb': 1000,
1569         'kilobytes': 1000,
1570         'kibibytes': 1024,
1571         'MiB': 1024 ** 2,
1572         'MB': 1000 ** 2,
1573         'mB': 1024 ** 2,
1574         'Mb': 1000 ** 2,
1575         'mb': 1000 ** 2,
1576         'megabytes': 1000 ** 2,
1577         'mebibytes': 1024 ** 2,
1578         'GiB': 1024 ** 3,
1579         'GB': 1000 ** 3,
1580         'gB': 1024 ** 3,
1581         'Gb': 1000 ** 3,
1582         'gb': 1000 ** 3,
1583         'gigabytes': 1000 ** 3,
1584         'gibibytes': 1024 ** 3,
1585         'TiB': 1024 ** 4,
1586         'TB': 1000 ** 4,
1587         'tB': 1024 ** 4,
1588         'Tb': 1000 ** 4,
1589         'tb': 1000 ** 4,
1590         'terabytes': 1000 ** 4,
1591         'tebibytes': 1024 ** 4,
1592         'PiB': 1024 ** 5,
1593         'PB': 1000 ** 5,
1594         'pB': 1024 ** 5,
1595         'Pb': 1000 ** 5,
1596         'pb': 1000 ** 5,
1597         'petabytes': 1000 ** 5,
1598         'pebibytes': 1024 ** 5,
1599         'EiB': 1024 ** 6,
1600         'EB': 1000 ** 6,
1601         'eB': 1024 ** 6,
1602         'Eb': 1000 ** 6,
1603         'eb': 1000 ** 6,
1604         'exabytes': 1000 ** 6,
1605         'exbibytes': 1024 ** 6,
1606         'ZiB': 1024 ** 7,
1607         'ZB': 1000 ** 7,
1608         'zB': 1024 ** 7,
1609         'Zb': 1000 ** 7,
1610         'zb': 1000 ** 7,
1611         'zettabytes': 1000 ** 7,
1612         'zebibytes': 1024 ** 7,
1613         'YiB': 1024 ** 8,
1614         'YB': 1000 ** 8,
1615         'yB': 1024 ** 8,
1616         'Yb': 1000 ** 8,
1617         'yb': 1000 ** 8,
1618         'yottabytes': 1000 ** 8,
1619         'yobibytes': 1024 ** 8,
1620     }
1621
1622     return lookup_unit_table(_UNIT_TABLE, s)
1623
1624
1625 def parse_count(s):
1626     if s is None:
1627         return None
1628
1629     s = s.strip()
1630
1631     if re.match(r'^[\d,.]+$', s):
1632         return str_to_int(s)
1633
1634     _UNIT_TABLE = {
1635         'k': 1000,
1636         'K': 1000,
1637         'm': 1000 ** 2,
1638         'M': 1000 ** 2,
1639         'kk': 1000 ** 2,
1640         'KK': 1000 ** 2,
1641     }
1642
1643     return lookup_unit_table(_UNIT_TABLE, s)
1644
1645
1646 def month_by_name(name, lang='en'):
1647     """ Return the number of a month by (locale-independently) English name """
1648
1649     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1650
1651     try:
1652         return month_names.index(name) + 1
1653     except ValueError:
1654         return None
1655
1656
1657 def month_by_abbreviation(abbrev):
1658     """ Return the number of a month by (locale-independently) English
1659         abbreviations """
1660
1661     try:
1662         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1663     except ValueError:
1664         return None
1665
1666
1667 def fix_xml_ampersands(xml_str):
1668     """Replace all the '&' by '&amp;' in XML"""
1669     return re.sub(
1670         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1671         '&amp;',
1672         xml_str)
1673
1674
1675 def setproctitle(title):
1676     assert isinstance(title, compat_str)
1677
1678     # ctypes in Jython is not complete
1679     # http://bugs.jython.org/issue2148
1680     if sys.platform.startswith('java'):
1681         return
1682
1683     try:
1684         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1685     except OSError:
1686         return
1687     except TypeError:
1688         # LoadLibrary in Windows Python 2.7.13 only expects
1689         # a bytestring, but since unicode_literals turns
1690         # every string into a unicode string, it fails.
1691         return
1692     title_bytes = title.encode('utf-8')
1693     buf = ctypes.create_string_buffer(len(title_bytes))
1694     buf.value = title_bytes
1695     try:
1696         libc.prctl(15, buf, 0, 0, 0)
1697     except AttributeError:
1698         return  # Strange libc, just skip this
1699
1700
1701 def remove_start(s, start):
1702     return s[len(start):] if s is not None and s.startswith(start) else s
1703
1704
1705 def remove_end(s, end):
1706     return s[:-len(end)] if s is not None and s.endswith(end) else s
1707
1708
1709 def remove_quotes(s):
1710     if s is None or len(s) < 2:
1711         return s
1712     for quote in ('"', "'", ):
1713         if s[0] == quote and s[-1] == quote:
1714             return s[1:-1]
1715     return s
1716
1717
1718 def url_basename(url):
1719     path = compat_urlparse.urlparse(url).path
1720     return path.strip('/').split('/')[-1]
1721
1722
1723 def base_url(url):
1724     return re.match(r'https?://[^?#&]+/', url).group()
1725
1726
1727 def urljoin(base, path):
1728     if not isinstance(path, compat_str) or not path:
1729         return None
1730     if re.match(r'^(?:https?:)?//', path):
1731         return path
1732     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1733         return None
1734     return compat_urlparse.urljoin(base, path)
1735
1736
1737 class HEADRequest(compat_urllib_request.Request):
1738     def get_method(self):
1739         return 'HEAD'
1740
1741
1742 class PUTRequest(compat_urllib_request.Request):
1743     def get_method(self):
1744         return 'PUT'
1745
1746
1747 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1748     if get_attr:
1749         if v is not None:
1750             v = getattr(v, get_attr, None)
1751     if v == '':
1752         v = None
1753     if v is None:
1754         return default
1755     try:
1756         return int(v) * invscale // scale
1757     except ValueError:
1758         return default
1759
1760
1761 def str_or_none(v, default=None):
1762     return default if v is None else compat_str(v)
1763
1764
1765 def str_to_int(int_str):
1766     """ A more relaxed version of int_or_none """
1767     if int_str is None:
1768         return None
1769     int_str = re.sub(r'[,\.\+]', '', int_str)
1770     return int(int_str)
1771
1772
1773 def float_or_none(v, scale=1, invscale=1, default=None):
1774     if v is None:
1775         return default
1776     try:
1777         return float(v) * invscale / scale
1778     except ValueError:
1779         return default
1780
1781
1782 def strip_or_none(v):
1783     return None if v is None else v.strip()
1784
1785
1786 def parse_duration(s):
1787     if not isinstance(s, compat_basestring):
1788         return None
1789
1790     s = s.strip()
1791
1792     days, hours, mins, secs, ms = [None] * 5
1793     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1794     if m:
1795         days, hours, mins, secs, ms = m.groups()
1796     else:
1797         m = re.match(
1798             r'''(?ix)(?:P?T)?
1799                 (?:
1800                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1801                 )?
1802                 (?:
1803                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1804                 )?
1805                 (?:
1806                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1807                 )?
1808                 (?:
1809                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1810                 )?Z?$''', s)
1811         if m:
1812             days, hours, mins, secs, ms = m.groups()
1813         else:
1814             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1815             if m:
1816                 hours, mins = m.groups()
1817             else:
1818                 return None
1819
1820     duration = 0
1821     if secs:
1822         duration += float(secs)
1823     if mins:
1824         duration += float(mins) * 60
1825     if hours:
1826         duration += float(hours) * 60 * 60
1827     if days:
1828         duration += float(days) * 24 * 60 * 60
1829     if ms:
1830         duration += float(ms)
1831     return duration
1832
1833
1834 def prepend_extension(filename, ext, expected_real_ext=None):
1835     name, real_ext = os.path.splitext(filename)
1836     return (
1837         '{0}.{1}{2}'.format(name, ext, real_ext)
1838         if not expected_real_ext or real_ext[1:] == expected_real_ext
1839         else '{0}.{1}'.format(filename, ext))
1840
1841
1842 def replace_extension(filename, ext, expected_real_ext=None):
1843     name, real_ext = os.path.splitext(filename)
1844     return '{0}.{1}'.format(
1845         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1846         ext)
1847
1848
1849 def check_executable(exe, args=[]):
1850     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1851     args can be a list of arguments for a short output (like -version) """
1852     try:
1853         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1854     except OSError:
1855         return False
1856     return exe
1857
1858
1859 def get_exe_version(exe, args=['--version'],
1860                     version_re=None, unrecognized='present'):
1861     """ Returns the version of the specified executable,
1862     or False if the executable is not present """
1863     try:
1864         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1865         # SIGTTOU if youtube-dl is run in the background.
1866         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1867         out, _ = subprocess.Popen(
1868             [encodeArgument(exe)] + args,
1869             stdin=subprocess.PIPE,
1870             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1871     except OSError:
1872         return False
1873     if isinstance(out, bytes):  # Python 2.x
1874         out = out.decode('ascii', 'ignore')
1875     return detect_exe_version(out, version_re, unrecognized)
1876
1877
1878 def detect_exe_version(output, version_re=None, unrecognized='present'):
1879     assert isinstance(output, compat_str)
1880     if version_re is None:
1881         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1882     m = re.search(version_re, output)
1883     if m:
1884         return m.group(1)
1885     else:
1886         return unrecognized
1887
1888
1889 class PagedList(object):
1890     def __len__(self):
1891         # This is only useful for tests
1892         return len(self.getslice())
1893
1894
1895 class OnDemandPagedList(PagedList):
1896     def __init__(self, pagefunc, pagesize, use_cache=False):
1897         self._pagefunc = pagefunc
1898         self._pagesize = pagesize
1899         self._use_cache = use_cache
1900         if use_cache:
1901             self._cache = {}
1902
1903     def getslice(self, start=0, end=None):
1904         res = []
1905         for pagenum in itertools.count(start // self._pagesize):
1906             firstid = pagenum * self._pagesize
1907             nextfirstid = pagenum * self._pagesize + self._pagesize
1908             if start >= nextfirstid:
1909                 continue
1910
1911             page_results = None
1912             if self._use_cache:
1913                 page_results = self._cache.get(pagenum)
1914             if page_results is None:
1915                 page_results = list(self._pagefunc(pagenum))
1916             if self._use_cache:
1917                 self._cache[pagenum] = page_results
1918
1919             startv = (
1920                 start % self._pagesize
1921                 if firstid <= start < nextfirstid
1922                 else 0)
1923
1924             endv = (
1925                 ((end - 1) % self._pagesize) + 1
1926                 if (end is not None and firstid <= end <= nextfirstid)
1927                 else None)
1928
1929             if startv != 0 or endv is not None:
1930                 page_results = page_results[startv:endv]
1931             res.extend(page_results)
1932
1933             # A little optimization - if current page is not "full", ie. does
1934             # not contain page_size videos then we can assume that this page
1935             # is the last one - there are no more ids on further pages -
1936             # i.e. no need to query again.
1937             if len(page_results) + startv < self._pagesize:
1938                 break
1939
1940             # If we got the whole page, but the next page is not interesting,
1941             # break out early as well
1942             if end == nextfirstid:
1943                 break
1944         return res
1945
1946
1947 class InAdvancePagedList(PagedList):
1948     def __init__(self, pagefunc, pagecount, pagesize):
1949         self._pagefunc = pagefunc
1950         self._pagecount = pagecount
1951         self._pagesize = pagesize
1952
1953     def getslice(self, start=0, end=None):
1954         res = []
1955         start_page = start // self._pagesize
1956         end_page = (
1957             self._pagecount if end is None else (end // self._pagesize + 1))
1958         skip_elems = start - start_page * self._pagesize
1959         only_more = None if end is None else end - start
1960         for pagenum in range(start_page, end_page):
1961             page = list(self._pagefunc(pagenum))
1962             if skip_elems:
1963                 page = page[skip_elems:]
1964                 skip_elems = None
1965             if only_more is not None:
1966                 if len(page) < only_more:
1967                     only_more -= len(page)
1968                 else:
1969                     page = page[:only_more]
1970                     res.extend(page)
1971                     break
1972             res.extend(page)
1973         return res
1974
1975
1976 def uppercase_escape(s):
1977     unicode_escape = codecs.getdecoder('unicode_escape')
1978     return re.sub(
1979         r'\\U[0-9a-fA-F]{8}',
1980         lambda m: unicode_escape(m.group(0))[0],
1981         s)
1982
1983
1984 def lowercase_escape(s):
1985     unicode_escape = codecs.getdecoder('unicode_escape')
1986     return re.sub(
1987         r'\\u[0-9a-fA-F]{4}',
1988         lambda m: unicode_escape(m.group(0))[0],
1989         s)
1990
1991
1992 def escape_rfc3986(s):
1993     """Escape non-ASCII characters as suggested by RFC 3986"""
1994     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1995         s = s.encode('utf-8')
1996     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1997
1998
1999 def escape_url(url):
2000     """Escape URL as suggested by RFC 3986"""
2001     url_parsed = compat_urllib_parse_urlparse(url)
2002     return url_parsed._replace(
2003         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2004         path=escape_rfc3986(url_parsed.path),
2005         params=escape_rfc3986(url_parsed.params),
2006         query=escape_rfc3986(url_parsed.query),
2007         fragment=escape_rfc3986(url_parsed.fragment)
2008     ).geturl()
2009
2010
2011 def read_batch_urls(batch_fd):
2012     def fixup(url):
2013         if not isinstance(url, compat_str):
2014             url = url.decode('utf-8', 'replace')
2015         BOM_UTF8 = '\xef\xbb\xbf'
2016         if url.startswith(BOM_UTF8):
2017             url = url[len(BOM_UTF8):]
2018         url = url.strip()
2019         if url.startswith(('#', ';', ']')):
2020             return False
2021         return url
2022
2023     with contextlib.closing(batch_fd) as fd:
2024         return [url for url in map(fixup, fd) if url]
2025
2026
2027 def urlencode_postdata(*args, **kargs):
2028     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2029
2030
2031 def update_url_query(url, query):
2032     if not query:
2033         return url
2034     parsed_url = compat_urlparse.urlparse(url)
2035     qs = compat_parse_qs(parsed_url.query)
2036     qs.update(query)
2037     return compat_urlparse.urlunparse(parsed_url._replace(
2038         query=compat_urllib_parse_urlencode(qs, True)))
2039
2040
2041 def update_Request(req, url=None, data=None, headers={}, query={}):
2042     req_headers = req.headers.copy()
2043     req_headers.update(headers)
2044     req_data = data or req.data
2045     req_url = update_url_query(url or req.get_full_url(), query)
2046     req_get_method = req.get_method()
2047     if req_get_method == 'HEAD':
2048         req_type = HEADRequest
2049     elif req_get_method == 'PUT':
2050         req_type = PUTRequest
2051     else:
2052         req_type = compat_urllib_request.Request
2053     new_req = req_type(
2054         req_url, data=req_data, headers=req_headers,
2055         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2056     if hasattr(req, 'timeout'):
2057         new_req.timeout = req.timeout
2058     return new_req
2059
2060
2061 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2062     if isinstance(key_or_keys, (list, tuple)):
2063         for key in key_or_keys:
2064             if key not in d or d[key] is None or skip_false_values and not d[key]:
2065                 continue
2066             return d[key]
2067         return default
2068     return d.get(key_or_keys, default)
2069
2070
2071 def try_get(src, getter, expected_type=None):
2072     try:
2073         v = getter(src)
2074     except (AttributeError, KeyError, TypeError, IndexError):
2075         pass
2076     else:
2077         if expected_type is None or isinstance(v, expected_type):
2078             return v
2079
2080
2081 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2082     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2083
2084
2085 US_RATINGS = {
2086     'G': 0,
2087     'PG': 10,
2088     'PG-13': 13,
2089     'R': 16,
2090     'NC': 18,
2091 }
2092
2093
2094 TV_PARENTAL_GUIDELINES = {
2095     'TV-Y': 0,
2096     'TV-Y7': 7,
2097     'TV-G': 0,
2098     'TV-PG': 0,
2099     'TV-14': 14,
2100     'TV-MA': 17,
2101 }
2102
2103
2104 def parse_age_limit(s):
2105     if type(s) == int:
2106         return s if 0 <= s <= 21 else None
2107     if not isinstance(s, compat_basestring):
2108         return None
2109     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2110     if m:
2111         return int(m.group('age'))
2112     if s in US_RATINGS:
2113         return US_RATINGS[s]
2114     return TV_PARENTAL_GUIDELINES.get(s)
2115
2116
2117 def strip_jsonp(code):
2118     return re.sub(
2119         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2120
2121
2122 def js_to_json(code):
2123     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2124     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2125     INTEGER_TABLE = (
2126         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2127         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2128     )
2129
2130     def fix_kv(m):
2131         v = m.group(0)
2132         if v in ('true', 'false', 'null'):
2133             return v
2134         elif v.startswith('/*') or v.startswith('//') or v == ',':
2135             return ""
2136
2137         if v[0] in ("'", '"'):
2138             v = re.sub(r'(?s)\\.|"', lambda m: {
2139                 '"': '\\"',
2140                 "\\'": "'",
2141                 '\\\n': '',
2142                 '\\x': '\\u00',
2143             }.get(m.group(0), m.group(0)), v[1:-1])
2144
2145         for regex, base in INTEGER_TABLE:
2146             im = re.match(regex, v)
2147             if im:
2148                 i = int(im.group(1), base)
2149                 return '"%d":' % i if v.endswith(':') else '%d' % i
2150
2151         return '"%s"' % v
2152
2153     return re.sub(r'''(?sx)
2154         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2155         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2156         {comment}|,(?={skip}[\]}}])|
2157         [a-zA-Z_][.a-zA-Z_0-9]*|
2158         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2159         [0-9]+(?={skip}:)
2160         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2161
2162
2163 def qualities(quality_ids):
2164     """ Get a numeric quality value out of a list of possible values """
2165     def q(qid):
2166         try:
2167             return quality_ids.index(qid)
2168         except ValueError:
2169             return -1
2170     return q
2171
2172
2173 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2174
2175
2176 def limit_length(s, length):
2177     """ Add ellipses to overly long strings """
2178     if s is None:
2179         return None
2180     ELLIPSES = '...'
2181     if len(s) > length:
2182         return s[:length - len(ELLIPSES)] + ELLIPSES
2183     return s
2184
2185
2186 def version_tuple(v):
2187     return tuple(int(e) for e in re.split(r'[-.]', v))
2188
2189
2190 def is_outdated_version(version, limit, assume_new=True):
2191     if not version:
2192         return not assume_new
2193     try:
2194         return version_tuple(version) < version_tuple(limit)
2195     except ValueError:
2196         return not assume_new
2197
2198
2199 def ytdl_is_updateable():
2200     """ Returns if youtube-dl can be updated with -U """
2201     from zipimport import zipimporter
2202
2203     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2204
2205
2206 def args_to_str(args):
2207     # Get a short string representation for a subprocess command
2208     return ' '.join(compat_shlex_quote(a) for a in args)
2209
2210
2211 def error_to_compat_str(err):
2212     err_str = str(err)
2213     # On python 2 error byte string must be decoded with proper
2214     # encoding rather than ascii
2215     if sys.version_info[0] < 3:
2216         err_str = err_str.decode(preferredencoding())
2217     return err_str
2218
2219
2220 def mimetype2ext(mt):
2221     if mt is None:
2222         return None
2223
2224     ext = {
2225         'audio/mp4': 'm4a',
2226         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2227         # it's the most popular one
2228         'audio/mpeg': 'mp3',
2229     }.get(mt)
2230     if ext is not None:
2231         return ext
2232
2233     _, _, res = mt.rpartition('/')
2234     res = res.split(';')[0].strip().lower()
2235
2236     return {
2237         '3gpp': '3gp',
2238         'smptett+xml': 'tt',
2239         'srt': 'srt',
2240         'ttaf+xml': 'dfxp',
2241         'ttml+xml': 'ttml',
2242         'vtt': 'vtt',
2243         'x-flv': 'flv',
2244         'x-mp4-fragmented': 'mp4',
2245         'x-ms-wmv': 'wmv',
2246         'mpegurl': 'm3u8',
2247         'x-mpegurl': 'm3u8',
2248         'vnd.apple.mpegurl': 'm3u8',
2249         'dash+xml': 'mpd',
2250         'f4m': 'f4m',
2251         'f4m+xml': 'f4m',
2252         'hds+xml': 'f4m',
2253         'vnd.ms-sstr+xml': 'ism',
2254         'quicktime': 'mov',
2255     }.get(res, res)
2256
2257
2258 def parse_codecs(codecs_str):
2259     # http://tools.ietf.org/html/rfc6381
2260     if not codecs_str:
2261         return {}
2262     splited_codecs = list(filter(None, map(
2263         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2264     vcodec, acodec = None, None
2265     for full_codec in splited_codecs:
2266         codec = full_codec.split('.')[0]
2267         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2268             if not vcodec:
2269                 vcodec = full_codec
2270         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2271             if not acodec:
2272                 acodec = full_codec
2273         else:
2274             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2275     if not vcodec and not acodec:
2276         if len(splited_codecs) == 2:
2277             return {
2278                 'vcodec': vcodec,
2279                 'acodec': acodec,
2280             }
2281         elif len(splited_codecs) == 1:
2282             return {
2283                 'vcodec': 'none',
2284                 'acodec': vcodec,
2285             }
2286     else:
2287         return {
2288             'vcodec': vcodec or 'none',
2289             'acodec': acodec or 'none',
2290         }
2291     return {}
2292
2293
2294 def urlhandle_detect_ext(url_handle):
2295     getheader = url_handle.headers.get
2296
2297     cd = getheader('Content-Disposition')
2298     if cd:
2299         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2300         if m:
2301             e = determine_ext(m.group('filename'), default_ext=None)
2302             if e:
2303                 return e
2304
2305     return mimetype2ext(getheader('Content-Type'))
2306
2307
2308 def encode_data_uri(data, mime_type):
2309     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2310
2311
2312 def age_restricted(content_limit, age_limit):
2313     """ Returns True iff the content should be blocked """
2314
2315     if age_limit is None:  # No limit set
2316         return False
2317     if content_limit is None:
2318         return False  # Content available for everyone
2319     return age_limit < content_limit
2320
2321
2322 def is_html(first_bytes):
2323     """ Detect whether a file contains HTML by examining its first bytes. """
2324
2325     BOMS = [
2326         (b'\xef\xbb\xbf', 'utf-8'),
2327         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2328         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2329         (b'\xff\xfe', 'utf-16-le'),
2330         (b'\xfe\xff', 'utf-16-be'),
2331     ]
2332     for bom, enc in BOMS:
2333         if first_bytes.startswith(bom):
2334             s = first_bytes[len(bom):].decode(enc, 'replace')
2335             break
2336     else:
2337         s = first_bytes.decode('utf-8', 'replace')
2338
2339     return re.match(r'^\s*<', s)
2340
2341
2342 def determine_protocol(info_dict):
2343     protocol = info_dict.get('protocol')
2344     if protocol is not None:
2345         return protocol
2346
2347     url = info_dict['url']
2348     if url.startswith('rtmp'):
2349         return 'rtmp'
2350     elif url.startswith('mms'):
2351         return 'mms'
2352     elif url.startswith('rtsp'):
2353         return 'rtsp'
2354
2355     ext = determine_ext(url)
2356     if ext == 'm3u8':
2357         return 'm3u8'
2358     elif ext == 'f4m':
2359         return 'f4m'
2360
2361     return compat_urllib_parse_urlparse(url).scheme
2362
2363
2364 def render_table(header_row, data):
2365     """ Render a list of rows, each as a list of values """
2366     table = [header_row] + data
2367     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2368     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2369     return '\n'.join(format_str % tuple(row) for row in table)
2370
2371
2372 def _match_one(filter_part, dct):
2373     COMPARISON_OPERATORS = {
2374         '<': operator.lt,
2375         '<=': operator.le,
2376         '>': operator.gt,
2377         '>=': operator.ge,
2378         '=': operator.eq,
2379         '!=': operator.ne,
2380     }
2381     operator_rex = re.compile(r'''(?x)\s*
2382         (?P<key>[a-z_]+)
2383         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2384         (?:
2385             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2386             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2387         )
2388         \s*$
2389         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2390     m = operator_rex.search(filter_part)
2391     if m:
2392         op = COMPARISON_OPERATORS[m.group('op')]
2393         actual_value = dct.get(m.group('key'))
2394         if (m.group('strval') is not None or
2395             # If the original field is a string and matching comparisonvalue is
2396             # a number we should respect the origin of the original field
2397             # and process comparison value as a string (see
2398             # https://github.com/rg3/youtube-dl/issues/11082).
2399             actual_value is not None and m.group('intval') is not None and
2400                 isinstance(actual_value, compat_str)):
2401             if m.group('op') not in ('=', '!='):
2402                 raise ValueError(
2403                     'Operator %s does not support string values!' % m.group('op'))
2404             comparison_value = m.group('strval') or m.group('intval')
2405         else:
2406             try:
2407                 comparison_value = int(m.group('intval'))
2408             except ValueError:
2409                 comparison_value = parse_filesize(m.group('intval'))
2410                 if comparison_value is None:
2411                     comparison_value = parse_filesize(m.group('intval') + 'B')
2412                 if comparison_value is None:
2413                     raise ValueError(
2414                         'Invalid integer value %r in filter part %r' % (
2415                             m.group('intval'), filter_part))
2416         if actual_value is None:
2417             return m.group('none_inclusive')
2418         return op(actual_value, comparison_value)
2419
2420     UNARY_OPERATORS = {
2421         '': lambda v: v is not None,
2422         '!': lambda v: v is None,
2423     }
2424     operator_rex = re.compile(r'''(?x)\s*
2425         (?P<op>%s)\s*(?P<key>[a-z_]+)
2426         \s*$
2427         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2428     m = operator_rex.search(filter_part)
2429     if m:
2430         op = UNARY_OPERATORS[m.group('op')]
2431         actual_value = dct.get(m.group('key'))
2432         return op(actual_value)
2433
2434     raise ValueError('Invalid filter part %r' % filter_part)
2435
2436
2437 def match_str(filter_str, dct):
2438     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2439
2440     return all(
2441         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2442
2443
2444 def match_filter_func(filter_str):
2445     def _match_func(info_dict):
2446         if match_str(filter_str, info_dict):
2447             return None
2448         else:
2449             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2450             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2451     return _match_func
2452
2453
2454 def parse_dfxp_time_expr(time_expr):
2455     if not time_expr:
2456         return
2457
2458     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2459     if mobj:
2460         return float(mobj.group('time_offset'))
2461
2462     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2463     if mobj:
2464         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2465
2466
2467 def srt_subtitles_timecode(seconds):
2468     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2469
2470
2471 def dfxp2srt(dfxp_data):
2472     _x = functools.partial(xpath_with_ns, ns_map={
2473         'ttml': 'http://www.w3.org/ns/ttml',
2474         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2475         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2476     })
2477
2478     class TTMLPElementParser(object):
2479         out = ''
2480
2481         def start(self, tag, attrib):
2482             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2483                 self.out += '\n'
2484
2485         def end(self, tag):
2486             pass
2487
2488         def data(self, data):
2489             self.out += data
2490
2491         def close(self):
2492             return self.out.strip()
2493
2494     def parse_node(node):
2495         target = TTMLPElementParser()
2496         parser = xml.etree.ElementTree.XMLParser(target=target)
2497         parser.feed(xml.etree.ElementTree.tostring(node))
2498         return parser.close()
2499
2500     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2501     out = []
2502     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2503
2504     if not paras:
2505         raise ValueError('Invalid dfxp/TTML subtitle')
2506
2507     for para, index in zip(paras, itertools.count(1)):
2508         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2509         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2510         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2511         if begin_time is None:
2512             continue
2513         if not end_time:
2514             if not dur:
2515                 continue
2516             end_time = begin_time + dur
2517         out.append('%d\n%s --> %s\n%s\n\n' % (
2518             index,
2519             srt_subtitles_timecode(begin_time),
2520             srt_subtitles_timecode(end_time),
2521             parse_node(para)))
2522
2523     return ''.join(out)
2524
2525
2526 def cli_option(params, command_option, param):
2527     param = params.get(param)
2528     if param:
2529         param = compat_str(param)
2530     return [command_option, param] if param is not None else []
2531
2532
2533 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2534     param = params.get(param)
2535     assert isinstance(param, bool)
2536     if separator:
2537         return [command_option + separator + (true_value if param else false_value)]
2538     return [command_option, true_value if param else false_value]
2539
2540
2541 def cli_valueless_option(params, command_option, param, expected_value=True):
2542     param = params.get(param)
2543     return [command_option] if param == expected_value else []
2544
2545
2546 def cli_configuration_args(params, param, default=[]):
2547     ex_args = params.get(param)
2548     if ex_args is None:
2549         return default
2550     assert isinstance(ex_args, list)
2551     return ex_args
2552
2553
2554 class ISO639Utils(object):
2555     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2556     _lang_map = {
2557         'aa': 'aar',
2558         'ab': 'abk',
2559         'ae': 'ave',
2560         'af': 'afr',
2561         'ak': 'aka',
2562         'am': 'amh',
2563         'an': 'arg',
2564         'ar': 'ara',
2565         'as': 'asm',
2566         'av': 'ava',
2567         'ay': 'aym',
2568         'az': 'aze',
2569         'ba': 'bak',
2570         'be': 'bel',
2571         'bg': 'bul',
2572         'bh': 'bih',
2573         'bi': 'bis',
2574         'bm': 'bam',
2575         'bn': 'ben',
2576         'bo': 'bod',
2577         'br': 'bre',
2578         'bs': 'bos',
2579         'ca': 'cat',
2580         'ce': 'che',
2581         'ch': 'cha',
2582         'co': 'cos',
2583         'cr': 'cre',
2584         'cs': 'ces',
2585         'cu': 'chu',
2586         'cv': 'chv',
2587         'cy': 'cym',
2588         'da': 'dan',
2589         'de': 'deu',
2590         'dv': 'div',
2591         'dz': 'dzo',
2592         'ee': 'ewe',
2593         'el': 'ell',
2594         'en': 'eng',
2595         'eo': 'epo',
2596         'es': 'spa',
2597         'et': 'est',
2598         'eu': 'eus',
2599         'fa': 'fas',
2600         'ff': 'ful',
2601         'fi': 'fin',
2602         'fj': 'fij',
2603         'fo': 'fao',
2604         'fr': 'fra',
2605         'fy': 'fry',
2606         'ga': 'gle',
2607         'gd': 'gla',
2608         'gl': 'glg',
2609         'gn': 'grn',
2610         'gu': 'guj',
2611         'gv': 'glv',
2612         'ha': 'hau',
2613         'he': 'heb',
2614         'hi': 'hin',
2615         'ho': 'hmo',
2616         'hr': 'hrv',
2617         'ht': 'hat',
2618         'hu': 'hun',
2619         'hy': 'hye',
2620         'hz': 'her',
2621         'ia': 'ina',
2622         'id': 'ind',
2623         'ie': 'ile',
2624         'ig': 'ibo',
2625         'ii': 'iii',
2626         'ik': 'ipk',
2627         'io': 'ido',
2628         'is': 'isl',
2629         'it': 'ita',
2630         'iu': 'iku',
2631         'ja': 'jpn',
2632         'jv': 'jav',
2633         'ka': 'kat',
2634         'kg': 'kon',
2635         'ki': 'kik',
2636         'kj': 'kua',
2637         'kk': 'kaz',
2638         'kl': 'kal',
2639         'km': 'khm',
2640         'kn': 'kan',
2641         'ko': 'kor',
2642         'kr': 'kau',
2643         'ks': 'kas',
2644         'ku': 'kur',
2645         'kv': 'kom',
2646         'kw': 'cor',
2647         'ky': 'kir',
2648         'la': 'lat',
2649         'lb': 'ltz',
2650         'lg': 'lug',
2651         'li': 'lim',
2652         'ln': 'lin',
2653         'lo': 'lao',
2654         'lt': 'lit',
2655         'lu': 'lub',
2656         'lv': 'lav',
2657         'mg': 'mlg',
2658         'mh': 'mah',
2659         'mi': 'mri',
2660         'mk': 'mkd',
2661         'ml': 'mal',
2662         'mn': 'mon',
2663         'mr': 'mar',
2664         'ms': 'msa',
2665         'mt': 'mlt',
2666         'my': 'mya',
2667         'na': 'nau',
2668         'nb': 'nob',
2669         'nd': 'nde',
2670         'ne': 'nep',
2671         'ng': 'ndo',
2672         'nl': 'nld',
2673         'nn': 'nno',
2674         'no': 'nor',
2675         'nr': 'nbl',
2676         'nv': 'nav',
2677         'ny': 'nya',
2678         'oc': 'oci',
2679         'oj': 'oji',
2680         'om': 'orm',
2681         'or': 'ori',
2682         'os': 'oss',
2683         'pa': 'pan',
2684         'pi': 'pli',
2685         'pl': 'pol',
2686         'ps': 'pus',
2687         'pt': 'por',
2688         'qu': 'que',
2689         'rm': 'roh',
2690         'rn': 'run',
2691         'ro': 'ron',
2692         'ru': 'rus',
2693         'rw': 'kin',
2694         'sa': 'san',
2695         'sc': 'srd',
2696         'sd': 'snd',
2697         'se': 'sme',
2698         'sg': 'sag',
2699         'si': 'sin',
2700         'sk': 'slk',
2701         'sl': 'slv',
2702         'sm': 'smo',
2703         'sn': 'sna',
2704         'so': 'som',
2705         'sq': 'sqi',
2706         'sr': 'srp',
2707         'ss': 'ssw',
2708         'st': 'sot',
2709         'su': 'sun',
2710         'sv': 'swe',
2711         'sw': 'swa',
2712         'ta': 'tam',
2713         'te': 'tel',
2714         'tg': 'tgk',
2715         'th': 'tha',
2716         'ti': 'tir',
2717         'tk': 'tuk',
2718         'tl': 'tgl',
2719         'tn': 'tsn',
2720         'to': 'ton',
2721         'tr': 'tur',
2722         'ts': 'tso',
2723         'tt': 'tat',
2724         'tw': 'twi',
2725         'ty': 'tah',
2726         'ug': 'uig',
2727         'uk': 'ukr',
2728         'ur': 'urd',
2729         'uz': 'uzb',
2730         've': 'ven',
2731         'vi': 'vie',
2732         'vo': 'vol',
2733         'wa': 'wln',
2734         'wo': 'wol',
2735         'xh': 'xho',
2736         'yi': 'yid',
2737         'yo': 'yor',
2738         'za': 'zha',
2739         'zh': 'zho',
2740         'zu': 'zul',
2741     }
2742
2743     @classmethod
2744     def short2long(cls, code):
2745         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2746         return cls._lang_map.get(code[:2])
2747
2748     @classmethod
2749     def long2short(cls, code):
2750         """Convert language code from ISO 639-2/T to ISO 639-1"""
2751         for short_name, long_name in cls._lang_map.items():
2752             if long_name == code:
2753                 return short_name
2754
2755
2756 class ISO3166Utils(object):
2757     # From http://data.okfn.org/data/core/country-list
2758     _country_map = {
2759         'AF': 'Afghanistan',
2760         'AX': 'Åland Islands',
2761         'AL': 'Albania',
2762         'DZ': 'Algeria',
2763         'AS': 'American Samoa',
2764         'AD': 'Andorra',
2765         'AO': 'Angola',
2766         'AI': 'Anguilla',
2767         'AQ': 'Antarctica',
2768         'AG': 'Antigua and Barbuda',
2769         'AR': 'Argentina',
2770         'AM': 'Armenia',
2771         'AW': 'Aruba',
2772         'AU': 'Australia',
2773         'AT': 'Austria',
2774         'AZ': 'Azerbaijan',
2775         'BS': 'Bahamas',
2776         'BH': 'Bahrain',
2777         'BD': 'Bangladesh',
2778         'BB': 'Barbados',
2779         'BY': 'Belarus',
2780         'BE': 'Belgium',
2781         'BZ': 'Belize',
2782         'BJ': 'Benin',
2783         'BM': 'Bermuda',
2784         'BT': 'Bhutan',
2785         'BO': 'Bolivia, Plurinational State of',
2786         'BQ': 'Bonaire, Sint Eustatius and Saba',
2787         'BA': 'Bosnia and Herzegovina',
2788         'BW': 'Botswana',
2789         'BV': 'Bouvet Island',
2790         'BR': 'Brazil',
2791         'IO': 'British Indian Ocean Territory',
2792         'BN': 'Brunei Darussalam',
2793         'BG': 'Bulgaria',
2794         'BF': 'Burkina Faso',
2795         'BI': 'Burundi',
2796         'KH': 'Cambodia',
2797         'CM': 'Cameroon',
2798         'CA': 'Canada',
2799         'CV': 'Cape Verde',
2800         'KY': 'Cayman Islands',
2801         'CF': 'Central African Republic',
2802         'TD': 'Chad',
2803         'CL': 'Chile',
2804         'CN': 'China',
2805         'CX': 'Christmas Island',
2806         'CC': 'Cocos (Keeling) Islands',
2807         'CO': 'Colombia',
2808         'KM': 'Comoros',
2809         'CG': 'Congo',
2810         'CD': 'Congo, the Democratic Republic of the',
2811         'CK': 'Cook Islands',
2812         'CR': 'Costa Rica',
2813         'CI': 'Côte d\'Ivoire',
2814         'HR': 'Croatia',
2815         'CU': 'Cuba',
2816         'CW': 'Curaçao',
2817         'CY': 'Cyprus',
2818         'CZ': 'Czech Republic',
2819         'DK': 'Denmark',
2820         'DJ': 'Djibouti',
2821         'DM': 'Dominica',
2822         'DO': 'Dominican Republic',
2823         'EC': 'Ecuador',
2824         'EG': 'Egypt',
2825         'SV': 'El Salvador',
2826         'GQ': 'Equatorial Guinea',
2827         'ER': 'Eritrea',
2828         'EE': 'Estonia',
2829         'ET': 'Ethiopia',
2830         'FK': 'Falkland Islands (Malvinas)',
2831         'FO': 'Faroe Islands',
2832         'FJ': 'Fiji',
2833         'FI': 'Finland',
2834         'FR': 'France',
2835         'GF': 'French Guiana',
2836         'PF': 'French Polynesia',
2837         'TF': 'French Southern Territories',
2838         'GA': 'Gabon',
2839         'GM': 'Gambia',
2840         'GE': 'Georgia',
2841         'DE': 'Germany',
2842         'GH': 'Ghana',
2843         'GI': 'Gibraltar',
2844         'GR': 'Greece',
2845         'GL': 'Greenland',
2846         'GD': 'Grenada',
2847         'GP': 'Guadeloupe',
2848         'GU': 'Guam',
2849         'GT': 'Guatemala',
2850         'GG': 'Guernsey',
2851         'GN': 'Guinea',
2852         'GW': 'Guinea-Bissau',
2853         'GY': 'Guyana',
2854         'HT': 'Haiti',
2855         'HM': 'Heard Island and McDonald Islands',
2856         'VA': 'Holy See (Vatican City State)',
2857         'HN': 'Honduras',
2858         'HK': 'Hong Kong',
2859         'HU': 'Hungary',
2860         'IS': 'Iceland',
2861         'IN': 'India',
2862         'ID': 'Indonesia',
2863         'IR': 'Iran, Islamic Republic of',
2864         'IQ': 'Iraq',
2865         'IE': 'Ireland',
2866         'IM': 'Isle of Man',
2867         'IL': 'Israel',
2868         'IT': 'Italy',
2869         'JM': 'Jamaica',
2870         'JP': 'Japan',
2871         'JE': 'Jersey',
2872         'JO': 'Jordan',
2873         'KZ': 'Kazakhstan',
2874         'KE': 'Kenya',
2875         'KI': 'Kiribati',
2876         'KP': 'Korea, Democratic People\'s Republic of',
2877         'KR': 'Korea, Republic of',
2878         'KW': 'Kuwait',
2879         'KG': 'Kyrgyzstan',
2880         'LA': 'Lao People\'s Democratic Republic',
2881         'LV': 'Latvia',
2882         'LB': 'Lebanon',
2883         'LS': 'Lesotho',
2884         'LR': 'Liberia',
2885         'LY': 'Libya',
2886         'LI': 'Liechtenstein',
2887         'LT': 'Lithuania',
2888         'LU': 'Luxembourg',
2889         'MO': 'Macao',
2890         'MK': 'Macedonia, the Former Yugoslav Republic of',
2891         'MG': 'Madagascar',
2892         'MW': 'Malawi',
2893         'MY': 'Malaysia',
2894         'MV': 'Maldives',
2895         'ML': 'Mali',
2896         'MT': 'Malta',
2897         'MH': 'Marshall Islands',
2898         'MQ': 'Martinique',
2899         'MR': 'Mauritania',
2900         'MU': 'Mauritius',
2901         'YT': 'Mayotte',
2902         'MX': 'Mexico',
2903         'FM': 'Micronesia, Federated States of',
2904         'MD': 'Moldova, Republic of',
2905         'MC': 'Monaco',
2906         'MN': 'Mongolia',
2907         'ME': 'Montenegro',
2908         'MS': 'Montserrat',
2909         'MA': 'Morocco',
2910         'MZ': 'Mozambique',
2911         'MM': 'Myanmar',
2912         'NA': 'Namibia',
2913         'NR': 'Nauru',
2914         'NP': 'Nepal',
2915         'NL': 'Netherlands',
2916         'NC': 'New Caledonia',
2917         'NZ': 'New Zealand',
2918         'NI': 'Nicaragua',
2919         'NE': 'Niger',
2920         'NG': 'Nigeria',
2921         'NU': 'Niue',
2922         'NF': 'Norfolk Island',
2923         'MP': 'Northern Mariana Islands',
2924         'NO': 'Norway',
2925         'OM': 'Oman',
2926         'PK': 'Pakistan',
2927         'PW': 'Palau',
2928         'PS': 'Palestine, State of',
2929         'PA': 'Panama',
2930         'PG': 'Papua New Guinea',
2931         'PY': 'Paraguay',
2932         'PE': 'Peru',
2933         'PH': 'Philippines',
2934         'PN': 'Pitcairn',
2935         'PL': 'Poland',
2936         'PT': 'Portugal',
2937         'PR': 'Puerto Rico',
2938         'QA': 'Qatar',
2939         'RE': 'Réunion',
2940         'RO': 'Romania',
2941         'RU': 'Russian Federation',
2942         'RW': 'Rwanda',
2943         'BL': 'Saint Barthélemy',
2944         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2945         'KN': 'Saint Kitts and Nevis',
2946         'LC': 'Saint Lucia',
2947         'MF': 'Saint Martin (French part)',
2948         'PM': 'Saint Pierre and Miquelon',
2949         'VC': 'Saint Vincent and the Grenadines',
2950         'WS': 'Samoa',
2951         'SM': 'San Marino',
2952         'ST': 'Sao Tome and Principe',
2953         'SA': 'Saudi Arabia',
2954         'SN': 'Senegal',
2955         'RS': 'Serbia',
2956         'SC': 'Seychelles',
2957         'SL': 'Sierra Leone',
2958         'SG': 'Singapore',
2959         'SX': 'Sint Maarten (Dutch part)',
2960         'SK': 'Slovakia',
2961         'SI': 'Slovenia',
2962         'SB': 'Solomon Islands',
2963         'SO': 'Somalia',
2964         'ZA': 'South Africa',
2965         'GS': 'South Georgia and the South Sandwich Islands',
2966         'SS': 'South Sudan',
2967         'ES': 'Spain',
2968         'LK': 'Sri Lanka',
2969         'SD': 'Sudan',
2970         'SR': 'Suriname',
2971         'SJ': 'Svalbard and Jan Mayen',
2972         'SZ': 'Swaziland',
2973         'SE': 'Sweden',
2974         'CH': 'Switzerland',
2975         'SY': 'Syrian Arab Republic',
2976         'TW': 'Taiwan, Province of China',
2977         'TJ': 'Tajikistan',
2978         'TZ': 'Tanzania, United Republic of',
2979         'TH': 'Thailand',
2980         'TL': 'Timor-Leste',
2981         'TG': 'Togo',
2982         'TK': 'Tokelau',
2983         'TO': 'Tonga',
2984         'TT': 'Trinidad and Tobago',
2985         'TN': 'Tunisia',
2986         'TR': 'Turkey',
2987         'TM': 'Turkmenistan',
2988         'TC': 'Turks and Caicos Islands',
2989         'TV': 'Tuvalu',
2990         'UG': 'Uganda',
2991         'UA': 'Ukraine',
2992         'AE': 'United Arab Emirates',
2993         'GB': 'United Kingdom',
2994         'US': 'United States',
2995         'UM': 'United States Minor Outlying Islands',
2996         'UY': 'Uruguay',
2997         'UZ': 'Uzbekistan',
2998         'VU': 'Vanuatu',
2999         'VE': 'Venezuela, Bolivarian Republic of',
3000         'VN': 'Viet Nam',
3001         'VG': 'Virgin Islands, British',
3002         'VI': 'Virgin Islands, U.S.',
3003         'WF': 'Wallis and Futuna',
3004         'EH': 'Western Sahara',
3005         'YE': 'Yemen',
3006         'ZM': 'Zambia',
3007         'ZW': 'Zimbabwe',
3008     }
3009
3010     @classmethod
3011     def short2full(cls, code):
3012         """Convert an ISO 3166-2 country code to the corresponding full name"""
3013         return cls._country_map.get(code.upper())
3014
3015
3016 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3017     def __init__(self, proxies=None):
3018         # Set default handlers
3019         for type in ('http', 'https'):
3020             setattr(self, '%s_open' % type,
3021                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3022                         meth(r, proxy, type))
3023         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3024
3025     def proxy_open(self, req, proxy, type):
3026         req_proxy = req.headers.get('Ytdl-request-proxy')
3027         if req_proxy is not None:
3028             proxy = req_proxy
3029             del req.headers['Ytdl-request-proxy']
3030
3031         if proxy == '__noproxy__':
3032             return None  # No Proxy
3033         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3034             req.add_header('Ytdl-socks-proxy', proxy)
3035             # youtube-dl's http/https handlers do wrapping the socket with socks
3036             return None
3037         return compat_urllib_request.ProxyHandler.proxy_open(
3038             self, req, proxy, type)
3039
3040
3041 def ohdave_rsa_encrypt(data, exponent, modulus):
3042     '''
3043     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3044
3045     Input:
3046         data: data to encrypt, bytes-like object
3047         exponent, modulus: parameter e and N of RSA algorithm, both integer
3048     Output: hex string of encrypted data
3049
3050     Limitation: supports one block encryption only
3051     '''
3052
3053     payload = int(binascii.hexlify(data[::-1]), 16)
3054     encrypted = pow(payload, exponent, modulus)
3055     return '%x' % encrypted
3056
3057
3058 def encode_base_n(num, n, table=None):
3059     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3060     if not table:
3061         table = FULL_TABLE[:n]
3062
3063     if n > len(table):
3064         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3065
3066     if num == 0:
3067         return table[0]
3068
3069     ret = ''
3070     while num:
3071         ret = table[num % n] + ret
3072         num = num // n
3073     return ret
3074
3075
3076 def decode_packed_codes(code):
3077     mobj = re.search(PACKED_CODES_RE, code)
3078     obfucasted_code, base, count, symbols = mobj.groups()
3079     base = int(base)
3080     count = int(count)
3081     symbols = symbols.split('|')
3082     symbol_table = {}
3083
3084     while count:
3085         count -= 1
3086         base_n_count = encode_base_n(count, base)
3087         symbol_table[base_n_count] = symbols[count] or base_n_count
3088
3089     return re.sub(
3090         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3091         obfucasted_code)
3092
3093
3094 def parse_m3u8_attributes(attrib):
3095     info = {}
3096     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3097         if val.startswith('"'):
3098             val = val[1:-1]
3099         info[key] = val
3100     return info
3101
3102
3103 def urshift(val, n):
3104     return val >> n if val >= 0 else (val + 0x100000000) >> n
3105
3106
3107 # Based on png2str() written by @gdkchan and improved by @yokrysty
3108 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3109 def decode_png(png_data):
3110     # Reference: https://www.w3.org/TR/PNG/
3111     header = png_data[8:]
3112
3113     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3114         raise IOError('Not a valid PNG file.')
3115
3116     int_map = {1: '>B', 2: '>H', 4: '>I'}
3117     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3118
3119     chunks = []
3120
3121     while header:
3122         length = unpack_integer(header[:4])
3123         header = header[4:]
3124
3125         chunk_type = header[:4]
3126         header = header[4:]
3127
3128         chunk_data = header[:length]
3129         header = header[length:]
3130
3131         header = header[4:]  # Skip CRC
3132
3133         chunks.append({
3134             'type': chunk_type,
3135             'length': length,
3136             'data': chunk_data
3137         })
3138
3139     ihdr = chunks[0]['data']
3140
3141     width = unpack_integer(ihdr[:4])
3142     height = unpack_integer(ihdr[4:8])
3143
3144     idat = b''
3145
3146     for chunk in chunks:
3147         if chunk['type'] == b'IDAT':
3148             idat += chunk['data']
3149
3150     if not idat:
3151         raise IOError('Unable to read PNG data.')
3152
3153     decompressed_data = bytearray(zlib.decompress(idat))
3154
3155     stride = width * 3
3156     pixels = []
3157
3158     def _get_pixel(idx):
3159         x = idx % stride
3160         y = idx // stride
3161         return pixels[y][x]
3162
3163     for y in range(height):
3164         basePos = y * (1 + stride)
3165         filter_type = decompressed_data[basePos]
3166
3167         current_row = []
3168
3169         pixels.append(current_row)
3170
3171         for x in range(stride):
3172             color = decompressed_data[1 + basePos + x]
3173             basex = y * stride + x
3174             left = 0
3175             up = 0
3176
3177             if x > 2:
3178                 left = _get_pixel(basex - 3)
3179             if y > 0:
3180                 up = _get_pixel(basex - stride)
3181
3182             if filter_type == 1:  # Sub
3183                 color = (color + left) & 0xff
3184             elif filter_type == 2:  # Up
3185                 color = (color + up) & 0xff
3186             elif filter_type == 3:  # Average
3187                 color = (color + ((left + up) >> 1)) & 0xff
3188             elif filter_type == 4:  # Paeth
3189                 a = left
3190                 b = up
3191                 c = 0
3192
3193                 if x > 2 and y > 0:
3194                     c = _get_pixel(basex - stride - 3)
3195
3196                 p = a + b - c
3197
3198                 pa = abs(p - a)
3199                 pb = abs(p - b)
3200                 pc = abs(p - c)
3201
3202                 if pa <= pb and pa <= pc:
3203                     color = (color + a) & 0xff
3204                 elif pb <= pc:
3205                     color = (color + b) & 0xff
3206                 else:
3207                     color = (color + c) & 0xff
3208
3209             current_row.append(color)
3210
3211     return width, height, pixels
3212
3213
3214 def write_xattr(path, key, value):
3215     # This mess below finds the best xattr tool for the job
3216     try:
3217         # try the pyxattr module...
3218         import xattr
3219
3220         if hasattr(xattr, 'set'):  # pyxattr
3221             # Unicode arguments are not supported in python-pyxattr until
3222             # version 0.5.0
3223             # See https://github.com/rg3/youtube-dl/issues/5498
3224             pyxattr_required_version = '0.5.0'
3225             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3226                 # TODO: fallback to CLI tools
3227                 raise XAttrUnavailableError(
3228                     'python-pyxattr is detected but is too old. '
3229                     'youtube-dl requires %s or above while your version is %s. '
3230                     'Falling back to other xattr implementations' % (
3231                         pyxattr_required_version, xattr.__version__))
3232
3233             setxattr = xattr.set
3234         else:  # xattr
3235             setxattr = xattr.setxattr
3236
3237         try:
3238             setxattr(path, key, value)
3239         except EnvironmentError as e:
3240             raise XAttrMetadataError(e.errno, e.strerror)
3241
3242     except ImportError:
3243         if compat_os_name == 'nt':
3244             # Write xattrs to NTFS Alternate Data Streams:
3245             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3246             assert ':' not in key
3247             assert os.path.exists(path)
3248
3249             ads_fn = path + ':' + key
3250             try:
3251                 with open(ads_fn, 'wb') as f:
3252                     f.write(value)
3253             except EnvironmentError as e:
3254                 raise XAttrMetadataError(e.errno, e.strerror)
3255         else:
3256             user_has_setfattr = check_executable('setfattr', ['--version'])
3257             user_has_xattr = check_executable('xattr', ['-h'])
3258
3259             if user_has_setfattr or user_has_xattr:
3260
3261                 value = value.decode('utf-8')
3262                 if user_has_setfattr:
3263                     executable = 'setfattr'
3264                     opts = ['-n', key, '-v', value]
3265                 elif user_has_xattr:
3266                     executable = 'xattr'
3267                     opts = ['-w', key, value]
3268
3269                 cmd = ([encodeFilename(executable, True)] +
3270                        [encodeArgument(o) for o in opts] +
3271                        [encodeFilename(path, True)])
3272
3273                 try:
3274                     p = subprocess.Popen(
3275                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3276                 except EnvironmentError as e:
3277                     raise XAttrMetadataError(e.errno, e.strerror)
3278                 stdout, stderr = p.communicate()
3279                 stderr = stderr.decode('utf-8', 'replace')
3280                 if p.returncode != 0:
3281                     raise XAttrMetadataError(p.returncode, stderr)
3282
3283             else:
3284                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3285                 if sys.platform.startswith('linux'):
3286                     raise XAttrUnavailableError(
3287                         "Couldn't find a tool to set the xattrs. "
3288                         "Install either the python 'pyxattr' or 'xattr' "
3289                         "modules, or the GNU 'attr' package "
3290                         "(which contains the 'setfattr' tool).")
3291                 else:
3292                     raise XAttrUnavailableError(
3293                         "Couldn't find a tool to set the xattrs. "
3294                         "Install either the python 'xattr' module, "
3295                         "or the 'xattr' binary.")