_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParser,
  39     compat_basestring,
  40     compat_chr,
  41     compat_etree_fromstring,
  42     compat_html_entities,
  43     compat_html_entities_html5,
  44     compat_http_client,
  45     compat_kwargs,
  46     compat_os_name,
  47     compat_parse_qs,
  48     compat_shlex_quote,
  49     compat_socket_create_connection,
  50     compat_str,
  51     compat_struct_pack,
  52     compat_struct_unpack,
  53     compat_urllib_error,
  54     compat_urllib_parse,
  55     compat_urllib_parse_urlencode,
  56     compat_urllib_parse_urlparse,
  57     compat_urllib_parse_unquote_plus,
  58     compat_urllib_request,
  59     compat_urlparse,
  60     compat_xpath,
  61 )
  62
  63 from .socks import (
  64     ProxyType,
  65     sockssocket,
  66 )
  67
  68
  69 def register_socks_protocols():
  70     # "Register" SOCKS protocols
  71     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  72     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  73     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  74         if scheme not in compat_urlparse.uses_netloc:
  75             compat_urlparse.uses_netloc.append(scheme)
  76
  77
  78 # This is not clearly defined otherwise
  79 compiled_regex_type = type(re.compile(''))
  80
  81 std_headers = {
  82     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  83     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  84     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  85     'Accept-Encoding': 'gzip, deflate',
  86     'Accept-Language': 'en-us,en;q=0.5',
  87 }
  88
  89
  90 USER_AGENTS = {
  91     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  92 }
  93
  94
  95 NO_DEFAULT = object()
  96
  97 ENGLISH_MONTH_NAMES = [
  98     'January', 'February', 'March', 'April', 'May', 'June',
  99     'July', 'August', 'September', 'October', 'November', 'December']
 100
 101 MONTH_NAMES = {
 102     'en': ENGLISH_MONTH_NAMES,
 103     'fr': [
 104         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 105         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 106 }
 107
 108 KNOWN_EXTENSIONS = (
 109     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 110     'flv', 'f4v', 'f4a', 'f4b',
 111     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 112     'mkv', 'mka', 'mk3d',
 113     'avi', 'divx',
 114     'mov',
 115     'asf', 'wmv', 'wma',
 116     '3gp', '3g2',
 117     'mp3',
 118     'flac',
 119     'ape',
 120     'wav',
 121     'f4f', 'f4m', 'm3u8', 'smil')
 122
 123 # needed for sanitizing filenames in restricted mode
 124 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 125                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 126                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 127
 128 DATE_FORMATS = (
 129     '%d %B %Y',
 130     '%d %b %Y',
 131     '%B %d %Y',
 132     '%B %dst %Y',
 133     '%B %dnd %Y',
 134     '%B %dth %Y',
 135     '%b %d %Y',
 136     '%b %dst %Y',
 137     '%b %dnd %Y',
 138     '%b %dth %Y',
 139     '%b %dst %Y %I:%M',
 140     '%b %dnd %Y %I:%M',
 141     '%b %dth %Y %I:%M',
 142     '%Y %m %d',
 143     '%Y-%m-%d',
 144     '%Y/%m/%d',
 145     '%Y/%m/%d %H:%M',
 146     '%Y/%m/%d %H:%M:%S',
 147     '%Y-%m-%d %H:%M',
 148     '%Y-%m-%d %H:%M:%S',
 149     '%Y-%m-%d %H:%M:%S.%f',
 150     '%d.%m.%Y %H:%M',
 151     '%d.%m.%Y %H.%M',
 152     '%Y-%m-%dT%H:%M:%SZ',
 153     '%Y-%m-%dT%H:%M:%S.%fZ',
 154     '%Y-%m-%dT%H:%M:%S.%f0Z',
 155     '%Y-%m-%dT%H:%M:%S',
 156     '%Y-%m-%dT%H:%M:%S.%f',
 157     '%Y-%m-%dT%H:%M',
 158     '%b %d %Y at %H:%M',
 159     '%b %d %Y at %H:%M:%S',
 160 )
 161
 162 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 163 DATE_FORMATS_DAY_FIRST.extend([
 164     '%d-%m-%Y',
 165     '%d.%m.%Y',
 166     '%d.%m.%y',
 167     '%d/%m/%Y',
 168     '%d/%m/%y',
 169     '%d/%m/%Y %H:%M:%S',
 170 ])
 171
 172 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 173 DATE_FORMATS_MONTH_FIRST.extend([
 174     '%m-%d-%Y',
 175     '%m.%d.%Y',
 176     '%m/%d/%Y',
 177     '%m/%d/%y',
 178     '%m/%d/%Y %H:%M:%S',
 179 ])
 180
 181 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 182
 183
 184 def preferredencoding():
 185     """Get preferred encoding.
 186
 187     Returns the best encoding scheme for the system, based on
 188     locale.getpreferredencoding() and some further tweaks.
 189     """
 190     try:
 191         pref = locale.getpreferredencoding()
 192         'TEST'.encode(pref)
 193     except Exception:
 194         pref = 'UTF-8'
 195
 196     return pref
 197
 198
 199 def write_json_file(obj, fn):
 200     """ Encode obj as JSON and write it to fn, atomically if possible """
 201
 202     fn = encodeFilename(fn)
 203     if sys.version_info < (3, 0) and sys.platform != 'win32':
 204         encoding = get_filesystem_encoding()
 205         # os.path.basename returns a bytes object, but NamedTemporaryFile
 206         # will fail if the filename contains non ascii characters unless we
 207         # use a unicode object
 208         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 209         # the same for os.path.dirname
 210         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 211     else:
 212         path_basename = os.path.basename
 213         path_dirname = os.path.dirname
 214
 215     args = {
 216         'suffix': '.tmp',
 217         'prefix': path_basename(fn) + '.',
 218         'dir': path_dirname(fn),
 219         'delete': False,
 220     }
 221
 222     # In Python 2.x, json.dump expects a bytestream.
 223     # In Python 3.x, it writes to a character stream
 224     if sys.version_info < (3, 0):
 225         args['mode'] = 'wb'
 226     else:
 227         args.update({
 228             'mode': 'w',
 229             'encoding': 'utf-8',
 230         })
 231
 232     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 233
 234     try:
 235         with tf:
 236             json.dump(obj, tf)
 237         if sys.platform == 'win32':
 238             # Need to remove existing file on Windows, else os.rename raises
 239             # WindowsError or FileExistsError.
 240             try:
 241                 os.unlink(fn)
 242             except OSError:
 243                 pass
 244         os.rename(tf.name, fn)
 245     except Exception:
 246         try:
 247             os.remove(tf.name)
 248         except OSError:
 249             pass
 250         raise
 251
 252
 253 if sys.version_info >= (2, 7):
 254     def find_xpath_attr(node, xpath, key, val=None):
 255         """ Find the xpath xpath[@key=val] """
 256         assert re.match(r'^[a-zA-Z_-]+$', key)
 257         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 258         return node.find(expr)
 259 else:
 260     def find_xpath_attr(node, xpath, key, val=None):
 261         for f in node.findall(compat_xpath(xpath)):
 262             if key not in f.attrib:
 263                 continue
 264             if val is None or f.attrib.get(key) == val:
 265                 return f
 266         return None
 267
 268 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 269 # the namespace parameter
 270
 271
 272 def xpath_with_ns(path, ns_map):
 273     components = [c.split(':') for c in path.split('/')]
 274     replaced = []
 275     for c in components:
 276         if len(c) == 1:
 277             replaced.append(c[0])
 278         else:
 279             ns, tag = c
 280             replaced.append('{%s}%s' % (ns_map[ns], tag))
 281     return '/'.join(replaced)
 282
 283
 284 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 285     def _find_xpath(xpath):
 286         return node.find(compat_xpath(xpath))
 287
 288     if isinstance(xpath, (str, compat_str)):
 289         n = _find_xpath(xpath)
 290     else:
 291         for xp in xpath:
 292             n = _find_xpath(xp)
 293             if n is not None:
 294                 break
 295
 296     if n is None:
 297         if default is not NO_DEFAULT:
 298             return default
 299         elif fatal:
 300             name = xpath if name is None else name
 301             raise ExtractorError('Could not find XML element %s' % name)
 302         else:
 303             return None
 304     return n
 305
 306
 307 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 308     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 309     if n is None or n == default:
 310         return n
 311     if n.text is None:
 312         if default is not NO_DEFAULT:
 313             return default
 314         elif fatal:
 315             name = xpath if name is None else name
 316             raise ExtractorError('Could not find XML element\'s text %s' % name)
 317         else:
 318             return None
 319     return n.text
 320
 321
 322 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 323     n = find_xpath_attr(node, xpath, key)
 324     if n is None:
 325         if default is not NO_DEFAULT:
 326             return default
 327         elif fatal:
 328             name = '%s[@%s]' % (xpath, key) if name is None else name
 329             raise ExtractorError('Could not find XML attribute %s' % name)
 330         else:
 331             return None
 332     return n.attrib[key]
 333
 334
 335 def get_element_by_id(id, html):
 336     """Return the content of the tag with the specified ID in the passed HTML document"""
 337     return get_element_by_attribute('id', id, html)
 338
 339
 340 def get_element_by_class(class_name, html):
 341     """Return the content of the first tag with the specified class in the passed HTML document"""
 342     retval = get_elements_by_class(class_name, html)
 343     return retval[0] if retval else None
 344
 345
 346 def get_element_by_attribute(attribute, value, html, escape_value=True):
 347     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 348     return retval[0] if retval else None
 349
 350
 351 def get_elements_by_class(class_name, html):
 352     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 353     return get_elements_by_attribute(
 354         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 355         html, escape_value=False)
 356
 357
 358 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 359     """Return the content of the tag with the specified attribute in the passed HTML document"""
 360
 361     value = re.escape(value) if escape_value else value
 362
 363     retlist = []
 364     for m in re.finditer(r'''(?xs)
 365         <([a-zA-Z0-9:._-]+)
 366          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 367          \s+%s=['"]?%s['"]?
 368          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 369         \s*>
 370         (?P<content>.*?)
 371         </\1>
 372     ''' % (re.escape(attribute), value), html):
 373         res = m.group('content')
 374
 375         if res.startswith('"') or res.startswith("'"):
 376             res = res[1:-1]
 377
 378         retlist.append(unescapeHTML(res))
 379
 380     return retlist
 381
 382
 383 class HTMLAttributeParser(compat_HTMLParser):
 384     """Trivial HTML parser to gather the attributes for a single element"""
 385     def __init__(self):
 386         self.attrs = {}
 387         compat_HTMLParser.__init__(self)
 388
 389     def handle_starttag(self, tag, attrs):
 390         self.attrs = dict(attrs)
 391
 392
 393 def extract_attributes(html_element):
 394     """Given a string for an HTML element such as
 395     <el
 396          a="foo" B="bar" c="&98;az" d=boz
 397          empty= noval entity="&amp;"
 398          sq='"' dq="'"
 399     >
 400     Decode and return a dictionary of attributes.
 401     {
 402         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 403         'empty': '', 'noval': None, 'entity': '&',
 404         'sq': '"', 'dq': '\''
 405     }.
 406     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 407     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 408     """
 409     parser = HTMLAttributeParser()
 410     parser.feed(html_element)
 411     parser.close()
 412     return parser.attrs
 413
 414
 415 def clean_html(html):
 416     """Clean an HTML snippet into a readable string"""
 417
 418     if html is None:  # Convenience for sanitizing descriptions etc.
 419         return html
 420
 421     # Newline vs <br />
 422     html = html.replace('\n', ' ')
 423     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 424     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 425     # Strip html tags
 426     html = re.sub('<.*?>', '', html)
 427     # Replace html entities
 428     html = unescapeHTML(html)
 429     return html.strip()
 430
 431
 432 def sanitize_open(filename, open_mode):
 433     """Try to open the given filename, and slightly tweak it if this fails.
 434
 435     Attempts to open the given filename. If this fails, it tries to change
 436     the filename slightly, step by step, until it's either able to open it
 437     or it fails and raises a final exception, like the standard open()
 438     function.
 439
 440     It returns the tuple (stream, definitive_file_name).
 441     """
 442     try:
 443         if filename == '-':
 444             if sys.platform == 'win32':
 445                 import msvcrt
 446                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 447             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 448         stream = open(encodeFilename(filename), open_mode)
 449         return (stream, filename)
 450     except (IOError, OSError) as err:
 451         if err.errno in (errno.EACCES,):
 452             raise
 453
 454         # In case of error, try to remove win32 forbidden chars
 455         alt_filename = sanitize_path(filename)
 456         if alt_filename == filename:
 457             raise
 458         else:
 459             # An exception here should be caught in the caller
 460             stream = open(encodeFilename(alt_filename), open_mode)
 461             return (stream, alt_filename)
 462
 463
 464 def timeconvert(timestr):
 465     """Convert RFC 2822 defined time string into system timestamp"""
 466     timestamp = None
 467     timetuple = email.utils.parsedate_tz(timestr)
 468     if timetuple is not None:
 469         timestamp = email.utils.mktime_tz(timetuple)
 470     return timestamp
 471
 472
 473 def sanitize_filename(s, restricted=False, is_id=False):
 474     """Sanitizes a string so it could be used as part of a filename.
 475     If restricted is set, use a stricter subset of allowed characters.
 476     Set is_id if this is not an arbitrary string, but an ID that should be kept
 477     if possible.
 478     """
 479     def replace_insane(char):
 480         if restricted and char in ACCENT_CHARS:
 481             return ACCENT_CHARS[char]
 482         if char == '?' or ord(char) < 32 or ord(char) == 127:
 483             return ''
 484         elif char == '"':
 485             return '' if restricted else '\''
 486         elif char == ':':
 487             return '_-' if restricted else ' -'
 488         elif char in '\\/|*<>':
 489             return '_'
 490         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 491             return '_'
 492         if restricted and ord(char) > 127:
 493             return '_'
 494         return char
 495
 496     # Handle timestamps
 497     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 498     result = ''.join(map(replace_insane, s))
 499     if not is_id:
 500         while '__' in result:
 501             result = result.replace('__', '_')
 502         result = result.strip('_')
 503         # Common case of "Foreign band name - English song title"
 504         if restricted and result.startswith('-_'):
 505             result = result[2:]
 506         if result.startswith('-'):
 507             result = '_' + result[len('-'):]
 508         result = result.lstrip('.')
 509         if not result:
 510             result = '_'
 511     return result
 512
 513
 514 def sanitize_path(s):
 515     """Sanitizes and normalizes path on Windows"""
 516     if sys.platform != 'win32':
 517         return s
 518     drive_or_unc, _ = os.path.splitdrive(s)
 519     if sys.version_info < (2, 7) and not drive_or_unc:
 520         drive_or_unc, _ = os.path.splitunc(s)
 521     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 522     if drive_or_unc:
 523         norm_path.pop(0)
 524     sanitized_path = [
 525         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 526         for path_part in norm_path]
 527     if drive_or_unc:
 528         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 529     return os.path.join(*sanitized_path)
 530
 531
 532 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 533 # unwanted failures due to missing protocol
 534 def sanitize_url(url):
 535     return 'http:%s' % url if url.startswith('//') else url
 536
 537
 538 def sanitized_Request(url, *args, **kwargs):
 539     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 540
 541
 542 def orderedSet(iterable):
 543     """ Remove all duplicates from the input iterable """
 544     res = []
 545     for el in iterable:
 546         if el not in res:
 547             res.append(el)
 548     return res
 549
 550
 551 def _htmlentity_transform(entity_with_semicolon):
 552     """Transforms an HTML entity to a character."""
 553     entity = entity_with_semicolon[:-1]
 554
 555     # Known non-numeric HTML entity
 556     if entity in compat_html_entities.name2codepoint:
 557         return compat_chr(compat_html_entities.name2codepoint[entity])
 558
 559     # TODO: HTML5 allows entities without a semicolon. For example,
 560     # '&Eacuteric' should be decoded as 'Éric'.
 561     if entity_with_semicolon in compat_html_entities_html5:
 562         return compat_html_entities_html5[entity_with_semicolon]
 563
 564     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 565     if mobj is not None:
 566         numstr = mobj.group(1)
 567         if numstr.startswith('x'):
 568             base = 16
 569             numstr = '0%s' % numstr
 570         else:
 571             base = 10
 572         # See https://github.com/rg3/youtube-dl/issues/7518
 573         try:
 574             return compat_chr(int(numstr, base))
 575         except ValueError:
 576             pass
 577
 578     # Unknown entity in name, return its literal representation
 579     return '&%s;' % entity
 580
 581
 582 def unescapeHTML(s):
 583     if s is None:
 584         return None
 585     assert type(s) == compat_str
 586
 587     return re.sub(
 588         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 589
 590
 591 def get_subprocess_encoding():
 592     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 593         # For subprocess calls, encode with locale encoding
 594         # Refer to http://stackoverflow.com/a/9951851/35070
 595         encoding = preferredencoding()
 596     else:
 597         encoding = sys.getfilesystemencoding()
 598     if encoding is None:
 599         encoding = 'utf-8'
 600     return encoding
 601
 602
 603 def encodeFilename(s, for_subprocess=False):
 604     """
 605     @param s The name of the file
 606     """
 607
 608     assert type(s) == compat_str
 609
 610     # Python 3 has a Unicode API
 611     if sys.version_info >= (3, 0):
 612         return s
 613
 614     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 615     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 616     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 617     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 618         return s
 619
 620     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 621     if sys.platform.startswith('java'):
 622         return s
 623
 624     return s.encode(get_subprocess_encoding(), 'ignore')
 625
 626
 627 def decodeFilename(b, for_subprocess=False):
 628
 629     if sys.version_info >= (3, 0):
 630         return b
 631
 632     if not isinstance(b, bytes):
 633         return b
 634
 635     return b.decode(get_subprocess_encoding(), 'ignore')
 636
 637
 638 def encodeArgument(s):
 639     if not isinstance(s, compat_str):
 640         # Legacy code that uses byte strings
 641         # Uncomment the following line after fixing all post processors
 642         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 643         s = s.decode('ascii')
 644     return encodeFilename(s, True)
 645
 646
 647 def decodeArgument(b):
 648     return decodeFilename(b, True)
 649
 650
 651 def decodeOption(optval):
 652     if optval is None:
 653         return optval
 654     if isinstance(optval, bytes):
 655         optval = optval.decode(preferredencoding())
 656
 657     assert isinstance(optval, compat_str)
 658     return optval
 659
 660
 661 def formatSeconds(secs):
 662     if secs > 3600:
 663         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 664     elif secs > 60:
 665         return '%d:%02d' % (secs // 60, secs % 60)
 666     else:
 667         return '%d' % secs
 668
 669
 670 def make_HTTPS_handler(params, **kwargs):
 671     opts_no_check_certificate = params.get('nocheckcertificate', False)
 672     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 673         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 674         if opts_no_check_certificate:
 675             context.check_hostname = False
 676             context.verify_mode = ssl.CERT_NONE
 677         try:
 678             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 679         except TypeError:
 680             # Python 2.7.8
 681             # (create_default_context present but HTTPSHandler has no context=)
 682             pass
 683
 684     if sys.version_info < (3, 2):
 685         return YoutubeDLHTTPSHandler(params, **kwargs)
 686     else:  # Python < 3.4
 687         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 688         context.verify_mode = (ssl.CERT_NONE
 689                                if opts_no_check_certificate
 690                                else ssl.CERT_REQUIRED)
 691         context.set_default_verify_paths()
 692         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 693
 694
 695 def bug_reports_message():
 696     if ytdl_is_updateable():
 697         update_cmd = 'type  youtube-dl -U  to update'
 698     else:
 699         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 700     msg = '; please report this issue on https://yt-dl.org/bug .'
 701     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 702     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 703     return msg
 704
 705
 706 class YoutubeDLError(Exception):
 707     """Base exception for YoutubeDL errors."""
 708     pass
 709
 710
 711 class ExtractorError(YoutubeDLError):
 712     """Error during info extraction."""
 713
 714     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 715         """ tb, if given, is the original traceback (so that it can be printed out).
 716         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 717         """
 718
 719         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 720             expected = True
 721         if video_id is not None:
 722             msg = video_id + ': ' + msg
 723         if cause:
 724             msg += ' (caused by %r)' % cause
 725         if not expected:
 726             msg += bug_reports_message()
 727         super(ExtractorError, self).__init__(msg)
 728
 729         self.traceback = tb
 730         self.exc_info = sys.exc_info()  # preserve original exception
 731         self.cause = cause
 732         self.video_id = video_id
 733
 734     def format_traceback(self):
 735         if self.traceback is None:
 736             return None
 737         return ''.join(traceback.format_tb(self.traceback))
 738
 739
 740 class UnsupportedError(ExtractorError):
 741     def __init__(self, url):
 742         super(UnsupportedError, self).__init__(
 743             'Unsupported URL: %s' % url, expected=True)
 744         self.url = url
 745
 746
 747 class RegexNotFoundError(ExtractorError):
 748     """Error when a regex didn't match"""
 749     pass
 750
 751
 752 class GeoRestrictedError(ExtractorError):
 753     """Geographic restriction Error exception.
 754
 755     This exception may be thrown when a video is not available from your
 756     geographic location due to geographic restrictions imposed by a website.
 757     """
 758     def __init__(self, msg, countries=None):
 759         super(GeoRestrictedError, self).__init__(msg, expected=True)
 760         self.msg = msg
 761         self.countries = countries
 762
 763
 764 class DownloadError(YoutubeDLError):
 765     """Download Error exception.
 766
 767     This exception may be thrown by FileDownloader objects if they are not
 768     configured to continue on errors. They will contain the appropriate
 769     error message.
 770     """
 771
 772     def __init__(self, msg, exc_info=None):
 773         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 774         super(DownloadError, self).__init__(msg)
 775         self.exc_info = exc_info
 776
 777
 778 class SameFileError(YoutubeDLError):
 779     """Same File exception.
 780
 781     This exception will be thrown by FileDownloader objects if they detect
 782     multiple files would have to be downloaded to the same file on disk.
 783     """
 784     pass
 785
 786
 787 class PostProcessingError(YoutubeDLError):
 788     """Post Processing exception.
 789
 790     This exception may be raised by PostProcessor's .run() method to
 791     indicate an error in the postprocessing task.
 792     """
 793
 794     def __init__(self, msg):
 795         super(PostProcessingError, self).__init__(msg)
 796         self.msg = msg
 797
 798
 799 class MaxDownloadsReached(YoutubeDLError):
 800     """ --max-downloads limit has been reached. """
 801     pass
 802
 803
 804 class UnavailableVideoError(YoutubeDLError):
 805     """Unavailable Format exception.
 806
 807     This exception will be thrown when a video is requested
 808     in a format that is not available for that video.
 809     """
 810     pass
 811
 812
 813 class ContentTooShortError(YoutubeDLError):
 814     """Content Too Short exception.
 815
 816     This exception may be raised by FileDownloader objects when a file they
 817     download is too small for what the server announced first, indicating
 818     the connection was probably interrupted.
 819     """
 820
 821     def __init__(self, downloaded, expected):
 822         super(ContentTooShortError, self).__init__(
 823             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 824         )
 825         # Both in bytes
 826         self.downloaded = downloaded
 827         self.expected = expected
 828
 829
 830 class XAttrMetadataError(YoutubeDLError):
 831     def __init__(self, code=None, msg='Unknown error'):
 832         super(XAttrMetadataError, self).__init__(msg)
 833         self.code = code
 834         self.msg = msg
 835
 836         # Parsing code and msg
 837         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 838                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 839             self.reason = 'NO_SPACE'
 840         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 841             self.reason = 'VALUE_TOO_LONG'
 842         else:
 843             self.reason = 'NOT_SUPPORTED'
 844
 845
 846 class XAttrUnavailableError(YoutubeDLError):
 847     pass
 848
 849
 850 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 851     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 852     # expected HTTP responses to meet HTTP/1.0 or later (see also
 853     # https://github.com/rg3/youtube-dl/issues/6727)
 854     if sys.version_info < (3, 0):
 855         kwargs[b'strict'] = True
 856     hc = http_class(*args, **kwargs)
 857     source_address = ydl_handler._params.get('source_address')
 858     if source_address is not None:
 859         sa = (source_address, 0)
 860         if hasattr(hc, 'source_address'):  # Python 2.7+
 861             hc.source_address = sa
 862         else:  # Python 2.6
 863             def _hc_connect(self, *args, **kwargs):
 864                 sock = compat_socket_create_connection(
 865                     (self.host, self.port), self.timeout, sa)
 866                 if is_https:
 867                     self.sock = ssl.wrap_socket(
 868                         sock, self.key_file, self.cert_file,
 869                         ssl_version=ssl.PROTOCOL_TLSv1)
 870                 else:
 871                     self.sock = sock
 872             hc.connect = functools.partial(_hc_connect, hc)
 873
 874     return hc
 875
 876
 877 def handle_youtubedl_headers(headers):
 878     filtered_headers = headers
 879
 880     if 'Youtubedl-no-compression' in filtered_headers:
 881         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 882         del filtered_headers['Youtubedl-no-compression']
 883
 884     return filtered_headers
 885
 886
 887 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 888     """Handler for HTTP requests and responses.
 889
 890     This class, when installed with an OpenerDirector, automatically adds
 891     the standard headers to every HTTP request and handles gzipped and
 892     deflated responses from web servers. If compression is to be avoided in
 893     a particular request, the original request in the program code only has
 894     to include the HTTP header "Youtubedl-no-compression", which will be
 895     removed before making the real request.
 896
 897     Part of this code was copied from:
 898
 899     http://techknack.net/python-urllib2-handlers/
 900
 901     Andrew Rowls, the author of that code, agreed to release it to the
 902     public domain.
 903     """
 904
 905     def __init__(self, params, *args, **kwargs):
 906         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 907         self._params = params
 908
 909     def http_open(self, req):
 910         conn_class = compat_http_client.HTTPConnection
 911
 912         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 913         if socks_proxy:
 914             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 915             del req.headers['Ytdl-socks-proxy']
 916
 917         return self.do_open(functools.partial(
 918             _create_http_connection, self, conn_class, False),
 919             req)
 920
 921     @staticmethod
 922     def deflate(data):
 923         try:
 924             return zlib.decompress(data, -zlib.MAX_WBITS)
 925         except zlib.error:
 926             return zlib.decompress(data)
 927
 928     @staticmethod
 929     def addinfourl_wrapper(stream, headers, url, code):
 930         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 931             return compat_urllib_request.addinfourl(stream, headers, url, code)
 932         ret = compat_urllib_request.addinfourl(stream, headers, url)
 933         ret.code = code
 934         return ret
 935
 936     def http_request(self, req):
 937         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 938         # always respected by websites, some tend to give out URLs with non percent-encoded
 939         # non-ASCII characters (see telemb.py, ard.py [#3412])
 940         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 941         # To work around aforementioned issue we will replace request's original URL with
 942         # percent-encoded one
 943         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 944         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 945         url = req.get_full_url()
 946         url_escaped = escape_url(url)
 947
 948         # Substitute URL if any change after escaping
 949         if url != url_escaped:
 950             req = update_Request(req, url=url_escaped)
 951
 952         for h, v in std_headers.items():
 953             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 954             # The dict keys are capitalized because of this bug by urllib
 955             if h.capitalize() not in req.headers:
 956                 req.add_header(h, v)
 957
 958         req.headers = handle_youtubedl_headers(req.headers)
 959
 960         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 961             # Python 2.6 is brain-dead when it comes to fragments
 962             req._Request__original = req._Request__original.partition('#')[0]
 963             req._Request__r_type = req._Request__r_type.partition('#')[0]
 964
 965         return req
 966
 967     def http_response(self, req, resp):
 968         old_resp = resp
 969         # gzip
 970         if resp.headers.get('Content-encoding', '') == 'gzip':
 971             content = resp.read()
 972             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 973             try:
 974                 uncompressed = io.BytesIO(gz.read())
 975             except IOError as original_ioerror:
 976                 # There may be junk add the end of the file
 977                 # See http://stackoverflow.com/q/4928560/35070 for details
 978                 for i in range(1, 1024):
 979                     try:
 980                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 981                         uncompressed = io.BytesIO(gz.read())
 982                     except IOError:
 983                         continue
 984                     break
 985                 else:
 986                     raise original_ioerror
 987             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 988             resp.msg = old_resp.msg
 989             del resp.headers['Content-encoding']
 990         # deflate
 991         if resp.headers.get('Content-encoding', '') == 'deflate':
 992             gz = io.BytesIO(self.deflate(resp.read()))
 993             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 994             resp.msg = old_resp.msg
 995             del resp.headers['Content-encoding']
 996         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 997         # https://github.com/rg3/youtube-dl/issues/6457).
 998         if 300 <= resp.code < 400:
 999             location = resp.headers.get('Location')
1000             if location:
1001                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1002                 if sys.version_info >= (3, 0):
1003                     location = location.encode('iso-8859-1').decode('utf-8')
1004                 else:
1005                     location = location.decode('utf-8')
1006                 location_escaped = escape_url(location)
1007                 if location != location_escaped:
1008                     del resp.headers['Location']
1009                     if sys.version_info < (3, 0):
1010                         location_escaped = location_escaped.encode('utf-8')
1011                     resp.headers['Location'] = location_escaped
1012         return resp
1013
1014     https_request = http_request
1015     https_response = http_response
1016
1017
1018 def make_socks_conn_class(base_class, socks_proxy):
1019     assert issubclass(base_class, (
1020         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1021
1022     url_components = compat_urlparse.urlparse(socks_proxy)
1023     if url_components.scheme.lower() == 'socks5':
1024         socks_type = ProxyType.SOCKS5
1025     elif url_components.scheme.lower() in ('socks', 'socks4'):
1026         socks_type = ProxyType.SOCKS4
1027     elif url_components.scheme.lower() == 'socks4a':
1028         socks_type = ProxyType.SOCKS4A
1029
1030     def unquote_if_non_empty(s):
1031         if not s:
1032             return s
1033         return compat_urllib_parse_unquote_plus(s)
1034
1035     proxy_args = (
1036         socks_type,
1037         url_components.hostname, url_components.port or 1080,
1038         True,  # Remote DNS
1039         unquote_if_non_empty(url_components.username),
1040         unquote_if_non_empty(url_components.password),
1041     )
1042
1043     class SocksConnection(base_class):
1044         def connect(self):
1045             self.sock = sockssocket()
1046             self.sock.setproxy(*proxy_args)
1047             if type(self.timeout) in (int, float):
1048                 self.sock.settimeout(self.timeout)
1049             self.sock.connect((self.host, self.port))
1050
1051             if isinstance(self, compat_http_client.HTTPSConnection):
1052                 if hasattr(self, '_context'):  # Python > 2.6
1053                     self.sock = self._context.wrap_socket(
1054                         self.sock, server_hostname=self.host)
1055                 else:
1056                     self.sock = ssl.wrap_socket(self.sock)
1057
1058     return SocksConnection
1059
1060
1061 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1062     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1063         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1064         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1065         self._params = params
1066
1067     def https_open(self, req):
1068         kwargs = {}
1069         conn_class = self._https_conn_class
1070
1071         if hasattr(self, '_context'):  # python > 2.6
1072             kwargs['context'] = self._context
1073         if hasattr(self, '_check_hostname'):  # python 3.x
1074             kwargs['check_hostname'] = self._check_hostname
1075
1076         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1077         if socks_proxy:
1078             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1079             del req.headers['Ytdl-socks-proxy']
1080
1081         return self.do_open(functools.partial(
1082             _create_http_connection, self, conn_class, True),
1083             req, **kwargs)
1084
1085
1086 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1087     def __init__(self, cookiejar=None):
1088         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1089
1090     def http_response(self, request, response):
1091         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1092         # characters in Set-Cookie HTTP header of last response (see
1093         # https://github.com/rg3/youtube-dl/issues/6769).
1094         # In order to at least prevent crashing we will percent encode Set-Cookie
1095         # header before HTTPCookieProcessor starts processing it.
1096         # if sys.version_info < (3, 0) and response.headers:
1097         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1098         #         set_cookie = response.headers.get(set_cookie_header)
1099         #         if set_cookie:
1100         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1101         #             if set_cookie != set_cookie_escaped:
1102         #                 del response.headers[set_cookie_header]
1103         #                 response.headers[set_cookie_header] = set_cookie_escaped
1104         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1105
1106     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1107     https_response = http_response
1108
1109
1110 def extract_timezone(date_str):
1111     m = re.search(
1112         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1113         date_str)
1114     if not m:
1115         timezone = datetime.timedelta()
1116     else:
1117         date_str = date_str[:-len(m.group('tz'))]
1118         if not m.group('sign'):
1119             timezone = datetime.timedelta()
1120         else:
1121             sign = 1 if m.group('sign') == '+' else -1
1122             timezone = datetime.timedelta(
1123                 hours=sign * int(m.group('hours')),
1124                 minutes=sign * int(m.group('minutes')))
1125     return timezone, date_str
1126
1127
1128 def parse_iso8601(date_str, delimiter='T', timezone=None):
1129     """ Return a UNIX timestamp from the given date """
1130
1131     if date_str is None:
1132         return None
1133
1134     date_str = re.sub(r'\.[0-9]+', '', date_str)
1135
1136     if timezone is None:
1137         timezone, date_str = extract_timezone(date_str)
1138
1139     try:
1140         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1141         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1142         return calendar.timegm(dt.timetuple())
1143     except ValueError:
1144         pass
1145
1146
1147 def date_formats(day_first=True):
1148     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1149
1150
1151 def unified_strdate(date_str, day_first=True):
1152     """Return a string with the date in the format YYYYMMDD"""
1153
1154     if date_str is None:
1155         return None
1156     upload_date = None
1157     # Replace commas
1158     date_str = date_str.replace(',', ' ')
1159     # Remove AM/PM + timezone
1160     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1161     _, date_str = extract_timezone(date_str)
1162
1163     for expression in date_formats(day_first):
1164         try:
1165             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1166         except ValueError:
1167             pass
1168     if upload_date is None:
1169         timetuple = email.utils.parsedate_tz(date_str)
1170         if timetuple:
1171             try:
1172                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1173             except ValueError:
1174                 pass
1175     if upload_date is not None:
1176         return compat_str(upload_date)
1177
1178
1179 def unified_timestamp(date_str, day_first=True):
1180     if date_str is None:
1181         return None
1182
1183     date_str = date_str.replace(',', ' ')
1184
1185     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1186     timezone, date_str = extract_timezone(date_str)
1187
1188     # Remove AM/PM + timezone
1189     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1190
1191     for expression in date_formats(day_first):
1192         try:
1193             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1194             return calendar.timegm(dt.timetuple())
1195         except ValueError:
1196             pass
1197     timetuple = email.utils.parsedate_tz(date_str)
1198     if timetuple:
1199         return calendar.timegm(timetuple) + pm_delta * 3600
1200
1201
1202 def determine_ext(url, default_ext='unknown_video'):
1203     if url is None:
1204         return default_ext
1205     guess = url.partition('?')[0].rpartition('.')[2]
1206     if re.match(r'^[A-Za-z0-9]+$', guess):
1207         return guess
1208     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1209     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1210         return guess.rstrip('/')
1211     else:
1212         return default_ext
1213
1214
1215 def subtitles_filename(filename, sub_lang, sub_format):
1216     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1217
1218
1219 def date_from_str(date_str):
1220     """
1221     Return a datetime object from a string in the format YYYYMMDD or
1222     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1223     today = datetime.date.today()
1224     if date_str in ('now', 'today'):
1225         return today
1226     if date_str == 'yesterday':
1227         return today - datetime.timedelta(days=1)
1228     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1229     if match is not None:
1230         sign = match.group('sign')
1231         time = int(match.group('time'))
1232         if sign == '-':
1233             time = -time
1234         unit = match.group('unit')
1235         # A bad approximation?
1236         if unit == 'month':
1237             unit = 'day'
1238             time *= 30
1239         elif unit == 'year':
1240             unit = 'day'
1241             time *= 365
1242         unit += 's'
1243         delta = datetime.timedelta(**{unit: time})
1244         return today + delta
1245     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1246
1247
1248 def hyphenate_date(date_str):
1249     """
1250     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1251     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1252     if match is not None:
1253         return '-'.join(match.groups())
1254     else:
1255         return date_str
1256
1257
1258 class DateRange(object):
1259     """Represents a time interval between two dates"""
1260
1261     def __init__(self, start=None, end=None):
1262         """start and end must be strings in the format accepted by date"""
1263         if start is not None:
1264             self.start = date_from_str(start)
1265         else:
1266             self.start = datetime.datetime.min.date()
1267         if end is not None:
1268             self.end = date_from_str(end)
1269         else:
1270             self.end = datetime.datetime.max.date()
1271         if self.start > self.end:
1272             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1273
1274     @classmethod
1275     def day(cls, day):
1276         """Returns a range that only contains the given day"""
1277         return cls(day, day)
1278
1279     def __contains__(self, date):
1280         """Check if the date is in the range"""
1281         if not isinstance(date, datetime.date):
1282             date = date_from_str(date)
1283         return self.start <= date <= self.end
1284
1285     def __str__(self):
1286         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1287
1288
1289 def platform_name():
1290     """ Returns the platform name as a compat_str """
1291     res = platform.platform()
1292     if isinstance(res, bytes):
1293         res = res.decode(preferredencoding())
1294
1295     assert isinstance(res, compat_str)
1296     return res
1297
1298
1299 def _windows_write_string(s, out):
1300     """ Returns True if the string was written using special methods,
1301     False if it has yet to be written out."""
1302     # Adapted from http://stackoverflow.com/a/3259271/35070
1303
1304     import ctypes
1305     import ctypes.wintypes
1306
1307     WIN_OUTPUT_IDS = {
1308         1: -11,
1309         2: -12,
1310     }
1311
1312     try:
1313         fileno = out.fileno()
1314     except AttributeError:
1315         # If the output stream doesn't have a fileno, it's virtual
1316         return False
1317     except io.UnsupportedOperation:
1318         # Some strange Windows pseudo files?
1319         return False
1320     if fileno not in WIN_OUTPUT_IDS:
1321         return False
1322
1323     GetStdHandle = ctypes.WINFUNCTYPE(
1324         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1325         (b'GetStdHandle', ctypes.windll.kernel32))
1326     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1327
1328     WriteConsoleW = ctypes.WINFUNCTYPE(
1329         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1330         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1331         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1332     written = ctypes.wintypes.DWORD(0)
1333
1334     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1335     FILE_TYPE_CHAR = 0x0002
1336     FILE_TYPE_REMOTE = 0x8000
1337     GetConsoleMode = ctypes.WINFUNCTYPE(
1338         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1339         ctypes.POINTER(ctypes.wintypes.DWORD))(
1340         (b'GetConsoleMode', ctypes.windll.kernel32))
1341     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1342
1343     def not_a_console(handle):
1344         if handle == INVALID_HANDLE_VALUE or handle is None:
1345             return True
1346         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1347                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1348
1349     if not_a_console(h):
1350         return False
1351
1352     def next_nonbmp_pos(s):
1353         try:
1354             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1355         except StopIteration:
1356             return len(s)
1357
1358     while s:
1359         count = min(next_nonbmp_pos(s), 1024)
1360
1361         ret = WriteConsoleW(
1362             h, s, count if count else 2, ctypes.byref(written), None)
1363         if ret == 0:
1364             raise OSError('Failed to write string')
1365         if not count:  # We just wrote a non-BMP character
1366             assert written.value == 2
1367             s = s[1:]
1368         else:
1369             assert written.value > 0
1370             s = s[written.value:]
1371     return True
1372
1373
1374 def write_string(s, out=None, encoding=None):
1375     if out is None:
1376         out = sys.stderr
1377     assert type(s) == compat_str
1378
1379     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1380         if _windows_write_string(s, out):
1381             return
1382
1383     if ('b' in getattr(out, 'mode', '') or
1384             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1385         byt = s.encode(encoding or preferredencoding(), 'ignore')
1386         out.write(byt)
1387     elif hasattr(out, 'buffer'):
1388         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1389         byt = s.encode(enc, 'ignore')
1390         out.buffer.write(byt)
1391     else:
1392         out.write(s)
1393     out.flush()
1394
1395
1396 def bytes_to_intlist(bs):
1397     if not bs:
1398         return []
1399     if isinstance(bs[0], int):  # Python 3
1400         return list(bs)
1401     else:
1402         return [ord(c) for c in bs]
1403
1404
1405 def intlist_to_bytes(xs):
1406     if not xs:
1407         return b''
1408     return compat_struct_pack('%dB' % len(xs), *xs)
1409
1410
1411 # Cross-platform file locking
1412 if sys.platform == 'win32':
1413     import ctypes.wintypes
1414     import msvcrt
1415
1416     class OVERLAPPED(ctypes.Structure):
1417         _fields_ = [
1418             ('Internal', ctypes.wintypes.LPVOID),
1419             ('InternalHigh', ctypes.wintypes.LPVOID),
1420             ('Offset', ctypes.wintypes.DWORD),
1421             ('OffsetHigh', ctypes.wintypes.DWORD),
1422             ('hEvent', ctypes.wintypes.HANDLE),
1423         ]
1424
1425     kernel32 = ctypes.windll.kernel32
1426     LockFileEx = kernel32.LockFileEx
1427     LockFileEx.argtypes = [
1428         ctypes.wintypes.HANDLE,     # hFile
1429         ctypes.wintypes.DWORD,      # dwFlags
1430         ctypes.wintypes.DWORD,      # dwReserved
1431         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1432         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1433         ctypes.POINTER(OVERLAPPED)  # Overlapped
1434     ]
1435     LockFileEx.restype = ctypes.wintypes.BOOL
1436     UnlockFileEx = kernel32.UnlockFileEx
1437     UnlockFileEx.argtypes = [
1438         ctypes.wintypes.HANDLE,     # hFile
1439         ctypes.wintypes.DWORD,      # dwReserved
1440         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1441         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1442         ctypes.POINTER(OVERLAPPED)  # Overlapped
1443     ]
1444     UnlockFileEx.restype = ctypes.wintypes.BOOL
1445     whole_low = 0xffffffff
1446     whole_high = 0x7fffffff
1447
1448     def _lock_file(f, exclusive):
1449         overlapped = OVERLAPPED()
1450         overlapped.Offset = 0
1451         overlapped.OffsetHigh = 0
1452         overlapped.hEvent = 0
1453         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1454         handle = msvcrt.get_osfhandle(f.fileno())
1455         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1456                           whole_low, whole_high, f._lock_file_overlapped_p):
1457             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1458
1459     def _unlock_file(f):
1460         assert f._lock_file_overlapped_p
1461         handle = msvcrt.get_osfhandle(f.fileno())
1462         if not UnlockFileEx(handle, 0,
1463                             whole_low, whole_high, f._lock_file_overlapped_p):
1464             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1465
1466 else:
1467     # Some platforms, such as Jython, is missing fcntl
1468     try:
1469         import fcntl
1470
1471         def _lock_file(f, exclusive):
1472             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1473
1474         def _unlock_file(f):
1475             fcntl.flock(f, fcntl.LOCK_UN)
1476     except ImportError:
1477         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1478
1479         def _lock_file(f, exclusive):
1480             raise IOError(UNSUPPORTED_MSG)
1481
1482         def _unlock_file(f):
1483             raise IOError(UNSUPPORTED_MSG)
1484
1485
1486 class locked_file(object):
1487     def __init__(self, filename, mode, encoding=None):
1488         assert mode in ['r', 'a', 'w']
1489         self.f = io.open(filename, mode, encoding=encoding)
1490         self.mode = mode
1491
1492     def __enter__(self):
1493         exclusive = self.mode != 'r'
1494         try:
1495             _lock_file(self.f, exclusive)
1496         except IOError:
1497             self.f.close()
1498             raise
1499         return self
1500
1501     def __exit__(self, etype, value, traceback):
1502         try:
1503             _unlock_file(self.f)
1504         finally:
1505             self.f.close()
1506
1507     def __iter__(self):
1508         return iter(self.f)
1509
1510     def write(self, *args):
1511         return self.f.write(*args)
1512
1513     def read(self, *args):
1514         return self.f.read(*args)
1515
1516
1517 def get_filesystem_encoding():
1518     encoding = sys.getfilesystemencoding()
1519     return encoding if encoding is not None else 'utf-8'
1520
1521
1522 def shell_quote(args):
1523     quoted_args = []
1524     encoding = get_filesystem_encoding()
1525     for a in args:
1526         if isinstance(a, bytes):
1527             # We may get a filename encoded with 'encodeFilename'
1528             a = a.decode(encoding)
1529         quoted_args.append(pipes.quote(a))
1530     return ' '.join(quoted_args)
1531
1532
1533 def smuggle_url(url, data):
1534     """ Pass additional data in a URL for internal use. """
1535
1536     url, idata = unsmuggle_url(url, {})
1537     data.update(idata)
1538     sdata = compat_urllib_parse_urlencode(
1539         {'__youtubedl_smuggle': json.dumps(data)})
1540     return url + '#' + sdata
1541
1542
1543 def unsmuggle_url(smug_url, default=None):
1544     if '#__youtubedl_smuggle' not in smug_url:
1545         return smug_url, default
1546     url, _, sdata = smug_url.rpartition('#')
1547     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1548     data = json.loads(jsond)
1549     return url, data
1550
1551
1552 def format_bytes(bytes):
1553     if bytes is None:
1554         return 'N/A'
1555     if type(bytes) is str:
1556         bytes = float(bytes)
1557     if bytes == 0.0:
1558         exponent = 0
1559     else:
1560         exponent = int(math.log(bytes, 1024.0))
1561     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1562     converted = float(bytes) / float(1024 ** exponent)
1563     return '%.2f%s' % (converted, suffix)
1564
1565
1566 def lookup_unit_table(unit_table, s):
1567     units_re = '|'.join(re.escape(u) for u in unit_table)
1568     m = re.match(
1569         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1570     if not m:
1571         return None
1572     num_str = m.group('num').replace(',', '.')
1573     mult = unit_table[m.group('unit')]
1574     return int(float(num_str) * mult)
1575
1576
1577 def parse_filesize(s):
1578     if s is None:
1579         return None
1580
1581     # The lower-case forms are of course incorrect and unofficial,
1582     # but we support those too
1583     _UNIT_TABLE = {
1584         'B': 1,
1585         'b': 1,
1586         'bytes': 1,
1587         'KiB': 1024,
1588         'KB': 1000,
1589         'kB': 1024,
1590         'Kb': 1000,
1591         'kb': 1000,
1592         'kilobytes': 1000,
1593         'kibibytes': 1024,
1594         'MiB': 1024 ** 2,
1595         'MB': 1000 ** 2,
1596         'mB': 1024 ** 2,
1597         'Mb': 1000 ** 2,
1598         'mb': 1000 ** 2,
1599         'megabytes': 1000 ** 2,
1600         'mebibytes': 1024 ** 2,
1601         'GiB': 1024 ** 3,
1602         'GB': 1000 ** 3,
1603         'gB': 1024 ** 3,
1604         'Gb': 1000 ** 3,
1605         'gb': 1000 ** 3,
1606         'gigabytes': 1000 ** 3,
1607         'gibibytes': 1024 ** 3,
1608         'TiB': 1024 ** 4,
1609         'TB': 1000 ** 4,
1610         'tB': 1024 ** 4,
1611         'Tb': 1000 ** 4,
1612         'tb': 1000 ** 4,
1613         'terabytes': 1000 ** 4,
1614         'tebibytes': 1024 ** 4,
1615         'PiB': 1024 ** 5,
1616         'PB': 1000 ** 5,
1617         'pB': 1024 ** 5,
1618         'Pb': 1000 ** 5,
1619         'pb': 1000 ** 5,
1620         'petabytes': 1000 ** 5,
1621         'pebibytes': 1024 ** 5,
1622         'EiB': 1024 ** 6,
1623         'EB': 1000 ** 6,
1624         'eB': 1024 ** 6,
1625         'Eb': 1000 ** 6,
1626         'eb': 1000 ** 6,
1627         'exabytes': 1000 ** 6,
1628         'exbibytes': 1024 ** 6,
1629         'ZiB': 1024 ** 7,
1630         'ZB': 1000 ** 7,
1631         'zB': 1024 ** 7,
1632         'Zb': 1000 ** 7,
1633         'zb': 1000 ** 7,
1634         'zettabytes': 1000 ** 7,
1635         'zebibytes': 1024 ** 7,
1636         'YiB': 1024 ** 8,
1637         'YB': 1000 ** 8,
1638         'yB': 1024 ** 8,
1639         'Yb': 1000 ** 8,
1640         'yb': 1000 ** 8,
1641         'yottabytes': 1000 ** 8,
1642         'yobibytes': 1024 ** 8,
1643     }
1644
1645     return lookup_unit_table(_UNIT_TABLE, s)
1646
1647
1648 def parse_count(s):
1649     if s is None:
1650         return None
1651
1652     s = s.strip()
1653
1654     if re.match(r'^[\d,.]+$', s):
1655         return str_to_int(s)
1656
1657     _UNIT_TABLE = {
1658         'k': 1000,
1659         'K': 1000,
1660         'm': 1000 ** 2,
1661         'M': 1000 ** 2,
1662         'kk': 1000 ** 2,
1663         'KK': 1000 ** 2,
1664     }
1665
1666     return lookup_unit_table(_UNIT_TABLE, s)
1667
1668
1669 def month_by_name(name, lang='en'):
1670     """ Return the number of a month by (locale-independently) English name """
1671
1672     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1673
1674     try:
1675         return month_names.index(name) + 1
1676     except ValueError:
1677         return None
1678
1679
1680 def month_by_abbreviation(abbrev):
1681     """ Return the number of a month by (locale-independently) English
1682         abbreviations """
1683
1684     try:
1685         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1686     except ValueError:
1687         return None
1688
1689
1690 def fix_xml_ampersands(xml_str):
1691     """Replace all the '&' by '&amp;' in XML"""
1692     return re.sub(
1693         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1694         '&amp;',
1695         xml_str)
1696
1697
1698 def setproctitle(title):
1699     assert isinstance(title, compat_str)
1700
1701     # ctypes in Jython is not complete
1702     # http://bugs.jython.org/issue2148
1703     if sys.platform.startswith('java'):
1704         return
1705
1706     try:
1707         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1708     except OSError:
1709         return
1710     except TypeError:
1711         # LoadLibrary in Windows Python 2.7.13 only expects
1712         # a bytestring, but since unicode_literals turns
1713         # every string into a unicode string, it fails.
1714         return
1715     title_bytes = title.encode('utf-8')
1716     buf = ctypes.create_string_buffer(len(title_bytes))
1717     buf.value = title_bytes
1718     try:
1719         libc.prctl(15, buf, 0, 0, 0)
1720     except AttributeError:
1721         return  # Strange libc, just skip this
1722
1723
1724 def remove_start(s, start):
1725     return s[len(start):] if s is not None and s.startswith(start) else s
1726
1727
1728 def remove_end(s, end):
1729     return s[:-len(end)] if s is not None and s.endswith(end) else s
1730
1731
1732 def remove_quotes(s):
1733     if s is None or len(s) < 2:
1734         return s
1735     for quote in ('"', "'", ):
1736         if s[0] == quote and s[-1] == quote:
1737             return s[1:-1]
1738     return s
1739
1740
1741 def url_basename(url):
1742     path = compat_urlparse.urlparse(url).path
1743     return path.strip('/').split('/')[-1]
1744
1745
1746 def base_url(url):
1747     return re.match(r'https?://[^?#&]+/', url).group()
1748
1749
1750 def urljoin(base, path):
1751     if not isinstance(path, compat_str) or not path:
1752         return None
1753     if re.match(r'^(?:https?:)?//', path):
1754         return path
1755     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1756         return None
1757     return compat_urlparse.urljoin(base, path)
1758
1759
1760 class HEADRequest(compat_urllib_request.Request):
1761     def get_method(self):
1762         return 'HEAD'
1763
1764
1765 class PUTRequest(compat_urllib_request.Request):
1766     def get_method(self):
1767         return 'PUT'
1768
1769
1770 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1771     if get_attr:
1772         if v is not None:
1773             v = getattr(v, get_attr, None)
1774     if v == '':
1775         v = None
1776     if v is None:
1777         return default
1778     try:
1779         return int(v) * invscale // scale
1780     except ValueError:
1781         return default
1782
1783
1784 def str_or_none(v, default=None):
1785     return default if v is None else compat_str(v)
1786
1787
1788 def str_to_int(int_str):
1789     """ A more relaxed version of int_or_none """
1790     if int_str is None:
1791         return None
1792     int_str = re.sub(r'[,\.\+]', '', int_str)
1793     return int(int_str)
1794
1795
1796 def float_or_none(v, scale=1, invscale=1, default=None):
1797     if v is None:
1798         return default
1799     try:
1800         return float(v) * invscale / scale
1801     except ValueError:
1802         return default
1803
1804
1805 def strip_or_none(v):
1806     return None if v is None else v.strip()
1807
1808
1809 def parse_duration(s):
1810     if not isinstance(s, compat_basestring):
1811         return None
1812
1813     s = s.strip()
1814
1815     days, hours, mins, secs, ms = [None] * 5
1816     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1817     if m:
1818         days, hours, mins, secs, ms = m.groups()
1819     else:
1820         m = re.match(
1821             r'''(?ix)(?:P?T)?
1822                 (?:
1823                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1824                 )?
1825                 (?:
1826                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1827                 )?
1828                 (?:
1829                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1830                 )?
1831                 (?:
1832                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1833                 )?Z?$''', s)
1834         if m:
1835             days, hours, mins, secs, ms = m.groups()
1836         else:
1837             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1838             if m:
1839                 hours, mins = m.groups()
1840             else:
1841                 return None
1842
1843     duration = 0
1844     if secs:
1845         duration += float(secs)
1846     if mins:
1847         duration += float(mins) * 60
1848     if hours:
1849         duration += float(hours) * 60 * 60
1850     if days:
1851         duration += float(days) * 24 * 60 * 60
1852     if ms:
1853         duration += float(ms)
1854     return duration
1855
1856
1857 def prepend_extension(filename, ext, expected_real_ext=None):
1858     name, real_ext = os.path.splitext(filename)
1859     return (
1860         '{0}.{1}{2}'.format(name, ext, real_ext)
1861         if not expected_real_ext or real_ext[1:] == expected_real_ext
1862         else '{0}.{1}'.format(filename, ext))
1863
1864
1865 def replace_extension(filename, ext, expected_real_ext=None):
1866     name, real_ext = os.path.splitext(filename)
1867     return '{0}.{1}'.format(
1868         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1869         ext)
1870
1871
1872 def check_executable(exe, args=[]):
1873     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1874     args can be a list of arguments for a short output (like -version) """
1875     try:
1876         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1877     except OSError:
1878         return False
1879     return exe
1880
1881
1882 def get_exe_version(exe, args=['--version'],
1883                     version_re=None, unrecognized='present'):
1884     """ Returns the version of the specified executable,
1885     or False if the executable is not present """
1886     try:
1887         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1888         # SIGTTOU if youtube-dl is run in the background.
1889         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1890         out, _ = subprocess.Popen(
1891             [encodeArgument(exe)] + args,
1892             stdin=subprocess.PIPE,
1893             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1894     except OSError:
1895         return False
1896     if isinstance(out, bytes):  # Python 2.x
1897         out = out.decode('ascii', 'ignore')
1898     return detect_exe_version(out, version_re, unrecognized)
1899
1900
1901 def detect_exe_version(output, version_re=None, unrecognized='present'):
1902     assert isinstance(output, compat_str)
1903     if version_re is None:
1904         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1905     m = re.search(version_re, output)
1906     if m:
1907         return m.group(1)
1908     else:
1909         return unrecognized
1910
1911
1912 class PagedList(object):
1913     def __len__(self):
1914         # This is only useful for tests
1915         return len(self.getslice())
1916
1917
1918 class OnDemandPagedList(PagedList):
1919     def __init__(self, pagefunc, pagesize, use_cache=False):
1920         self._pagefunc = pagefunc
1921         self._pagesize = pagesize
1922         self._use_cache = use_cache
1923         if use_cache:
1924             self._cache = {}
1925
1926     def getslice(self, start=0, end=None):
1927         res = []
1928         for pagenum in itertools.count(start // self._pagesize):
1929             firstid = pagenum * self._pagesize
1930             nextfirstid = pagenum * self._pagesize + self._pagesize
1931             if start >= nextfirstid:
1932                 continue
1933
1934             page_results = None
1935             if self._use_cache:
1936                 page_results = self._cache.get(pagenum)
1937             if page_results is None:
1938                 page_results = list(self._pagefunc(pagenum))
1939             if self._use_cache:
1940                 self._cache[pagenum] = page_results
1941
1942             startv = (
1943                 start % self._pagesize
1944                 if firstid <= start < nextfirstid
1945                 else 0)
1946
1947             endv = (
1948                 ((end - 1) % self._pagesize) + 1
1949                 if (end is not None and firstid <= end <= nextfirstid)
1950                 else None)
1951
1952             if startv != 0 or endv is not None:
1953                 page_results = page_results[startv:endv]
1954             res.extend(page_results)
1955
1956             # A little optimization - if current page is not "full", ie. does
1957             # not contain page_size videos then we can assume that this page
1958             # is the last one - there are no more ids on further pages -
1959             # i.e. no need to query again.
1960             if len(page_results) + startv < self._pagesize:
1961                 break
1962
1963             # If we got the whole page, but the next page is not interesting,
1964             # break out early as well
1965             if end == nextfirstid:
1966                 break
1967         return res
1968
1969
1970 class InAdvancePagedList(PagedList):
1971     def __init__(self, pagefunc, pagecount, pagesize):
1972         self._pagefunc = pagefunc
1973         self._pagecount = pagecount
1974         self._pagesize = pagesize
1975
1976     def getslice(self, start=0, end=None):
1977         res = []
1978         start_page = start // self._pagesize
1979         end_page = (
1980             self._pagecount if end is None else (end // self._pagesize + 1))
1981         skip_elems = start - start_page * self._pagesize
1982         only_more = None if end is None else end - start
1983         for pagenum in range(start_page, end_page):
1984             page = list(self._pagefunc(pagenum))
1985             if skip_elems:
1986                 page = page[skip_elems:]
1987                 skip_elems = None
1988             if only_more is not None:
1989                 if len(page) < only_more:
1990                     only_more -= len(page)
1991                 else:
1992                     page = page[:only_more]
1993                     res.extend(page)
1994                     break
1995             res.extend(page)
1996         return res
1997
1998
1999 def uppercase_escape(s):
2000     unicode_escape = codecs.getdecoder('unicode_escape')
2001     return re.sub(
2002         r'\\U[0-9a-fA-F]{8}',
2003         lambda m: unicode_escape(m.group(0))[0],
2004         s)
2005
2006
2007 def lowercase_escape(s):
2008     unicode_escape = codecs.getdecoder('unicode_escape')
2009     return re.sub(
2010         r'\\u[0-9a-fA-F]{4}',
2011         lambda m: unicode_escape(m.group(0))[0],
2012         s)
2013
2014
2015 def escape_rfc3986(s):
2016     """Escape non-ASCII characters as suggested by RFC 3986"""
2017     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2018         s = s.encode('utf-8')
2019     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2020
2021
2022 def escape_url(url):
2023     """Escape URL as suggested by RFC 3986"""
2024     url_parsed = compat_urllib_parse_urlparse(url)
2025     return url_parsed._replace(
2026         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2027         path=escape_rfc3986(url_parsed.path),
2028         params=escape_rfc3986(url_parsed.params),
2029         query=escape_rfc3986(url_parsed.query),
2030         fragment=escape_rfc3986(url_parsed.fragment)
2031     ).geturl()
2032
2033
2034 def read_batch_urls(batch_fd):
2035     def fixup(url):
2036         if not isinstance(url, compat_str):
2037             url = url.decode('utf-8', 'replace')
2038         BOM_UTF8 = '\xef\xbb\xbf'
2039         if url.startswith(BOM_UTF8):
2040             url = url[len(BOM_UTF8):]
2041         url = url.strip()
2042         if url.startswith(('#', ';', ']')):
2043             return False
2044         return url
2045
2046     with contextlib.closing(batch_fd) as fd:
2047         return [url for url in map(fixup, fd) if url]
2048
2049
2050 def urlencode_postdata(*args, **kargs):
2051     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2052
2053
2054 def update_url_query(url, query):
2055     if not query:
2056         return url
2057     parsed_url = compat_urlparse.urlparse(url)
2058     qs = compat_parse_qs(parsed_url.query)
2059     qs.update(query)
2060     return compat_urlparse.urlunparse(parsed_url._replace(
2061         query=compat_urllib_parse_urlencode(qs, True)))
2062
2063
2064 def update_Request(req, url=None, data=None, headers={}, query={}):
2065     req_headers = req.headers.copy()
2066     req_headers.update(headers)
2067     req_data = data or req.data
2068     req_url = update_url_query(url or req.get_full_url(), query)
2069     req_get_method = req.get_method()
2070     if req_get_method == 'HEAD':
2071         req_type = HEADRequest
2072     elif req_get_method == 'PUT':
2073         req_type = PUTRequest
2074     else:
2075         req_type = compat_urllib_request.Request
2076     new_req = req_type(
2077         req_url, data=req_data, headers=req_headers,
2078         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2079     if hasattr(req, 'timeout'):
2080         new_req.timeout = req.timeout
2081     return new_req
2082
2083
2084 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2085     if isinstance(key_or_keys, (list, tuple)):
2086         for key in key_or_keys:
2087             if key not in d or d[key] is None or skip_false_values and not d[key]:
2088                 continue
2089             return d[key]
2090         return default
2091     return d.get(key_or_keys, default)
2092
2093
2094 def try_get(src, getter, expected_type=None):
2095     try:
2096         v = getter(src)
2097     except (AttributeError, KeyError, TypeError, IndexError):
2098         pass
2099     else:
2100         if expected_type is None or isinstance(v, expected_type):
2101             return v
2102
2103
2104 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2105     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2106
2107
2108 US_RATINGS = {
2109     'G': 0,
2110     'PG': 10,
2111     'PG-13': 13,
2112     'R': 16,
2113     'NC': 18,
2114 }
2115
2116
2117 TV_PARENTAL_GUIDELINES = {
2118     'TV-Y': 0,
2119     'TV-Y7': 7,
2120     'TV-G': 0,
2121     'TV-PG': 0,
2122     'TV-14': 14,
2123     'TV-MA': 17,
2124 }
2125
2126
2127 def parse_age_limit(s):
2128     if type(s) == int:
2129         return s if 0 <= s <= 21 else None
2130     if not isinstance(s, compat_basestring):
2131         return None
2132     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2133     if m:
2134         return int(m.group('age'))
2135     if s in US_RATINGS:
2136         return US_RATINGS[s]
2137     return TV_PARENTAL_GUIDELINES.get(s)
2138
2139
2140 def strip_jsonp(code):
2141     return re.sub(
2142         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2143
2144
2145 def js_to_json(code):
2146     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2147     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2148     INTEGER_TABLE = (
2149         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2150         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2151     )
2152
2153     def fix_kv(m):
2154         v = m.group(0)
2155         if v in ('true', 'false', 'null'):
2156             return v
2157         elif v.startswith('/*') or v.startswith('//') or v == ',':
2158             return ""
2159
2160         if v[0] in ("'", '"'):
2161             v = re.sub(r'(?s)\\.|"', lambda m: {
2162                 '"': '\\"',
2163                 "\\'": "'",
2164                 '\\\n': '',
2165                 '\\x': '\\u00',
2166             }.get(m.group(0), m.group(0)), v[1:-1])
2167
2168         for regex, base in INTEGER_TABLE:
2169             im = re.match(regex, v)
2170             if im:
2171                 i = int(im.group(1), base)
2172                 return '"%d":' % i if v.endswith(':') else '%d' % i
2173
2174         return '"%s"' % v
2175
2176     return re.sub(r'''(?sx)
2177         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2178         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2179         {comment}|,(?={skip}[\]}}])|
2180         [a-zA-Z_][.a-zA-Z_0-9]*|
2181         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2182         [0-9]+(?={skip}:)
2183         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2184
2185
2186 def qualities(quality_ids):
2187     """ Get a numeric quality value out of a list of possible values """
2188     def q(qid):
2189         try:
2190             return quality_ids.index(qid)
2191         except ValueError:
2192             return -1
2193     return q
2194
2195
2196 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2197
2198
2199 def limit_length(s, length):
2200     """ Add ellipses to overly long strings """
2201     if s is None:
2202         return None
2203     ELLIPSES = '...'
2204     if len(s) > length:
2205         return s[:length - len(ELLIPSES)] + ELLIPSES
2206     return s
2207
2208
2209 def version_tuple(v):
2210     return tuple(int(e) for e in re.split(r'[-.]', v))
2211
2212
2213 def is_outdated_version(version, limit, assume_new=True):
2214     if not version:
2215         return not assume_new
2216     try:
2217         return version_tuple(version) < version_tuple(limit)
2218     except ValueError:
2219         return not assume_new
2220
2221
2222 def ytdl_is_updateable():
2223     """ Returns if youtube-dl can be updated with -U """
2224     from zipimport import zipimporter
2225
2226     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2227
2228
2229 def args_to_str(args):
2230     # Get a short string representation for a subprocess command
2231     return ' '.join(compat_shlex_quote(a) for a in args)
2232
2233
2234 def error_to_compat_str(err):
2235     err_str = str(err)
2236     # On python 2 error byte string must be decoded with proper
2237     # encoding rather than ascii
2238     if sys.version_info[0] < 3:
2239         err_str = err_str.decode(preferredencoding())
2240     return err_str
2241
2242
2243 def mimetype2ext(mt):
2244     if mt is None:
2245         return None
2246
2247     ext = {
2248         'audio/mp4': 'm4a',
2249         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2250         # it's the most popular one
2251         'audio/mpeg': 'mp3',
2252     }.get(mt)
2253     if ext is not None:
2254         return ext
2255
2256     _, _, res = mt.rpartition('/')
2257     res = res.split(';')[0].strip().lower()
2258
2259     return {
2260         '3gpp': '3gp',
2261         'smptett+xml': 'tt',
2262         'srt': 'srt',
2263         'ttaf+xml': 'dfxp',
2264         'ttml+xml': 'ttml',
2265         'vtt': 'vtt',
2266         'x-flv': 'flv',
2267         'x-mp4-fragmented': 'mp4',
2268         'x-ms-wmv': 'wmv',
2269         'mpegurl': 'm3u8',
2270         'x-mpegurl': 'm3u8',
2271         'vnd.apple.mpegurl': 'm3u8',
2272         'dash+xml': 'mpd',
2273         'f4m': 'f4m',
2274         'f4m+xml': 'f4m',
2275         'hds+xml': 'f4m',
2276         'vnd.ms-sstr+xml': 'ism',
2277         'quicktime': 'mov',
2278     }.get(res, res)
2279
2280
2281 def parse_codecs(codecs_str):
2282     # http://tools.ietf.org/html/rfc6381
2283     if not codecs_str:
2284         return {}
2285     splited_codecs = list(filter(None, map(
2286         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2287     vcodec, acodec = None, None
2288     for full_codec in splited_codecs:
2289         codec = full_codec.split('.')[0]
2290         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2291             if not vcodec:
2292                 vcodec = full_codec
2293         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2294             if not acodec:
2295                 acodec = full_codec
2296         else:
2297             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2298     if not vcodec and not acodec:
2299         if len(splited_codecs) == 2:
2300             return {
2301                 'vcodec': vcodec,
2302                 'acodec': acodec,
2303             }
2304         elif len(splited_codecs) == 1:
2305             return {
2306                 'vcodec': 'none',
2307                 'acodec': vcodec,
2308             }
2309     else:
2310         return {
2311             'vcodec': vcodec or 'none',
2312             'acodec': acodec or 'none',
2313         }
2314     return {}
2315
2316
2317 def urlhandle_detect_ext(url_handle):
2318     getheader = url_handle.headers.get
2319
2320     cd = getheader('Content-Disposition')
2321     if cd:
2322         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2323         if m:
2324             e = determine_ext(m.group('filename'), default_ext=None)
2325             if e:
2326                 return e
2327
2328     return mimetype2ext(getheader('Content-Type'))
2329
2330
2331 def encode_data_uri(data, mime_type):
2332     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2333
2334
2335 def age_restricted(content_limit, age_limit):
2336     """ Returns True iff the content should be blocked """
2337
2338     if age_limit is None:  # No limit set
2339         return False
2340     if content_limit is None:
2341         return False  # Content available for everyone
2342     return age_limit < content_limit
2343
2344
2345 def is_html(first_bytes):
2346     """ Detect whether a file contains HTML by examining its first bytes. """
2347
2348     BOMS = [
2349         (b'\xef\xbb\xbf', 'utf-8'),
2350         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2351         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2352         (b'\xff\xfe', 'utf-16-le'),
2353         (b'\xfe\xff', 'utf-16-be'),
2354     ]
2355     for bom, enc in BOMS:
2356         if first_bytes.startswith(bom):
2357             s = first_bytes[len(bom):].decode(enc, 'replace')
2358             break
2359     else:
2360         s = first_bytes.decode('utf-8', 'replace')
2361
2362     return re.match(r'^\s*<', s)
2363
2364
2365 def determine_protocol(info_dict):
2366     protocol = info_dict.get('protocol')
2367     if protocol is not None:
2368         return protocol
2369
2370     url = info_dict['url']
2371     if url.startswith('rtmp'):
2372         return 'rtmp'
2373     elif url.startswith('mms'):
2374         return 'mms'
2375     elif url.startswith('rtsp'):
2376         return 'rtsp'
2377
2378     ext = determine_ext(url)
2379     if ext == 'm3u8':
2380         return 'm3u8'
2381     elif ext == 'f4m':
2382         return 'f4m'
2383
2384     return compat_urllib_parse_urlparse(url).scheme
2385
2386
2387 def render_table(header_row, data):
2388     """ Render a list of rows, each as a list of values """
2389     table = [header_row] + data
2390     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2391     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2392     return '\n'.join(format_str % tuple(row) for row in table)
2393
2394
2395 def _match_one(filter_part, dct):
2396     COMPARISON_OPERATORS = {
2397         '<': operator.lt,
2398         '<=': operator.le,
2399         '>': operator.gt,
2400         '>=': operator.ge,
2401         '=': operator.eq,
2402         '!=': operator.ne,
2403     }
2404     operator_rex = re.compile(r'''(?x)\s*
2405         (?P<key>[a-z_]+)
2406         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2407         (?:
2408             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2409             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2410             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2411         )
2412         \s*$
2413         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2414     m = operator_rex.search(filter_part)
2415     if m:
2416         op = COMPARISON_OPERATORS[m.group('op')]
2417         actual_value = dct.get(m.group('key'))
2418         if (m.group('quotedstrval') is not None or
2419             m.group('strval') is not None or
2420             # If the original field is a string and matching comparisonvalue is
2421             # a number we should respect the origin of the original field
2422             # and process comparison value as a string (see
2423             # https://github.com/rg3/youtube-dl/issues/11082).
2424             actual_value is not None and m.group('intval') is not None and
2425                 isinstance(actual_value, compat_str)):
2426             if m.group('op') not in ('=', '!='):
2427                 raise ValueError(
2428                     'Operator %s does not support string values!' % m.group('op'))
2429             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2430             quote = m.group('quote')
2431             if quote is not None:
2432                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2433         else:
2434             try:
2435                 comparison_value = int(m.group('intval'))
2436             except ValueError:
2437                 comparison_value = parse_filesize(m.group('intval'))
2438                 if comparison_value is None:
2439                     comparison_value = parse_filesize(m.group('intval') + 'B')
2440                 if comparison_value is None:
2441                     raise ValueError(
2442                         'Invalid integer value %r in filter part %r' % (
2443                             m.group('intval'), filter_part))
2444         if actual_value is None:
2445             return m.group('none_inclusive')
2446         return op(actual_value, comparison_value)
2447
2448     UNARY_OPERATORS = {
2449         '': lambda v: v is not None,
2450         '!': lambda v: v is None,
2451     }
2452     operator_rex = re.compile(r'''(?x)\s*
2453         (?P<op>%s)\s*(?P<key>[a-z_]+)
2454         \s*$
2455         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2456     m = operator_rex.search(filter_part)
2457     if m:
2458         op = UNARY_OPERATORS[m.group('op')]
2459         actual_value = dct.get(m.group('key'))
2460         return op(actual_value)
2461
2462     raise ValueError('Invalid filter part %r' % filter_part)
2463
2464
2465 def match_str(filter_str, dct):
2466     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2467
2468     return all(
2469         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2470
2471
2472 def match_filter_func(filter_str):
2473     def _match_func(info_dict):
2474         if match_str(filter_str, info_dict):
2475             return None
2476         else:
2477             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2478             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2479     return _match_func
2480
2481
2482 def parse_dfxp_time_expr(time_expr):
2483     if not time_expr:
2484         return
2485
2486     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2487     if mobj:
2488         return float(mobj.group('time_offset'))
2489
2490     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2491     if mobj:
2492         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2493
2494
2495 def srt_subtitles_timecode(seconds):
2496     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2497
2498
2499 def dfxp2srt(dfxp_data):
2500     _x = functools.partial(xpath_with_ns, ns_map={
2501         'ttml': 'http://www.w3.org/ns/ttml',
2502         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2503         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2504     })
2505
2506     class TTMLPElementParser(object):
2507         out = ''
2508
2509         def start(self, tag, attrib):
2510             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2511                 self.out += '\n'
2512
2513         def end(self, tag):
2514             pass
2515
2516         def data(self, data):
2517             self.out += data
2518
2519         def close(self):
2520             return self.out.strip()
2521
2522     def parse_node(node):
2523         target = TTMLPElementParser()
2524         parser = xml.etree.ElementTree.XMLParser(target=target)
2525         parser.feed(xml.etree.ElementTree.tostring(node))
2526         return parser.close()
2527
2528     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2529     out = []
2530     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2531
2532     if not paras:
2533         raise ValueError('Invalid dfxp/TTML subtitle')
2534
2535     for para, index in zip(paras, itertools.count(1)):
2536         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2537         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2538         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2539         if begin_time is None:
2540             continue
2541         if not end_time:
2542             if not dur:
2543                 continue
2544             end_time = begin_time + dur
2545         out.append('%d\n%s --> %s\n%s\n\n' % (
2546             index,
2547             srt_subtitles_timecode(begin_time),
2548             srt_subtitles_timecode(end_time),
2549             parse_node(para)))
2550
2551     return ''.join(out)
2552
2553
2554 def cli_option(params, command_option, param):
2555     param = params.get(param)
2556     if param:
2557         param = compat_str(param)
2558     return [command_option, param] if param is not None else []
2559
2560
2561 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2562     param = params.get(param)
2563     assert isinstance(param, bool)
2564     if separator:
2565         return [command_option + separator + (true_value if param else false_value)]
2566     return [command_option, true_value if param else false_value]
2567
2568
2569 def cli_valueless_option(params, command_option, param, expected_value=True):
2570     param = params.get(param)
2571     return [command_option] if param == expected_value else []
2572
2573
2574 def cli_configuration_args(params, param, default=[]):
2575     ex_args = params.get(param)
2576     if ex_args is None:
2577         return default
2578     assert isinstance(ex_args, list)
2579     return ex_args
2580
2581
2582 class ISO639Utils(object):
2583     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2584     _lang_map = {
2585         'aa': 'aar',
2586         'ab': 'abk',
2587         'ae': 'ave',
2588         'af': 'afr',
2589         'ak': 'aka',
2590         'am': 'amh',
2591         'an': 'arg',
2592         'ar': 'ara',
2593         'as': 'asm',
2594         'av': 'ava',
2595         'ay': 'aym',
2596         'az': 'aze',
2597         'ba': 'bak',
2598         'be': 'bel',
2599         'bg': 'bul',
2600         'bh': 'bih',
2601         'bi': 'bis',
2602         'bm': 'bam',
2603         'bn': 'ben',
2604         'bo': 'bod',
2605         'br': 'bre',
2606         'bs': 'bos',
2607         'ca': 'cat',
2608         'ce': 'che',
2609         'ch': 'cha',
2610         'co': 'cos',
2611         'cr': 'cre',
2612         'cs': 'ces',
2613         'cu': 'chu',
2614         'cv': 'chv',
2615         'cy': 'cym',
2616         'da': 'dan',
2617         'de': 'deu',
2618         'dv': 'div',
2619         'dz': 'dzo',
2620         'ee': 'ewe',
2621         'el': 'ell',
2622         'en': 'eng',
2623         'eo': 'epo',
2624         'es': 'spa',
2625         'et': 'est',
2626         'eu': 'eus',
2627         'fa': 'fas',
2628         'ff': 'ful',
2629         'fi': 'fin',
2630         'fj': 'fij',
2631         'fo': 'fao',
2632         'fr': 'fra',
2633         'fy': 'fry',
2634         'ga': 'gle',
2635         'gd': 'gla',
2636         'gl': 'glg',
2637         'gn': 'grn',
2638         'gu': 'guj',
2639         'gv': 'glv',
2640         'ha': 'hau',
2641         'he': 'heb',
2642         'hi': 'hin',
2643         'ho': 'hmo',
2644         'hr': 'hrv',
2645         'ht': 'hat',
2646         'hu': 'hun',
2647         'hy': 'hye',
2648         'hz': 'her',
2649         'ia': 'ina',
2650         'id': 'ind',
2651         'ie': 'ile',
2652         'ig': 'ibo',
2653         'ii': 'iii',
2654         'ik': 'ipk',
2655         'io': 'ido',
2656         'is': 'isl',
2657         'it': 'ita',
2658         'iu': 'iku',
2659         'ja': 'jpn',
2660         'jv': 'jav',
2661         'ka': 'kat',
2662         'kg': 'kon',
2663         'ki': 'kik',
2664         'kj': 'kua',
2665         'kk': 'kaz',
2666         'kl': 'kal',
2667         'km': 'khm',
2668         'kn': 'kan',
2669         'ko': 'kor',
2670         'kr': 'kau',
2671         'ks': 'kas',
2672         'ku': 'kur',
2673         'kv': 'kom',
2674         'kw': 'cor',
2675         'ky': 'kir',
2676         'la': 'lat',
2677         'lb': 'ltz',
2678         'lg': 'lug',
2679         'li': 'lim',
2680         'ln': 'lin',
2681         'lo': 'lao',
2682         'lt': 'lit',
2683         'lu': 'lub',
2684         'lv': 'lav',
2685         'mg': 'mlg',
2686         'mh': 'mah',
2687         'mi': 'mri',
2688         'mk': 'mkd',
2689         'ml': 'mal',
2690         'mn': 'mon',
2691         'mr': 'mar',
2692         'ms': 'msa',
2693         'mt': 'mlt',
2694         'my': 'mya',
2695         'na': 'nau',
2696         'nb': 'nob',
2697         'nd': 'nde',
2698         'ne': 'nep',
2699         'ng': 'ndo',
2700         'nl': 'nld',
2701         'nn': 'nno',
2702         'no': 'nor',
2703         'nr': 'nbl',
2704         'nv': 'nav',
2705         'ny': 'nya',
2706         'oc': 'oci',
2707         'oj': 'oji',
2708         'om': 'orm',
2709         'or': 'ori',
2710         'os': 'oss',
2711         'pa': 'pan',
2712         'pi': 'pli',
2713         'pl': 'pol',
2714         'ps': 'pus',
2715         'pt': 'por',
2716         'qu': 'que',
2717         'rm': 'roh',
2718         'rn': 'run',
2719         'ro': 'ron',
2720         'ru': 'rus',
2721         'rw': 'kin',
2722         'sa': 'san',
2723         'sc': 'srd',
2724         'sd': 'snd',
2725         'se': 'sme',
2726         'sg': 'sag',
2727         'si': 'sin',
2728         'sk': 'slk',
2729         'sl': 'slv',
2730         'sm': 'smo',
2731         'sn': 'sna',
2732         'so': 'som',
2733         'sq': 'sqi',
2734         'sr': 'srp',
2735         'ss': 'ssw',
2736         'st': 'sot',
2737         'su': 'sun',
2738         'sv': 'swe',
2739         'sw': 'swa',
2740         'ta': 'tam',
2741         'te': 'tel',
2742         'tg': 'tgk',
2743         'th': 'tha',
2744         'ti': 'tir',
2745         'tk': 'tuk',
2746         'tl': 'tgl',
2747         'tn': 'tsn',
2748         'to': 'ton',
2749         'tr': 'tur',
2750         'ts': 'tso',
2751         'tt': 'tat',
2752         'tw': 'twi',
2753         'ty': 'tah',
2754         'ug': 'uig',
2755         'uk': 'ukr',
2756         'ur': 'urd',
2757         'uz': 'uzb',
2758         've': 'ven',
2759         'vi': 'vie',
2760         'vo': 'vol',
2761         'wa': 'wln',
2762         'wo': 'wol',
2763         'xh': 'xho',
2764         'yi': 'yid',
2765         'yo': 'yor',
2766         'za': 'zha',
2767         'zh': 'zho',
2768         'zu': 'zul',
2769     }
2770
2771     @classmethod
2772     def short2long(cls, code):
2773         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2774         return cls._lang_map.get(code[:2])
2775
2776     @classmethod
2777     def long2short(cls, code):
2778         """Convert language code from ISO 639-2/T to ISO 639-1"""
2779         for short_name, long_name in cls._lang_map.items():
2780             if long_name == code:
2781                 return short_name
2782
2783
2784 class ISO3166Utils(object):
2785     # From http://data.okfn.org/data/core/country-list
2786     _country_map = {
2787         'AF': 'Afghanistan',
2788         'AX': 'Åland Islands',
2789         'AL': 'Albania',
2790         'DZ': 'Algeria',
2791         'AS': 'American Samoa',
2792         'AD': 'Andorra',
2793         'AO': 'Angola',
2794         'AI': 'Anguilla',
2795         'AQ': 'Antarctica',
2796         'AG': 'Antigua and Barbuda',
2797         'AR': 'Argentina',
2798         'AM': 'Armenia',
2799         'AW': 'Aruba',
2800         'AU': 'Australia',
2801         'AT': 'Austria',
2802         'AZ': 'Azerbaijan',
2803         'BS': 'Bahamas',
2804         'BH': 'Bahrain',
2805         'BD': 'Bangladesh',
2806         'BB': 'Barbados',
2807         'BY': 'Belarus',
2808         'BE': 'Belgium',
2809         'BZ': 'Belize',
2810         'BJ': 'Benin',
2811         'BM': 'Bermuda',
2812         'BT': 'Bhutan',
2813         'BO': 'Bolivia, Plurinational State of',
2814         'BQ': 'Bonaire, Sint Eustatius and Saba',
2815         'BA': 'Bosnia and Herzegovina',
2816         'BW': 'Botswana',
2817         'BV': 'Bouvet Island',
2818         'BR': 'Brazil',
2819         'IO': 'British Indian Ocean Territory',
2820         'BN': 'Brunei Darussalam',
2821         'BG': 'Bulgaria',
2822         'BF': 'Burkina Faso',
2823         'BI': 'Burundi',
2824         'KH': 'Cambodia',
2825         'CM': 'Cameroon',
2826         'CA': 'Canada',
2827         'CV': 'Cape Verde',
2828         'KY': 'Cayman Islands',
2829         'CF': 'Central African Republic',
2830         'TD': 'Chad',
2831         'CL': 'Chile',
2832         'CN': 'China',
2833         'CX': 'Christmas Island',
2834         'CC': 'Cocos (Keeling) Islands',
2835         'CO': 'Colombia',
2836         'KM': 'Comoros',
2837         'CG': 'Congo',
2838         'CD': 'Congo, the Democratic Republic of the',
2839         'CK': 'Cook Islands',
2840         'CR': 'Costa Rica',
2841         'CI': 'Côte d\'Ivoire',
2842         'HR': 'Croatia',
2843         'CU': 'Cuba',
2844         'CW': 'Curaçao',
2845         'CY': 'Cyprus',
2846         'CZ': 'Czech Republic',
2847         'DK': 'Denmark',
2848         'DJ': 'Djibouti',
2849         'DM': 'Dominica',
2850         'DO': 'Dominican Republic',
2851         'EC': 'Ecuador',
2852         'EG': 'Egypt',
2853         'SV': 'El Salvador',
2854         'GQ': 'Equatorial Guinea',
2855         'ER': 'Eritrea',
2856         'EE': 'Estonia',
2857         'ET': 'Ethiopia',
2858         'FK': 'Falkland Islands (Malvinas)',
2859         'FO': 'Faroe Islands',
2860         'FJ': 'Fiji',
2861         'FI': 'Finland',
2862         'FR': 'France',
2863         'GF': 'French Guiana',
2864         'PF': 'French Polynesia',
2865         'TF': 'French Southern Territories',
2866         'GA': 'Gabon',
2867         'GM': 'Gambia',
2868         'GE': 'Georgia',
2869         'DE': 'Germany',
2870         'GH': 'Ghana',
2871         'GI': 'Gibraltar',
2872         'GR': 'Greece',
2873         'GL': 'Greenland',
2874         'GD': 'Grenada',
2875         'GP': 'Guadeloupe',
2876         'GU': 'Guam',
2877         'GT': 'Guatemala',
2878         'GG': 'Guernsey',
2879         'GN': 'Guinea',
2880         'GW': 'Guinea-Bissau',
2881         'GY': 'Guyana',
2882         'HT': 'Haiti',
2883         'HM': 'Heard Island and McDonald Islands',
2884         'VA': 'Holy See (Vatican City State)',
2885         'HN': 'Honduras',
2886         'HK': 'Hong Kong',
2887         'HU': 'Hungary',
2888         'IS': 'Iceland',
2889         'IN': 'India',
2890         'ID': 'Indonesia',
2891         'IR': 'Iran, Islamic Republic of',
2892         'IQ': 'Iraq',
2893         'IE': 'Ireland',
2894         'IM': 'Isle of Man',
2895         'IL': 'Israel',
2896         'IT': 'Italy',
2897         'JM': 'Jamaica',
2898         'JP': 'Japan',
2899         'JE': 'Jersey',
2900         'JO': 'Jordan',
2901         'KZ': 'Kazakhstan',
2902         'KE': 'Kenya',
2903         'KI': 'Kiribati',
2904         'KP': 'Korea, Democratic People\'s Republic of',
2905         'KR': 'Korea, Republic of',
2906         'KW': 'Kuwait',
2907         'KG': 'Kyrgyzstan',
2908         'LA': 'Lao People\'s Democratic Republic',
2909         'LV': 'Latvia',
2910         'LB': 'Lebanon',
2911         'LS': 'Lesotho',
2912         'LR': 'Liberia',
2913         'LY': 'Libya',
2914         'LI': 'Liechtenstein',
2915         'LT': 'Lithuania',
2916         'LU': 'Luxembourg',
2917         'MO': 'Macao',
2918         'MK': 'Macedonia, the Former Yugoslav Republic of',
2919         'MG': 'Madagascar',
2920         'MW': 'Malawi',
2921         'MY': 'Malaysia',
2922         'MV': 'Maldives',
2923         'ML': 'Mali',
2924         'MT': 'Malta',
2925         'MH': 'Marshall Islands',
2926         'MQ': 'Martinique',
2927         'MR': 'Mauritania',
2928         'MU': 'Mauritius',
2929         'YT': 'Mayotte',
2930         'MX': 'Mexico',
2931         'FM': 'Micronesia, Federated States of',
2932         'MD': 'Moldova, Republic of',
2933         'MC': 'Monaco',
2934         'MN': 'Mongolia',
2935         'ME': 'Montenegro',
2936         'MS': 'Montserrat',
2937         'MA': 'Morocco',
2938         'MZ': 'Mozambique',
2939         'MM': 'Myanmar',
2940         'NA': 'Namibia',
2941         'NR': 'Nauru',
2942         'NP': 'Nepal',
2943         'NL': 'Netherlands',
2944         'NC': 'New Caledonia',
2945         'NZ': 'New Zealand',
2946         'NI': 'Nicaragua',
2947         'NE': 'Niger',
2948         'NG': 'Nigeria',
2949         'NU': 'Niue',
2950         'NF': 'Norfolk Island',
2951         'MP': 'Northern Mariana Islands',
2952         'NO': 'Norway',
2953         'OM': 'Oman',
2954         'PK': 'Pakistan',
2955         'PW': 'Palau',
2956         'PS': 'Palestine, State of',
2957         'PA': 'Panama',
2958         'PG': 'Papua New Guinea',
2959         'PY': 'Paraguay',
2960         'PE': 'Peru',
2961         'PH': 'Philippines',
2962         'PN': 'Pitcairn',
2963         'PL': 'Poland',
2964         'PT': 'Portugal',
2965         'PR': 'Puerto Rico',
2966         'QA': 'Qatar',
2967         'RE': 'Réunion',
2968         'RO': 'Romania',
2969         'RU': 'Russian Federation',
2970         'RW': 'Rwanda',
2971         'BL': 'Saint Barthélemy',
2972         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2973         'KN': 'Saint Kitts and Nevis',
2974         'LC': 'Saint Lucia',
2975         'MF': 'Saint Martin (French part)',
2976         'PM': 'Saint Pierre and Miquelon',
2977         'VC': 'Saint Vincent and the Grenadines',
2978         'WS': 'Samoa',
2979         'SM': 'San Marino',
2980         'ST': 'Sao Tome and Principe',
2981         'SA': 'Saudi Arabia',
2982         'SN': 'Senegal',
2983         'RS': 'Serbia',
2984         'SC': 'Seychelles',
2985         'SL': 'Sierra Leone',
2986         'SG': 'Singapore',
2987         'SX': 'Sint Maarten (Dutch part)',
2988         'SK': 'Slovakia',
2989         'SI': 'Slovenia',
2990         'SB': 'Solomon Islands',
2991         'SO': 'Somalia',
2992         'ZA': 'South Africa',
2993         'GS': 'South Georgia and the South Sandwich Islands',
2994         'SS': 'South Sudan',
2995         'ES': 'Spain',
2996         'LK': 'Sri Lanka',
2997         'SD': 'Sudan',
2998         'SR': 'Suriname',
2999         'SJ': 'Svalbard and Jan Mayen',
3000         'SZ': 'Swaziland',
3001         'SE': 'Sweden',
3002         'CH': 'Switzerland',
3003         'SY': 'Syrian Arab Republic',
3004         'TW': 'Taiwan, Province of China',
3005         'TJ': 'Tajikistan',
3006         'TZ': 'Tanzania, United Republic of',
3007         'TH': 'Thailand',
3008         'TL': 'Timor-Leste',
3009         'TG': 'Togo',
3010         'TK': 'Tokelau',
3011         'TO': 'Tonga',
3012         'TT': 'Trinidad and Tobago',
3013         'TN': 'Tunisia',
3014         'TR': 'Turkey',
3015         'TM': 'Turkmenistan',
3016         'TC': 'Turks and Caicos Islands',
3017         'TV': 'Tuvalu',
3018         'UG': 'Uganda',
3019         'UA': 'Ukraine',
3020         'AE': 'United Arab Emirates',
3021         'GB': 'United Kingdom',
3022         'US': 'United States',
3023         'UM': 'United States Minor Outlying Islands',
3024         'UY': 'Uruguay',
3025         'UZ': 'Uzbekistan',
3026         'VU': 'Vanuatu',
3027         'VE': 'Venezuela, Bolivarian Republic of',
3028         'VN': 'Viet Nam',
3029         'VG': 'Virgin Islands, British',
3030         'VI': 'Virgin Islands, U.S.',
3031         'WF': 'Wallis and Futuna',
3032         'EH': 'Western Sahara',
3033         'YE': 'Yemen',
3034         'ZM': 'Zambia',
3035         'ZW': 'Zimbabwe',
3036     }
3037
3038     @classmethod
3039     def short2full(cls, code):
3040         """Convert an ISO 3166-2 country code to the corresponding full name"""
3041         return cls._country_map.get(code.upper())
3042
3043
3044 class GeoUtils(object):
3045     # Major IPv4 address blocks per country
3046     _country_ip_map = {
3047         'AD': '85.94.160.0/19',
3048         'AE': '94.200.0.0/13',
3049         'AF': '149.54.0.0/17',
3050         'AG': '209.59.64.0/18',
3051         'AI': '204.14.248.0/21',
3052         'AL': '46.99.0.0/16',
3053         'AM': '46.70.0.0/15',
3054         'AO': '105.168.0.0/13',
3055         'AP': '159.117.192.0/21',
3056         'AR': '181.0.0.0/12',
3057         'AS': '202.70.112.0/20',
3058         'AT': '84.112.0.0/13',
3059         'AU': '1.128.0.0/11',
3060         'AW': '181.41.0.0/18',
3061         'AZ': '5.191.0.0/16',
3062         'BA': '31.176.128.0/17',
3063         'BB': '65.48.128.0/17',
3064         'BD': '114.130.0.0/16',
3065         'BE': '57.0.0.0/8',
3066         'BF': '129.45.128.0/17',
3067         'BG': '95.42.0.0/15',
3068         'BH': '37.131.0.0/17',
3069         'BI': '154.117.192.0/18',
3070         'BJ': '137.255.0.0/16',
3071         'BL': '192.131.134.0/24',
3072         'BM': '196.12.64.0/18',
3073         'BN': '156.31.0.0/16',
3074         'BO': '161.56.0.0/16',
3075         'BQ': '161.0.80.0/20',
3076         'BR': '152.240.0.0/12',
3077         'BS': '24.51.64.0/18',
3078         'BT': '119.2.96.0/19',
3079         'BW': '168.167.0.0/16',
3080         'BY': '178.120.0.0/13',
3081         'BZ': '179.42.192.0/18',
3082         'CA': '99.224.0.0/11',
3083         'CD': '41.243.0.0/16',
3084         'CF': '196.32.200.0/21',
3085         'CG': '197.214.128.0/17',
3086         'CH': '85.0.0.0/13',
3087         'CI': '154.232.0.0/14',
3088         'CK': '202.65.32.0/19',
3089         'CL': '152.172.0.0/14',
3090         'CM': '165.210.0.0/15',
3091         'CN': '36.128.0.0/10',
3092         'CO': '181.240.0.0/12',
3093         'CR': '201.192.0.0/12',
3094         'CU': '152.206.0.0/15',
3095         'CV': '165.90.96.0/19',
3096         'CW': '190.88.128.0/17',
3097         'CY': '46.198.0.0/15',
3098         'CZ': '88.100.0.0/14',
3099         'DE': '53.0.0.0/8',
3100         'DJ': '197.241.0.0/17',
3101         'DK': '87.48.0.0/12',
3102         'DM': '192.243.48.0/20',
3103         'DO': '152.166.0.0/15',
3104         'DZ': '41.96.0.0/12',
3105         'EC': '186.68.0.0/15',
3106         'EE': '90.190.0.0/15',
3107         'EG': '156.160.0.0/11',
3108         'ER': '196.200.96.0/20',
3109         'ES': '88.0.0.0/11',
3110         'ET': '196.188.0.0/14',
3111         'EU': '2.16.0.0/13',
3112         'FI': '91.152.0.0/13',
3113         'FJ': '144.120.0.0/16',
3114         'FM': '119.252.112.0/20',
3115         'FO': '88.85.32.0/19',
3116         'FR': '90.0.0.0/9',
3117         'GA': '41.158.0.0/15',
3118         'GB': '25.0.0.0/8',
3119         'GD': '74.122.88.0/21',
3120         'GE': '31.146.0.0/16',
3121         'GF': '161.22.64.0/18',
3122         'GG': '62.68.160.0/19',
3123         'GH': '45.208.0.0/14',
3124         'GI': '85.115.128.0/19',
3125         'GL': '88.83.0.0/19',
3126         'GM': '160.182.0.0/15',
3127         'GN': '197.149.192.0/18',
3128         'GP': '104.250.0.0/19',
3129         'GQ': '105.235.224.0/20',
3130         'GR': '94.64.0.0/13',
3131         'GT': '168.234.0.0/16',
3132         'GU': '168.123.0.0/16',
3133         'GW': '197.214.80.0/20',
3134         'GY': '181.41.64.0/18',
3135         'HK': '113.252.0.0/14',
3136         'HN': '181.210.0.0/16',
3137         'HR': '93.136.0.0/13',
3138         'HT': '148.102.128.0/17',
3139         'HU': '84.0.0.0/14',
3140         'ID': '39.192.0.0/10',
3141         'IE': '87.32.0.0/12',
3142         'IL': '79.176.0.0/13',
3143         'IM': '5.62.80.0/20',
3144         'IN': '117.192.0.0/10',
3145         'IO': '203.83.48.0/21',
3146         'IQ': '37.236.0.0/14',
3147         'IR': '2.176.0.0/12',
3148         'IS': '82.221.0.0/16',
3149         'IT': '79.0.0.0/10',
3150         'JE': '87.244.64.0/18',
3151         'JM': '72.27.0.0/17',
3152         'JO': '176.29.0.0/16',
3153         'JP': '126.0.0.0/8',
3154         'KE': '105.48.0.0/12',
3155         'KG': '158.181.128.0/17',
3156         'KH': '36.37.128.0/17',
3157         'KI': '103.25.140.0/22',
3158         'KM': '197.255.224.0/20',
3159         'KN': '198.32.32.0/19',
3160         'KP': '175.45.176.0/22',
3161         'KR': '175.192.0.0/10',
3162         'KW': '37.36.0.0/14',
3163         'KY': '64.96.0.0/15',
3164         'KZ': '2.72.0.0/13',
3165         'LA': '115.84.64.0/18',
3166         'LB': '178.135.0.0/16',
3167         'LC': '192.147.231.0/24',
3168         'LI': '82.117.0.0/19',
3169         'LK': '112.134.0.0/15',
3170         'LR': '41.86.0.0/19',
3171         'LS': '129.232.0.0/17',
3172         'LT': '78.56.0.0/13',
3173         'LU': '188.42.0.0/16',
3174         'LV': '46.109.0.0/16',
3175         'LY': '41.252.0.0/14',
3176         'MA': '105.128.0.0/11',
3177         'MC': '88.209.64.0/18',
3178         'MD': '37.246.0.0/16',
3179         'ME': '178.175.0.0/17',
3180         'MF': '74.112.232.0/21',
3181         'MG': '154.126.0.0/17',
3182         'MH': '117.103.88.0/21',
3183         'MK': '77.28.0.0/15',
3184         'ML': '154.118.128.0/18',
3185         'MM': '37.111.0.0/17',
3186         'MN': '49.0.128.0/17',
3187         'MO': '60.246.0.0/16',
3188         'MP': '202.88.64.0/20',
3189         'MQ': '109.203.224.0/19',
3190         'MR': '41.188.64.0/18',
3191         'MS': '208.90.112.0/22',
3192         'MT': '46.11.0.0/16',
3193         'MU': '105.16.0.0/12',
3194         'MV': '27.114.128.0/18',
3195         'MW': '105.234.0.0/16',
3196         'MX': '187.192.0.0/11',
3197         'MY': '175.136.0.0/13',
3198         'MZ': '197.218.0.0/15',
3199         'NA': '41.182.0.0/16',
3200         'NC': '101.101.0.0/18',
3201         'NE': '197.214.0.0/18',
3202         'NF': '203.17.240.0/22',
3203         'NG': '105.112.0.0/12',
3204         'NI': '186.76.0.0/15',
3205         'NL': '145.96.0.0/11',
3206         'NO': '84.208.0.0/13',
3207         'NP': '36.252.0.0/15',
3208         'NR': '203.98.224.0/19',
3209         'NU': '49.156.48.0/22',
3210         'NZ': '49.224.0.0/14',
3211         'OM': '5.36.0.0/15',
3212         'PA': '186.72.0.0/15',
3213         'PE': '186.160.0.0/14',
3214         'PF': '123.50.64.0/18',
3215         'PG': '124.240.192.0/19',
3216         'PH': '49.144.0.0/13',
3217         'PK': '39.32.0.0/11',
3218         'PL': '83.0.0.0/11',
3219         'PM': '70.36.0.0/20',
3220         'PR': '66.50.0.0/16',
3221         'PS': '188.161.0.0/16',
3222         'PT': '85.240.0.0/13',
3223         'PW': '202.124.224.0/20',
3224         'PY': '181.120.0.0/14',
3225         'QA': '37.210.0.0/15',
3226         'RE': '139.26.0.0/16',
3227         'RO': '79.112.0.0/13',
3228         'RS': '178.220.0.0/14',
3229         'RU': '5.136.0.0/13',
3230         'RW': '105.178.0.0/15',
3231         'SA': '188.48.0.0/13',
3232         'SB': '202.1.160.0/19',
3233         'SC': '154.192.0.0/11',
3234         'SD': '154.96.0.0/13',
3235         'SE': '78.64.0.0/12',
3236         'SG': '152.56.0.0/14',
3237         'SI': '188.196.0.0/14',
3238         'SK': '78.98.0.0/15',
3239         'SL': '197.215.0.0/17',
3240         'SM': '89.186.32.0/19',
3241         'SN': '41.82.0.0/15',
3242         'SO': '197.220.64.0/19',
3243         'SR': '186.179.128.0/17',
3244         'SS': '105.235.208.0/21',
3245         'ST': '197.159.160.0/19',
3246         'SV': '168.243.0.0/16',
3247         'SX': '190.102.0.0/20',
3248         'SY': '5.0.0.0/16',
3249         'SZ': '41.84.224.0/19',
3250         'TC': '65.255.48.0/20',
3251         'TD': '154.68.128.0/19',
3252         'TG': '196.168.0.0/14',
3253         'TH': '171.96.0.0/13',
3254         'TJ': '85.9.128.0/18',
3255         'TK': '27.96.24.0/21',
3256         'TL': '180.189.160.0/20',
3257         'TM': '95.85.96.0/19',
3258         'TN': '197.0.0.0/11',
3259         'TO': '175.176.144.0/21',
3260         'TR': '78.160.0.0/11',
3261         'TT': '186.44.0.0/15',
3262         'TV': '202.2.96.0/19',
3263         'TW': '120.96.0.0/11',
3264         'TZ': '156.156.0.0/14',
3265         'UA': '93.72.0.0/13',
3266         'UG': '154.224.0.0/13',
3267         'US': '3.0.0.0/8',
3268         'UY': '167.56.0.0/13',
3269         'UZ': '82.215.64.0/18',
3270         'VA': '212.77.0.0/19',
3271         'VC': '24.92.144.0/20',
3272         'VE': '186.88.0.0/13',
3273         'VG': '172.103.64.0/18',
3274         'VI': '146.226.0.0/16',
3275         'VN': '14.160.0.0/11',
3276         'VU': '202.80.32.0/20',
3277         'WF': '117.20.32.0/21',
3278         'WS': '202.4.32.0/19',
3279         'YE': '134.35.0.0/16',
3280         'YT': '41.242.116.0/22',
3281         'ZA': '41.0.0.0/11',
3282         'ZM': '165.56.0.0/13',
3283         'ZW': '41.85.192.0/19',
3284     }
3285
3286     @classmethod
3287     def random_ipv4(cls, code):
3288         block = cls._country_ip_map.get(code.upper())
3289         if not block:
3290             return None
3291         addr, preflen = block.split('/')
3292         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3293         addr_max = addr_min | (0xffffffff >> int(preflen))
3294         return compat_str(socket.inet_ntoa(
3295             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3296
3297
3298 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3299     def __init__(self, proxies=None):
3300         # Set default handlers
3301         for type in ('http', 'https'):
3302             setattr(self, '%s_open' % type,
3303                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3304                         meth(r, proxy, type))
3305         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3306
3307     def proxy_open(self, req, proxy, type):
3308         req_proxy = req.headers.get('Ytdl-request-proxy')
3309         if req_proxy is not None:
3310             proxy = req_proxy
3311             del req.headers['Ytdl-request-proxy']
3312
3313         if proxy == '__noproxy__':
3314             return None  # No Proxy
3315         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3316             req.add_header('Ytdl-socks-proxy', proxy)
3317             # youtube-dl's http/https handlers do wrapping the socket with socks
3318             return None
3319         return compat_urllib_request.ProxyHandler.proxy_open(
3320             self, req, proxy, type)
3321
3322
3323 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3324 # released into Public Domain
3325 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3326
3327 def long_to_bytes(n, blocksize=0):
3328     """long_to_bytes(n:long, blocksize:int) : string
3329     Convert a long integer to a byte string.
3330
3331     If optional blocksize is given and greater than zero, pad the front of the
3332     byte string with binary zeros so that the length is a multiple of
3333     blocksize.
3334     """
3335     # after much testing, this algorithm was deemed to be the fastest
3336     s = b''
3337     n = int(n)
3338     while n > 0:
3339         s = compat_struct_pack('>I', n & 0xffffffff) + s
3340         n = n >> 32
3341     # strip off leading zeros
3342     for i in range(len(s)):
3343         if s[i] != b'\000'[0]:
3344             break
3345     else:
3346         # only happens when n == 0
3347         s = b'\000'
3348         i = 0
3349     s = s[i:]
3350     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3351     # de-padding being done above, but sigh...
3352     if blocksize > 0 and len(s) % blocksize:
3353         s = (blocksize - len(s) % blocksize) * b'\000' + s
3354     return s
3355
3356
3357 def bytes_to_long(s):
3358     """bytes_to_long(string) : long
3359     Convert a byte string to a long integer.
3360
3361     This is (essentially) the inverse of long_to_bytes().
3362     """
3363     acc = 0
3364     length = len(s)
3365     if length % 4:
3366         extra = (4 - length % 4)
3367         s = b'\000' * extra + s
3368         length = length + extra
3369     for i in range(0, length, 4):
3370         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3371     return acc
3372
3373
3374 def ohdave_rsa_encrypt(data, exponent, modulus):
3375     '''
3376     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3377
3378     Input:
3379         data: data to encrypt, bytes-like object
3380         exponent, modulus: parameter e and N of RSA algorithm, both integer
3381     Output: hex string of encrypted data
3382
3383     Limitation: supports one block encryption only
3384     '''
3385
3386     payload = int(binascii.hexlify(data[::-1]), 16)
3387     encrypted = pow(payload, exponent, modulus)
3388     return '%x' % encrypted
3389
3390
3391 def pkcs1pad(data, length):
3392     """
3393     Padding input data with PKCS#1 scheme
3394
3395     @param {int[]} data        input data
3396     @param {int}   length      target length
3397     @returns {int[]}           padded data
3398     """
3399     if len(data) > length - 11:
3400         raise ValueError('Input data too long for PKCS#1 padding')
3401
3402     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3403     return [0, 2] + pseudo_random + [0] + data
3404
3405
3406 def encode_base_n(num, n, table=None):
3407     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3408     if not table:
3409         table = FULL_TABLE[:n]
3410
3411     if n > len(table):
3412         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3413
3414     if num == 0:
3415         return table[0]
3416
3417     ret = ''
3418     while num:
3419         ret = table[num % n] + ret
3420         num = num // n
3421     return ret
3422
3423
3424 def decode_packed_codes(code):
3425     mobj = re.search(PACKED_CODES_RE, code)
3426     obfucasted_code, base, count, symbols = mobj.groups()
3427     base = int(base)
3428     count = int(count)
3429     symbols = symbols.split('|')
3430     symbol_table = {}
3431
3432     while count:
3433         count -= 1
3434         base_n_count = encode_base_n(count, base)
3435         symbol_table[base_n_count] = symbols[count] or base_n_count
3436
3437     return re.sub(
3438         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3439         obfucasted_code)
3440
3441
3442 def parse_m3u8_attributes(attrib):
3443     info = {}
3444     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3445         if val.startswith('"'):
3446             val = val[1:-1]
3447         info[key] = val
3448     return info
3449
3450
3451 def urshift(val, n):
3452     return val >> n if val >= 0 else (val + 0x100000000) >> n
3453
3454
3455 # Based on png2str() written by @gdkchan and improved by @yokrysty
3456 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3457 def decode_png(png_data):
3458     # Reference: https://www.w3.org/TR/PNG/
3459     header = png_data[8:]
3460
3461     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3462         raise IOError('Not a valid PNG file.')
3463
3464     int_map = {1: '>B', 2: '>H', 4: '>I'}
3465     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3466
3467     chunks = []
3468
3469     while header:
3470         length = unpack_integer(header[:4])
3471         header = header[4:]
3472
3473         chunk_type = header[:4]
3474         header = header[4:]
3475
3476         chunk_data = header[:length]
3477         header = header[length:]
3478
3479         header = header[4:]  # Skip CRC
3480
3481         chunks.append({
3482             'type': chunk_type,
3483             'length': length,
3484             'data': chunk_data
3485         })
3486
3487     ihdr = chunks[0]['data']
3488
3489     width = unpack_integer(ihdr[:4])
3490     height = unpack_integer(ihdr[4:8])
3491
3492     idat = b''
3493
3494     for chunk in chunks:
3495         if chunk['type'] == b'IDAT':
3496             idat += chunk['data']
3497
3498     if not idat:
3499         raise IOError('Unable to read PNG data.')
3500
3501     decompressed_data = bytearray(zlib.decompress(idat))
3502
3503     stride = width * 3
3504     pixels = []
3505
3506     def _get_pixel(idx):
3507         x = idx % stride
3508         y = idx // stride
3509         return pixels[y][x]
3510
3511     for y in range(height):
3512         basePos = y * (1 + stride)
3513         filter_type = decompressed_data[basePos]
3514
3515         current_row = []
3516
3517         pixels.append(current_row)
3518
3519         for x in range(stride):
3520             color = decompressed_data[1 + basePos + x]
3521             basex = y * stride + x
3522             left = 0
3523             up = 0
3524
3525             if x > 2:
3526                 left = _get_pixel(basex - 3)
3527             if y > 0:
3528                 up = _get_pixel(basex - stride)
3529
3530             if filter_type == 1:  # Sub
3531                 color = (color + left) & 0xff
3532             elif filter_type == 2:  # Up
3533                 color = (color + up) & 0xff
3534             elif filter_type == 3:  # Average
3535                 color = (color + ((left + up) >> 1)) & 0xff
3536             elif filter_type == 4:  # Paeth
3537                 a = left
3538                 b = up
3539                 c = 0
3540
3541                 if x > 2 and y > 0:
3542                     c = _get_pixel(basex - stride - 3)
3543
3544                 p = a + b - c
3545
3546                 pa = abs(p - a)
3547                 pb = abs(p - b)
3548                 pc = abs(p - c)
3549
3550                 if pa <= pb and pa <= pc:
3551                     color = (color + a) & 0xff
3552                 elif pb <= pc:
3553                     color = (color + b) & 0xff
3554                 else:
3555                     color = (color + c) & 0xff
3556
3557             current_row.append(color)
3558
3559     return width, height, pixels
3560
3561
3562 def write_xattr(path, key, value):
3563     # This mess below finds the best xattr tool for the job
3564     try:
3565         # try the pyxattr module...
3566         import xattr
3567
3568         if hasattr(xattr, 'set'):  # pyxattr
3569             # Unicode arguments are not supported in python-pyxattr until
3570             # version 0.5.0
3571             # See https://github.com/rg3/youtube-dl/issues/5498
3572             pyxattr_required_version = '0.5.0'
3573             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3574                 # TODO: fallback to CLI tools
3575                 raise XAttrUnavailableError(
3576                     'python-pyxattr is detected but is too old. '
3577                     'youtube-dl requires %s or above while your version is %s. '
3578                     'Falling back to other xattr implementations' % (
3579                         pyxattr_required_version, xattr.__version__))
3580
3581             setxattr = xattr.set
3582         else:  # xattr
3583             setxattr = xattr.setxattr
3584
3585         try:
3586             setxattr(path, key, value)
3587         except EnvironmentError as e:
3588             raise XAttrMetadataError(e.errno, e.strerror)
3589
3590     except ImportError:
3591         if compat_os_name == 'nt':
3592             # Write xattrs to NTFS Alternate Data Streams:
3593             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3594             assert ':' not in key
3595             assert os.path.exists(path)
3596
3597             ads_fn = path + ':' + key
3598             try:
3599                 with open(ads_fn, 'wb') as f:
3600                     f.write(value)
3601             except EnvironmentError as e:
3602                 raise XAttrMetadataError(e.errno, e.strerror)
3603         else:
3604             user_has_setfattr = check_executable('setfattr', ['--version'])
3605             user_has_xattr = check_executable('xattr', ['-h'])
3606
3607             if user_has_setfattr or user_has_xattr:
3608
3609                 value = value.decode('utf-8')
3610                 if user_has_setfattr:
3611                     executable = 'setfattr'
3612                     opts = ['-n', key, '-v', value]
3613                 elif user_has_xattr:
3614                     executable = 'xattr'
3615                     opts = ['-w', key, value]
3616
3617                 cmd = ([encodeFilename(executable, True)] +
3618                        [encodeArgument(o) for o in opts] +
3619                        [encodeFilename(path, True)])
3620
3621                 try:
3622                     p = subprocess.Popen(
3623                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3624                 except EnvironmentError as e:
3625                     raise XAttrMetadataError(e.errno, e.strerror)
3626                 stdout, stderr = p.communicate()
3627                 stderr = stderr.decode('utf-8', 'replace')
3628                 if p.returncode != 0:
3629                     raise XAttrMetadataError(p.returncode, stderr)
3630
3631             else:
3632                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3633                 if sys.platform.startswith('linux'):
3634                     raise XAttrUnavailableError(
3635                         "Couldn't find a tool to set the xattrs. "
3636                         "Install either the python 'pyxattr' or 'xattr' "
3637                         "modules, or the GNU 'attr' package "
3638                         "(which contains the 'setfattr' tool).")
3639                 else:
3640                     raise XAttrUnavailableError(
3641                         "Couldn't find a tool to set the xattrs. "
3642                         "Install either the python 'xattr' module, "
3643                         "or the 'xattr' binary.")