_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParser,
  39     compat_basestring,
  40     compat_chr,
  41     compat_etree_fromstring,
  42     compat_html_entities,
  43     compat_html_entities_html5,
  44     compat_http_client,
  45     compat_kwargs,
  46     compat_os_name,
  47     compat_parse_qs,
  48     compat_shlex_quote,
  49     compat_socket_create_connection,
  50     compat_str,
  51     compat_struct_pack,
  52     compat_struct_unpack,
  53     compat_urllib_error,
  54     compat_urllib_parse,
  55     compat_urllib_parse_urlencode,
  56     compat_urllib_parse_urlparse,
  57     compat_urllib_parse_unquote_plus,
  58     compat_urllib_request,
  59     compat_urlparse,
  60     compat_xpath,
  61 )
  62
  63 from .socks import (
  64     ProxyType,
  65     sockssocket,
  66 )
  67
  68
  69 def register_socks_protocols():
  70     # "Register" SOCKS protocols
  71     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  72     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  73     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  74         if scheme not in compat_urlparse.uses_netloc:
  75             compat_urlparse.uses_netloc.append(scheme)
  76
  77
  78 # This is not clearly defined otherwise
  79 compiled_regex_type = type(re.compile(''))
  80
  81 std_headers = {
  82     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  83     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  84     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  85     'Accept-Encoding': 'gzip, deflate',
  86     'Accept-Language': 'en-us,en;q=0.5',
  87 }
  88
  89
  90 USER_AGENTS = {
  91     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  92 }
  93
  94
  95 NO_DEFAULT = object()
  96
  97 ENGLISH_MONTH_NAMES = [
  98     'January', 'February', 'March', 'April', 'May', 'June',
  99     'July', 'August', 'September', 'October', 'November', 'December']
 100
 101 MONTH_NAMES = {
 102     'en': ENGLISH_MONTH_NAMES,
 103     'fr': [
 104         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 105         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 106 }
 107
 108 KNOWN_EXTENSIONS = (
 109     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 110     'flv', 'f4v', 'f4a', 'f4b',
 111     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 112     'mkv', 'mka', 'mk3d',
 113     'avi', 'divx',
 114     'mov',
 115     'asf', 'wmv', 'wma',
 116     '3gp', '3g2',
 117     'mp3',
 118     'flac',
 119     'ape',
 120     'wav',
 121     'f4f', 'f4m', 'm3u8', 'smil')
 122
 123 # needed for sanitizing filenames in restricted mode
 124 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 125                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 126                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 127
 128 DATE_FORMATS = (
 129     '%d %B %Y',
 130     '%d %b %Y',
 131     '%B %d %Y',
 132     '%B %dst %Y',
 133     '%B %dnd %Y',
 134     '%B %dth %Y',
 135     '%b %d %Y',
 136     '%b %dst %Y',
 137     '%b %dnd %Y',
 138     '%b %dth %Y',
 139     '%b %dst %Y %I:%M',
 140     '%b %dnd %Y %I:%M',
 141     '%b %dth %Y %I:%M',
 142     '%Y %m %d',
 143     '%Y-%m-%d',
 144     '%Y/%m/%d',
 145     '%Y/%m/%d %H:%M',
 146     '%Y/%m/%d %H:%M:%S',
 147     '%Y-%m-%d %H:%M',
 148     '%Y-%m-%d %H:%M:%S',
 149     '%Y-%m-%d %H:%M:%S.%f',
 150     '%d.%m.%Y %H:%M',
 151     '%d.%m.%Y %H.%M',
 152     '%Y-%m-%dT%H:%M:%SZ',
 153     '%Y-%m-%dT%H:%M:%S.%fZ',
 154     '%Y-%m-%dT%H:%M:%S.%f0Z',
 155     '%Y-%m-%dT%H:%M:%S',
 156     '%Y-%m-%dT%H:%M:%S.%f',
 157     '%Y-%m-%dT%H:%M',
 158     '%b %d %Y at %H:%M',
 159     '%b %d %Y at %H:%M:%S',
 160 )
 161
 162 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 163 DATE_FORMATS_DAY_FIRST.extend([
 164     '%d-%m-%Y',
 165     '%d.%m.%Y',
 166     '%d.%m.%y',
 167     '%d/%m/%Y',
 168     '%d/%m/%y',
 169     '%d/%m/%Y %H:%M:%S',
 170 ])
 171
 172 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 173 DATE_FORMATS_MONTH_FIRST.extend([
 174     '%m-%d-%Y',
 175     '%m.%d.%Y',
 176     '%m/%d/%Y',
 177     '%m/%d/%y',
 178     '%m/%d/%Y %H:%M:%S',
 179 ])
 180
 181 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 182
 183
 184 def preferredencoding():
 185     """Get preferred encoding.
 186
 187     Returns the best encoding scheme for the system, based on
 188     locale.getpreferredencoding() and some further tweaks.
 189     """
 190     try:
 191         pref = locale.getpreferredencoding()
 192         'TEST'.encode(pref)
 193     except Exception:
 194         pref = 'UTF-8'
 195
 196     return pref
 197
 198
 199 def write_json_file(obj, fn):
 200     """ Encode obj as JSON and write it to fn, atomically if possible """
 201
 202     fn = encodeFilename(fn)
 203     if sys.version_info < (3, 0) and sys.platform != 'win32':
 204         encoding = get_filesystem_encoding()
 205         # os.path.basename returns a bytes object, but NamedTemporaryFile
 206         # will fail if the filename contains non ascii characters unless we
 207         # use a unicode object
 208         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 209         # the same for os.path.dirname
 210         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 211     else:
 212         path_basename = os.path.basename
 213         path_dirname = os.path.dirname
 214
 215     args = {
 216         'suffix': '.tmp',
 217         'prefix': path_basename(fn) + '.',
 218         'dir': path_dirname(fn),
 219         'delete': False,
 220     }
 221
 222     # In Python 2.x, json.dump expects a bytestream.
 223     # In Python 3.x, it writes to a character stream
 224     if sys.version_info < (3, 0):
 225         args['mode'] = 'wb'
 226     else:
 227         args.update({
 228             'mode': 'w',
 229             'encoding': 'utf-8',
 230         })
 231
 232     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 233
 234     try:
 235         with tf:
 236             json.dump(obj, tf)
 237         if sys.platform == 'win32':
 238             # Need to remove existing file on Windows, else os.rename raises
 239             # WindowsError or FileExistsError.
 240             try:
 241                 os.unlink(fn)
 242             except OSError:
 243                 pass
 244         os.rename(tf.name, fn)
 245     except Exception:
 246         try:
 247             os.remove(tf.name)
 248         except OSError:
 249             pass
 250         raise
 251
 252
 253 if sys.version_info >= (2, 7):
 254     def find_xpath_attr(node, xpath, key, val=None):
 255         """ Find the xpath xpath[@key=val] """
 256         assert re.match(r'^[a-zA-Z_-]+$', key)
 257         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 258         return node.find(expr)
 259 else:
 260     def find_xpath_attr(node, xpath, key, val=None):
 261         for f in node.findall(compat_xpath(xpath)):
 262             if key not in f.attrib:
 263                 continue
 264             if val is None or f.attrib.get(key) == val:
 265                 return f
 266         return None
 267
 268 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 269 # the namespace parameter
 270
 271
 272 def xpath_with_ns(path, ns_map):
 273     components = [c.split(':') for c in path.split('/')]
 274     replaced = []
 275     for c in components:
 276         if len(c) == 1:
 277             replaced.append(c[0])
 278         else:
 279             ns, tag = c
 280             replaced.append('{%s}%s' % (ns_map[ns], tag))
 281     return '/'.join(replaced)
 282
 283
 284 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 285     def _find_xpath(xpath):
 286         return node.find(compat_xpath(xpath))
 287
 288     if isinstance(xpath, (str, compat_str)):
 289         n = _find_xpath(xpath)
 290     else:
 291         for xp in xpath:
 292             n = _find_xpath(xp)
 293             if n is not None:
 294                 break
 295
 296     if n is None:
 297         if default is not NO_DEFAULT:
 298             return default
 299         elif fatal:
 300             name = xpath if name is None else name
 301             raise ExtractorError('Could not find XML element %s' % name)
 302         else:
 303             return None
 304     return n
 305
 306
 307 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 308     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 309     if n is None or n == default:
 310         return n
 311     if n.text is None:
 312         if default is not NO_DEFAULT:
 313             return default
 314         elif fatal:
 315             name = xpath if name is None else name
 316             raise ExtractorError('Could not find XML element\'s text %s' % name)
 317         else:
 318             return None
 319     return n.text
 320
 321
 322 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 323     n = find_xpath_attr(node, xpath, key)
 324     if n is None:
 325         if default is not NO_DEFAULT:
 326             return default
 327         elif fatal:
 328             name = '%s[@%s]' % (xpath, key) if name is None else name
 329             raise ExtractorError('Could not find XML attribute %s' % name)
 330         else:
 331             return None
 332     return n.attrib[key]
 333
 334
 335 def get_element_by_id(id, html):
 336     """Return the content of the tag with the specified ID in the passed HTML document"""
 337     return get_element_by_attribute('id', id, html)
 338
 339
 340 def get_element_by_class(class_name, html):
 341     """Return the content of the first tag with the specified class in the passed HTML document"""
 342     retval = get_elements_by_class(class_name, html)
 343     return retval[0] if retval else None
 344
 345
 346 def get_element_by_attribute(attribute, value, html, escape_value=True):
 347     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 348     return retval[0] if retval else None
 349
 350
 351 def get_elements_by_class(class_name, html):
 352     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 353     return get_elements_by_attribute(
 354         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 355         html, escape_value=False)
 356
 357
 358 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 359     """Return the content of the tag with the specified attribute in the passed HTML document"""
 360
 361     value = re.escape(value) if escape_value else value
 362
 363     retlist = []
 364     for m in re.finditer(r'''(?xs)
 365         <([a-zA-Z0-9:._-]+)
 366          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 367          \s+%s=['"]?%s['"]?
 368          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 369         \s*>
 370         (?P<content>.*?)
 371         </\1>
 372     ''' % (re.escape(attribute), value), html):
 373         res = m.group('content')
 374
 375         if res.startswith('"') or res.startswith("'"):
 376             res = res[1:-1]
 377
 378         retlist.append(unescapeHTML(res))
 379
 380     return retlist
 381
 382
 383 class HTMLAttributeParser(compat_HTMLParser):
 384     """Trivial HTML parser to gather the attributes for a single element"""
 385     def __init__(self):
 386         self.attrs = {}
 387         compat_HTMLParser.__init__(self)
 388
 389     def handle_starttag(self, tag, attrs):
 390         self.attrs = dict(attrs)
 391
 392
 393 def extract_attributes(html_element):
 394     """Given a string for an HTML element such as
 395     <el
 396          a="foo" B="bar" c="&98;az" d=boz
 397          empty= noval entity="&amp;"
 398          sq='"' dq="'"
 399     >
 400     Decode and return a dictionary of attributes.
 401     {
 402         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 403         'empty': '', 'noval': None, 'entity': '&',
 404         'sq': '"', 'dq': '\''
 405     }.
 406     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 407     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 408     """
 409     parser = HTMLAttributeParser()
 410     parser.feed(html_element)
 411     parser.close()
 412     return parser.attrs
 413
 414
 415 def clean_html(html):
 416     """Clean an HTML snippet into a readable string"""
 417
 418     if html is None:  # Convenience for sanitizing descriptions etc.
 419         return html
 420
 421     # Newline vs <br />
 422     html = html.replace('\n', ' ')
 423     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 424     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 425     # Strip html tags
 426     html = re.sub('<.*?>', '', html)
 427     # Replace html entities
 428     html = unescapeHTML(html)
 429     return html.strip()
 430
 431
 432 def sanitize_open(filename, open_mode):
 433     """Try to open the given filename, and slightly tweak it if this fails.
 434
 435     Attempts to open the given filename. If this fails, it tries to change
 436     the filename slightly, step by step, until it's either able to open it
 437     or it fails and raises a final exception, like the standard open()
 438     function.
 439
 440     It returns the tuple (stream, definitive_file_name).
 441     """
 442     try:
 443         if filename == '-':
 444             if sys.platform == 'win32':
 445                 import msvcrt
 446                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 447             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 448         stream = open(encodeFilename(filename), open_mode)
 449         return (stream, filename)
 450     except (IOError, OSError) as err:
 451         if err.errno in (errno.EACCES,):
 452             raise
 453
 454         # In case of error, try to remove win32 forbidden chars
 455         alt_filename = sanitize_path(filename)
 456         if alt_filename == filename:
 457             raise
 458         else:
 459             # An exception here should be caught in the caller
 460             stream = open(encodeFilename(alt_filename), open_mode)
 461             return (stream, alt_filename)
 462
 463
 464 def timeconvert(timestr):
 465     """Convert RFC 2822 defined time string into system timestamp"""
 466     timestamp = None
 467     timetuple = email.utils.parsedate_tz(timestr)
 468     if timetuple is not None:
 469         timestamp = email.utils.mktime_tz(timetuple)
 470     return timestamp
 471
 472
 473 def sanitize_filename(s, restricted=False, is_id=False):
 474     """Sanitizes a string so it could be used as part of a filename.
 475     If restricted is set, use a stricter subset of allowed characters.
 476     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 477     """
 478     def replace_insane(char):
 479         if restricted and char in ACCENT_CHARS:
 480             return ACCENT_CHARS[char]
 481         if char == '?' or ord(char) < 32 or ord(char) == 127:
 482             return ''
 483         elif char == '"':
 484             return '' if restricted else '\''
 485         elif char == ':':
 486             return '_-' if restricted else ' -'
 487         elif char in '\\/|*<>':
 488             return '_'
 489         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 490             return '_'
 491         if restricted and ord(char) > 127:
 492             return '_'
 493         return char
 494
 495     # Handle timestamps
 496     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 497     result = ''.join(map(replace_insane, s))
 498     if not is_id:
 499         while '__' in result:
 500             result = result.replace('__', '_')
 501         result = result.strip('_')
 502         # Common case of "Foreign band name - English song title"
 503         if restricted and result.startswith('-_'):
 504             result = result[2:]
 505         if result.startswith('-'):
 506             result = '_' + result[len('-'):]
 507         result = result.lstrip('.')
 508         if not result:
 509             result = '_'
 510     return result
 511
 512
 513 def sanitize_path(s):
 514     """Sanitizes and normalizes path on Windows"""
 515     if sys.platform != 'win32':
 516         return s
 517     drive_or_unc, _ = os.path.splitdrive(s)
 518     if sys.version_info < (2, 7) and not drive_or_unc:
 519         drive_or_unc, _ = os.path.splitunc(s)
 520     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 521     if drive_or_unc:
 522         norm_path.pop(0)
 523     sanitized_path = [
 524         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 525         for path_part in norm_path]
 526     if drive_or_unc:
 527         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 528     return os.path.join(*sanitized_path)
 529
 530
 531 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 532 # unwanted failures due to missing protocol
 533 def sanitize_url(url):
 534     return 'http:%s' % url if url.startswith('//') else url
 535
 536
 537 def sanitized_Request(url, *args, **kwargs):
 538     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 539
 540
 541 def orderedSet(iterable):
 542     """ Remove all duplicates from the input iterable """
 543     res = []
 544     for el in iterable:
 545         if el not in res:
 546             res.append(el)
 547     return res
 548
 549
 550 def _htmlentity_transform(entity_with_semicolon):
 551     """Transforms an HTML entity to a character."""
 552     entity = entity_with_semicolon[:-1]
 553
 554     # Known non-numeric HTML entity
 555     if entity in compat_html_entities.name2codepoint:
 556         return compat_chr(compat_html_entities.name2codepoint[entity])
 557
 558     # TODO: HTML5 allows entities without a semicolon. For example,
 559     # '&Eacuteric' should be decoded as 'Éric'.
 560     if entity_with_semicolon in compat_html_entities_html5:
 561         return compat_html_entities_html5[entity_with_semicolon]
 562
 563     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 564     if mobj is not None:
 565         numstr = mobj.group(1)
 566         if numstr.startswith('x'):
 567             base = 16
 568             numstr = '0%s' % numstr
 569         else:
 570             base = 10
 571         # See https://github.com/rg3/youtube-dl/issues/7518
 572         try:
 573             return compat_chr(int(numstr, base))
 574         except ValueError:
 575             pass
 576
 577     # Unknown entity in name, return its literal representation
 578     return '&%s;' % entity
 579
 580
 581 def unescapeHTML(s):
 582     if s is None:
 583         return None
 584     assert type(s) == compat_str
 585
 586     return re.sub(
 587         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 588
 589
 590 def get_subprocess_encoding():
 591     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 592         # For subprocess calls, encode with locale encoding
 593         # Refer to http://stackoverflow.com/a/9951851/35070
 594         encoding = preferredencoding()
 595     else:
 596         encoding = sys.getfilesystemencoding()
 597     if encoding is None:
 598         encoding = 'utf-8'
 599     return encoding
 600
 601
 602 def encodeFilename(s, for_subprocess=False):
 603     """
 604     @param s The name of the file
 605     """
 606
 607     assert type(s) == compat_str
 608
 609     # Python 3 has a Unicode API
 610     if sys.version_info >= (3, 0):
 611         return s
 612
 613     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 614     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 615     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 616     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 617         return s
 618
 619     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 620     if sys.platform.startswith('java'):
 621         return s
 622
 623     return s.encode(get_subprocess_encoding(), 'ignore')
 624
 625
 626 def decodeFilename(b, for_subprocess=False):
 627
 628     if sys.version_info >= (3, 0):
 629         return b
 630
 631     if not isinstance(b, bytes):
 632         return b
 633
 634     return b.decode(get_subprocess_encoding(), 'ignore')
 635
 636
 637 def encodeArgument(s):
 638     if not isinstance(s, compat_str):
 639         # Legacy code that uses byte strings
 640         # Uncomment the following line after fixing all post processors
 641         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 642         s = s.decode('ascii')
 643     return encodeFilename(s, True)
 644
 645
 646 def decodeArgument(b):
 647     return decodeFilename(b, True)
 648
 649
 650 def decodeOption(optval):
 651     if optval is None:
 652         return optval
 653     if isinstance(optval, bytes):
 654         optval = optval.decode(preferredencoding())
 655
 656     assert isinstance(optval, compat_str)
 657     return optval
 658
 659
 660 def formatSeconds(secs):
 661     if secs > 3600:
 662         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 663     elif secs > 60:
 664         return '%d:%02d' % (secs // 60, secs % 60)
 665     else:
 666         return '%d' % secs
 667
 668
 669 def make_HTTPS_handler(params, **kwargs):
 670     opts_no_check_certificate = params.get('nocheckcertificate', False)
 671     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 672         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 673         if opts_no_check_certificate:
 674             context.check_hostname = False
 675             context.verify_mode = ssl.CERT_NONE
 676         try:
 677             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 678         except TypeError:
 679             # Python 2.7.8
 680             # (create_default_context present but HTTPSHandler has no context=)
 681             pass
 682
 683     if sys.version_info < (3, 2):
 684         return YoutubeDLHTTPSHandler(params, **kwargs)
 685     else:  # Python < 3.4
 686         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 687         context.verify_mode = (ssl.CERT_NONE
 688                                if opts_no_check_certificate
 689                                else ssl.CERT_REQUIRED)
 690         context.set_default_verify_paths()
 691         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 692
 693
 694 def bug_reports_message():
 695     if ytdl_is_updateable():
 696         update_cmd = 'type  youtube-dl -U  to update'
 697     else:
 698         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 699     msg = '; please report this issue on https://yt-dl.org/bug .'
 700     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 701     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 702     return msg
 703
 704
 705 class YoutubeDLError(Exception):
 706     """Base exception for YoutubeDL errors."""
 707     pass
 708
 709
 710 class ExtractorError(YoutubeDLError):
 711     """Error during info extraction."""
 712
 713     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 714         """ tb, if given, is the original traceback (so that it can be printed out).
 715         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 716         """
 717
 718         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 719             expected = True
 720         if video_id is not None:
 721             msg = video_id + ': ' + msg
 722         if cause:
 723             msg += ' (caused by %r)' % cause
 724         if not expected:
 725             msg += bug_reports_message()
 726         super(ExtractorError, self).__init__(msg)
 727
 728         self.traceback = tb
 729         self.exc_info = sys.exc_info()  # preserve original exception
 730         self.cause = cause
 731         self.video_id = video_id
 732
 733     def format_traceback(self):
 734         if self.traceback is None:
 735             return None
 736         return ''.join(traceback.format_tb(self.traceback))
 737
 738
 739 class UnsupportedError(ExtractorError):
 740     def __init__(self, url):
 741         super(UnsupportedError, self).__init__(
 742             'Unsupported URL: %s' % url, expected=True)
 743         self.url = url
 744
 745
 746 class RegexNotFoundError(ExtractorError):
 747     """Error when a regex didn't match"""
 748     pass
 749
 750
 751 class GeoRestrictedError(ExtractorError):
 752     """Geographic restriction Error exception.
 753
 754     This exception may be thrown when a video is not available from your
 755     geographic location due to geographic restrictions imposed by a website.
 756     """
 757     def __init__(self, msg, countries=None):
 758         super(GeoRestrictedError, self).__init__(msg, expected=True)
 759         self.msg = msg
 760         self.countries = countries
 761
 762
 763 class DownloadError(YoutubeDLError):
 764     """Download Error exception.
 765
 766     This exception may be thrown by FileDownloader objects if they are not
 767     configured to continue on errors. They will contain the appropriate
 768     error message.
 769     """
 770
 771     def __init__(self, msg, exc_info=None):
 772         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 773         super(DownloadError, self).__init__(msg)
 774         self.exc_info = exc_info
 775
 776
 777 class SameFileError(YoutubeDLError):
 778     """Same File exception.
 779
 780     This exception will be thrown by FileDownloader objects if they detect
 781     multiple files would have to be downloaded to the same file on disk.
 782     """
 783     pass
 784
 785
 786 class PostProcessingError(YoutubeDLError):
 787     """Post Processing exception.
 788
 789     This exception may be raised by PostProcessor's .run() method to
 790     indicate an error in the postprocessing task.
 791     """
 792
 793     def __init__(self, msg):
 794         super(PostProcessingError, self).__init__(msg)
 795         self.msg = msg
 796
 797
 798 class MaxDownloadsReached(YoutubeDLError):
 799     """ --max-downloads limit has been reached. """
 800     pass
 801
 802
 803 class UnavailableVideoError(YoutubeDLError):
 804     """Unavailable Format exception.
 805
 806     This exception will be thrown when a video is requested
 807     in a format that is not available for that video.
 808     """
 809     pass
 810
 811
 812 class ContentTooShortError(YoutubeDLError):
 813     """Content Too Short exception.
 814
 815     This exception may be raised by FileDownloader objects when a file they
 816     download is too small for what the server announced first, indicating
 817     the connection was probably interrupted.
 818     """
 819
 820     def __init__(self, downloaded, expected):
 821         super(ContentTooShortError, self).__init__(
 822             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 823         )
 824         # Both in bytes
 825         self.downloaded = downloaded
 826         self.expected = expected
 827
 828
 829 class XAttrMetadataError(YoutubeDLError):
 830     def __init__(self, code=None, msg='Unknown error'):
 831         super(XAttrMetadataError, self).__init__(msg)
 832         self.code = code
 833         self.msg = msg
 834
 835         # Parsing code and msg
 836         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 837                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 838             self.reason = 'NO_SPACE'
 839         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 840             self.reason = 'VALUE_TOO_LONG'
 841         else:
 842             self.reason = 'NOT_SUPPORTED'
 843
 844
 845 class XAttrUnavailableError(YoutubeDLError):
 846     pass
 847
 848
 849 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 850     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 851     # expected HTTP responses to meet HTTP/1.0 or later (see also
 852     # https://github.com/rg3/youtube-dl/issues/6727)
 853     if sys.version_info < (3, 0):
 854         kwargs[b'strict'] = True
 855     hc = http_class(*args, **kwargs)
 856     source_address = ydl_handler._params.get('source_address')
 857     if source_address is not None:
 858         sa = (source_address, 0)
 859         if hasattr(hc, 'source_address'):  # Python 2.7+
 860             hc.source_address = sa
 861         else:  # Python 2.6
 862             def _hc_connect(self, *args, **kwargs):
 863                 sock = compat_socket_create_connection(
 864                     (self.host, self.port), self.timeout, sa)
 865                 if is_https:
 866                     self.sock = ssl.wrap_socket(
 867                         sock, self.key_file, self.cert_file,
 868                         ssl_version=ssl.PROTOCOL_TLSv1)
 869                 else:
 870                     self.sock = sock
 871             hc.connect = functools.partial(_hc_connect, hc)
 872
 873     return hc
 874
 875
 876 def handle_youtubedl_headers(headers):
 877     filtered_headers = headers
 878
 879     if 'Youtubedl-no-compression' in filtered_headers:
 880         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 881         del filtered_headers['Youtubedl-no-compression']
 882
 883     return filtered_headers
 884
 885
 886 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 887     """Handler for HTTP requests and responses.
 888
 889     This class, when installed with an OpenerDirector, automatically adds
 890     the standard headers to every HTTP request and handles gzipped and
 891     deflated responses from web servers. If compression is to be avoided in
 892     a particular request, the original request in the program code only has
 893     to include the HTTP header "Youtubedl-no-compression", which will be
 894     removed before making the real request.
 895
 896     Part of this code was copied from:
 897
 898     http://techknack.net/python-urllib2-handlers/
 899
 900     Andrew Rowls, the author of that code, agreed to release it to the
 901     public domain.
 902     """
 903
 904     def __init__(self, params, *args, **kwargs):
 905         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 906         self._params = params
 907
 908     def http_open(self, req):
 909         conn_class = compat_http_client.HTTPConnection
 910
 911         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 912         if socks_proxy:
 913             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 914             del req.headers['Ytdl-socks-proxy']
 915
 916         return self.do_open(functools.partial(
 917             _create_http_connection, self, conn_class, False),
 918             req)
 919
 920     @staticmethod
 921     def deflate(data):
 922         try:
 923             return zlib.decompress(data, -zlib.MAX_WBITS)
 924         except zlib.error:
 925             return zlib.decompress(data)
 926
 927     @staticmethod
 928     def addinfourl_wrapper(stream, headers, url, code):
 929         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 930             return compat_urllib_request.addinfourl(stream, headers, url, code)
 931         ret = compat_urllib_request.addinfourl(stream, headers, url)
 932         ret.code = code
 933         return ret
 934
 935     def http_request(self, req):
 936         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 937         # always respected by websites, some tend to give out URLs with non percent-encoded
 938         # non-ASCII characters (see telemb.py, ard.py [#3412])
 939         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 940         # To work around aforementioned issue we will replace request's original URL with
 941         # percent-encoded one
 942         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 943         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 944         url = req.get_full_url()
 945         url_escaped = escape_url(url)
 946
 947         # Substitute URL if any change after escaping
 948         if url != url_escaped:
 949             req = update_Request(req, url=url_escaped)
 950
 951         for h, v in std_headers.items():
 952             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 953             # The dict keys are capitalized because of this bug by urllib
 954             if h.capitalize() not in req.headers:
 955                 req.add_header(h, v)
 956
 957         req.headers = handle_youtubedl_headers(req.headers)
 958
 959         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 960             # Python 2.6 is brain-dead when it comes to fragments
 961             req._Request__original = req._Request__original.partition('#')[0]
 962             req._Request__r_type = req._Request__r_type.partition('#')[0]
 963
 964         return req
 965
 966     def http_response(self, req, resp):
 967         old_resp = resp
 968         # gzip
 969         if resp.headers.get('Content-encoding', '') == 'gzip':
 970             content = resp.read()
 971             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 972             try:
 973                 uncompressed = io.BytesIO(gz.read())
 974             except IOError as original_ioerror:
 975                 # There may be junk add the end of the file
 976                 # See http://stackoverflow.com/q/4928560/35070 for details
 977                 for i in range(1, 1024):
 978                     try:
 979                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 980                         uncompressed = io.BytesIO(gz.read())
 981                     except IOError:
 982                         continue
 983                     break
 984                 else:
 985                     raise original_ioerror
 986             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 987             resp.msg = old_resp.msg
 988             del resp.headers['Content-encoding']
 989         # deflate
 990         if resp.headers.get('Content-encoding', '') == 'deflate':
 991             gz = io.BytesIO(self.deflate(resp.read()))
 992             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 993             resp.msg = old_resp.msg
 994             del resp.headers['Content-encoding']
 995         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 996         # https://github.com/rg3/youtube-dl/issues/6457).
 997         if 300 <= resp.code < 400:
 998             location = resp.headers.get('Location')
 999             if location:
1000                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1001                 if sys.version_info >= (3, 0):
1002                     location = location.encode('iso-8859-1').decode('utf-8')
1003                 else:
1004                     location = location.decode('utf-8')
1005                 location_escaped = escape_url(location)
1006                 if location != location_escaped:
1007                     del resp.headers['Location']
1008                     if sys.version_info < (3, 0):
1009                         location_escaped = location_escaped.encode('utf-8')
1010                     resp.headers['Location'] = location_escaped
1011         return resp
1012
1013     https_request = http_request
1014     https_response = http_response
1015
1016
1017 def make_socks_conn_class(base_class, socks_proxy):
1018     assert issubclass(base_class, (
1019         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1020
1021     url_components = compat_urlparse.urlparse(socks_proxy)
1022     if url_components.scheme.lower() == 'socks5':
1023         socks_type = ProxyType.SOCKS5
1024     elif url_components.scheme.lower() in ('socks', 'socks4'):
1025         socks_type = ProxyType.SOCKS4
1026     elif url_components.scheme.lower() == 'socks4a':
1027         socks_type = ProxyType.SOCKS4A
1028
1029     def unquote_if_non_empty(s):
1030         if not s:
1031             return s
1032         return compat_urllib_parse_unquote_plus(s)
1033
1034     proxy_args = (
1035         socks_type,
1036         url_components.hostname, url_components.port or 1080,
1037         True,  # Remote DNS
1038         unquote_if_non_empty(url_components.username),
1039         unquote_if_non_empty(url_components.password),
1040     )
1041
1042     class SocksConnection(base_class):
1043         def connect(self):
1044             self.sock = sockssocket()
1045             self.sock.setproxy(*proxy_args)
1046             if type(self.timeout) in (int, float):
1047                 self.sock.settimeout(self.timeout)
1048             self.sock.connect((self.host, self.port))
1049
1050             if isinstance(self, compat_http_client.HTTPSConnection):
1051                 if hasattr(self, '_context'):  # Python > 2.6
1052                     self.sock = self._context.wrap_socket(
1053                         self.sock, server_hostname=self.host)
1054                 else:
1055                     self.sock = ssl.wrap_socket(self.sock)
1056
1057     return SocksConnection
1058
1059
1060 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1061     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1062         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1063         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1064         self._params = params
1065
1066     def https_open(self, req):
1067         kwargs = {}
1068         conn_class = self._https_conn_class
1069
1070         if hasattr(self, '_context'):  # python > 2.6
1071             kwargs['context'] = self._context
1072         if hasattr(self, '_check_hostname'):  # python 3.x
1073             kwargs['check_hostname'] = self._check_hostname
1074
1075         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1076         if socks_proxy:
1077             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1078             del req.headers['Ytdl-socks-proxy']
1079
1080         return self.do_open(functools.partial(
1081             _create_http_connection, self, conn_class, True),
1082             req, **kwargs)
1083
1084
1085 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1086     def __init__(self, cookiejar=None):
1087         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1088
1089     def http_response(self, request, response):
1090         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1091         # characters in Set-Cookie HTTP header of last response (see
1092         # https://github.com/rg3/youtube-dl/issues/6769).
1093         # In order to at least prevent crashing we will percent encode Set-Cookie
1094         # header before HTTPCookieProcessor starts processing it.
1095         # if sys.version_info < (3, 0) and response.headers:
1096         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1097         #         set_cookie = response.headers.get(set_cookie_header)
1098         #         if set_cookie:
1099         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1100         #             if set_cookie != set_cookie_escaped:
1101         #                 del response.headers[set_cookie_header]
1102         #                 response.headers[set_cookie_header] = set_cookie_escaped
1103         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1104
1105     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1106     https_response = http_response
1107
1108
1109 def extract_timezone(date_str):
1110     m = re.search(
1111         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1112         date_str)
1113     if not m:
1114         timezone = datetime.timedelta()
1115     else:
1116         date_str = date_str[:-len(m.group('tz'))]
1117         if not m.group('sign'):
1118             timezone = datetime.timedelta()
1119         else:
1120             sign = 1 if m.group('sign') == '+' else -1
1121             timezone = datetime.timedelta(
1122                 hours=sign * int(m.group('hours')),
1123                 minutes=sign * int(m.group('minutes')))
1124     return timezone, date_str
1125
1126
1127 def parse_iso8601(date_str, delimiter='T', timezone=None):
1128     """ Return a UNIX timestamp from the given date """
1129
1130     if date_str is None:
1131         return None
1132
1133     date_str = re.sub(r'\.[0-9]+', '', date_str)
1134
1135     if timezone is None:
1136         timezone, date_str = extract_timezone(date_str)
1137
1138     try:
1139         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1140         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1141         return calendar.timegm(dt.timetuple())
1142     except ValueError:
1143         pass
1144
1145
1146 def date_formats(day_first=True):
1147     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1148
1149
1150 def unified_strdate(date_str, day_first=True):
1151     """Return a string with the date in the format YYYYMMDD"""
1152
1153     if date_str is None:
1154         return None
1155     upload_date = None
1156     # Replace commas
1157     date_str = date_str.replace(',', ' ')
1158     # Remove AM/PM + timezone
1159     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1160     _, date_str = extract_timezone(date_str)
1161
1162     for expression in date_formats(day_first):
1163         try:
1164             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1165         except ValueError:
1166             pass
1167     if upload_date is None:
1168         timetuple = email.utils.parsedate_tz(date_str)
1169         if timetuple:
1170             try:
1171                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1172             except ValueError:
1173                 pass
1174     if upload_date is not None:
1175         return compat_str(upload_date)
1176
1177
1178 def unified_timestamp(date_str, day_first=True):
1179     if date_str is None:
1180         return None
1181
1182     date_str = date_str.replace(',', ' ')
1183
1184     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1185     timezone, date_str = extract_timezone(date_str)
1186
1187     # Remove AM/PM + timezone
1188     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1189
1190     for expression in date_formats(day_first):
1191         try:
1192             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1193             return calendar.timegm(dt.timetuple())
1194         except ValueError:
1195             pass
1196     timetuple = email.utils.parsedate_tz(date_str)
1197     if timetuple:
1198         return calendar.timegm(timetuple) + pm_delta * 3600
1199
1200
1201 def determine_ext(url, default_ext='unknown_video'):
1202     if url is None:
1203         return default_ext
1204     guess = url.partition('?')[0].rpartition('.')[2]
1205     if re.match(r'^[A-Za-z0-9]+$', guess):
1206         return guess
1207     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1208     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1209         return guess.rstrip('/')
1210     else:
1211         return default_ext
1212
1213
1214 def subtitles_filename(filename, sub_lang, sub_format):
1215     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1216
1217
1218 def date_from_str(date_str):
1219     """
1220     Return a datetime object from a string in the format YYYYMMDD or
1221     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1222     today = datetime.date.today()
1223     if date_str in ('now', 'today'):
1224         return today
1225     if date_str == 'yesterday':
1226         return today - datetime.timedelta(days=1)
1227     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1228     if match is not None:
1229         sign = match.group('sign')
1230         time = int(match.group('time'))
1231         if sign == '-':
1232             time = -time
1233         unit = match.group('unit')
1234         # A bad approximation?
1235         if unit == 'month':
1236             unit = 'day'
1237             time *= 30
1238         elif unit == 'year':
1239             unit = 'day'
1240             time *= 365
1241         unit += 's'
1242         delta = datetime.timedelta(**{unit: time})
1243         return today + delta
1244     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1245
1246
1247 def hyphenate_date(date_str):
1248     """
1249     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1250     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1251     if match is not None:
1252         return '-'.join(match.groups())
1253     else:
1254         return date_str
1255
1256
1257 class DateRange(object):
1258     """Represents a time interval between two dates"""
1259
1260     def __init__(self, start=None, end=None):
1261         """start and end must be strings in the format accepted by date"""
1262         if start is not None:
1263             self.start = date_from_str(start)
1264         else:
1265             self.start = datetime.datetime.min.date()
1266         if end is not None:
1267             self.end = date_from_str(end)
1268         else:
1269             self.end = datetime.datetime.max.date()
1270         if self.start > self.end:
1271             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1272
1273     @classmethod
1274     def day(cls, day):
1275         """Returns a range that only contains the given day"""
1276         return cls(day, day)
1277
1278     def __contains__(self, date):
1279         """Check if the date is in the range"""
1280         if not isinstance(date, datetime.date):
1281             date = date_from_str(date)
1282         return self.start <= date <= self.end
1283
1284     def __str__(self):
1285         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1286
1287
1288 def platform_name():
1289     """ Returns the platform name as a compat_str """
1290     res = platform.platform()
1291     if isinstance(res, bytes):
1292         res = res.decode(preferredencoding())
1293
1294     assert isinstance(res, compat_str)
1295     return res
1296
1297
1298 def _windows_write_string(s, out):
1299     """ Returns True if the string was written using special methods,
1300     False if it has yet to be written out."""
1301     # Adapted from http://stackoverflow.com/a/3259271/35070
1302
1303     import ctypes
1304     import ctypes.wintypes
1305
1306     WIN_OUTPUT_IDS = {
1307         1: -11,
1308         2: -12,
1309     }
1310
1311     try:
1312         fileno = out.fileno()
1313     except AttributeError:
1314         # If the output stream doesn't have a fileno, it's virtual
1315         return False
1316     except io.UnsupportedOperation:
1317         # Some strange Windows pseudo files?
1318         return False
1319     if fileno not in WIN_OUTPUT_IDS:
1320         return False
1321
1322     GetStdHandle = ctypes.WINFUNCTYPE(
1323         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1324         (b'GetStdHandle', ctypes.windll.kernel32))
1325     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1326
1327     WriteConsoleW = ctypes.WINFUNCTYPE(
1328         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1329         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1330         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1331     written = ctypes.wintypes.DWORD(0)
1332
1333     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1334     FILE_TYPE_CHAR = 0x0002
1335     FILE_TYPE_REMOTE = 0x8000
1336     GetConsoleMode = ctypes.WINFUNCTYPE(
1337         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1338         ctypes.POINTER(ctypes.wintypes.DWORD))(
1339         (b'GetConsoleMode', ctypes.windll.kernel32))
1340     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1341
1342     def not_a_console(handle):
1343         if handle == INVALID_HANDLE_VALUE or handle is None:
1344             return True
1345         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1346                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1347
1348     if not_a_console(h):
1349         return False
1350
1351     def next_nonbmp_pos(s):
1352         try:
1353             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1354         except StopIteration:
1355             return len(s)
1356
1357     while s:
1358         count = min(next_nonbmp_pos(s), 1024)
1359
1360         ret = WriteConsoleW(
1361             h, s, count if count else 2, ctypes.byref(written), None)
1362         if ret == 0:
1363             raise OSError('Failed to write string')
1364         if not count:  # We just wrote a non-BMP character
1365             assert written.value == 2
1366             s = s[1:]
1367         else:
1368             assert written.value > 0
1369             s = s[written.value:]
1370     return True
1371
1372
1373 def write_string(s, out=None, encoding=None):
1374     if out is None:
1375         out = sys.stderr
1376     assert type(s) == compat_str
1377
1378     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1379         if _windows_write_string(s, out):
1380             return
1381
1382     if ('b' in getattr(out, 'mode', '') or
1383             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1384         byt = s.encode(encoding or preferredencoding(), 'ignore')
1385         out.write(byt)
1386     elif hasattr(out, 'buffer'):
1387         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1388         byt = s.encode(enc, 'ignore')
1389         out.buffer.write(byt)
1390     else:
1391         out.write(s)
1392     out.flush()
1393
1394
1395 def bytes_to_intlist(bs):
1396     if not bs:
1397         return []
1398     if isinstance(bs[0], int):  # Python 3
1399         return list(bs)
1400     else:
1401         return [ord(c) for c in bs]
1402
1403
1404 def intlist_to_bytes(xs):
1405     if not xs:
1406         return b''
1407     return compat_struct_pack('%dB' % len(xs), *xs)
1408
1409
1410 # Cross-platform file locking
1411 if sys.platform == 'win32':
1412     import ctypes.wintypes
1413     import msvcrt
1414
1415     class OVERLAPPED(ctypes.Structure):
1416         _fields_ = [
1417             ('Internal', ctypes.wintypes.LPVOID),
1418             ('InternalHigh', ctypes.wintypes.LPVOID),
1419             ('Offset', ctypes.wintypes.DWORD),
1420             ('OffsetHigh', ctypes.wintypes.DWORD),
1421             ('hEvent', ctypes.wintypes.HANDLE),
1422         ]
1423
1424     kernel32 = ctypes.windll.kernel32
1425     LockFileEx = kernel32.LockFileEx
1426     LockFileEx.argtypes = [
1427         ctypes.wintypes.HANDLE,     # hFile
1428         ctypes.wintypes.DWORD,      # dwFlags
1429         ctypes.wintypes.DWORD,      # dwReserved
1430         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1431         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1432         ctypes.POINTER(OVERLAPPED)  # Overlapped
1433     ]
1434     LockFileEx.restype = ctypes.wintypes.BOOL
1435     UnlockFileEx = kernel32.UnlockFileEx
1436     UnlockFileEx.argtypes = [
1437         ctypes.wintypes.HANDLE,     # hFile
1438         ctypes.wintypes.DWORD,      # dwReserved
1439         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1440         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1441         ctypes.POINTER(OVERLAPPED)  # Overlapped
1442     ]
1443     UnlockFileEx.restype = ctypes.wintypes.BOOL
1444     whole_low = 0xffffffff
1445     whole_high = 0x7fffffff
1446
1447     def _lock_file(f, exclusive):
1448         overlapped = OVERLAPPED()
1449         overlapped.Offset = 0
1450         overlapped.OffsetHigh = 0
1451         overlapped.hEvent = 0
1452         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1453         handle = msvcrt.get_osfhandle(f.fileno())
1454         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1455                           whole_low, whole_high, f._lock_file_overlapped_p):
1456             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1457
1458     def _unlock_file(f):
1459         assert f._lock_file_overlapped_p
1460         handle = msvcrt.get_osfhandle(f.fileno())
1461         if not UnlockFileEx(handle, 0,
1462                             whole_low, whole_high, f._lock_file_overlapped_p):
1463             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1464
1465 else:
1466     # Some platforms, such as Jython, is missing fcntl
1467     try:
1468         import fcntl
1469
1470         def _lock_file(f, exclusive):
1471             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1472
1473         def _unlock_file(f):
1474             fcntl.flock(f, fcntl.LOCK_UN)
1475     except ImportError:
1476         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1477
1478         def _lock_file(f, exclusive):
1479             raise IOError(UNSUPPORTED_MSG)
1480
1481         def _unlock_file(f):
1482             raise IOError(UNSUPPORTED_MSG)
1483
1484
1485 class locked_file(object):
1486     def __init__(self, filename, mode, encoding=None):
1487         assert mode in ['r', 'a', 'w']
1488         self.f = io.open(filename, mode, encoding=encoding)
1489         self.mode = mode
1490
1491     def __enter__(self):
1492         exclusive = self.mode != 'r'
1493         try:
1494             _lock_file(self.f, exclusive)
1495         except IOError:
1496             self.f.close()
1497             raise
1498         return self
1499
1500     def __exit__(self, etype, value, traceback):
1501         try:
1502             _unlock_file(self.f)
1503         finally:
1504             self.f.close()
1505
1506     def __iter__(self):
1507         return iter(self.f)
1508
1509     def write(self, *args):
1510         return self.f.write(*args)
1511
1512     def read(self, *args):
1513         return self.f.read(*args)
1514
1515
1516 def get_filesystem_encoding():
1517     encoding = sys.getfilesystemencoding()
1518     return encoding if encoding is not None else 'utf-8'
1519
1520
1521 def shell_quote(args):
1522     quoted_args = []
1523     encoding = get_filesystem_encoding()
1524     for a in args:
1525         if isinstance(a, bytes):
1526             # We may get a filename encoded with 'encodeFilename'
1527             a = a.decode(encoding)
1528         quoted_args.append(pipes.quote(a))
1529     return ' '.join(quoted_args)
1530
1531
1532 def smuggle_url(url, data):
1533     """ Pass additional data in a URL for internal use. """
1534
1535     url, idata = unsmuggle_url(url, {})
1536     data.update(idata)
1537     sdata = compat_urllib_parse_urlencode(
1538         {'__youtubedl_smuggle': json.dumps(data)})
1539     return url + '#' + sdata
1540
1541
1542 def unsmuggle_url(smug_url, default=None):
1543     if '#__youtubedl_smuggle' not in smug_url:
1544         return smug_url, default
1545     url, _, sdata = smug_url.rpartition('#')
1546     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1547     data = json.loads(jsond)
1548     return url, data
1549
1550
1551 def format_bytes(bytes):
1552     if bytes is None:
1553         return 'N/A'
1554     if type(bytes) is str:
1555         bytes = float(bytes)
1556     if bytes == 0.0:
1557         exponent = 0
1558     else:
1559         exponent = int(math.log(bytes, 1024.0))
1560     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1561     converted = float(bytes) / float(1024 ** exponent)
1562     return '%.2f%s' % (converted, suffix)
1563
1564
1565 def lookup_unit_table(unit_table, s):
1566     units_re = '|'.join(re.escape(u) for u in unit_table)
1567     m = re.match(
1568         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1569     if not m:
1570         return None
1571     num_str = m.group('num').replace(',', '.')
1572     mult = unit_table[m.group('unit')]
1573     return int(float(num_str) * mult)
1574
1575
1576 def parse_filesize(s):
1577     if s is None:
1578         return None
1579
1580     # The lower-case forms are of course incorrect and unofficial,
1581     # but we support those too
1582     _UNIT_TABLE = {
1583         'B': 1,
1584         'b': 1,
1585         'bytes': 1,
1586         'KiB': 1024,
1587         'KB': 1000,
1588         'kB': 1024,
1589         'Kb': 1000,
1590         'kb': 1000,
1591         'kilobytes': 1000,
1592         'kibibytes': 1024,
1593         'MiB': 1024 ** 2,
1594         'MB': 1000 ** 2,
1595         'mB': 1024 ** 2,
1596         'Mb': 1000 ** 2,
1597         'mb': 1000 ** 2,
1598         'megabytes': 1000 ** 2,
1599         'mebibytes': 1024 ** 2,
1600         'GiB': 1024 ** 3,
1601         'GB': 1000 ** 3,
1602         'gB': 1024 ** 3,
1603         'Gb': 1000 ** 3,
1604         'gb': 1000 ** 3,
1605         'gigabytes': 1000 ** 3,
1606         'gibibytes': 1024 ** 3,
1607         'TiB': 1024 ** 4,
1608         'TB': 1000 ** 4,
1609         'tB': 1024 ** 4,
1610         'Tb': 1000 ** 4,
1611         'tb': 1000 ** 4,
1612         'terabytes': 1000 ** 4,
1613         'tebibytes': 1024 ** 4,
1614         'PiB': 1024 ** 5,
1615         'PB': 1000 ** 5,
1616         'pB': 1024 ** 5,
1617         'Pb': 1000 ** 5,
1618         'pb': 1000 ** 5,
1619         'petabytes': 1000 ** 5,
1620         'pebibytes': 1024 ** 5,
1621         'EiB': 1024 ** 6,
1622         'EB': 1000 ** 6,
1623         'eB': 1024 ** 6,
1624         'Eb': 1000 ** 6,
1625         'eb': 1000 ** 6,
1626         'exabytes': 1000 ** 6,
1627         'exbibytes': 1024 ** 6,
1628         'ZiB': 1024 ** 7,
1629         'ZB': 1000 ** 7,
1630         'zB': 1024 ** 7,
1631         'Zb': 1000 ** 7,
1632         'zb': 1000 ** 7,
1633         'zettabytes': 1000 ** 7,
1634         'zebibytes': 1024 ** 7,
1635         'YiB': 1024 ** 8,
1636         'YB': 1000 ** 8,
1637         'yB': 1024 ** 8,
1638         'Yb': 1000 ** 8,
1639         'yb': 1000 ** 8,
1640         'yottabytes': 1000 ** 8,
1641         'yobibytes': 1024 ** 8,
1642     }
1643
1644     return lookup_unit_table(_UNIT_TABLE, s)
1645
1646
1647 def parse_count(s):
1648     if s is None:
1649         return None
1650
1651     s = s.strip()
1652
1653     if re.match(r'^[\d,.]+$', s):
1654         return str_to_int(s)
1655
1656     _UNIT_TABLE = {
1657         'k': 1000,
1658         'K': 1000,
1659         'm': 1000 ** 2,
1660         'M': 1000 ** 2,
1661         'kk': 1000 ** 2,
1662         'KK': 1000 ** 2,
1663     }
1664
1665     return lookup_unit_table(_UNIT_TABLE, s)
1666
1667
1668 def month_by_name(name, lang='en'):
1669     """ Return the number of a month by (locale-independently) English name """
1670
1671     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1672
1673     try:
1674         return month_names.index(name) + 1
1675     except ValueError:
1676         return None
1677
1678
1679 def month_by_abbreviation(abbrev):
1680     """ Return the number of a month by (locale-independently) English
1681         abbreviations """
1682
1683     try:
1684         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1685     except ValueError:
1686         return None
1687
1688
1689 def fix_xml_ampersands(xml_str):
1690     """Replace all the '&' by '&amp;' in XML"""
1691     return re.sub(
1692         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1693         '&amp;',
1694         xml_str)
1695
1696
1697 def setproctitle(title):
1698     assert isinstance(title, compat_str)
1699
1700     # ctypes in Jython is not complete
1701     # http://bugs.jython.org/issue2148
1702     if sys.platform.startswith('java'):
1703         return
1704
1705     try:
1706         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1707     except OSError:
1708         return
1709     except TypeError:
1710         # LoadLibrary in Windows Python 2.7.13 only expects
1711         # a bytestring, but since unicode_literals turns
1712         # every string into a unicode string, it fails.
1713         return
1714     title_bytes = title.encode('utf-8')
1715     buf = ctypes.create_string_buffer(len(title_bytes))
1716     buf.value = title_bytes
1717     try:
1718         libc.prctl(15, buf, 0, 0, 0)
1719     except AttributeError:
1720         return  # Strange libc, just skip this
1721
1722
1723 def remove_start(s, start):
1724     return s[len(start):] if s is not None and s.startswith(start) else s
1725
1726
1727 def remove_end(s, end):
1728     return s[:-len(end)] if s is not None and s.endswith(end) else s
1729
1730
1731 def remove_quotes(s):
1732     if s is None or len(s) < 2:
1733         return s
1734     for quote in ('"', "'", ):
1735         if s[0] == quote and s[-1] == quote:
1736             return s[1:-1]
1737     return s
1738
1739
1740 def url_basename(url):
1741     path = compat_urlparse.urlparse(url).path
1742     return path.strip('/').split('/')[-1]
1743
1744
1745 def base_url(url):
1746     return re.match(r'https?://[^?#&]+/', url).group()
1747
1748
1749 def urljoin(base, path):
1750     if not isinstance(path, compat_str) or not path:
1751         return None
1752     if re.match(r'^(?:https?:)?//', path):
1753         return path
1754     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1755         return None
1756     return compat_urlparse.urljoin(base, path)
1757
1758
1759 class HEADRequest(compat_urllib_request.Request):
1760     def get_method(self):
1761         return 'HEAD'
1762
1763
1764 class PUTRequest(compat_urllib_request.Request):
1765     def get_method(self):
1766         return 'PUT'
1767
1768
1769 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1770     if get_attr:
1771         if v is not None:
1772             v = getattr(v, get_attr, None)
1773     if v == '':
1774         v = None
1775     if v is None:
1776         return default
1777     try:
1778         return int(v) * invscale // scale
1779     except ValueError:
1780         return default
1781
1782
1783 def str_or_none(v, default=None):
1784     return default if v is None else compat_str(v)
1785
1786
1787 def str_to_int(int_str):
1788     """ A more relaxed version of int_or_none """
1789     if int_str is None:
1790         return None
1791     int_str = re.sub(r'[,\.\+]', '', int_str)
1792     return int(int_str)
1793
1794
1795 def float_or_none(v, scale=1, invscale=1, default=None):
1796     if v is None:
1797         return default
1798     try:
1799         return float(v) * invscale / scale
1800     except ValueError:
1801         return default
1802
1803
1804 def strip_or_none(v):
1805     return None if v is None else v.strip()
1806
1807
1808 def parse_duration(s):
1809     if not isinstance(s, compat_basestring):
1810         return None
1811
1812     s = s.strip()
1813
1814     days, hours, mins, secs, ms = [None] * 5
1815     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1816     if m:
1817         days, hours, mins, secs, ms = m.groups()
1818     else:
1819         m = re.match(
1820             r'''(?ix)(?:P?T)?
1821                 (?:
1822                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1823                 )?
1824                 (?:
1825                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1826                 )?
1827                 (?:
1828                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1829                 )?
1830                 (?:
1831                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1832                 )?Z?$''', s)
1833         if m:
1834             days, hours, mins, secs, ms = m.groups()
1835         else:
1836             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1837             if m:
1838                 hours, mins = m.groups()
1839             else:
1840                 return None
1841
1842     duration = 0
1843     if secs:
1844         duration += float(secs)
1845     if mins:
1846         duration += float(mins) * 60
1847     if hours:
1848         duration += float(hours) * 60 * 60
1849     if days:
1850         duration += float(days) * 24 * 60 * 60
1851     if ms:
1852         duration += float(ms)
1853     return duration
1854
1855
1856 def prepend_extension(filename, ext, expected_real_ext=None):
1857     name, real_ext = os.path.splitext(filename)
1858     return (
1859         '{0}.{1}{2}'.format(name, ext, real_ext)
1860         if not expected_real_ext or real_ext[1:] == expected_real_ext
1861         else '{0}.{1}'.format(filename, ext))
1862
1863
1864 def replace_extension(filename, ext, expected_real_ext=None):
1865     name, real_ext = os.path.splitext(filename)
1866     return '{0}.{1}'.format(
1867         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1868         ext)
1869
1870
1871 def check_executable(exe, args=[]):
1872     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1873     args can be a list of arguments for a short output (like -version) """
1874     try:
1875         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1876     except OSError:
1877         return False
1878     return exe
1879
1880
1881 def get_exe_version(exe, args=['--version'],
1882                     version_re=None, unrecognized='present'):
1883     """ Returns the version of the specified executable,
1884     or False if the executable is not present """
1885     try:
1886         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1887         # SIGTTOU if youtube-dl is run in the background.
1888         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1889         out, _ = subprocess.Popen(
1890             [encodeArgument(exe)] + args,
1891             stdin=subprocess.PIPE,
1892             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1893     except OSError:
1894         return False
1895     if isinstance(out, bytes):  # Python 2.x
1896         out = out.decode('ascii', 'ignore')
1897     return detect_exe_version(out, version_re, unrecognized)
1898
1899
1900 def detect_exe_version(output, version_re=None, unrecognized='present'):
1901     assert isinstance(output, compat_str)
1902     if version_re is None:
1903         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1904     m = re.search(version_re, output)
1905     if m:
1906         return m.group(1)
1907     else:
1908         return unrecognized
1909
1910
1911 class PagedList(object):
1912     def __len__(self):
1913         # This is only useful for tests
1914         return len(self.getslice())
1915
1916
1917 class OnDemandPagedList(PagedList):
1918     def __init__(self, pagefunc, pagesize, use_cache=False):
1919         self._pagefunc = pagefunc
1920         self._pagesize = pagesize
1921         self._use_cache = use_cache
1922         if use_cache:
1923             self._cache = {}
1924
1925     def getslice(self, start=0, end=None):
1926         res = []
1927         for pagenum in itertools.count(start // self._pagesize):
1928             firstid = pagenum * self._pagesize
1929             nextfirstid = pagenum * self._pagesize + self._pagesize
1930             if start >= nextfirstid:
1931                 continue
1932
1933             page_results = None
1934             if self._use_cache:
1935                 page_results = self._cache.get(pagenum)
1936             if page_results is None:
1937                 page_results = list(self._pagefunc(pagenum))
1938             if self._use_cache:
1939                 self._cache[pagenum] = page_results
1940
1941             startv = (
1942                 start % self._pagesize
1943                 if firstid <= start < nextfirstid
1944                 else 0)
1945
1946             endv = (
1947                 ((end - 1) % self._pagesize) + 1
1948                 if (end is not None and firstid <= end <= nextfirstid)
1949                 else None)
1950
1951             if startv != 0 or endv is not None:
1952                 page_results = page_results[startv:endv]
1953             res.extend(page_results)
1954
1955             # A little optimization - if current page is not "full", ie. does
1956             # not contain page_size videos then we can assume that this page
1957             # is the last one - there are no more ids on further pages -
1958             # i.e. no need to query again.
1959             if len(page_results) + startv < self._pagesize:
1960                 break
1961
1962             # If we got the whole page, but the next page is not interesting,
1963             # break out early as well
1964             if end == nextfirstid:
1965                 break
1966         return res
1967
1968
1969 class InAdvancePagedList(PagedList):
1970     def __init__(self, pagefunc, pagecount, pagesize):
1971         self._pagefunc = pagefunc
1972         self._pagecount = pagecount
1973         self._pagesize = pagesize
1974
1975     def getslice(self, start=0, end=None):
1976         res = []
1977         start_page = start // self._pagesize
1978         end_page = (
1979             self._pagecount if end is None else (end // self._pagesize + 1))
1980         skip_elems = start - start_page * self._pagesize
1981         only_more = None if end is None else end - start
1982         for pagenum in range(start_page, end_page):
1983             page = list(self._pagefunc(pagenum))
1984             if skip_elems:
1985                 page = page[skip_elems:]
1986                 skip_elems = None
1987             if only_more is not None:
1988                 if len(page) < only_more:
1989                     only_more -= len(page)
1990                 else:
1991                     page = page[:only_more]
1992                     res.extend(page)
1993                     break
1994             res.extend(page)
1995         return res
1996
1997
1998 def uppercase_escape(s):
1999     unicode_escape = codecs.getdecoder('unicode_escape')
2000     return re.sub(
2001         r'\\U[0-9a-fA-F]{8}',
2002         lambda m: unicode_escape(m.group(0))[0],
2003         s)
2004
2005
2006 def lowercase_escape(s):
2007     unicode_escape = codecs.getdecoder('unicode_escape')
2008     return re.sub(
2009         r'\\u[0-9a-fA-F]{4}',
2010         lambda m: unicode_escape(m.group(0))[0],
2011         s)
2012
2013
2014 def escape_rfc3986(s):
2015     """Escape non-ASCII characters as suggested by RFC 3986"""
2016     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2017         s = s.encode('utf-8')
2018     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2019
2020
2021 def escape_url(url):
2022     """Escape URL as suggested by RFC 3986"""
2023     url_parsed = compat_urllib_parse_urlparse(url)
2024     return url_parsed._replace(
2025         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2026         path=escape_rfc3986(url_parsed.path),
2027         params=escape_rfc3986(url_parsed.params),
2028         query=escape_rfc3986(url_parsed.query),
2029         fragment=escape_rfc3986(url_parsed.fragment)
2030     ).geturl()
2031
2032
2033 def read_batch_urls(batch_fd):
2034     def fixup(url):
2035         if not isinstance(url, compat_str):
2036             url = url.decode('utf-8', 'replace')
2037         BOM_UTF8 = '\xef\xbb\xbf'
2038         if url.startswith(BOM_UTF8):
2039             url = url[len(BOM_UTF8):]
2040         url = url.strip()
2041         if url.startswith(('#', ';', ']')):
2042             return False
2043         return url
2044
2045     with contextlib.closing(batch_fd) as fd:
2046         return [url for url in map(fixup, fd) if url]
2047
2048
2049 def urlencode_postdata(*args, **kargs):
2050     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2051
2052
2053 def update_url_query(url, query):
2054     if not query:
2055         return url
2056     parsed_url = compat_urlparse.urlparse(url)
2057     qs = compat_parse_qs(parsed_url.query)
2058     qs.update(query)
2059     return compat_urlparse.urlunparse(parsed_url._replace(
2060         query=compat_urllib_parse_urlencode(qs, True)))
2061
2062
2063 def update_Request(req, url=None, data=None, headers={}, query={}):
2064     req_headers = req.headers.copy()
2065     req_headers.update(headers)
2066     req_data = data or req.data
2067     req_url = update_url_query(url or req.get_full_url(), query)
2068     req_get_method = req.get_method()
2069     if req_get_method == 'HEAD':
2070         req_type = HEADRequest
2071     elif req_get_method == 'PUT':
2072         req_type = PUTRequest
2073     else:
2074         req_type = compat_urllib_request.Request
2075     new_req = req_type(
2076         req_url, data=req_data, headers=req_headers,
2077         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2078     if hasattr(req, 'timeout'):
2079         new_req.timeout = req.timeout
2080     return new_req
2081
2082
2083 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2084     if isinstance(key_or_keys, (list, tuple)):
2085         for key in key_or_keys:
2086             if key not in d or d[key] is None or skip_false_values and not d[key]:
2087                 continue
2088             return d[key]
2089         return default
2090     return d.get(key_or_keys, default)
2091
2092
2093 def try_get(src, getter, expected_type=None):
2094     try:
2095         v = getter(src)
2096     except (AttributeError, KeyError, TypeError, IndexError):
2097         pass
2098     else:
2099         if expected_type is None or isinstance(v, expected_type):
2100             return v
2101
2102
2103 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2104     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2105
2106
2107 US_RATINGS = {
2108     'G': 0,
2109     'PG': 10,
2110     'PG-13': 13,
2111     'R': 16,
2112     'NC': 18,
2113 }
2114
2115
2116 TV_PARENTAL_GUIDELINES = {
2117     'TV-Y': 0,
2118     'TV-Y7': 7,
2119     'TV-G': 0,
2120     'TV-PG': 0,
2121     'TV-14': 14,
2122     'TV-MA': 17,
2123 }
2124
2125
2126 def parse_age_limit(s):
2127     if type(s) == int:
2128         return s if 0 <= s <= 21 else None
2129     if not isinstance(s, compat_basestring):
2130         return None
2131     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2132     if m:
2133         return int(m.group('age'))
2134     if s in US_RATINGS:
2135         return US_RATINGS[s]
2136     return TV_PARENTAL_GUIDELINES.get(s)
2137
2138
2139 def strip_jsonp(code):
2140     return re.sub(
2141         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2142
2143
2144 def js_to_json(code):
2145     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2146     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2147     INTEGER_TABLE = (
2148         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2149         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2150     )
2151
2152     def fix_kv(m):
2153         v = m.group(0)
2154         if v in ('true', 'false', 'null'):
2155             return v
2156         elif v.startswith('/*') or v.startswith('//') or v == ',':
2157             return ""
2158
2159         if v[0] in ("'", '"'):
2160             v = re.sub(r'(?s)\\.|"', lambda m: {
2161                 '"': '\\"',
2162                 "\\'": "'",
2163                 '\\\n': '',
2164                 '\\x': '\\u00',
2165             }.get(m.group(0), m.group(0)), v[1:-1])
2166
2167         for regex, base in INTEGER_TABLE:
2168             im = re.match(regex, v)
2169             if im:
2170                 i = int(im.group(1), base)
2171                 return '"%d":' % i if v.endswith(':') else '%d' % i
2172
2173         return '"%s"' % v
2174
2175     return re.sub(r'''(?sx)
2176         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2177         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2178         {comment}|,(?={skip}[\]}}])|
2179         [a-zA-Z_][.a-zA-Z_0-9]*|
2180         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2181         [0-9]+(?={skip}:)
2182         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2183
2184
2185 def qualities(quality_ids):
2186     """ Get a numeric quality value out of a list of possible values """
2187     def q(qid):
2188         try:
2189             return quality_ids.index(qid)
2190         except ValueError:
2191             return -1
2192     return q
2193
2194
2195 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2196
2197
2198 def limit_length(s, length):
2199     """ Add ellipses to overly long strings """
2200     if s is None:
2201         return None
2202     ELLIPSES = '...'
2203     if len(s) > length:
2204         return s[:length - len(ELLIPSES)] + ELLIPSES
2205     return s
2206
2207
2208 def version_tuple(v):
2209     return tuple(int(e) for e in re.split(r'[-.]', v))
2210
2211
2212 def is_outdated_version(version, limit, assume_new=True):
2213     if not version:
2214         return not assume_new
2215     try:
2216         return version_tuple(version) < version_tuple(limit)
2217     except ValueError:
2218         return not assume_new
2219
2220
2221 def ytdl_is_updateable():
2222     """ Returns if youtube-dl can be updated with -U """
2223     from zipimport import zipimporter
2224
2225     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2226
2227
2228 def args_to_str(args):
2229     # Get a short string representation for a subprocess command
2230     return ' '.join(compat_shlex_quote(a) for a in args)
2231
2232
2233 def error_to_compat_str(err):
2234     err_str = str(err)
2235     # On python 2 error byte string must be decoded with proper
2236     # encoding rather than ascii
2237     if sys.version_info[0] < 3:
2238         err_str = err_str.decode(preferredencoding())
2239     return err_str
2240
2241
2242 def mimetype2ext(mt):
2243     if mt is None:
2244         return None
2245
2246     ext = {
2247         'audio/mp4': 'm4a',
2248         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2249         # it's the most popular one
2250         'audio/mpeg': 'mp3',
2251     }.get(mt)
2252     if ext is not None:
2253         return ext
2254
2255     _, _, res = mt.rpartition('/')
2256     res = res.split(';')[0].strip().lower()
2257
2258     return {
2259         '3gpp': '3gp',
2260         'smptett+xml': 'tt',
2261         'srt': 'srt',
2262         'ttaf+xml': 'dfxp',
2263         'ttml+xml': 'ttml',
2264         'vtt': 'vtt',
2265         'x-flv': 'flv',
2266         'x-mp4-fragmented': 'mp4',
2267         'x-ms-wmv': 'wmv',
2268         'mpegurl': 'm3u8',
2269         'x-mpegurl': 'm3u8',
2270         'vnd.apple.mpegurl': 'm3u8',
2271         'dash+xml': 'mpd',
2272         'f4m': 'f4m',
2273         'f4m+xml': 'f4m',
2274         'hds+xml': 'f4m',
2275         'vnd.ms-sstr+xml': 'ism',
2276         'quicktime': 'mov',
2277     }.get(res, res)
2278
2279
2280 def parse_codecs(codecs_str):
2281     # http://tools.ietf.org/html/rfc6381
2282     if not codecs_str:
2283         return {}
2284     splited_codecs = list(filter(None, map(
2285         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2286     vcodec, acodec = None, None
2287     for full_codec in splited_codecs:
2288         codec = full_codec.split('.')[0]
2289         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2290             if not vcodec:
2291                 vcodec = full_codec
2292         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2293             if not acodec:
2294                 acodec = full_codec
2295         else:
2296             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2297     if not vcodec and not acodec:
2298         if len(splited_codecs) == 2:
2299             return {
2300                 'vcodec': vcodec,
2301                 'acodec': acodec,
2302             }
2303         elif len(splited_codecs) == 1:
2304             return {
2305                 'vcodec': 'none',
2306                 'acodec': vcodec,
2307             }
2308     else:
2309         return {
2310             'vcodec': vcodec or 'none',
2311             'acodec': acodec or 'none',
2312         }
2313     return {}
2314
2315
2316 def urlhandle_detect_ext(url_handle):
2317     getheader = url_handle.headers.get
2318
2319     cd = getheader('Content-Disposition')
2320     if cd:
2321         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2322         if m:
2323             e = determine_ext(m.group('filename'), default_ext=None)
2324             if e:
2325                 return e
2326
2327     return mimetype2ext(getheader('Content-Type'))
2328
2329
2330 def encode_data_uri(data, mime_type):
2331     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2332
2333
2334 def age_restricted(content_limit, age_limit):
2335     """ Returns True iff the content should be blocked """
2336
2337     if age_limit is None:  # No limit set
2338         return False
2339     if content_limit is None:
2340         return False  # Content available for everyone
2341     return age_limit < content_limit
2342
2343
2344 def is_html(first_bytes):
2345     """ Detect whether a file contains HTML by examining its first bytes. """
2346
2347     BOMS = [
2348         (b'\xef\xbb\xbf', 'utf-8'),
2349         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2350         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2351         (b'\xff\xfe', 'utf-16-le'),
2352         (b'\xfe\xff', 'utf-16-be'),
2353     ]
2354     for bom, enc in BOMS:
2355         if first_bytes.startswith(bom):
2356             s = first_bytes[len(bom):].decode(enc, 'replace')
2357             break
2358     else:
2359         s = first_bytes.decode('utf-8', 'replace')
2360
2361     return re.match(r'^\s*<', s)
2362
2363
2364 def determine_protocol(info_dict):
2365     protocol = info_dict.get('protocol')
2366     if protocol is not None:
2367         return protocol
2368
2369     url = info_dict['url']
2370     if url.startswith('rtmp'):
2371         return 'rtmp'
2372     elif url.startswith('mms'):
2373         return 'mms'
2374     elif url.startswith('rtsp'):
2375         return 'rtsp'
2376
2377     ext = determine_ext(url)
2378     if ext == 'm3u8':
2379         return 'm3u8'
2380     elif ext == 'f4m':
2381         return 'f4m'
2382
2383     return compat_urllib_parse_urlparse(url).scheme
2384
2385
2386 def render_table(header_row, data):
2387     """ Render a list of rows, each as a list of values """
2388     table = [header_row] + data
2389     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2390     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2391     return '\n'.join(format_str % tuple(row) for row in table)
2392
2393
2394 def _match_one(filter_part, dct):
2395     COMPARISON_OPERATORS = {
2396         '<': operator.lt,
2397         '<=': operator.le,
2398         '>': operator.gt,
2399         '>=': operator.ge,
2400         '=': operator.eq,
2401         '!=': operator.ne,
2402     }
2403     operator_rex = re.compile(r'''(?x)\s*
2404         (?P<key>[a-z_]+)
2405         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2406         (?:
2407             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2408             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2409             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2410         )
2411         \s*$
2412         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2413     m = operator_rex.search(filter_part)
2414     if m:
2415         op = COMPARISON_OPERATORS[m.group('op')]
2416         actual_value = dct.get(m.group('key'))
2417         if (m.group('quotedstrval') is not None or
2418             m.group('strval') is not None or
2419             # If the original field is a string and matching comparisonvalue is
2420             # a number we should respect the origin of the original field
2421             # and process comparison value as a string (see
2422             # https://github.com/rg3/youtube-dl/issues/11082).
2423             actual_value is not None and m.group('intval') is not None and
2424                 isinstance(actual_value, compat_str)):
2425             if m.group('op') not in ('=', '!='):
2426                 raise ValueError(
2427                     'Operator %s does not support string values!' % m.group('op'))
2428             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2429             quote = m.group('quote')
2430             if quote is not None:
2431                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2432         else:
2433             try:
2434                 comparison_value = int(m.group('intval'))
2435             except ValueError:
2436                 comparison_value = parse_filesize(m.group('intval'))
2437                 if comparison_value is None:
2438                     comparison_value = parse_filesize(m.group('intval') + 'B')
2439                 if comparison_value is None:
2440                     raise ValueError(
2441                         'Invalid integer value %r in filter part %r' % (
2442                             m.group('intval'), filter_part))
2443         if actual_value is None:
2444             return m.group('none_inclusive')
2445         return op(actual_value, comparison_value)
2446
2447     UNARY_OPERATORS = {
2448         '': lambda v: v is not None,
2449         '!': lambda v: v is None,
2450     }
2451     operator_rex = re.compile(r'''(?x)\s*
2452         (?P<op>%s)\s*(?P<key>[a-z_]+)
2453         \s*$
2454         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2455     m = operator_rex.search(filter_part)
2456     if m:
2457         op = UNARY_OPERATORS[m.group('op')]
2458         actual_value = dct.get(m.group('key'))
2459         return op(actual_value)
2460
2461     raise ValueError('Invalid filter part %r' % filter_part)
2462
2463
2464 def match_str(filter_str, dct):
2465     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2466
2467     return all(
2468         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2469
2470
2471 def match_filter_func(filter_str):
2472     def _match_func(info_dict):
2473         if match_str(filter_str, info_dict):
2474             return None
2475         else:
2476             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2477             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2478     return _match_func
2479
2480
2481 def parse_dfxp_time_expr(time_expr):
2482     if not time_expr:
2483         return
2484
2485     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2486     if mobj:
2487         return float(mobj.group('time_offset'))
2488
2489     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2490     if mobj:
2491         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2492
2493
2494 def srt_subtitles_timecode(seconds):
2495     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2496
2497
2498 def dfxp2srt(dfxp_data):
2499     _x = functools.partial(xpath_with_ns, ns_map={
2500         'ttml': 'http://www.w3.org/ns/ttml',
2501         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2502         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2503     })
2504
2505     class TTMLPElementParser(object):
2506         out = ''
2507
2508         def start(self, tag, attrib):
2509             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2510                 self.out += '\n'
2511
2512         def end(self, tag):
2513             pass
2514
2515         def data(self, data):
2516             self.out += data
2517
2518         def close(self):
2519             return self.out.strip()
2520
2521     def parse_node(node):
2522         target = TTMLPElementParser()
2523         parser = xml.etree.ElementTree.XMLParser(target=target)
2524         parser.feed(xml.etree.ElementTree.tostring(node))
2525         return parser.close()
2526
2527     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2528     out = []
2529     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2530
2531     if not paras:
2532         raise ValueError('Invalid dfxp/TTML subtitle')
2533
2534     for para, index in zip(paras, itertools.count(1)):
2535         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2536         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2537         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2538         if begin_time is None:
2539             continue
2540         if not end_time:
2541             if not dur:
2542                 continue
2543             end_time = begin_time + dur
2544         out.append('%d\n%s --> %s\n%s\n\n' % (
2545             index,
2546             srt_subtitles_timecode(begin_time),
2547             srt_subtitles_timecode(end_time),
2548             parse_node(para)))
2549
2550     return ''.join(out)
2551
2552
2553 def cli_option(params, command_option, param):
2554     param = params.get(param)
2555     if param:
2556         param = compat_str(param)
2557     return [command_option, param] if param is not None else []
2558
2559
2560 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2561     param = params.get(param)
2562     assert isinstance(param, bool)
2563     if separator:
2564         return [command_option + separator + (true_value if param else false_value)]
2565     return [command_option, true_value if param else false_value]
2566
2567
2568 def cli_valueless_option(params, command_option, param, expected_value=True):
2569     param = params.get(param)
2570     return [command_option] if param == expected_value else []
2571
2572
2573 def cli_configuration_args(params, param, default=[]):
2574     ex_args = params.get(param)
2575     if ex_args is None:
2576         return default
2577     assert isinstance(ex_args, list)
2578     return ex_args
2579
2580
2581 class ISO639Utils(object):
2582     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2583     _lang_map = {
2584         'aa': 'aar',
2585         'ab': 'abk',
2586         'ae': 'ave',
2587         'af': 'afr',
2588         'ak': 'aka',
2589         'am': 'amh',
2590         'an': 'arg',
2591         'ar': 'ara',
2592         'as': 'asm',
2593         'av': 'ava',
2594         'ay': 'aym',
2595         'az': 'aze',
2596         'ba': 'bak',
2597         'be': 'bel',
2598         'bg': 'bul',
2599         'bh': 'bih',
2600         'bi': 'bis',
2601         'bm': 'bam',
2602         'bn': 'ben',
2603         'bo': 'bod',
2604         'br': 'bre',
2605         'bs': 'bos',
2606         'ca': 'cat',
2607         'ce': 'che',
2608         'ch': 'cha',
2609         'co': 'cos',
2610         'cr': 'cre',
2611         'cs': 'ces',
2612         'cu': 'chu',
2613         'cv': 'chv',
2614         'cy': 'cym',
2615         'da': 'dan',
2616         'de': 'deu',
2617         'dv': 'div',
2618         'dz': 'dzo',
2619         'ee': 'ewe',
2620         'el': 'ell',
2621         'en': 'eng',
2622         'eo': 'epo',
2623         'es': 'spa',
2624         'et': 'est',
2625         'eu': 'eus',
2626         'fa': 'fas',
2627         'ff': 'ful',
2628         'fi': 'fin',
2629         'fj': 'fij',
2630         'fo': 'fao',
2631         'fr': 'fra',
2632         'fy': 'fry',
2633         'ga': 'gle',
2634         'gd': 'gla',
2635         'gl': 'glg',
2636         'gn': 'grn',
2637         'gu': 'guj',
2638         'gv': 'glv',
2639         'ha': 'hau',
2640         'he': 'heb',
2641         'hi': 'hin',
2642         'ho': 'hmo',
2643         'hr': 'hrv',
2644         'ht': 'hat',
2645         'hu': 'hun',
2646         'hy': 'hye',
2647         'hz': 'her',
2648         'ia': 'ina',
2649         'id': 'ind',
2650         'ie': 'ile',
2651         'ig': 'ibo',
2652         'ii': 'iii',
2653         'ik': 'ipk',
2654         'io': 'ido',
2655         'is': 'isl',
2656         'it': 'ita',
2657         'iu': 'iku',
2658         'ja': 'jpn',
2659         'jv': 'jav',
2660         'ka': 'kat',
2661         'kg': 'kon',
2662         'ki': 'kik',
2663         'kj': 'kua',
2664         'kk': 'kaz',
2665         'kl': 'kal',
2666         'km': 'khm',
2667         'kn': 'kan',
2668         'ko': 'kor',
2669         'kr': 'kau',
2670         'ks': 'kas',
2671         'ku': 'kur',
2672         'kv': 'kom',
2673         'kw': 'cor',
2674         'ky': 'kir',
2675         'la': 'lat',
2676         'lb': 'ltz',
2677         'lg': 'lug',
2678         'li': 'lim',
2679         'ln': 'lin',
2680         'lo': 'lao',
2681         'lt': 'lit',
2682         'lu': 'lub',
2683         'lv': 'lav',
2684         'mg': 'mlg',
2685         'mh': 'mah',
2686         'mi': 'mri',
2687         'mk': 'mkd',
2688         'ml': 'mal',
2689         'mn': 'mon',
2690         'mr': 'mar',
2691         'ms': 'msa',
2692         'mt': 'mlt',
2693         'my': 'mya',
2694         'na': 'nau',
2695         'nb': 'nob',
2696         'nd': 'nde',
2697         'ne': 'nep',
2698         'ng': 'ndo',
2699         'nl': 'nld',
2700         'nn': 'nno',
2701         'no': 'nor',
2702         'nr': 'nbl',
2703         'nv': 'nav',
2704         'ny': 'nya',
2705         'oc': 'oci',
2706         'oj': 'oji',
2707         'om': 'orm',
2708         'or': 'ori',
2709         'os': 'oss',
2710         'pa': 'pan',
2711         'pi': 'pli',
2712         'pl': 'pol',
2713         'ps': 'pus',
2714         'pt': 'por',
2715         'qu': 'que',
2716         'rm': 'roh',
2717         'rn': 'run',
2718         'ro': 'ron',
2719         'ru': 'rus',
2720         'rw': 'kin',
2721         'sa': 'san',
2722         'sc': 'srd',
2723         'sd': 'snd',
2724         'se': 'sme',
2725         'sg': 'sag',
2726         'si': 'sin',
2727         'sk': 'slk',
2728         'sl': 'slv',
2729         'sm': 'smo',
2730         'sn': 'sna',
2731         'so': 'som',
2732         'sq': 'sqi',
2733         'sr': 'srp',
2734         'ss': 'ssw',
2735         'st': 'sot',
2736         'su': 'sun',
2737         'sv': 'swe',
2738         'sw': 'swa',
2739         'ta': 'tam',
2740         'te': 'tel',
2741         'tg': 'tgk',
2742         'th': 'tha',
2743         'ti': 'tir',
2744         'tk': 'tuk',
2745         'tl': 'tgl',
2746         'tn': 'tsn',
2747         'to': 'ton',
2748         'tr': 'tur',
2749         'ts': 'tso',
2750         'tt': 'tat',
2751         'tw': 'twi',
2752         'ty': 'tah',
2753         'ug': 'uig',
2754         'uk': 'ukr',
2755         'ur': 'urd',
2756         'uz': 'uzb',
2757         've': 'ven',
2758         'vi': 'vie',
2759         'vo': 'vol',
2760         'wa': 'wln',
2761         'wo': 'wol',
2762         'xh': 'xho',
2763         'yi': 'yid',
2764         'yo': 'yor',
2765         'za': 'zha',
2766         'zh': 'zho',
2767         'zu': 'zul',
2768     }
2769
2770     @classmethod
2771     def short2long(cls, code):
2772         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2773         return cls._lang_map.get(code[:2])
2774
2775     @classmethod
2776     def long2short(cls, code):
2777         """Convert language code from ISO 639-2/T to ISO 639-1"""
2778         for short_name, long_name in cls._lang_map.items():
2779             if long_name == code:
2780                 return short_name
2781
2782
2783 class ISO3166Utils(object):
2784     # From http://data.okfn.org/data/core/country-list
2785     _country_map = {
2786         'AF': 'Afghanistan',
2787         'AX': 'Åland Islands',
2788         'AL': 'Albania',
2789         'DZ': 'Algeria',
2790         'AS': 'American Samoa',
2791         'AD': 'Andorra',
2792         'AO': 'Angola',
2793         'AI': 'Anguilla',
2794         'AQ': 'Antarctica',
2795         'AG': 'Antigua and Barbuda',
2796         'AR': 'Argentina',
2797         'AM': 'Armenia',
2798         'AW': 'Aruba',
2799         'AU': 'Australia',
2800         'AT': 'Austria',
2801         'AZ': 'Azerbaijan',
2802         'BS': 'Bahamas',
2803         'BH': 'Bahrain',
2804         'BD': 'Bangladesh',
2805         'BB': 'Barbados',
2806         'BY': 'Belarus',
2807         'BE': 'Belgium',
2808         'BZ': 'Belize',
2809         'BJ': 'Benin',
2810         'BM': 'Bermuda',
2811         'BT': 'Bhutan',
2812         'BO': 'Bolivia, Plurinational State of',
2813         'BQ': 'Bonaire, Sint Eustatius and Saba',
2814         'BA': 'Bosnia and Herzegovina',
2815         'BW': 'Botswana',
2816         'BV': 'Bouvet Island',
2817         'BR': 'Brazil',
2818         'IO': 'British Indian Ocean Territory',
2819         'BN': 'Brunei Darussalam',
2820         'BG': 'Bulgaria',
2821         'BF': 'Burkina Faso',
2822         'BI': 'Burundi',
2823         'KH': 'Cambodia',
2824         'CM': 'Cameroon',
2825         'CA': 'Canada',
2826         'CV': 'Cape Verde',
2827         'KY': 'Cayman Islands',
2828         'CF': 'Central African Republic',
2829         'TD': 'Chad',
2830         'CL': 'Chile',
2831         'CN': 'China',
2832         'CX': 'Christmas Island',
2833         'CC': 'Cocos (Keeling) Islands',
2834         'CO': 'Colombia',
2835         'KM': 'Comoros',
2836         'CG': 'Congo',
2837         'CD': 'Congo, the Democratic Republic of the',
2838         'CK': 'Cook Islands',
2839         'CR': 'Costa Rica',
2840         'CI': 'Côte d\'Ivoire',
2841         'HR': 'Croatia',
2842         'CU': 'Cuba',
2843         'CW': 'Curaçao',
2844         'CY': 'Cyprus',
2845         'CZ': 'Czech Republic',
2846         'DK': 'Denmark',
2847         'DJ': 'Djibouti',
2848         'DM': 'Dominica',
2849         'DO': 'Dominican Republic',
2850         'EC': 'Ecuador',
2851         'EG': 'Egypt',
2852         'SV': 'El Salvador',
2853         'GQ': 'Equatorial Guinea',
2854         'ER': 'Eritrea',
2855         'EE': 'Estonia',
2856         'ET': 'Ethiopia',
2857         'FK': 'Falkland Islands (Malvinas)',
2858         'FO': 'Faroe Islands',
2859         'FJ': 'Fiji',
2860         'FI': 'Finland',
2861         'FR': 'France',
2862         'GF': 'French Guiana',
2863         'PF': 'French Polynesia',
2864         'TF': 'French Southern Territories',
2865         'GA': 'Gabon',
2866         'GM': 'Gambia',
2867         'GE': 'Georgia',
2868         'DE': 'Germany',
2869         'GH': 'Ghana',
2870         'GI': 'Gibraltar',
2871         'GR': 'Greece',
2872         'GL': 'Greenland',
2873         'GD': 'Grenada',
2874         'GP': 'Guadeloupe',
2875         'GU': 'Guam',
2876         'GT': 'Guatemala',
2877         'GG': 'Guernsey',
2878         'GN': 'Guinea',
2879         'GW': 'Guinea-Bissau',
2880         'GY': 'Guyana',
2881         'HT': 'Haiti',
2882         'HM': 'Heard Island and McDonald Islands',
2883         'VA': 'Holy See (Vatican City State)',
2884         'HN': 'Honduras',
2885         'HK': 'Hong Kong',
2886         'HU': 'Hungary',
2887         'IS': 'Iceland',
2888         'IN': 'India',
2889         'ID': 'Indonesia',
2890         'IR': 'Iran, Islamic Republic of',
2891         'IQ': 'Iraq',
2892         'IE': 'Ireland',
2893         'IM': 'Isle of Man',
2894         'IL': 'Israel',
2895         'IT': 'Italy',
2896         'JM': 'Jamaica',
2897         'JP': 'Japan',
2898         'JE': 'Jersey',
2899         'JO': 'Jordan',
2900         'KZ': 'Kazakhstan',
2901         'KE': 'Kenya',
2902         'KI': 'Kiribati',
2903         'KP': 'Korea, Democratic People\'s Republic of',
2904         'KR': 'Korea, Republic of',
2905         'KW': 'Kuwait',
2906         'KG': 'Kyrgyzstan',
2907         'LA': 'Lao People\'s Democratic Republic',
2908         'LV': 'Latvia',
2909         'LB': 'Lebanon',
2910         'LS': 'Lesotho',
2911         'LR': 'Liberia',
2912         'LY': 'Libya',
2913         'LI': 'Liechtenstein',
2914         'LT': 'Lithuania',
2915         'LU': 'Luxembourg',
2916         'MO': 'Macao',
2917         'MK': 'Macedonia, the Former Yugoslav Republic of',
2918         'MG': 'Madagascar',
2919         'MW': 'Malawi',
2920         'MY': 'Malaysia',
2921         'MV': 'Maldives',
2922         'ML': 'Mali',
2923         'MT': 'Malta',
2924         'MH': 'Marshall Islands',
2925         'MQ': 'Martinique',
2926         'MR': 'Mauritania',
2927         'MU': 'Mauritius',
2928         'YT': 'Mayotte',
2929         'MX': 'Mexico',
2930         'FM': 'Micronesia, Federated States of',
2931         'MD': 'Moldova, Republic of',
2932         'MC': 'Monaco',
2933         'MN': 'Mongolia',
2934         'ME': 'Montenegro',
2935         'MS': 'Montserrat',
2936         'MA': 'Morocco',
2937         'MZ': 'Mozambique',
2938         'MM': 'Myanmar',
2939         'NA': 'Namibia',
2940         'NR': 'Nauru',
2941         'NP': 'Nepal',
2942         'NL': 'Netherlands',
2943         'NC': 'New Caledonia',
2944         'NZ': 'New Zealand',
2945         'NI': 'Nicaragua',
2946         'NE': 'Niger',
2947         'NG': 'Nigeria',
2948         'NU': 'Niue',
2949         'NF': 'Norfolk Island',
2950         'MP': 'Northern Mariana Islands',
2951         'NO': 'Norway',
2952         'OM': 'Oman',
2953         'PK': 'Pakistan',
2954         'PW': 'Palau',
2955         'PS': 'Palestine, State of',
2956         'PA': 'Panama',
2957         'PG': 'Papua New Guinea',
2958         'PY': 'Paraguay',
2959         'PE': 'Peru',
2960         'PH': 'Philippines',
2961         'PN': 'Pitcairn',
2962         'PL': 'Poland',
2963         'PT': 'Portugal',
2964         'PR': 'Puerto Rico',
2965         'QA': 'Qatar',
2966         'RE': 'Réunion',
2967         'RO': 'Romania',
2968         'RU': 'Russian Federation',
2969         'RW': 'Rwanda',
2970         'BL': 'Saint Barthélemy',
2971         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2972         'KN': 'Saint Kitts and Nevis',
2973         'LC': 'Saint Lucia',
2974         'MF': 'Saint Martin (French part)',
2975         'PM': 'Saint Pierre and Miquelon',
2976         'VC': 'Saint Vincent and the Grenadines',
2977         'WS': 'Samoa',
2978         'SM': 'San Marino',
2979         'ST': 'Sao Tome and Principe',
2980         'SA': 'Saudi Arabia',
2981         'SN': 'Senegal',
2982         'RS': 'Serbia',
2983         'SC': 'Seychelles',
2984         'SL': 'Sierra Leone',
2985         'SG': 'Singapore',
2986         'SX': 'Sint Maarten (Dutch part)',
2987         'SK': 'Slovakia',
2988         'SI': 'Slovenia',
2989         'SB': 'Solomon Islands',
2990         'SO': 'Somalia',
2991         'ZA': 'South Africa',
2992         'GS': 'South Georgia and the South Sandwich Islands',
2993         'SS': 'South Sudan',
2994         'ES': 'Spain',
2995         'LK': 'Sri Lanka',
2996         'SD': 'Sudan',
2997         'SR': 'Suriname',
2998         'SJ': 'Svalbard and Jan Mayen',
2999         'SZ': 'Swaziland',
3000         'SE': 'Sweden',
3001         'CH': 'Switzerland',
3002         'SY': 'Syrian Arab Republic',
3003         'TW': 'Taiwan, Province of China',
3004         'TJ': 'Tajikistan',
3005         'TZ': 'Tanzania, United Republic of',
3006         'TH': 'Thailand',
3007         'TL': 'Timor-Leste',
3008         'TG': 'Togo',
3009         'TK': 'Tokelau',
3010         'TO': 'Tonga',
3011         'TT': 'Trinidad and Tobago',
3012         'TN': 'Tunisia',
3013         'TR': 'Turkey',
3014         'TM': 'Turkmenistan',
3015         'TC': 'Turks and Caicos Islands',
3016         'TV': 'Tuvalu',
3017         'UG': 'Uganda',
3018         'UA': 'Ukraine',
3019         'AE': 'United Arab Emirates',
3020         'GB': 'United Kingdom',
3021         'US': 'United States',
3022         'UM': 'United States Minor Outlying Islands',
3023         'UY': 'Uruguay',
3024         'UZ': 'Uzbekistan',
3025         'VU': 'Vanuatu',
3026         'VE': 'Venezuela, Bolivarian Republic of',
3027         'VN': 'Viet Nam',
3028         'VG': 'Virgin Islands, British',
3029         'VI': 'Virgin Islands, U.S.',
3030         'WF': 'Wallis and Futuna',
3031         'EH': 'Western Sahara',
3032         'YE': 'Yemen',
3033         'ZM': 'Zambia',
3034         'ZW': 'Zimbabwe',
3035     }
3036
3037     @classmethod
3038     def short2full(cls, code):
3039         """Convert an ISO 3166-2 country code to the corresponding full name"""
3040         return cls._country_map.get(code.upper())
3041
3042
3043 class GeoUtils(object):
3044     # Major IPv4 address blocks per country
3045     _country_ip_map = {
3046         'AD': '85.94.160.0/19',
3047         'AE': '94.200.0.0/13',
3048         'AF': '149.54.0.0/17',
3049         'AG': '209.59.64.0/18',
3050         'AI': '204.14.248.0/21',
3051         'AL': '46.99.0.0/16',
3052         'AM': '46.70.0.0/15',
3053         'AO': '105.168.0.0/13',
3054         'AP': '159.117.192.0/21',
3055         'AR': '181.0.0.0/12',
3056         'AS': '202.70.112.0/20',
3057         'AT': '84.112.0.0/13',
3058         'AU': '1.128.0.0/11',
3059         'AW': '181.41.0.0/18',
3060         'AZ': '5.191.0.0/16',
3061         'BA': '31.176.128.0/17',
3062         'BB': '65.48.128.0/17',
3063         'BD': '114.130.0.0/16',
3064         'BE': '57.0.0.0/8',
3065         'BF': '129.45.128.0/17',
3066         'BG': '95.42.0.0/15',
3067         'BH': '37.131.0.0/17',
3068         'BI': '154.117.192.0/18',
3069         'BJ': '137.255.0.0/16',
3070         'BL': '192.131.134.0/24',
3071         'BM': '196.12.64.0/18',
3072         'BN': '156.31.0.0/16',
3073         'BO': '161.56.0.0/16',
3074         'BQ': '161.0.80.0/20',
3075         'BR': '152.240.0.0/12',
3076         'BS': '24.51.64.0/18',
3077         'BT': '119.2.96.0/19',
3078         'BW': '168.167.0.0/16',
3079         'BY': '178.120.0.0/13',
3080         'BZ': '179.42.192.0/18',
3081         'CA': '99.224.0.0/11',
3082         'CD': '41.243.0.0/16',
3083         'CF': '196.32.200.0/21',
3084         'CG': '197.214.128.0/17',
3085         'CH': '85.0.0.0/13',
3086         'CI': '154.232.0.0/14',
3087         'CK': '202.65.32.0/19',
3088         'CL': '152.172.0.0/14',
3089         'CM': '165.210.0.0/15',
3090         'CN': '36.128.0.0/10',
3091         'CO': '181.240.0.0/12',
3092         'CR': '201.192.0.0/12',
3093         'CU': '152.206.0.0/15',
3094         'CV': '165.90.96.0/19',
3095         'CW': '190.88.128.0/17',
3096         'CY': '46.198.0.0/15',
3097         'CZ': '88.100.0.0/14',
3098         'DE': '53.0.0.0/8',
3099         'DJ': '197.241.0.0/17',
3100         'DK': '87.48.0.0/12',
3101         'DM': '192.243.48.0/20',
3102         'DO': '152.166.0.0/15',
3103         'DZ': '41.96.0.0/12',
3104         'EC': '186.68.0.0/15',
3105         'EE': '90.190.0.0/15',
3106         'EG': '156.160.0.0/11',
3107         'ER': '196.200.96.0/20',
3108         'ES': '88.0.0.0/11',
3109         'ET': '196.188.0.0/14',
3110         'EU': '2.16.0.0/13',
3111         'FI': '91.152.0.0/13',
3112         'FJ': '144.120.0.0/16',
3113         'FM': '119.252.112.0/20',
3114         'FO': '88.85.32.0/19',
3115         'FR': '90.0.0.0/9',
3116         'GA': '41.158.0.0/15',
3117         'GB': '25.0.0.0/8',
3118         'GD': '74.122.88.0/21',
3119         'GE': '31.146.0.0/16',
3120         'GF': '161.22.64.0/18',
3121         'GG': '62.68.160.0/19',
3122         'GH': '45.208.0.0/14',
3123         'GI': '85.115.128.0/19',
3124         'GL': '88.83.0.0/19',
3125         'GM': '160.182.0.0/15',
3126         'GN': '197.149.192.0/18',
3127         'GP': '104.250.0.0/19',
3128         'GQ': '105.235.224.0/20',
3129         'GR': '94.64.0.0/13',
3130         'GT': '168.234.0.0/16',
3131         'GU': '168.123.0.0/16',
3132         'GW': '197.214.80.0/20',
3133         'GY': '181.41.64.0/18',
3134         'HK': '113.252.0.0/14',
3135         'HN': '181.210.0.0/16',
3136         'HR': '93.136.0.0/13',
3137         'HT': '148.102.128.0/17',
3138         'HU': '84.0.0.0/14',
3139         'ID': '39.192.0.0/10',
3140         'IE': '87.32.0.0/12',
3141         'IL': '79.176.0.0/13',
3142         'IM': '5.62.80.0/20',
3143         'IN': '117.192.0.0/10',
3144         'IO': '203.83.48.0/21',
3145         'IQ': '37.236.0.0/14',
3146         'IR': '2.176.0.0/12',
3147         'IS': '82.221.0.0/16',
3148         'IT': '79.0.0.0/10',
3149         'JE': '87.244.64.0/18',
3150         'JM': '72.27.0.0/17',
3151         'JO': '176.29.0.0/16',
3152         'JP': '126.0.0.0/8',
3153         'KE': '105.48.0.0/12',
3154         'KG': '158.181.128.0/17',
3155         'KH': '36.37.128.0/17',
3156         'KI': '103.25.140.0/22',
3157         'KM': '197.255.224.0/20',
3158         'KN': '198.32.32.0/19',
3159         'KP': '175.45.176.0/22',
3160         'KR': '175.192.0.0/10',
3161         'KW': '37.36.0.0/14',
3162         'KY': '64.96.0.0/15',
3163         'KZ': '2.72.0.0/13',
3164         'LA': '115.84.64.0/18',
3165         'LB': '178.135.0.0/16',
3166         'LC': '192.147.231.0/24',
3167         'LI': '82.117.0.0/19',
3168         'LK': '112.134.0.0/15',
3169         'LR': '41.86.0.0/19',
3170         'LS': '129.232.0.0/17',
3171         'LT': '78.56.0.0/13',
3172         'LU': '188.42.0.0/16',
3173         'LV': '46.109.0.0/16',
3174         'LY': '41.252.0.0/14',
3175         'MA': '105.128.0.0/11',
3176         'MC': '88.209.64.0/18',
3177         'MD': '37.246.0.0/16',
3178         'ME': '178.175.0.0/17',
3179         'MF': '74.112.232.0/21',
3180         'MG': '154.126.0.0/17',
3181         'MH': '117.103.88.0/21',
3182         'MK': '77.28.0.0/15',
3183         'ML': '154.118.128.0/18',
3184         'MM': '37.111.0.0/17',
3185         'MN': '49.0.128.0/17',
3186         'MO': '60.246.0.0/16',
3187         'MP': '202.88.64.0/20',
3188         'MQ': '109.203.224.0/19',
3189         'MR': '41.188.64.0/18',
3190         'MS': '208.90.112.0/22',
3191         'MT': '46.11.0.0/16',
3192         'MU': '105.16.0.0/12',
3193         'MV': '27.114.128.0/18',
3194         'MW': '105.234.0.0/16',
3195         'MX': '187.192.0.0/11',
3196         'MY': '175.136.0.0/13',
3197         'MZ': '197.218.0.0/15',
3198         'NA': '41.182.0.0/16',
3199         'NC': '101.101.0.0/18',
3200         'NE': '197.214.0.0/18',
3201         'NF': '203.17.240.0/22',
3202         'NG': '105.112.0.0/12',
3203         'NI': '186.76.0.0/15',
3204         'NL': '145.96.0.0/11',
3205         'NO': '84.208.0.0/13',
3206         'NP': '36.252.0.0/15',
3207         'NR': '203.98.224.0/19',
3208         'NU': '49.156.48.0/22',
3209         'NZ': '49.224.0.0/14',
3210         'OM': '5.36.0.0/15',
3211         'PA': '186.72.0.0/15',
3212         'PE': '186.160.0.0/14',
3213         'PF': '123.50.64.0/18',
3214         'PG': '124.240.192.0/19',
3215         'PH': '49.144.0.0/13',
3216         'PK': '39.32.0.0/11',
3217         'PL': '83.0.0.0/11',
3218         'PM': '70.36.0.0/20',
3219         'PR': '66.50.0.0/16',
3220         'PS': '188.161.0.0/16',
3221         'PT': '85.240.0.0/13',
3222         'PW': '202.124.224.0/20',
3223         'PY': '181.120.0.0/14',
3224         'QA': '37.210.0.0/15',
3225         'RE': '139.26.0.0/16',
3226         'RO': '79.112.0.0/13',
3227         'RS': '178.220.0.0/14',
3228         'RU': '5.136.0.0/13',
3229         'RW': '105.178.0.0/15',
3230         'SA': '188.48.0.0/13',
3231         'SB': '202.1.160.0/19',
3232         'SC': '154.192.0.0/11',
3233         'SD': '154.96.0.0/13',
3234         'SE': '78.64.0.0/12',
3235         'SG': '152.56.0.0/14',
3236         'SI': '188.196.0.0/14',
3237         'SK': '78.98.0.0/15',
3238         'SL': '197.215.0.0/17',
3239         'SM': '89.186.32.0/19',
3240         'SN': '41.82.0.0/15',
3241         'SO': '197.220.64.0/19',
3242         'SR': '186.179.128.0/17',
3243         'SS': '105.235.208.0/21',
3244         'ST': '197.159.160.0/19',
3245         'SV': '168.243.0.0/16',
3246         'SX': '190.102.0.0/20',
3247         'SY': '5.0.0.0/16',
3248         'SZ': '41.84.224.0/19',
3249         'TC': '65.255.48.0/20',
3250         'TD': '154.68.128.0/19',
3251         'TG': '196.168.0.0/14',
3252         'TH': '171.96.0.0/13',
3253         'TJ': '85.9.128.0/18',
3254         'TK': '27.96.24.0/21',
3255         'TL': '180.189.160.0/20',
3256         'TM': '95.85.96.0/19',
3257         'TN': '197.0.0.0/11',
3258         'TO': '175.176.144.0/21',
3259         'TR': '78.160.0.0/11',
3260         'TT': '186.44.0.0/15',
3261         'TV': '202.2.96.0/19',
3262         'TW': '120.96.0.0/11',
3263         'TZ': '156.156.0.0/14',
3264         'UA': '93.72.0.0/13',
3265         'UG': '154.224.0.0/13',
3266         'US': '3.0.0.0/8',
3267         'UY': '167.56.0.0/13',
3268         'UZ': '82.215.64.0/18',
3269         'VA': '212.77.0.0/19',
3270         'VC': '24.92.144.0/20',
3271         'VE': '186.88.0.0/13',
3272         'VG': '172.103.64.0/18',
3273         'VI': '146.226.0.0/16',
3274         'VN': '14.160.0.0/11',
3275         'VU': '202.80.32.0/20',
3276         'WF': '117.20.32.0/21',
3277         'WS': '202.4.32.0/19',
3278         'YE': '134.35.0.0/16',
3279         'YT': '41.242.116.0/22',
3280         'ZA': '41.0.0.0/11',
3281         'ZM': '165.56.0.0/13',
3282         'ZW': '41.85.192.0/19',
3283     }
3284
3285     @classmethod
3286     def random_ipv4(cls, code):
3287         block = cls._country_ip_map.get(code.upper())
3288         if not block:
3289             return None
3290         addr, preflen = block.split('/')
3291         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3292         addr_max = addr_min | (0xffffffff >> int(preflen))
3293         return compat_str(socket.inet_ntoa(
3294             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3295
3296
3297 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3298     def __init__(self, proxies=None):
3299         # Set default handlers
3300         for type in ('http', 'https'):
3301             setattr(self, '%s_open' % type,
3302                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3303                         meth(r, proxy, type))
3304         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3305
3306     def proxy_open(self, req, proxy, type):
3307         req_proxy = req.headers.get('Ytdl-request-proxy')
3308         if req_proxy is not None:
3309             proxy = req_proxy
3310             del req.headers['Ytdl-request-proxy']
3311
3312         if proxy == '__noproxy__':
3313             return None  # No Proxy
3314         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3315             req.add_header('Ytdl-socks-proxy', proxy)
3316             # youtube-dl's http/https handlers do wrapping the socket with socks
3317             return None
3318         return compat_urllib_request.ProxyHandler.proxy_open(
3319             self, req, proxy, type)
3320
3321
3322 def ohdave_rsa_encrypt(data, exponent, modulus):
3323     '''
3324     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3325
3326     Input:
3327         data: data to encrypt, bytes-like object
3328         exponent, modulus: parameter e and N of RSA algorithm, both integer
3329     Output: hex string of encrypted data
3330
3331     Limitation: supports one block encryption only
3332     '''
3333
3334     payload = int(binascii.hexlify(data[::-1]), 16)
3335     encrypted = pow(payload, exponent, modulus)
3336     return '%x' % encrypted
3337
3338
3339 def encode_base_n(num, n, table=None):
3340     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3341     if not table:
3342         table = FULL_TABLE[:n]
3343
3344     if n > len(table):
3345         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3346
3347     if num == 0:
3348         return table[0]
3349
3350     ret = ''
3351     while num:
3352         ret = table[num % n] + ret
3353         num = num // n
3354     return ret
3355
3356
3357 def decode_packed_codes(code):
3358     mobj = re.search(PACKED_CODES_RE, code)
3359     obfucasted_code, base, count, symbols = mobj.groups()
3360     base = int(base)
3361     count = int(count)
3362     symbols = symbols.split('|')
3363     symbol_table = {}
3364
3365     while count:
3366         count -= 1
3367         base_n_count = encode_base_n(count, base)
3368         symbol_table[base_n_count] = symbols[count] or base_n_count
3369
3370     return re.sub(
3371         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3372         obfucasted_code)
3373
3374
3375 def parse_m3u8_attributes(attrib):
3376     info = {}
3377     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3378         if val.startswith('"'):
3379             val = val[1:-1]
3380         info[key] = val
3381     return info
3382
3383
3384 def urshift(val, n):
3385     return val >> n if val >= 0 else (val + 0x100000000) >> n
3386
3387
3388 # Based on png2str() written by @gdkchan and improved by @yokrysty
3389 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3390 def decode_png(png_data):
3391     # Reference: https://www.w3.org/TR/PNG/
3392     header = png_data[8:]
3393
3394     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3395         raise IOError('Not a valid PNG file.')
3396
3397     int_map = {1: '>B', 2: '>H', 4: '>I'}
3398     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3399
3400     chunks = []
3401
3402     while header:
3403         length = unpack_integer(header[:4])
3404         header = header[4:]
3405
3406         chunk_type = header[:4]
3407         header = header[4:]
3408
3409         chunk_data = header[:length]
3410         header = header[length:]
3411
3412         header = header[4:]  # Skip CRC
3413
3414         chunks.append({
3415             'type': chunk_type,
3416             'length': length,
3417             'data': chunk_data
3418         })
3419
3420     ihdr = chunks[0]['data']
3421
3422     width = unpack_integer(ihdr[:4])
3423     height = unpack_integer(ihdr[4:8])
3424
3425     idat = b''
3426
3427     for chunk in chunks:
3428         if chunk['type'] == b'IDAT':
3429             idat += chunk['data']
3430
3431     if not idat:
3432         raise IOError('Unable to read PNG data.')
3433
3434     decompressed_data = bytearray(zlib.decompress(idat))
3435
3436     stride = width * 3
3437     pixels = []
3438
3439     def _get_pixel(idx):
3440         x = idx % stride
3441         y = idx // stride
3442         return pixels[y][x]
3443
3444     for y in range(height):
3445         basePos = y * (1 + stride)
3446         filter_type = decompressed_data[basePos]
3447
3448         current_row = []
3449
3450         pixels.append(current_row)
3451
3452         for x in range(stride):
3453             color = decompressed_data[1 + basePos + x]
3454             basex = y * stride + x
3455             left = 0
3456             up = 0
3457
3458             if x > 2:
3459                 left = _get_pixel(basex - 3)
3460             if y > 0:
3461                 up = _get_pixel(basex - stride)
3462
3463             if filter_type == 1:  # Sub
3464                 color = (color + left) & 0xff
3465             elif filter_type == 2:  # Up
3466                 color = (color + up) & 0xff
3467             elif filter_type == 3:  # Average
3468                 color = (color + ((left + up) >> 1)) & 0xff
3469             elif filter_type == 4:  # Paeth
3470                 a = left
3471                 b = up
3472                 c = 0
3473
3474                 if x > 2 and y > 0:
3475                     c = _get_pixel(basex - stride - 3)
3476
3477                 p = a + b - c
3478
3479                 pa = abs(p - a)
3480                 pb = abs(p - b)
3481                 pc = abs(p - c)
3482
3483                 if pa <= pb and pa <= pc:
3484                     color = (color + a) & 0xff
3485                 elif pb <= pc:
3486                     color = (color + b) & 0xff
3487                 else:
3488                     color = (color + c) & 0xff
3489
3490             current_row.append(color)
3491
3492     return width, height, pixels
3493
3494
3495 def write_xattr(path, key, value):
3496     # This mess below finds the best xattr tool for the job
3497     try:
3498         # try the pyxattr module...
3499         import xattr
3500
3501         if hasattr(xattr, 'set'):  # pyxattr
3502             # Unicode arguments are not supported in python-pyxattr until
3503             # version 0.5.0
3504             # See https://github.com/rg3/youtube-dl/issues/5498
3505             pyxattr_required_version = '0.5.0'
3506             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3507                 # TODO: fallback to CLI tools
3508                 raise XAttrUnavailableError(
3509                     'python-pyxattr is detected but is too old. '
3510                     'youtube-dl requires %s or above while your version is %s. '
3511                     'Falling back to other xattr implementations' % (
3512                         pyxattr_required_version, xattr.__version__))
3513
3514             setxattr = xattr.set
3515         else:  # xattr
3516             setxattr = xattr.setxattr
3517
3518         try:
3519             setxattr(path, key, value)
3520         except EnvironmentError as e:
3521             raise XAttrMetadataError(e.errno, e.strerror)
3522
3523     except ImportError:
3524         if compat_os_name == 'nt':
3525             # Write xattrs to NTFS Alternate Data Streams:
3526             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3527             assert ':' not in key
3528             assert os.path.exists(path)
3529
3530             ads_fn = path + ':' + key
3531             try:
3532                 with open(ads_fn, 'wb') as f:
3533                     f.write(value)
3534             except EnvironmentError as e:
3535                 raise XAttrMetadataError(e.errno, e.strerror)
3536         else:
3537             user_has_setfattr = check_executable('setfattr', ['--version'])
3538             user_has_xattr = check_executable('xattr', ['-h'])
3539
3540             if user_has_setfattr or user_has_xattr:
3541
3542                 value = value.decode('utf-8')
3543                 if user_has_setfattr:
3544                     executable = 'setfattr'
3545                     opts = ['-n', key, '-v', value]
3546                 elif user_has_xattr:
3547                     executable = 'xattr'
3548                     opts = ['-w', key, value]
3549
3550                 cmd = ([encodeFilename(executable, True)] +
3551                        [encodeArgument(o) for o in opts] +
3552                        [encodeFilename(path, True)])
3553
3554                 try:
3555                     p = subprocess.Popen(
3556                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3557                 except EnvironmentError as e:
3558                     raise XAttrMetadataError(e.errno, e.strerror)
3559                 stdout, stderr = p.communicate()
3560                 stderr = stderr.decode('utf-8', 'replace')
3561                 if p.returncode != 0:
3562                     raise XAttrMetadataError(p.returncode, stderr)
3563
3564             else:
3565                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3566                 if sys.platform.startswith('linux'):
3567                     raise XAttrUnavailableError(
3568                         "Couldn't find a tool to set the xattrs. "
3569                         "Install either the python 'pyxattr' or 'xattr' "
3570                         "modules, or the GNU 'attr' package "
3571                         "(which contains the 'setfattr' tool).")
3572                 else:
3573                     raise XAttrUnavailableError(
3574                         "Couldn't find a tool to set the xattrs. "
3575                         "Install either the python 'xattr' module, "
3576                         "or the 'xattr' binary.")