_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_ctypes_WINFUNCTYPE,
  43     compat_etree_fromstring,
  44     compat_expanduser,
  45     compat_html_entities,
  46     compat_html_entities_html5,
  47     compat_http_client,
  48     compat_kwargs,
  49     compat_os_name,
  50     compat_parse_qs,
  51     compat_shlex_quote,
  52     compat_socket_create_connection,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187
 188
 189 def preferredencoding():
 190     """Get preferred encoding.
 191
 192     Returns the best encoding scheme for the system, based on
 193     locale.getpreferredencoding() and some further tweaks.
 194     """
 195     try:
 196         pref = locale.getpreferredencoding()
 197         'TEST'.encode(pref)
 198     except Exception:
 199         pref = 'UTF-8'
 200
 201     return pref
 202
 203
 204 def write_json_file(obj, fn):
 205     """ Encode obj as JSON and write it to fn, atomically if possible """
 206
 207     fn = encodeFilename(fn)
 208     if sys.version_info < (3, 0) and sys.platform != 'win32':
 209         encoding = get_filesystem_encoding()
 210         # os.path.basename returns a bytes object, but NamedTemporaryFile
 211         # will fail if the filename contains non ascii characters unless we
 212         # use a unicode object
 213         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 214         # the same for os.path.dirname
 215         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 216     else:
 217         path_basename = os.path.basename
 218         path_dirname = os.path.dirname
 219
 220     args = {
 221         'suffix': '.tmp',
 222         'prefix': path_basename(fn) + '.',
 223         'dir': path_dirname(fn),
 224         'delete': False,
 225     }
 226
 227     # In Python 2.x, json.dump expects a bytestream.
 228     # In Python 3.x, it writes to a character stream
 229     if sys.version_info < (3, 0):
 230         args['mode'] = 'wb'
 231     else:
 232         args.update({
 233             'mode': 'w',
 234             'encoding': 'utf-8',
 235         })
 236
 237     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 238
 239     try:
 240         with tf:
 241             json.dump(obj, tf)
 242         if sys.platform == 'win32':
 243             # Need to remove existing file on Windows, else os.rename raises
 244             # WindowsError or FileExistsError.
 245             try:
 246                 os.unlink(fn)
 247             except OSError:
 248                 pass
 249         os.rename(tf.name, fn)
 250     except Exception:
 251         try:
 252             os.remove(tf.name)
 253         except OSError:
 254             pass
 255         raise
 256
 257
 258 if sys.version_info >= (2, 7):
 259     def find_xpath_attr(node, xpath, key, val=None):
 260         """ Find the xpath xpath[@key=val] """
 261         assert re.match(r'^[a-zA-Z_-]+$', key)
 262         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 263         return node.find(expr)
 264 else:
 265     def find_xpath_attr(node, xpath, key, val=None):
 266         for f in node.findall(compat_xpath(xpath)):
 267             if key not in f.attrib:
 268                 continue
 269             if val is None or f.attrib.get(key) == val:
 270                 return f
 271         return None
 272
 273 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 274 # the namespace parameter
 275
 276
 277 def xpath_with_ns(path, ns_map):
 278     components = [c.split(':') for c in path.split('/')]
 279     replaced = []
 280     for c in components:
 281         if len(c) == 1:
 282             replaced.append(c[0])
 283         else:
 284             ns, tag = c
 285             replaced.append('{%s}%s' % (ns_map[ns], tag))
 286     return '/'.join(replaced)
 287
 288
 289 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 290     def _find_xpath(xpath):
 291         return node.find(compat_xpath(xpath))
 292
 293     if isinstance(xpath, (str, compat_str)):
 294         n = _find_xpath(xpath)
 295     else:
 296         for xp in xpath:
 297             n = _find_xpath(xp)
 298             if n is not None:
 299                 break
 300
 301     if n is None:
 302         if default is not NO_DEFAULT:
 303             return default
 304         elif fatal:
 305             name = xpath if name is None else name
 306             raise ExtractorError('Could not find XML element %s' % name)
 307         else:
 308             return None
 309     return n
 310
 311
 312 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 313     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 314     if n is None or n == default:
 315         return n
 316     if n.text is None:
 317         if default is not NO_DEFAULT:
 318             return default
 319         elif fatal:
 320             name = xpath if name is None else name
 321             raise ExtractorError('Could not find XML element\'s text %s' % name)
 322         else:
 323             return None
 324     return n.text
 325
 326
 327 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 328     n = find_xpath_attr(node, xpath, key)
 329     if n is None:
 330         if default is not NO_DEFAULT:
 331             return default
 332         elif fatal:
 333             name = '%s[@%s]' % (xpath, key) if name is None else name
 334             raise ExtractorError('Could not find XML attribute %s' % name)
 335         else:
 336             return None
 337     return n.attrib[key]
 338
 339
 340 def get_element_by_id(id, html):
 341     """Return the content of the tag with the specified ID in the passed HTML document"""
 342     return get_element_by_attribute('id', id, html)
 343
 344
 345 def get_element_by_class(class_name, html):
 346     """Return the content of the first tag with the specified class in the passed HTML document"""
 347     retval = get_elements_by_class(class_name, html)
 348     return retval[0] if retval else None
 349
 350
 351 def get_element_by_attribute(attribute, value, html, escape_value=True):
 352     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 353     return retval[0] if retval else None
 354
 355
 356 def get_elements_by_class(class_name, html):
 357     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 358     return get_elements_by_attribute(
 359         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 360         html, escape_value=False)
 361
 362
 363 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 364     """Return the content of the tag with the specified attribute in the passed HTML document"""
 365
 366     value = re.escape(value) if escape_value else value
 367
 368     retlist = []
 369     for m in re.finditer(r'''(?xs)
 370         <([a-zA-Z0-9:._-]+)
 371          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 372          \s+%s=['"]?%s['"]?
 373          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 374         \s*>
 375         (?P<content>.*?)
 376         </\1>
 377     ''' % (re.escape(attribute), value), html):
 378         res = m.group('content')
 379
 380         if res.startswith('"') or res.startswith("'"):
 381             res = res[1:-1]
 382
 383         retlist.append(unescapeHTML(res))
 384
 385     return retlist
 386
 387
 388 class HTMLAttributeParser(compat_HTMLParser):
 389     """Trivial HTML parser to gather the attributes for a single element"""
 390     def __init__(self):
 391         self.attrs = {}
 392         compat_HTMLParser.__init__(self)
 393
 394     def handle_starttag(self, tag, attrs):
 395         self.attrs = dict(attrs)
 396
 397
 398 def extract_attributes(html_element):
 399     """Given a string for an HTML element such as
 400     <el
 401          a="foo" B="bar" c="&98;az" d=boz
 402          empty= noval entity="&amp;"
 403          sq='"' dq="'"
 404     >
 405     Decode and return a dictionary of attributes.
 406     {
 407         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 408         'empty': '', 'noval': None, 'entity': '&',
 409         'sq': '"', 'dq': '\''
 410     }.
 411     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 412     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 413     """
 414     parser = HTMLAttributeParser()
 415     try:
 416         parser.feed(html_element)
 417         parser.close()
 418     # Older Python may throw HTMLParseError in case of malformed HTML
 419     except compat_HTMLParseError:
 420         pass
 421     return parser.attrs
 422
 423
 424 def clean_html(html):
 425     """Clean an HTML snippet into a readable string"""
 426
 427     if html is None:  # Convenience for sanitizing descriptions etc.
 428         return html
 429
 430     # Newline vs <br />
 431     html = html.replace('\n', ' ')
 432     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 433     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 434     # Strip html tags
 435     html = re.sub('<.*?>', '', html)
 436     # Replace html entities
 437     html = unescapeHTML(html)
 438     return html.strip()
 439
 440
 441 def sanitize_open(filename, open_mode):
 442     """Try to open the given filename, and slightly tweak it if this fails.
 443
 444     Attempts to open the given filename. If this fails, it tries to change
 445     the filename slightly, step by step, until it's either able to open it
 446     or it fails and raises a final exception, like the standard open()
 447     function.
 448
 449     It returns the tuple (stream, definitive_file_name).
 450     """
 451     try:
 452         if filename == '-':
 453             if sys.platform == 'win32':
 454                 import msvcrt
 455                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 456             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 457         stream = open(encodeFilename(filename), open_mode)
 458         return (stream, filename)
 459     except (IOError, OSError) as err:
 460         if err.errno in (errno.EACCES,):
 461             raise
 462
 463         # In case of error, try to remove win32 forbidden chars
 464         alt_filename = sanitize_path(filename)
 465         if alt_filename == filename:
 466             raise
 467         else:
 468             # An exception here should be caught in the caller
 469             stream = open(encodeFilename(alt_filename), open_mode)
 470             return (stream, alt_filename)
 471
 472
 473 def timeconvert(timestr):
 474     """Convert RFC 2822 defined time string into system timestamp"""
 475     timestamp = None
 476     timetuple = email.utils.parsedate_tz(timestr)
 477     if timetuple is not None:
 478         timestamp = email.utils.mktime_tz(timetuple)
 479     return timestamp
 480
 481
 482 def sanitize_filename(s, restricted=False, is_id=False):
 483     """Sanitizes a string so it could be used as part of a filename.
 484     If restricted is set, use a stricter subset of allowed characters.
 485     Set is_id if this is not an arbitrary string, but an ID that should be kept
 486     if possible.
 487     """
 488     def replace_insane(char):
 489         if restricted and char in ACCENT_CHARS:
 490             return ACCENT_CHARS[char]
 491         if char == '?' or ord(char) < 32 or ord(char) == 127:
 492             return ''
 493         elif char == '"':
 494             return '' if restricted else '\''
 495         elif char == ':':
 496             return '_-' if restricted else ' -'
 497         elif char in '\\/|*<>':
 498             return '_'
 499         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 500             return '_'
 501         if restricted and ord(char) > 127:
 502             return '_'
 503         return char
 504
 505     # Handle timestamps
 506     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 507     result = ''.join(map(replace_insane, s))
 508     if not is_id:
 509         while '__' in result:
 510             result = result.replace('__', '_')
 511         result = result.strip('_')
 512         # Common case of "Foreign band name - English song title"
 513         if restricted and result.startswith('-_'):
 514             result = result[2:]
 515         if result.startswith('-'):
 516             result = '_' + result[len('-'):]
 517         result = result.lstrip('.')
 518         if not result:
 519             result = '_'
 520     return result
 521
 522
 523 def sanitize_path(s):
 524     """Sanitizes and normalizes path on Windows"""
 525     if sys.platform != 'win32':
 526         return s
 527     drive_or_unc, _ = os.path.splitdrive(s)
 528     if sys.version_info < (2, 7) and not drive_or_unc:
 529         drive_or_unc, _ = os.path.splitunc(s)
 530     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 531     if drive_or_unc:
 532         norm_path.pop(0)
 533     sanitized_path = [
 534         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 535         for path_part in norm_path]
 536     if drive_or_unc:
 537         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 538     return os.path.join(*sanitized_path)
 539
 540
 541 def sanitize_url(url):
 542     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 543     # the number of unwanted failures due to missing protocol
 544     if url.startswith('//'):
 545         return 'http:%s' % url
 546     # Fix some common typos seen so far
 547     COMMON_TYPOS = (
 548         # https://github.com/rg3/youtube-dl/issues/15649
 549         (r'^httpss://', r'https://'),
 550         # https://bx1.be/lives/direct-tv/
 551         (r'^rmtp([es]?)://', r'rtmp\1://'),
 552     )
 553     for mistake, fixup in COMMON_TYPOS:
 554         if re.match(mistake, url):
 555             return re.sub(mistake, fixup, url)
 556     return url
 557
 558
 559 def sanitized_Request(url, *args, **kwargs):
 560     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 561
 562
 563 def expand_path(s):
 564     """Expand shell variables and ~"""
 565     return os.path.expandvars(compat_expanduser(s))
 566
 567
 568 def orderedSet(iterable):
 569     """ Remove all duplicates from the input iterable """
 570     res = []
 571     for el in iterable:
 572         if el not in res:
 573             res.append(el)
 574     return res
 575
 576
 577 def _htmlentity_transform(entity_with_semicolon):
 578     """Transforms an HTML entity to a character."""
 579     entity = entity_with_semicolon[:-1]
 580
 581     # Known non-numeric HTML entity
 582     if entity in compat_html_entities.name2codepoint:
 583         return compat_chr(compat_html_entities.name2codepoint[entity])
 584
 585     # TODO: HTML5 allows entities without a semicolon. For example,
 586     # '&Eacuteric' should be decoded as 'Éric'.
 587     if entity_with_semicolon in compat_html_entities_html5:
 588         return compat_html_entities_html5[entity_with_semicolon]
 589
 590     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 591     if mobj is not None:
 592         numstr = mobj.group(1)
 593         if numstr.startswith('x'):
 594             base = 16
 595             numstr = '0%s' % numstr
 596         else:
 597             base = 10
 598         # See https://github.com/rg3/youtube-dl/issues/7518
 599         try:
 600             return compat_chr(int(numstr, base))
 601         except ValueError:
 602             pass
 603
 604     # Unknown entity in name, return its literal representation
 605     return '&%s;' % entity
 606
 607
 608 def unescapeHTML(s):
 609     if s is None:
 610         return None
 611     assert type(s) == compat_str
 612
 613     return re.sub(
 614         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 615
 616
 617 def get_subprocess_encoding():
 618     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 619         # For subprocess calls, encode with locale encoding
 620         # Refer to http://stackoverflow.com/a/9951851/35070
 621         encoding = preferredencoding()
 622     else:
 623         encoding = sys.getfilesystemencoding()
 624     if encoding is None:
 625         encoding = 'utf-8'
 626     return encoding
 627
 628
 629 def encodeFilename(s, for_subprocess=False):
 630     """
 631     @param s The name of the file
 632     """
 633
 634     assert type(s) == compat_str
 635
 636     # Python 3 has a Unicode API
 637     if sys.version_info >= (3, 0):
 638         return s
 639
 640     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 641     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 642     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 643     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 644         return s
 645
 646     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 647     if sys.platform.startswith('java'):
 648         return s
 649
 650     return s.encode(get_subprocess_encoding(), 'ignore')
 651
 652
 653 def decodeFilename(b, for_subprocess=False):
 654
 655     if sys.version_info >= (3, 0):
 656         return b
 657
 658     if not isinstance(b, bytes):
 659         return b
 660
 661     return b.decode(get_subprocess_encoding(), 'ignore')
 662
 663
 664 def encodeArgument(s):
 665     if not isinstance(s, compat_str):
 666         # Legacy code that uses byte strings
 667         # Uncomment the following line after fixing all post processors
 668         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 669         s = s.decode('ascii')
 670     return encodeFilename(s, True)
 671
 672
 673 def decodeArgument(b):
 674     return decodeFilename(b, True)
 675
 676
 677 def decodeOption(optval):
 678     if optval is None:
 679         return optval
 680     if isinstance(optval, bytes):
 681         optval = optval.decode(preferredencoding())
 682
 683     assert isinstance(optval, compat_str)
 684     return optval
 685
 686
 687 def formatSeconds(secs):
 688     if secs > 3600:
 689         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 690     elif secs > 60:
 691         return '%d:%02d' % (secs // 60, secs % 60)
 692     else:
 693         return '%d' % secs
 694
 695
 696 def make_HTTPS_handler(params, **kwargs):
 697     opts_no_check_certificate = params.get('nocheckcertificate', False)
 698     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 699         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 700         if opts_no_check_certificate:
 701             context.check_hostname = False
 702             context.verify_mode = ssl.CERT_NONE
 703         try:
 704             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 705         except TypeError:
 706             # Python 2.7.8
 707             # (create_default_context present but HTTPSHandler has no context=)
 708             pass
 709
 710     if sys.version_info < (3, 2):
 711         return YoutubeDLHTTPSHandler(params, **kwargs)
 712     else:  # Python < 3.4
 713         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 714         context.verify_mode = (ssl.CERT_NONE
 715                                if opts_no_check_certificate
 716                                else ssl.CERT_REQUIRED)
 717         context.set_default_verify_paths()
 718         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 719
 720
 721 def bug_reports_message():
 722     if ytdl_is_updateable():
 723         update_cmd = 'type  youtube-dl -U  to update'
 724     else:
 725         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 726     msg = '; please report this issue on https://yt-dl.org/bug .'
 727     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 728     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 729     return msg
 730
 731
 732 class YoutubeDLError(Exception):
 733     """Base exception for YoutubeDL errors."""
 734     pass
 735
 736
 737 class ExtractorError(YoutubeDLError):
 738     """Error during info extraction."""
 739
 740     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 741         """ tb, if given, is the original traceback (so that it can be printed out).
 742         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 743         """
 744
 745         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 746             expected = True
 747         if video_id is not None:
 748             msg = video_id + ': ' + msg
 749         if cause:
 750             msg += ' (caused by %r)' % cause
 751         if not expected:
 752             msg += bug_reports_message()
 753         super(ExtractorError, self).__init__(msg)
 754
 755         self.traceback = tb
 756         self.exc_info = sys.exc_info()  # preserve original exception
 757         self.cause = cause
 758         self.video_id = video_id
 759
 760     def format_traceback(self):
 761         if self.traceback is None:
 762             return None
 763         return ''.join(traceback.format_tb(self.traceback))
 764
 765
 766 class UnsupportedError(ExtractorError):
 767     def __init__(self, url):
 768         super(UnsupportedError, self).__init__(
 769             'Unsupported URL: %s' % url, expected=True)
 770         self.url = url
 771
 772
 773 class RegexNotFoundError(ExtractorError):
 774     """Error when a regex didn't match"""
 775     pass
 776
 777
 778 class GeoRestrictedError(ExtractorError):
 779     """Geographic restriction Error exception.
 780
 781     This exception may be thrown when a video is not available from your
 782     geographic location due to geographic restrictions imposed by a website.
 783     """
 784     def __init__(self, msg, countries=None):
 785         super(GeoRestrictedError, self).__init__(msg, expected=True)
 786         self.msg = msg
 787         self.countries = countries
 788
 789
 790 class DownloadError(YoutubeDLError):
 791     """Download Error exception.
 792
 793     This exception may be thrown by FileDownloader objects if they are not
 794     configured to continue on errors. They will contain the appropriate
 795     error message.
 796     """
 797
 798     def __init__(self, msg, exc_info=None):
 799         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 800         super(DownloadError, self).__init__(msg)
 801         self.exc_info = exc_info
 802
 803
 804 class SameFileError(YoutubeDLError):
 805     """Same File exception.
 806
 807     This exception will be thrown by FileDownloader objects if they detect
 808     multiple files would have to be downloaded to the same file on disk.
 809     """
 810     pass
 811
 812
 813 class PostProcessingError(YoutubeDLError):
 814     """Post Processing exception.
 815
 816     This exception may be raised by PostProcessor's .run() method to
 817     indicate an error in the postprocessing task.
 818     """
 819
 820     def __init__(self, msg):
 821         super(PostProcessingError, self).__init__(msg)
 822         self.msg = msg
 823
 824
 825 class MaxDownloadsReached(YoutubeDLError):
 826     """ --max-downloads limit has been reached. """
 827     pass
 828
 829
 830 class UnavailableVideoError(YoutubeDLError):
 831     """Unavailable Format exception.
 832
 833     This exception will be thrown when a video is requested
 834     in a format that is not available for that video.
 835     """
 836     pass
 837
 838
 839 class ContentTooShortError(YoutubeDLError):
 840     """Content Too Short exception.
 841
 842     This exception may be raised by FileDownloader objects when a file they
 843     download is too small for what the server announced first, indicating
 844     the connection was probably interrupted.
 845     """
 846
 847     def __init__(self, downloaded, expected):
 848         super(ContentTooShortError, self).__init__(
 849             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 850         )
 851         # Both in bytes
 852         self.downloaded = downloaded
 853         self.expected = expected
 854
 855
 856 class XAttrMetadataError(YoutubeDLError):
 857     def __init__(self, code=None, msg='Unknown error'):
 858         super(XAttrMetadataError, self).__init__(msg)
 859         self.code = code
 860         self.msg = msg
 861
 862         # Parsing code and msg
 863         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 864                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 865             self.reason = 'NO_SPACE'
 866         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 867             self.reason = 'VALUE_TOO_LONG'
 868         else:
 869             self.reason = 'NOT_SUPPORTED'
 870
 871
 872 class XAttrUnavailableError(YoutubeDLError):
 873     pass
 874
 875
 876 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 877     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 878     # expected HTTP responses to meet HTTP/1.0 or later (see also
 879     # https://github.com/rg3/youtube-dl/issues/6727)
 880     if sys.version_info < (3, 0):
 881         kwargs['strict'] = True
 882     hc = http_class(*args, **compat_kwargs(kwargs))
 883     source_address = ydl_handler._params.get('source_address')
 884     if source_address is not None:
 885         sa = (source_address, 0)
 886         if hasattr(hc, 'source_address'):  # Python 2.7+
 887             hc.source_address = sa
 888         else:  # Python 2.6
 889             def _hc_connect(self, *args, **kwargs):
 890                 sock = compat_socket_create_connection(
 891                     (self.host, self.port), self.timeout, sa)
 892                 if is_https:
 893                     self.sock = ssl.wrap_socket(
 894                         sock, self.key_file, self.cert_file,
 895                         ssl_version=ssl.PROTOCOL_TLSv1)
 896                 else:
 897                     self.sock = sock
 898             hc.connect = functools.partial(_hc_connect, hc)
 899
 900     return hc
 901
 902
 903 def handle_youtubedl_headers(headers):
 904     filtered_headers = headers
 905
 906     if 'Youtubedl-no-compression' in filtered_headers:
 907         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 908         del filtered_headers['Youtubedl-no-compression']
 909
 910     return filtered_headers
 911
 912
 913 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 914     """Handler for HTTP requests and responses.
 915
 916     This class, when installed with an OpenerDirector, automatically adds
 917     the standard headers to every HTTP request and handles gzipped and
 918     deflated responses from web servers. If compression is to be avoided in
 919     a particular request, the original request in the program code only has
 920     to include the HTTP header "Youtubedl-no-compression", which will be
 921     removed before making the real request.
 922
 923     Part of this code was copied from:
 924
 925     http://techknack.net/python-urllib2-handlers/
 926
 927     Andrew Rowls, the author of that code, agreed to release it to the
 928     public domain.
 929     """
 930
 931     def __init__(self, params, *args, **kwargs):
 932         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 933         self._params = params
 934
 935     def http_open(self, req):
 936         conn_class = compat_http_client.HTTPConnection
 937
 938         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 939         if socks_proxy:
 940             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 941             del req.headers['Ytdl-socks-proxy']
 942
 943         return self.do_open(functools.partial(
 944             _create_http_connection, self, conn_class, False),
 945             req)
 946
 947     @staticmethod
 948     def deflate(data):
 949         try:
 950             return zlib.decompress(data, -zlib.MAX_WBITS)
 951         except zlib.error:
 952             return zlib.decompress(data)
 953
 954     def http_request(self, req):
 955         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 956         # always respected by websites, some tend to give out URLs with non percent-encoded
 957         # non-ASCII characters (see telemb.py, ard.py [#3412])
 958         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 959         # To work around aforementioned issue we will replace request's original URL with
 960         # percent-encoded one
 961         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 962         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 963         url = req.get_full_url()
 964         url_escaped = escape_url(url)
 965
 966         # Substitute URL if any change after escaping
 967         if url != url_escaped:
 968             req = update_Request(req, url=url_escaped)
 969
 970         for h, v in std_headers.items():
 971             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 972             # The dict keys are capitalized because of this bug by urllib
 973             if h.capitalize() not in req.headers:
 974                 req.add_header(h, v)
 975
 976         req.headers = handle_youtubedl_headers(req.headers)
 977
 978         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 979             # Python 2.6 is brain-dead when it comes to fragments
 980             req._Request__original = req._Request__original.partition('#')[0]
 981             req._Request__r_type = req._Request__r_type.partition('#')[0]
 982
 983         return req
 984
 985     def http_response(self, req, resp):
 986         old_resp = resp
 987         # gzip
 988         if resp.headers.get('Content-encoding', '') == 'gzip':
 989             content = resp.read()
 990             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 991             try:
 992                 uncompressed = io.BytesIO(gz.read())
 993             except IOError as original_ioerror:
 994                 # There may be junk add the end of the file
 995                 # See http://stackoverflow.com/q/4928560/35070 for details
 996                 for i in range(1, 1024):
 997                     try:
 998                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 999                         uncompressed = io.BytesIO(gz.read())
1000                     except IOError:
1001                         continue
1002                     break
1003                 else:
1004                     raise original_ioerror
1005             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1006             resp.msg = old_resp.msg
1007             del resp.headers['Content-encoding']
1008         # deflate
1009         if resp.headers.get('Content-encoding', '') == 'deflate':
1010             gz = io.BytesIO(self.deflate(resp.read()))
1011             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1012             resp.msg = old_resp.msg
1013             del resp.headers['Content-encoding']
1014         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015         # https://github.com/rg3/youtube-dl/issues/6457).
1016         if 300 <= resp.code < 400:
1017             location = resp.headers.get('Location')
1018             if location:
1019                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020                 if sys.version_info >= (3, 0):
1021                     location = location.encode('iso-8859-1').decode('utf-8')
1022                 else:
1023                     location = location.decode('utf-8')
1024                 location_escaped = escape_url(location)
1025                 if location != location_escaped:
1026                     del resp.headers['Location']
1027                     if sys.version_info < (3, 0):
1028                         location_escaped = location_escaped.encode('utf-8')
1029                     resp.headers['Location'] = location_escaped
1030         return resp
1031
1032     https_request = http_request
1033     https_response = http_response
1034
1035
1036 def make_socks_conn_class(base_class, socks_proxy):
1037     assert issubclass(base_class, (
1038         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1039
1040     url_components = compat_urlparse.urlparse(socks_proxy)
1041     if url_components.scheme.lower() == 'socks5':
1042         socks_type = ProxyType.SOCKS5
1043     elif url_components.scheme.lower() in ('socks', 'socks4'):
1044         socks_type = ProxyType.SOCKS4
1045     elif url_components.scheme.lower() == 'socks4a':
1046         socks_type = ProxyType.SOCKS4A
1047
1048     def unquote_if_non_empty(s):
1049         if not s:
1050             return s
1051         return compat_urllib_parse_unquote_plus(s)
1052
1053     proxy_args = (
1054         socks_type,
1055         url_components.hostname, url_components.port or 1080,
1056         True,  # Remote DNS
1057         unquote_if_non_empty(url_components.username),
1058         unquote_if_non_empty(url_components.password),
1059     )
1060
1061     class SocksConnection(base_class):
1062         def connect(self):
1063             self.sock = sockssocket()
1064             self.sock.setproxy(*proxy_args)
1065             if type(self.timeout) in (int, float):
1066                 self.sock.settimeout(self.timeout)
1067             self.sock.connect((self.host, self.port))
1068
1069             if isinstance(self, compat_http_client.HTTPSConnection):
1070                 if hasattr(self, '_context'):  # Python > 2.6
1071                     self.sock = self._context.wrap_socket(
1072                         self.sock, server_hostname=self.host)
1073                 else:
1074                     self.sock = ssl.wrap_socket(self.sock)
1075
1076     return SocksConnection
1077
1078
1079 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083         self._params = params
1084
1085     def https_open(self, req):
1086         kwargs = {}
1087         conn_class = self._https_conn_class
1088
1089         if hasattr(self, '_context'):  # python > 2.6
1090             kwargs['context'] = self._context
1091         if hasattr(self, '_check_hostname'):  # python 3.x
1092             kwargs['check_hostname'] = self._check_hostname
1093
1094         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1095         if socks_proxy:
1096             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097             del req.headers['Ytdl-socks-proxy']
1098
1099         return self.do_open(functools.partial(
1100             _create_http_connection, self, conn_class, True),
1101             req, **kwargs)
1102
1103
1104 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105     def __init__(self, cookiejar=None):
1106         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1107
1108     def http_response(self, request, response):
1109         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110         # characters in Set-Cookie HTTP header of last response (see
1111         # https://github.com/rg3/youtube-dl/issues/6769).
1112         # In order to at least prevent crashing we will percent encode Set-Cookie
1113         # header before HTTPCookieProcessor starts processing it.
1114         # if sys.version_info < (3, 0) and response.headers:
1115         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116         #         set_cookie = response.headers.get(set_cookie_header)
1117         #         if set_cookie:
1118         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119         #             if set_cookie != set_cookie_escaped:
1120         #                 del response.headers[set_cookie_header]
1121         #                 response.headers[set_cookie_header] = set_cookie_escaped
1122         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1123
1124     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125     https_response = http_response
1126
1127
1128 def extract_timezone(date_str):
1129     m = re.search(
1130         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1131         date_str)
1132     if not m:
1133         timezone = datetime.timedelta()
1134     else:
1135         date_str = date_str[:-len(m.group('tz'))]
1136         if not m.group('sign'):
1137             timezone = datetime.timedelta()
1138         else:
1139             sign = 1 if m.group('sign') == '+' else -1
1140             timezone = datetime.timedelta(
1141                 hours=sign * int(m.group('hours')),
1142                 minutes=sign * int(m.group('minutes')))
1143     return timezone, date_str
1144
1145
1146 def parse_iso8601(date_str, delimiter='T', timezone=None):
1147     """ Return a UNIX timestamp from the given date """
1148
1149     if date_str is None:
1150         return None
1151
1152     date_str = re.sub(r'\.[0-9]+', '', date_str)
1153
1154     if timezone is None:
1155         timezone, date_str = extract_timezone(date_str)
1156
1157     try:
1158         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160         return calendar.timegm(dt.timetuple())
1161     except ValueError:
1162         pass
1163
1164
1165 def date_formats(day_first=True):
1166     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1167
1168
1169 def unified_strdate(date_str, day_first=True):
1170     """Return a string with the date in the format YYYYMMDD"""
1171
1172     if date_str is None:
1173         return None
1174     upload_date = None
1175     # Replace commas
1176     date_str = date_str.replace(',', ' ')
1177     # Remove AM/PM + timezone
1178     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1179     _, date_str = extract_timezone(date_str)
1180
1181     for expression in date_formats(day_first):
1182         try:
1183             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1184         except ValueError:
1185             pass
1186     if upload_date is None:
1187         timetuple = email.utils.parsedate_tz(date_str)
1188         if timetuple:
1189             try:
1190                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191             except ValueError:
1192                 pass
1193     if upload_date is not None:
1194         return compat_str(upload_date)
1195
1196
1197 def unified_timestamp(date_str, day_first=True):
1198     if date_str is None:
1199         return None
1200
1201     date_str = re.sub(r'[,|]', '', date_str)
1202
1203     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1204     timezone, date_str = extract_timezone(date_str)
1205
1206     # Remove AM/PM + timezone
1207     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1208
1209     # Remove unrecognized timezones from ISO 8601 alike timestamps
1210     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1211     if m:
1212         date_str = date_str[:-len(m.group('tz'))]
1213
1214     for expression in date_formats(day_first):
1215         try:
1216             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1217             return calendar.timegm(dt.timetuple())
1218         except ValueError:
1219             pass
1220     timetuple = email.utils.parsedate_tz(date_str)
1221     if timetuple:
1222         return calendar.timegm(timetuple) + pm_delta * 3600
1223
1224
1225 def determine_ext(url, default_ext='unknown_video'):
1226     if url is None:
1227         return default_ext
1228     guess = url.partition('?')[0].rpartition('.')[2]
1229     if re.match(r'^[A-Za-z0-9]+$', guess):
1230         return guess
1231     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1232     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1233         return guess.rstrip('/')
1234     else:
1235         return default_ext
1236
1237
1238 def subtitles_filename(filename, sub_lang, sub_format):
1239     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1240
1241
1242 def date_from_str(date_str):
1243     """
1244     Return a datetime object from a string in the format YYYYMMDD or
1245     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1246     today = datetime.date.today()
1247     if date_str in ('now', 'today'):
1248         return today
1249     if date_str == 'yesterday':
1250         return today - datetime.timedelta(days=1)
1251     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1252     if match is not None:
1253         sign = match.group('sign')
1254         time = int(match.group('time'))
1255         if sign == '-':
1256             time = -time
1257         unit = match.group('unit')
1258         # A bad approximation?
1259         if unit == 'month':
1260             unit = 'day'
1261             time *= 30
1262         elif unit == 'year':
1263             unit = 'day'
1264             time *= 365
1265         unit += 's'
1266         delta = datetime.timedelta(**{unit: time})
1267         return today + delta
1268     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1269
1270
1271 def hyphenate_date(date_str):
1272     """
1273     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1274     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1275     if match is not None:
1276         return '-'.join(match.groups())
1277     else:
1278         return date_str
1279
1280
1281 class DateRange(object):
1282     """Represents a time interval between two dates"""
1283
1284     def __init__(self, start=None, end=None):
1285         """start and end must be strings in the format accepted by date"""
1286         if start is not None:
1287             self.start = date_from_str(start)
1288         else:
1289             self.start = datetime.datetime.min.date()
1290         if end is not None:
1291             self.end = date_from_str(end)
1292         else:
1293             self.end = datetime.datetime.max.date()
1294         if self.start > self.end:
1295             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1296
1297     @classmethod
1298     def day(cls, day):
1299         """Returns a range that only contains the given day"""
1300         return cls(day, day)
1301
1302     def __contains__(self, date):
1303         """Check if the date is in the range"""
1304         if not isinstance(date, datetime.date):
1305             date = date_from_str(date)
1306         return self.start <= date <= self.end
1307
1308     def __str__(self):
1309         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1310
1311
1312 def platform_name():
1313     """ Returns the platform name as a compat_str """
1314     res = platform.platform()
1315     if isinstance(res, bytes):
1316         res = res.decode(preferredencoding())
1317
1318     assert isinstance(res, compat_str)
1319     return res
1320
1321
1322 def _windows_write_string(s, out):
1323     """ Returns True if the string was written using special methods,
1324     False if it has yet to be written out."""
1325     # Adapted from http://stackoverflow.com/a/3259271/35070
1326
1327     import ctypes
1328     import ctypes.wintypes
1329
1330     WIN_OUTPUT_IDS = {
1331         1: -11,
1332         2: -12,
1333     }
1334
1335     try:
1336         fileno = out.fileno()
1337     except AttributeError:
1338         # If the output stream doesn't have a fileno, it's virtual
1339         return False
1340     except io.UnsupportedOperation:
1341         # Some strange Windows pseudo files?
1342         return False
1343     if fileno not in WIN_OUTPUT_IDS:
1344         return False
1345
1346     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1347         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1348         ('GetStdHandle', ctypes.windll.kernel32))
1349     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1350
1351     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1352         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1353         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1354         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1355     written = ctypes.wintypes.DWORD(0)
1356
1357     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1358     FILE_TYPE_CHAR = 0x0002
1359     FILE_TYPE_REMOTE = 0x8000
1360     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1361         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1362         ctypes.POINTER(ctypes.wintypes.DWORD))(
1363         ('GetConsoleMode', ctypes.windll.kernel32))
1364     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1365
1366     def not_a_console(handle):
1367         if handle == INVALID_HANDLE_VALUE or handle is None:
1368             return True
1369         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1370                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1371
1372     if not_a_console(h):
1373         return False
1374
1375     def next_nonbmp_pos(s):
1376         try:
1377             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1378         except StopIteration:
1379             return len(s)
1380
1381     while s:
1382         count = min(next_nonbmp_pos(s), 1024)
1383
1384         ret = WriteConsoleW(
1385             h, s, count if count else 2, ctypes.byref(written), None)
1386         if ret == 0:
1387             raise OSError('Failed to write string')
1388         if not count:  # We just wrote a non-BMP character
1389             assert written.value == 2
1390             s = s[1:]
1391         else:
1392             assert written.value > 0
1393             s = s[written.value:]
1394     return True
1395
1396
1397 def write_string(s, out=None, encoding=None):
1398     if out is None:
1399         out = sys.stderr
1400     assert type(s) == compat_str
1401
1402     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1403         if _windows_write_string(s, out):
1404             return
1405
1406     if ('b' in getattr(out, 'mode', '') or
1407             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1408         byt = s.encode(encoding or preferredencoding(), 'ignore')
1409         out.write(byt)
1410     elif hasattr(out, 'buffer'):
1411         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1412         byt = s.encode(enc, 'ignore')
1413         out.buffer.write(byt)
1414     else:
1415         out.write(s)
1416     out.flush()
1417
1418
1419 def bytes_to_intlist(bs):
1420     if not bs:
1421         return []
1422     if isinstance(bs[0], int):  # Python 3
1423         return list(bs)
1424     else:
1425         return [ord(c) for c in bs]
1426
1427
1428 def intlist_to_bytes(xs):
1429     if not xs:
1430         return b''
1431     return compat_struct_pack('%dB' % len(xs), *xs)
1432
1433
1434 # Cross-platform file locking
1435 if sys.platform == 'win32':
1436     import ctypes.wintypes
1437     import msvcrt
1438
1439     class OVERLAPPED(ctypes.Structure):
1440         _fields_ = [
1441             ('Internal', ctypes.wintypes.LPVOID),
1442             ('InternalHigh', ctypes.wintypes.LPVOID),
1443             ('Offset', ctypes.wintypes.DWORD),
1444             ('OffsetHigh', ctypes.wintypes.DWORD),
1445             ('hEvent', ctypes.wintypes.HANDLE),
1446         ]
1447
1448     kernel32 = ctypes.windll.kernel32
1449     LockFileEx = kernel32.LockFileEx
1450     LockFileEx.argtypes = [
1451         ctypes.wintypes.HANDLE,     # hFile
1452         ctypes.wintypes.DWORD,      # dwFlags
1453         ctypes.wintypes.DWORD,      # dwReserved
1454         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1455         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1456         ctypes.POINTER(OVERLAPPED)  # Overlapped
1457     ]
1458     LockFileEx.restype = ctypes.wintypes.BOOL
1459     UnlockFileEx = kernel32.UnlockFileEx
1460     UnlockFileEx.argtypes = [
1461         ctypes.wintypes.HANDLE,     # hFile
1462         ctypes.wintypes.DWORD,      # dwReserved
1463         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1464         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1465         ctypes.POINTER(OVERLAPPED)  # Overlapped
1466     ]
1467     UnlockFileEx.restype = ctypes.wintypes.BOOL
1468     whole_low = 0xffffffff
1469     whole_high = 0x7fffffff
1470
1471     def _lock_file(f, exclusive):
1472         overlapped = OVERLAPPED()
1473         overlapped.Offset = 0
1474         overlapped.OffsetHigh = 0
1475         overlapped.hEvent = 0
1476         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1477         handle = msvcrt.get_osfhandle(f.fileno())
1478         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1479                           whole_low, whole_high, f._lock_file_overlapped_p):
1480             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1481
1482     def _unlock_file(f):
1483         assert f._lock_file_overlapped_p
1484         handle = msvcrt.get_osfhandle(f.fileno())
1485         if not UnlockFileEx(handle, 0,
1486                             whole_low, whole_high, f._lock_file_overlapped_p):
1487             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1488
1489 else:
1490     # Some platforms, such as Jython, is missing fcntl
1491     try:
1492         import fcntl
1493
1494         def _lock_file(f, exclusive):
1495             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1496
1497         def _unlock_file(f):
1498             fcntl.flock(f, fcntl.LOCK_UN)
1499     except ImportError:
1500         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1501
1502         def _lock_file(f, exclusive):
1503             raise IOError(UNSUPPORTED_MSG)
1504
1505         def _unlock_file(f):
1506             raise IOError(UNSUPPORTED_MSG)
1507
1508
1509 class locked_file(object):
1510     def __init__(self, filename, mode, encoding=None):
1511         assert mode in ['r', 'a', 'w']
1512         self.f = io.open(filename, mode, encoding=encoding)
1513         self.mode = mode
1514
1515     def __enter__(self):
1516         exclusive = self.mode != 'r'
1517         try:
1518             _lock_file(self.f, exclusive)
1519         except IOError:
1520             self.f.close()
1521             raise
1522         return self
1523
1524     def __exit__(self, etype, value, traceback):
1525         try:
1526             _unlock_file(self.f)
1527         finally:
1528             self.f.close()
1529
1530     def __iter__(self):
1531         return iter(self.f)
1532
1533     def write(self, *args):
1534         return self.f.write(*args)
1535
1536     def read(self, *args):
1537         return self.f.read(*args)
1538
1539
1540 def get_filesystem_encoding():
1541     encoding = sys.getfilesystemencoding()
1542     return encoding if encoding is not None else 'utf-8'
1543
1544
1545 def shell_quote(args):
1546     quoted_args = []
1547     encoding = get_filesystem_encoding()
1548     for a in args:
1549         if isinstance(a, bytes):
1550             # We may get a filename encoded with 'encodeFilename'
1551             a = a.decode(encoding)
1552         quoted_args.append(compat_shlex_quote(a))
1553     return ' '.join(quoted_args)
1554
1555
1556 def smuggle_url(url, data):
1557     """ Pass additional data in a URL for internal use. """
1558
1559     url, idata = unsmuggle_url(url, {})
1560     data.update(idata)
1561     sdata = compat_urllib_parse_urlencode(
1562         {'__youtubedl_smuggle': json.dumps(data)})
1563     return url + '#' + sdata
1564
1565
1566 def unsmuggle_url(smug_url, default=None):
1567     if '#__youtubedl_smuggle' not in smug_url:
1568         return smug_url, default
1569     url, _, sdata = smug_url.rpartition('#')
1570     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1571     data = json.loads(jsond)
1572     return url, data
1573
1574
1575 def format_bytes(bytes):
1576     if bytes is None:
1577         return 'N/A'
1578     if type(bytes) is str:
1579         bytes = float(bytes)
1580     if bytes == 0.0:
1581         exponent = 0
1582     else:
1583         exponent = int(math.log(bytes, 1024.0))
1584     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1585     converted = float(bytes) / float(1024 ** exponent)
1586     return '%.2f%s' % (converted, suffix)
1587
1588
1589 def lookup_unit_table(unit_table, s):
1590     units_re = '|'.join(re.escape(u) for u in unit_table)
1591     m = re.match(
1592         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1593     if not m:
1594         return None
1595     num_str = m.group('num').replace(',', '.')
1596     mult = unit_table[m.group('unit')]
1597     return int(float(num_str) * mult)
1598
1599
1600 def parse_filesize(s):
1601     if s is None:
1602         return None
1603
1604     # The lower-case forms are of course incorrect and unofficial,
1605     # but we support those too
1606     _UNIT_TABLE = {
1607         'B': 1,
1608         'b': 1,
1609         'bytes': 1,
1610         'KiB': 1024,
1611         'KB': 1000,
1612         'kB': 1024,
1613         'Kb': 1000,
1614         'kb': 1000,
1615         'kilobytes': 1000,
1616         'kibibytes': 1024,
1617         'MiB': 1024 ** 2,
1618         'MB': 1000 ** 2,
1619         'mB': 1024 ** 2,
1620         'Mb': 1000 ** 2,
1621         'mb': 1000 ** 2,
1622         'megabytes': 1000 ** 2,
1623         'mebibytes': 1024 ** 2,
1624         'GiB': 1024 ** 3,
1625         'GB': 1000 ** 3,
1626         'gB': 1024 ** 3,
1627         'Gb': 1000 ** 3,
1628         'gb': 1000 ** 3,
1629         'gigabytes': 1000 ** 3,
1630         'gibibytes': 1024 ** 3,
1631         'TiB': 1024 ** 4,
1632         'TB': 1000 ** 4,
1633         'tB': 1024 ** 4,
1634         'Tb': 1000 ** 4,
1635         'tb': 1000 ** 4,
1636         'terabytes': 1000 ** 4,
1637         'tebibytes': 1024 ** 4,
1638         'PiB': 1024 ** 5,
1639         'PB': 1000 ** 5,
1640         'pB': 1024 ** 5,
1641         'Pb': 1000 ** 5,
1642         'pb': 1000 ** 5,
1643         'petabytes': 1000 ** 5,
1644         'pebibytes': 1024 ** 5,
1645         'EiB': 1024 ** 6,
1646         'EB': 1000 ** 6,
1647         'eB': 1024 ** 6,
1648         'Eb': 1000 ** 6,
1649         'eb': 1000 ** 6,
1650         'exabytes': 1000 ** 6,
1651         'exbibytes': 1024 ** 6,
1652         'ZiB': 1024 ** 7,
1653         'ZB': 1000 ** 7,
1654         'zB': 1024 ** 7,
1655         'Zb': 1000 ** 7,
1656         'zb': 1000 ** 7,
1657         'zettabytes': 1000 ** 7,
1658         'zebibytes': 1024 ** 7,
1659         'YiB': 1024 ** 8,
1660         'YB': 1000 ** 8,
1661         'yB': 1024 ** 8,
1662         'Yb': 1000 ** 8,
1663         'yb': 1000 ** 8,
1664         'yottabytes': 1000 ** 8,
1665         'yobibytes': 1024 ** 8,
1666     }
1667
1668     return lookup_unit_table(_UNIT_TABLE, s)
1669
1670
1671 def parse_count(s):
1672     if s is None:
1673         return None
1674
1675     s = s.strip()
1676
1677     if re.match(r'^[\d,.]+$', s):
1678         return str_to_int(s)
1679
1680     _UNIT_TABLE = {
1681         'k': 1000,
1682         'K': 1000,
1683         'm': 1000 ** 2,
1684         'M': 1000 ** 2,
1685         'kk': 1000 ** 2,
1686         'KK': 1000 ** 2,
1687     }
1688
1689     return lookup_unit_table(_UNIT_TABLE, s)
1690
1691
1692 def parse_resolution(s):
1693     if s is None:
1694         return {}
1695
1696     mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1697     if mobj:
1698         return {
1699             'width': int(mobj.group('w')),
1700             'height': int(mobj.group('h')),
1701         }
1702
1703     mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1704     if mobj:
1705         return {'height': int(mobj.group(1))}
1706
1707     mobj = re.search(r'\b([48])[kK]\b', s)
1708     if mobj:
1709         return {'height': int(mobj.group(1)) * 540}
1710
1711     return {}
1712
1713
1714 def month_by_name(name, lang='en'):
1715     """ Return the number of a month by (locale-independently) English name """
1716
1717     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1718
1719     try:
1720         return month_names.index(name) + 1
1721     except ValueError:
1722         return None
1723
1724
1725 def month_by_abbreviation(abbrev):
1726     """ Return the number of a month by (locale-independently) English
1727         abbreviations """
1728
1729     try:
1730         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1731     except ValueError:
1732         return None
1733
1734
1735 def fix_xml_ampersands(xml_str):
1736     """Replace all the '&' by '&amp;' in XML"""
1737     return re.sub(
1738         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1739         '&amp;',
1740         xml_str)
1741
1742
1743 def setproctitle(title):
1744     assert isinstance(title, compat_str)
1745
1746     # ctypes in Jython is not complete
1747     # http://bugs.jython.org/issue2148
1748     if sys.platform.startswith('java'):
1749         return
1750
1751     try:
1752         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1753     except OSError:
1754         return
1755     except TypeError:
1756         # LoadLibrary in Windows Python 2.7.13 only expects
1757         # a bytestring, but since unicode_literals turns
1758         # every string into a unicode string, it fails.
1759         return
1760     title_bytes = title.encode('utf-8')
1761     buf = ctypes.create_string_buffer(len(title_bytes))
1762     buf.value = title_bytes
1763     try:
1764         libc.prctl(15, buf, 0, 0, 0)
1765     except AttributeError:
1766         return  # Strange libc, just skip this
1767
1768
1769 def remove_start(s, start):
1770     return s[len(start):] if s is not None and s.startswith(start) else s
1771
1772
1773 def remove_end(s, end):
1774     return s[:-len(end)] if s is not None and s.endswith(end) else s
1775
1776
1777 def remove_quotes(s):
1778     if s is None or len(s) < 2:
1779         return s
1780     for quote in ('"', "'", ):
1781         if s[0] == quote and s[-1] == quote:
1782             return s[1:-1]
1783     return s
1784
1785
1786 def url_basename(url):
1787     path = compat_urlparse.urlparse(url).path
1788     return path.strip('/').split('/')[-1]
1789
1790
1791 def base_url(url):
1792     return re.match(r'https?://[^?#&]+/', url).group()
1793
1794
1795 def urljoin(base, path):
1796     if isinstance(path, bytes):
1797         path = path.decode('utf-8')
1798     if not isinstance(path, compat_str) or not path:
1799         return None
1800     if re.match(r'^(?:https?:)?//', path):
1801         return path
1802     if isinstance(base, bytes):
1803         base = base.decode('utf-8')
1804     if not isinstance(base, compat_str) or not re.match(
1805             r'^(?:https?:)?//', base):
1806         return None
1807     return compat_urlparse.urljoin(base, path)
1808
1809
1810 class HEADRequest(compat_urllib_request.Request):
1811     def get_method(self):
1812         return 'HEAD'
1813
1814
1815 class PUTRequest(compat_urllib_request.Request):
1816     def get_method(self):
1817         return 'PUT'
1818
1819
1820 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1821     if get_attr:
1822         if v is not None:
1823             v = getattr(v, get_attr, None)
1824     if v == '':
1825         v = None
1826     if v is None:
1827         return default
1828     try:
1829         return int(v) * invscale // scale
1830     except ValueError:
1831         return default
1832
1833
1834 def str_or_none(v, default=None):
1835     return default if v is None else compat_str(v)
1836
1837
1838 def str_to_int(int_str):
1839     """ A more relaxed version of int_or_none """
1840     if int_str is None:
1841         return None
1842     int_str = re.sub(r'[,\.\+]', '', int_str)
1843     return int(int_str)
1844
1845
1846 def float_or_none(v, scale=1, invscale=1, default=None):
1847     if v is None:
1848         return default
1849     try:
1850         return float(v) * invscale / scale
1851     except ValueError:
1852         return default
1853
1854
1855 def bool_or_none(v, default=None):
1856     return v if isinstance(v, bool) else default
1857
1858
1859 def strip_or_none(v):
1860     return None if v is None else v.strip()
1861
1862
1863 def parse_duration(s):
1864     if not isinstance(s, compat_basestring):
1865         return None
1866
1867     s = s.strip()
1868
1869     days, hours, mins, secs, ms = [None] * 5
1870     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1871     if m:
1872         days, hours, mins, secs, ms = m.groups()
1873     else:
1874         m = re.match(
1875             r'''(?ix)(?:P?
1876                 (?:
1877                     [0-9]+\s*y(?:ears?)?\s*
1878                 )?
1879                 (?:
1880                     [0-9]+\s*m(?:onths?)?\s*
1881                 )?
1882                 (?:
1883                     [0-9]+\s*w(?:eeks?)?\s*
1884                 )?
1885                 (?:
1886                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1887                 )?
1888                 T)?
1889                 (?:
1890                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1891                 )?
1892                 (?:
1893                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1894                 )?
1895                 (?:
1896                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1897                 )?Z?$''', s)
1898         if m:
1899             days, hours, mins, secs, ms = m.groups()
1900         else:
1901             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1902             if m:
1903                 hours, mins = m.groups()
1904             else:
1905                 return None
1906
1907     duration = 0
1908     if secs:
1909         duration += float(secs)
1910     if mins:
1911         duration += float(mins) * 60
1912     if hours:
1913         duration += float(hours) * 60 * 60
1914     if days:
1915         duration += float(days) * 24 * 60 * 60
1916     if ms:
1917         duration += float(ms)
1918     return duration
1919
1920
1921 def prepend_extension(filename, ext, expected_real_ext=None):
1922     name, real_ext = os.path.splitext(filename)
1923     return (
1924         '{0}.{1}{2}'.format(name, ext, real_ext)
1925         if not expected_real_ext or real_ext[1:] == expected_real_ext
1926         else '{0}.{1}'.format(filename, ext))
1927
1928
1929 def replace_extension(filename, ext, expected_real_ext=None):
1930     name, real_ext = os.path.splitext(filename)
1931     return '{0}.{1}'.format(
1932         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1933         ext)
1934
1935
1936 def check_executable(exe, args=[]):
1937     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1938     args can be a list of arguments for a short output (like -version) """
1939     try:
1940         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1941     except OSError:
1942         return False
1943     return exe
1944
1945
1946 def get_exe_version(exe, args=['--version'],
1947                     version_re=None, unrecognized='present'):
1948     """ Returns the version of the specified executable,
1949     or False if the executable is not present """
1950     try:
1951         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1952         # SIGTTOU if youtube-dl is run in the background.
1953         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1954         out, _ = subprocess.Popen(
1955             [encodeArgument(exe)] + args,
1956             stdin=subprocess.PIPE,
1957             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1958     except OSError:
1959         return False
1960     if isinstance(out, bytes):  # Python 2.x
1961         out = out.decode('ascii', 'ignore')
1962     return detect_exe_version(out, version_re, unrecognized)
1963
1964
1965 def detect_exe_version(output, version_re=None, unrecognized='present'):
1966     assert isinstance(output, compat_str)
1967     if version_re is None:
1968         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1969     m = re.search(version_re, output)
1970     if m:
1971         return m.group(1)
1972     else:
1973         return unrecognized
1974
1975
1976 class PagedList(object):
1977     def __len__(self):
1978         # This is only useful for tests
1979         return len(self.getslice())
1980
1981
1982 class OnDemandPagedList(PagedList):
1983     def __init__(self, pagefunc, pagesize, use_cache=True):
1984         self._pagefunc = pagefunc
1985         self._pagesize = pagesize
1986         self._use_cache = use_cache
1987         if use_cache:
1988             self._cache = {}
1989
1990     def getslice(self, start=0, end=None):
1991         res = []
1992         for pagenum in itertools.count(start // self._pagesize):
1993             firstid = pagenum * self._pagesize
1994             nextfirstid = pagenum * self._pagesize + self._pagesize
1995             if start >= nextfirstid:
1996                 continue
1997
1998             page_results = None
1999             if self._use_cache:
2000                 page_results = self._cache.get(pagenum)
2001             if page_results is None:
2002                 page_results = list(self._pagefunc(pagenum))
2003             if self._use_cache:
2004                 self._cache[pagenum] = page_results
2005
2006             startv = (
2007                 start % self._pagesize
2008                 if firstid <= start < nextfirstid
2009                 else 0)
2010
2011             endv = (
2012                 ((end - 1) % self._pagesize) + 1
2013                 if (end is not None and firstid <= end <= nextfirstid)
2014                 else None)
2015
2016             if startv != 0 or endv is not None:
2017                 page_results = page_results[startv:endv]
2018             res.extend(page_results)
2019
2020             # A little optimization - if current page is not "full", ie. does
2021             # not contain page_size videos then we can assume that this page
2022             # is the last one - there are no more ids on further pages -
2023             # i.e. no need to query again.
2024             if len(page_results) + startv < self._pagesize:
2025                 break
2026
2027             # If we got the whole page, but the next page is not interesting,
2028             # break out early as well
2029             if end == nextfirstid:
2030                 break
2031         return res
2032
2033
2034 class InAdvancePagedList(PagedList):
2035     def __init__(self, pagefunc, pagecount, pagesize):
2036         self._pagefunc = pagefunc
2037         self._pagecount = pagecount
2038         self._pagesize = pagesize
2039
2040     def getslice(self, start=0, end=None):
2041         res = []
2042         start_page = start // self._pagesize
2043         end_page = (
2044             self._pagecount if end is None else (end // self._pagesize + 1))
2045         skip_elems = start - start_page * self._pagesize
2046         only_more = None if end is None else end - start
2047         for pagenum in range(start_page, end_page):
2048             page = list(self._pagefunc(pagenum))
2049             if skip_elems:
2050                 page = page[skip_elems:]
2051                 skip_elems = None
2052             if only_more is not None:
2053                 if len(page) < only_more:
2054                     only_more -= len(page)
2055                 else:
2056                     page = page[:only_more]
2057                     res.extend(page)
2058                     break
2059             res.extend(page)
2060         return res
2061
2062
2063 def uppercase_escape(s):
2064     unicode_escape = codecs.getdecoder('unicode_escape')
2065     return re.sub(
2066         r'\\U[0-9a-fA-F]{8}',
2067         lambda m: unicode_escape(m.group(0))[0],
2068         s)
2069
2070
2071 def lowercase_escape(s):
2072     unicode_escape = codecs.getdecoder('unicode_escape')
2073     return re.sub(
2074         r'\\u[0-9a-fA-F]{4}',
2075         lambda m: unicode_escape(m.group(0))[0],
2076         s)
2077
2078
2079 def escape_rfc3986(s):
2080     """Escape non-ASCII characters as suggested by RFC 3986"""
2081     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2082         s = s.encode('utf-8')
2083     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2084
2085
2086 def escape_url(url):
2087     """Escape URL as suggested by RFC 3986"""
2088     url_parsed = compat_urllib_parse_urlparse(url)
2089     return url_parsed._replace(
2090         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2091         path=escape_rfc3986(url_parsed.path),
2092         params=escape_rfc3986(url_parsed.params),
2093         query=escape_rfc3986(url_parsed.query),
2094         fragment=escape_rfc3986(url_parsed.fragment)
2095     ).geturl()
2096
2097
2098 def read_batch_urls(batch_fd):
2099     def fixup(url):
2100         if not isinstance(url, compat_str):
2101             url = url.decode('utf-8', 'replace')
2102         BOM_UTF8 = '\xef\xbb\xbf'
2103         if url.startswith(BOM_UTF8):
2104             url = url[len(BOM_UTF8):]
2105         url = url.strip()
2106         if url.startswith(('#', ';', ']')):
2107             return False
2108         return url
2109
2110     with contextlib.closing(batch_fd) as fd:
2111         return [url for url in map(fixup, fd) if url]
2112
2113
2114 def urlencode_postdata(*args, **kargs):
2115     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2116
2117
2118 def update_url_query(url, query):
2119     if not query:
2120         return url
2121     parsed_url = compat_urlparse.urlparse(url)
2122     qs = compat_parse_qs(parsed_url.query)
2123     qs.update(query)
2124     return compat_urlparse.urlunparse(parsed_url._replace(
2125         query=compat_urllib_parse_urlencode(qs, True)))
2126
2127
2128 def update_Request(req, url=None, data=None, headers={}, query={}):
2129     req_headers = req.headers.copy()
2130     req_headers.update(headers)
2131     req_data = data or req.data
2132     req_url = update_url_query(url or req.get_full_url(), query)
2133     req_get_method = req.get_method()
2134     if req_get_method == 'HEAD':
2135         req_type = HEADRequest
2136     elif req_get_method == 'PUT':
2137         req_type = PUTRequest
2138     else:
2139         req_type = compat_urllib_request.Request
2140     new_req = req_type(
2141         req_url, data=req_data, headers=req_headers,
2142         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2143     if hasattr(req, 'timeout'):
2144         new_req.timeout = req.timeout
2145     return new_req
2146
2147
2148 def _multipart_encode_impl(data, boundary):
2149     content_type = 'multipart/form-data; boundary=%s' % boundary
2150
2151     out = b''
2152     for k, v in data.items():
2153         out += b'--' + boundary.encode('ascii') + b'\r\n'
2154         if isinstance(k, compat_str):
2155             k = k.encode('utf-8')
2156         if isinstance(v, compat_str):
2157             v = v.encode('utf-8')
2158         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2159         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2160         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2161         if boundary.encode('ascii') in content:
2162             raise ValueError('Boundary overlaps with data')
2163         out += content
2164
2165     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2166
2167     return out, content_type
2168
2169
2170 def multipart_encode(data, boundary=None):
2171     '''
2172     Encode a dict to RFC 7578-compliant form-data
2173
2174     data:
2175         A dict where keys and values can be either Unicode or bytes-like
2176         objects.
2177     boundary:
2178         If specified a Unicode object, it's used as the boundary. Otherwise
2179         a random boundary is generated.
2180
2181     Reference: https://tools.ietf.org/html/rfc7578
2182     '''
2183     has_specified_boundary = boundary is not None
2184
2185     while True:
2186         if boundary is None:
2187             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2188
2189         try:
2190             out, content_type = _multipart_encode_impl(data, boundary)
2191             break
2192         except ValueError:
2193             if has_specified_boundary:
2194                 raise
2195             boundary = None
2196
2197     return out, content_type
2198
2199
2200 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2201     if isinstance(key_or_keys, (list, tuple)):
2202         for key in key_or_keys:
2203             if key not in d or d[key] is None or skip_false_values and not d[key]:
2204                 continue
2205             return d[key]
2206         return default
2207     return d.get(key_or_keys, default)
2208
2209
2210 def try_get(src, getter, expected_type=None):
2211     if not isinstance(getter, (list, tuple)):
2212         getter = [getter]
2213     for get in getter:
2214         try:
2215             v = get(src)
2216         except (AttributeError, KeyError, TypeError, IndexError):
2217             pass
2218         else:
2219             if expected_type is None or isinstance(v, expected_type):
2220                 return v
2221
2222
2223 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2224     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2225
2226
2227 US_RATINGS = {
2228     'G': 0,
2229     'PG': 10,
2230     'PG-13': 13,
2231     'R': 16,
2232     'NC': 18,
2233 }
2234
2235
2236 TV_PARENTAL_GUIDELINES = {
2237     'TV-Y': 0,
2238     'TV-Y7': 7,
2239     'TV-G': 0,
2240     'TV-PG': 0,
2241     'TV-14': 14,
2242     'TV-MA': 17,
2243 }
2244
2245
2246 def parse_age_limit(s):
2247     if type(s) == int:
2248         return s if 0 <= s <= 21 else None
2249     if not isinstance(s, compat_basestring):
2250         return None
2251     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2252     if m:
2253         return int(m.group('age'))
2254     if s in US_RATINGS:
2255         return US_RATINGS[s]
2256     return TV_PARENTAL_GUIDELINES.get(s)
2257
2258
2259 def strip_jsonp(code):
2260     return re.sub(
2261         r'''(?sx)^
2262             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2263             (?:\s*&&\s*(?P=func_name))?
2264             \s*\(\s*(?P<callback_data>.*)\);?
2265             \s*?(?://[^\n]*)*$''',
2266         r'\g<callback_data>', code)
2267
2268
2269 def js_to_json(code):
2270     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2271     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2272     INTEGER_TABLE = (
2273         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2274         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2275     )
2276
2277     def fix_kv(m):
2278         v = m.group(0)
2279         if v in ('true', 'false', 'null'):
2280             return v
2281         elif v.startswith('/*') or v.startswith('//') or v == ',':
2282             return ""
2283
2284         if v[0] in ("'", '"'):
2285             v = re.sub(r'(?s)\\.|"', lambda m: {
2286                 '"': '\\"',
2287                 "\\'": "'",
2288                 '\\\n': '',
2289                 '\\x': '\\u00',
2290             }.get(m.group(0), m.group(0)), v[1:-1])
2291
2292         for regex, base in INTEGER_TABLE:
2293             im = re.match(regex, v)
2294             if im:
2295                 i = int(im.group(1), base)
2296                 return '"%d":' % i if v.endswith(':') else '%d' % i
2297
2298         return '"%s"' % v
2299
2300     return re.sub(r'''(?sx)
2301         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2302         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2303         {comment}|,(?={skip}[\]}}])|
2304         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2305         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2306         [0-9]+(?={skip}:)
2307         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2308
2309
2310 def qualities(quality_ids):
2311     """ Get a numeric quality value out of a list of possible values """
2312     def q(qid):
2313         try:
2314             return quality_ids.index(qid)
2315         except ValueError:
2316             return -1
2317     return q
2318
2319
2320 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2321
2322
2323 def limit_length(s, length):
2324     """ Add ellipses to overly long strings """
2325     if s is None:
2326         return None
2327     ELLIPSES = '...'
2328     if len(s) > length:
2329         return s[:length - len(ELLIPSES)] + ELLIPSES
2330     return s
2331
2332
2333 def version_tuple(v):
2334     return tuple(int(e) for e in re.split(r'[-.]', v))
2335
2336
2337 def is_outdated_version(version, limit, assume_new=True):
2338     if not version:
2339         return not assume_new
2340     try:
2341         return version_tuple(version) < version_tuple(limit)
2342     except ValueError:
2343         return not assume_new
2344
2345
2346 def ytdl_is_updateable():
2347     """ Returns if youtube-dl can be updated with -U """
2348     from zipimport import zipimporter
2349
2350     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2351
2352
2353 def args_to_str(args):
2354     # Get a short string representation for a subprocess command
2355     return ' '.join(compat_shlex_quote(a) for a in args)
2356
2357
2358 def error_to_compat_str(err):
2359     err_str = str(err)
2360     # On python 2 error byte string must be decoded with proper
2361     # encoding rather than ascii
2362     if sys.version_info[0] < 3:
2363         err_str = err_str.decode(preferredencoding())
2364     return err_str
2365
2366
2367 def mimetype2ext(mt):
2368     if mt is None:
2369         return None
2370
2371     ext = {
2372         'audio/mp4': 'm4a',
2373         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2374         # it's the most popular one
2375         'audio/mpeg': 'mp3',
2376     }.get(mt)
2377     if ext is not None:
2378         return ext
2379
2380     _, _, res = mt.rpartition('/')
2381     res = res.split(';')[0].strip().lower()
2382
2383     return {
2384         '3gpp': '3gp',
2385         'smptett+xml': 'tt',
2386         'ttaf+xml': 'dfxp',
2387         'ttml+xml': 'ttml',
2388         'x-flv': 'flv',
2389         'x-mp4-fragmented': 'mp4',
2390         'x-ms-sami': 'sami',
2391         'x-ms-wmv': 'wmv',
2392         'mpegurl': 'm3u8',
2393         'x-mpegurl': 'm3u8',
2394         'vnd.apple.mpegurl': 'm3u8',
2395         'dash+xml': 'mpd',
2396         'f4m+xml': 'f4m',
2397         'hds+xml': 'f4m',
2398         'vnd.ms-sstr+xml': 'ism',
2399         'quicktime': 'mov',
2400         'mp2t': 'ts',
2401     }.get(res, res)
2402
2403
2404 def parse_codecs(codecs_str):
2405     # http://tools.ietf.org/html/rfc6381
2406     if not codecs_str:
2407         return {}
2408     splited_codecs = list(filter(None, map(
2409         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2410     vcodec, acodec = None, None
2411     for full_codec in splited_codecs:
2412         codec = full_codec.split('.')[0]
2413         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2414             if not vcodec:
2415                 vcodec = full_codec
2416         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2417             if not acodec:
2418                 acodec = full_codec
2419         else:
2420             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2421     if not vcodec and not acodec:
2422         if len(splited_codecs) == 2:
2423             return {
2424                 'vcodec': vcodec,
2425                 'acodec': acodec,
2426             }
2427         elif len(splited_codecs) == 1:
2428             return {
2429                 'vcodec': 'none',
2430                 'acodec': vcodec,
2431             }
2432     else:
2433         return {
2434             'vcodec': vcodec or 'none',
2435             'acodec': acodec or 'none',
2436         }
2437     return {}
2438
2439
2440 def urlhandle_detect_ext(url_handle):
2441     getheader = url_handle.headers.get
2442
2443     cd = getheader('Content-Disposition')
2444     if cd:
2445         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2446         if m:
2447             e = determine_ext(m.group('filename'), default_ext=None)
2448             if e:
2449                 return e
2450
2451     return mimetype2ext(getheader('Content-Type'))
2452
2453
2454 def encode_data_uri(data, mime_type):
2455     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2456
2457
2458 def age_restricted(content_limit, age_limit):
2459     """ Returns True iff the content should be blocked """
2460
2461     if age_limit is None:  # No limit set
2462         return False
2463     if content_limit is None:
2464         return False  # Content available for everyone
2465     return age_limit < content_limit
2466
2467
2468 def is_html(first_bytes):
2469     """ Detect whether a file contains HTML by examining its first bytes. """
2470
2471     BOMS = [
2472         (b'\xef\xbb\xbf', 'utf-8'),
2473         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2474         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2475         (b'\xff\xfe', 'utf-16-le'),
2476         (b'\xfe\xff', 'utf-16-be'),
2477     ]
2478     for bom, enc in BOMS:
2479         if first_bytes.startswith(bom):
2480             s = first_bytes[len(bom):].decode(enc, 'replace')
2481             break
2482     else:
2483         s = first_bytes.decode('utf-8', 'replace')
2484
2485     return re.match(r'^\s*<', s)
2486
2487
2488 def determine_protocol(info_dict):
2489     protocol = info_dict.get('protocol')
2490     if protocol is not None:
2491         return protocol
2492
2493     url = info_dict['url']
2494     if url.startswith('rtmp'):
2495         return 'rtmp'
2496     elif url.startswith('mms'):
2497         return 'mms'
2498     elif url.startswith('rtsp'):
2499         return 'rtsp'
2500
2501     ext = determine_ext(url)
2502     if ext == 'm3u8':
2503         return 'm3u8'
2504     elif ext == 'f4m':
2505         return 'f4m'
2506
2507     return compat_urllib_parse_urlparse(url).scheme
2508
2509
2510 def render_table(header_row, data):
2511     """ Render a list of rows, each as a list of values """
2512     table = [header_row] + data
2513     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2514     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2515     return '\n'.join(format_str % tuple(row) for row in table)
2516
2517
2518 def _match_one(filter_part, dct):
2519     COMPARISON_OPERATORS = {
2520         '<': operator.lt,
2521         '<=': operator.le,
2522         '>': operator.gt,
2523         '>=': operator.ge,
2524         '=': operator.eq,
2525         '!=': operator.ne,
2526     }
2527     operator_rex = re.compile(r'''(?x)\s*
2528         (?P<key>[a-z_]+)
2529         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2530         (?:
2531             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2532             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2533             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2534         )
2535         \s*$
2536         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2537     m = operator_rex.search(filter_part)
2538     if m:
2539         op = COMPARISON_OPERATORS[m.group('op')]
2540         actual_value = dct.get(m.group('key'))
2541         if (m.group('quotedstrval') is not None or
2542             m.group('strval') is not None or
2543             # If the original field is a string and matching comparisonvalue is
2544             # a number we should respect the origin of the original field
2545             # and process comparison value as a string (see
2546             # https://github.com/rg3/youtube-dl/issues/11082).
2547             actual_value is not None and m.group('intval') is not None and
2548                 isinstance(actual_value, compat_str)):
2549             if m.group('op') not in ('=', '!='):
2550                 raise ValueError(
2551                     'Operator %s does not support string values!' % m.group('op'))
2552             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2553             quote = m.group('quote')
2554             if quote is not None:
2555                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2556         else:
2557             try:
2558                 comparison_value = int(m.group('intval'))
2559             except ValueError:
2560                 comparison_value = parse_filesize(m.group('intval'))
2561                 if comparison_value is None:
2562                     comparison_value = parse_filesize(m.group('intval') + 'B')
2563                 if comparison_value is None:
2564                     raise ValueError(
2565                         'Invalid integer value %r in filter part %r' % (
2566                             m.group('intval'), filter_part))
2567         if actual_value is None:
2568             return m.group('none_inclusive')
2569         return op(actual_value, comparison_value)
2570
2571     UNARY_OPERATORS = {
2572         '': lambda v: v is not None,
2573         '!': lambda v: v is None,
2574     }
2575     operator_rex = re.compile(r'''(?x)\s*
2576         (?P<op>%s)\s*(?P<key>[a-z_]+)
2577         \s*$
2578         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2579     m = operator_rex.search(filter_part)
2580     if m:
2581         op = UNARY_OPERATORS[m.group('op')]
2582         actual_value = dct.get(m.group('key'))
2583         return op(actual_value)
2584
2585     raise ValueError('Invalid filter part %r' % filter_part)
2586
2587
2588 def match_str(filter_str, dct):
2589     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2590
2591     return all(
2592         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2593
2594
2595 def match_filter_func(filter_str):
2596     def _match_func(info_dict):
2597         if match_str(filter_str, info_dict):
2598             return None
2599         else:
2600             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2601             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2602     return _match_func
2603
2604
2605 def parse_dfxp_time_expr(time_expr):
2606     if not time_expr:
2607         return
2608
2609     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2610     if mobj:
2611         return float(mobj.group('time_offset'))
2612
2613     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2614     if mobj:
2615         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2616
2617
2618 def srt_subtitles_timecode(seconds):
2619     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2620
2621
2622 def dfxp2srt(dfxp_data):
2623     '''
2624     @param dfxp_data A bytes-like object containing DFXP data
2625     @returns A unicode object containing converted SRT data
2626     '''
2627     LEGACY_NAMESPACES = (
2628         (b'http://www.w3.org/ns/ttml', [
2629             b'http://www.w3.org/2004/11/ttaf1',
2630             b'http://www.w3.org/2006/04/ttaf1',
2631             b'http://www.w3.org/2006/10/ttaf1',
2632         ]),
2633         (b'http://www.w3.org/ns/ttml#styling', [
2634             b'http://www.w3.org/ns/ttml#style',
2635         ]),
2636     )
2637
2638     SUPPORTED_STYLING = [
2639         'color',
2640         'fontFamily',
2641         'fontSize',
2642         'fontStyle',
2643         'fontWeight',
2644         'textDecoration'
2645     ]
2646
2647     _x = functools.partial(xpath_with_ns, ns_map={
2648         'ttml': 'http://www.w3.org/ns/ttml',
2649         'tts': 'http://www.w3.org/ns/ttml#styling',
2650     })
2651
2652     styles = {}
2653     default_style = {}
2654
2655     class TTMLPElementParser(object):
2656         _out = ''
2657         _unclosed_elements = []
2658         _applied_styles = []
2659
2660         def start(self, tag, attrib):
2661             if tag in (_x('ttml:br'), 'br'):
2662                 self._out += '\n'
2663             else:
2664                 unclosed_elements = []
2665                 style = {}
2666                 element_style_id = attrib.get('style')
2667                 if default_style:
2668                     style.update(default_style)
2669                 if element_style_id:
2670                     style.update(styles.get(element_style_id, {}))
2671                 for prop in SUPPORTED_STYLING:
2672                     prop_val = attrib.get(_x('tts:' + prop))
2673                     if prop_val:
2674                         style[prop] = prop_val
2675                 if style:
2676                     font = ''
2677                     for k, v in sorted(style.items()):
2678                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2679                             continue
2680                         if k == 'color':
2681                             font += ' color="%s"' % v
2682                         elif k == 'fontSize':
2683                             font += ' size="%s"' % v
2684                         elif k == 'fontFamily':
2685                             font += ' face="%s"' % v
2686                         elif k == 'fontWeight' and v == 'bold':
2687                             self._out += '<b>'
2688                             unclosed_elements.append('b')
2689                         elif k == 'fontStyle' and v == 'italic':
2690                             self._out += '<i>'
2691                             unclosed_elements.append('i')
2692                         elif k == 'textDecoration' and v == 'underline':
2693                             self._out += '<u>'
2694                             unclosed_elements.append('u')
2695                     if font:
2696                         self._out += '<font' + font + '>'
2697                         unclosed_elements.append('font')
2698                     applied_style = {}
2699                     if self._applied_styles:
2700                         applied_style.update(self._applied_styles[-1])
2701                     applied_style.update(style)
2702                     self._applied_styles.append(applied_style)
2703                 self._unclosed_elements.append(unclosed_elements)
2704
2705         def end(self, tag):
2706             if tag not in (_x('ttml:br'), 'br'):
2707                 unclosed_elements = self._unclosed_elements.pop()
2708                 for element in reversed(unclosed_elements):
2709                     self._out += '</%s>' % element
2710                 if unclosed_elements and self._applied_styles:
2711                     self._applied_styles.pop()
2712
2713         def data(self, data):
2714             self._out += data
2715
2716         def close(self):
2717             return self._out.strip()
2718
2719     def parse_node(node):
2720         target = TTMLPElementParser()
2721         parser = xml.etree.ElementTree.XMLParser(target=target)
2722         parser.feed(xml.etree.ElementTree.tostring(node))
2723         return parser.close()
2724
2725     for k, v in LEGACY_NAMESPACES:
2726         for ns in v:
2727             dfxp_data = dfxp_data.replace(ns, k)
2728
2729     dfxp = compat_etree_fromstring(dfxp_data)
2730     out = []
2731     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2732
2733     if not paras:
2734         raise ValueError('Invalid dfxp/TTML subtitle')
2735
2736     repeat = False
2737     while True:
2738         for style in dfxp.findall(_x('.//ttml:style')):
2739             style_id = style.get('id')
2740             parent_style_id = style.get('style')
2741             if parent_style_id:
2742                 if parent_style_id not in styles:
2743                     repeat = True
2744                     continue
2745                 styles[style_id] = styles[parent_style_id].copy()
2746             for prop in SUPPORTED_STYLING:
2747                 prop_val = style.get(_x('tts:' + prop))
2748                 if prop_val:
2749                     styles.setdefault(style_id, {})[prop] = prop_val
2750         if repeat:
2751             repeat = False
2752         else:
2753             break
2754
2755     for p in ('body', 'div'):
2756         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2757         if ele is None:
2758             continue
2759         style = styles.get(ele.get('style'))
2760         if not style:
2761             continue
2762         default_style.update(style)
2763
2764     for para, index in zip(paras, itertools.count(1)):
2765         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2766         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2767         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2768         if begin_time is None:
2769             continue
2770         if not end_time:
2771             if not dur:
2772                 continue
2773             end_time = begin_time + dur
2774         out.append('%d\n%s --> %s\n%s\n\n' % (
2775             index,
2776             srt_subtitles_timecode(begin_time),
2777             srt_subtitles_timecode(end_time),
2778             parse_node(para)))
2779
2780     return ''.join(out)
2781
2782
2783 def cli_option(params, command_option, param):
2784     param = params.get(param)
2785     if param:
2786         param = compat_str(param)
2787     return [command_option, param] if param is not None else []
2788
2789
2790 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2791     param = params.get(param)
2792     if param is None:
2793         return []
2794     assert isinstance(param, bool)
2795     if separator:
2796         return [command_option + separator + (true_value if param else false_value)]
2797     return [command_option, true_value if param else false_value]
2798
2799
2800 def cli_valueless_option(params, command_option, param, expected_value=True):
2801     param = params.get(param)
2802     return [command_option] if param == expected_value else []
2803
2804
2805 def cli_configuration_args(params, param, default=[]):
2806     ex_args = params.get(param)
2807     if ex_args is None:
2808         return default
2809     assert isinstance(ex_args, list)
2810     return ex_args
2811
2812
2813 class ISO639Utils(object):
2814     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2815     _lang_map = {
2816         'aa': 'aar',
2817         'ab': 'abk',
2818         'ae': 'ave',
2819         'af': 'afr',
2820         'ak': 'aka',
2821         'am': 'amh',
2822         'an': 'arg',
2823         'ar': 'ara',
2824         'as': 'asm',
2825         'av': 'ava',
2826         'ay': 'aym',
2827         'az': 'aze',
2828         'ba': 'bak',
2829         'be': 'bel',
2830         'bg': 'bul',
2831         'bh': 'bih',
2832         'bi': 'bis',
2833         'bm': 'bam',
2834         'bn': 'ben',
2835         'bo': 'bod',
2836         'br': 'bre',
2837         'bs': 'bos',
2838         'ca': 'cat',
2839         'ce': 'che',
2840         'ch': 'cha',
2841         'co': 'cos',
2842         'cr': 'cre',
2843         'cs': 'ces',
2844         'cu': 'chu',
2845         'cv': 'chv',
2846         'cy': 'cym',
2847         'da': 'dan',
2848         'de': 'deu',
2849         'dv': 'div',
2850         'dz': 'dzo',
2851         'ee': 'ewe',
2852         'el': 'ell',
2853         'en': 'eng',
2854         'eo': 'epo',
2855         'es': 'spa',
2856         'et': 'est',
2857         'eu': 'eus',
2858         'fa': 'fas',
2859         'ff': 'ful',
2860         'fi': 'fin',
2861         'fj': 'fij',
2862         'fo': 'fao',
2863         'fr': 'fra',
2864         'fy': 'fry',
2865         'ga': 'gle',
2866         'gd': 'gla',
2867         'gl': 'glg',
2868         'gn': 'grn',
2869         'gu': 'guj',
2870         'gv': 'glv',
2871         'ha': 'hau',
2872         'he': 'heb',
2873         'hi': 'hin',
2874         'ho': 'hmo',
2875         'hr': 'hrv',
2876         'ht': 'hat',
2877         'hu': 'hun',
2878         'hy': 'hye',
2879         'hz': 'her',
2880         'ia': 'ina',
2881         'id': 'ind',
2882         'ie': 'ile',
2883         'ig': 'ibo',
2884         'ii': 'iii',
2885         'ik': 'ipk',
2886         'io': 'ido',
2887         'is': 'isl',
2888         'it': 'ita',
2889         'iu': 'iku',
2890         'ja': 'jpn',
2891         'jv': 'jav',
2892         'ka': 'kat',
2893         'kg': 'kon',
2894         'ki': 'kik',
2895         'kj': 'kua',
2896         'kk': 'kaz',
2897         'kl': 'kal',
2898         'km': 'khm',
2899         'kn': 'kan',
2900         'ko': 'kor',
2901         'kr': 'kau',
2902         'ks': 'kas',
2903         'ku': 'kur',
2904         'kv': 'kom',
2905         'kw': 'cor',
2906         'ky': 'kir',
2907         'la': 'lat',
2908         'lb': 'ltz',
2909         'lg': 'lug',
2910         'li': 'lim',
2911         'ln': 'lin',
2912         'lo': 'lao',
2913         'lt': 'lit',
2914         'lu': 'lub',
2915         'lv': 'lav',
2916         'mg': 'mlg',
2917         'mh': 'mah',
2918         'mi': 'mri',
2919         'mk': 'mkd',
2920         'ml': 'mal',
2921         'mn': 'mon',
2922         'mr': 'mar',
2923         'ms': 'msa',
2924         'mt': 'mlt',
2925         'my': 'mya',
2926         'na': 'nau',
2927         'nb': 'nob',
2928         'nd': 'nde',
2929         'ne': 'nep',
2930         'ng': 'ndo',
2931         'nl': 'nld',
2932         'nn': 'nno',
2933         'no': 'nor',
2934         'nr': 'nbl',
2935         'nv': 'nav',
2936         'ny': 'nya',
2937         'oc': 'oci',
2938         'oj': 'oji',
2939         'om': 'orm',
2940         'or': 'ori',
2941         'os': 'oss',
2942         'pa': 'pan',
2943         'pi': 'pli',
2944         'pl': 'pol',
2945         'ps': 'pus',
2946         'pt': 'por',
2947         'qu': 'que',
2948         'rm': 'roh',
2949         'rn': 'run',
2950         'ro': 'ron',
2951         'ru': 'rus',
2952         'rw': 'kin',
2953         'sa': 'san',
2954         'sc': 'srd',
2955         'sd': 'snd',
2956         'se': 'sme',
2957         'sg': 'sag',
2958         'si': 'sin',
2959         'sk': 'slk',
2960         'sl': 'slv',
2961         'sm': 'smo',
2962         'sn': 'sna',
2963         'so': 'som',
2964         'sq': 'sqi',
2965         'sr': 'srp',
2966         'ss': 'ssw',
2967         'st': 'sot',
2968         'su': 'sun',
2969         'sv': 'swe',
2970         'sw': 'swa',
2971         'ta': 'tam',
2972         'te': 'tel',
2973         'tg': 'tgk',
2974         'th': 'tha',
2975         'ti': 'tir',
2976         'tk': 'tuk',
2977         'tl': 'tgl',
2978         'tn': 'tsn',
2979         'to': 'ton',
2980         'tr': 'tur',
2981         'ts': 'tso',
2982         'tt': 'tat',
2983         'tw': 'twi',
2984         'ty': 'tah',
2985         'ug': 'uig',
2986         'uk': 'ukr',
2987         'ur': 'urd',
2988         'uz': 'uzb',
2989         've': 'ven',
2990         'vi': 'vie',
2991         'vo': 'vol',
2992         'wa': 'wln',
2993         'wo': 'wol',
2994         'xh': 'xho',
2995         'yi': 'yid',
2996         'yo': 'yor',
2997         'za': 'zha',
2998         'zh': 'zho',
2999         'zu': 'zul',
3000     }
3001
3002     @classmethod
3003     def short2long(cls, code):
3004         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3005         return cls._lang_map.get(code[:2])
3006
3007     @classmethod
3008     def long2short(cls, code):
3009         """Convert language code from ISO 639-2/T to ISO 639-1"""
3010         for short_name, long_name in cls._lang_map.items():
3011             if long_name == code:
3012                 return short_name
3013
3014
3015 class ISO3166Utils(object):
3016     # From http://data.okfn.org/data/core/country-list
3017     _country_map = {
3018         'AF': 'Afghanistan',
3019         'AX': 'Åland Islands',
3020         'AL': 'Albania',
3021         'DZ': 'Algeria',
3022         'AS': 'American Samoa',
3023         'AD': 'Andorra',
3024         'AO': 'Angola',
3025         'AI': 'Anguilla',
3026         'AQ': 'Antarctica',
3027         'AG': 'Antigua and Barbuda',
3028         'AR': 'Argentina',
3029         'AM': 'Armenia',
3030         'AW': 'Aruba',
3031         'AU': 'Australia',
3032         'AT': 'Austria',
3033         'AZ': 'Azerbaijan',
3034         'BS': 'Bahamas',
3035         'BH': 'Bahrain',
3036         'BD': 'Bangladesh',
3037         'BB': 'Barbados',
3038         'BY': 'Belarus',
3039         'BE': 'Belgium',
3040         'BZ': 'Belize',
3041         'BJ': 'Benin',
3042         'BM': 'Bermuda',
3043         'BT': 'Bhutan',
3044         'BO': 'Bolivia, Plurinational State of',
3045         'BQ': 'Bonaire, Sint Eustatius and Saba',
3046         'BA': 'Bosnia and Herzegovina',
3047         'BW': 'Botswana',
3048         'BV': 'Bouvet Island',
3049         'BR': 'Brazil',
3050         'IO': 'British Indian Ocean Territory',
3051         'BN': 'Brunei Darussalam',
3052         'BG': 'Bulgaria',
3053         'BF': 'Burkina Faso',
3054         'BI': 'Burundi',
3055         'KH': 'Cambodia',
3056         'CM': 'Cameroon',
3057         'CA': 'Canada',
3058         'CV': 'Cape Verde',
3059         'KY': 'Cayman Islands',
3060         'CF': 'Central African Republic',
3061         'TD': 'Chad',
3062         'CL': 'Chile',
3063         'CN': 'China',
3064         'CX': 'Christmas Island',
3065         'CC': 'Cocos (Keeling) Islands',
3066         'CO': 'Colombia',
3067         'KM': 'Comoros',
3068         'CG': 'Congo',
3069         'CD': 'Congo, the Democratic Republic of the',
3070         'CK': 'Cook Islands',
3071         'CR': 'Costa Rica',
3072         'CI': 'Côte d\'Ivoire',
3073         'HR': 'Croatia',
3074         'CU': 'Cuba',
3075         'CW': 'Curaçao',
3076         'CY': 'Cyprus',
3077         'CZ': 'Czech Republic',
3078         'DK': 'Denmark',
3079         'DJ': 'Djibouti',
3080         'DM': 'Dominica',
3081         'DO': 'Dominican Republic',
3082         'EC': 'Ecuador',
3083         'EG': 'Egypt',
3084         'SV': 'El Salvador',
3085         'GQ': 'Equatorial Guinea',
3086         'ER': 'Eritrea',
3087         'EE': 'Estonia',
3088         'ET': 'Ethiopia',
3089         'FK': 'Falkland Islands (Malvinas)',
3090         'FO': 'Faroe Islands',
3091         'FJ': 'Fiji',
3092         'FI': 'Finland',
3093         'FR': 'France',
3094         'GF': 'French Guiana',
3095         'PF': 'French Polynesia',
3096         'TF': 'French Southern Territories',
3097         'GA': 'Gabon',
3098         'GM': 'Gambia',
3099         'GE': 'Georgia',
3100         'DE': 'Germany',
3101         'GH': 'Ghana',
3102         'GI': 'Gibraltar',
3103         'GR': 'Greece',
3104         'GL': 'Greenland',
3105         'GD': 'Grenada',
3106         'GP': 'Guadeloupe',
3107         'GU': 'Guam',
3108         'GT': 'Guatemala',
3109         'GG': 'Guernsey',
3110         'GN': 'Guinea',
3111         'GW': 'Guinea-Bissau',
3112         'GY': 'Guyana',
3113         'HT': 'Haiti',
3114         'HM': 'Heard Island and McDonald Islands',
3115         'VA': 'Holy See (Vatican City State)',
3116         'HN': 'Honduras',
3117         'HK': 'Hong Kong',
3118         'HU': 'Hungary',
3119         'IS': 'Iceland',
3120         'IN': 'India',
3121         'ID': 'Indonesia',
3122         'IR': 'Iran, Islamic Republic of',
3123         'IQ': 'Iraq',
3124         'IE': 'Ireland',
3125         'IM': 'Isle of Man',
3126         'IL': 'Israel',
3127         'IT': 'Italy',
3128         'JM': 'Jamaica',
3129         'JP': 'Japan',
3130         'JE': 'Jersey',
3131         'JO': 'Jordan',
3132         'KZ': 'Kazakhstan',
3133         'KE': 'Kenya',
3134         'KI': 'Kiribati',
3135         'KP': 'Korea, Democratic People\'s Republic of',
3136         'KR': 'Korea, Republic of',
3137         'KW': 'Kuwait',
3138         'KG': 'Kyrgyzstan',
3139         'LA': 'Lao People\'s Democratic Republic',
3140         'LV': 'Latvia',
3141         'LB': 'Lebanon',
3142         'LS': 'Lesotho',
3143         'LR': 'Liberia',
3144         'LY': 'Libya',
3145         'LI': 'Liechtenstein',
3146         'LT': 'Lithuania',
3147         'LU': 'Luxembourg',
3148         'MO': 'Macao',
3149         'MK': 'Macedonia, the Former Yugoslav Republic of',
3150         'MG': 'Madagascar',
3151         'MW': 'Malawi',
3152         'MY': 'Malaysia',
3153         'MV': 'Maldives',
3154         'ML': 'Mali',
3155         'MT': 'Malta',
3156         'MH': 'Marshall Islands',
3157         'MQ': 'Martinique',
3158         'MR': 'Mauritania',
3159         'MU': 'Mauritius',
3160         'YT': 'Mayotte',
3161         'MX': 'Mexico',
3162         'FM': 'Micronesia, Federated States of',
3163         'MD': 'Moldova, Republic of',
3164         'MC': 'Monaco',
3165         'MN': 'Mongolia',
3166         'ME': 'Montenegro',
3167         'MS': 'Montserrat',
3168         'MA': 'Morocco',
3169         'MZ': 'Mozambique',
3170         'MM': 'Myanmar',
3171         'NA': 'Namibia',
3172         'NR': 'Nauru',
3173         'NP': 'Nepal',
3174         'NL': 'Netherlands',
3175         'NC': 'New Caledonia',
3176         'NZ': 'New Zealand',
3177         'NI': 'Nicaragua',
3178         'NE': 'Niger',
3179         'NG': 'Nigeria',
3180         'NU': 'Niue',
3181         'NF': 'Norfolk Island',
3182         'MP': 'Northern Mariana Islands',
3183         'NO': 'Norway',
3184         'OM': 'Oman',
3185         'PK': 'Pakistan',
3186         'PW': 'Palau',
3187         'PS': 'Palestine, State of',
3188         'PA': 'Panama',
3189         'PG': 'Papua New Guinea',
3190         'PY': 'Paraguay',
3191         'PE': 'Peru',
3192         'PH': 'Philippines',
3193         'PN': 'Pitcairn',
3194         'PL': 'Poland',
3195         'PT': 'Portugal',
3196         'PR': 'Puerto Rico',
3197         'QA': 'Qatar',
3198         'RE': 'Réunion',
3199         'RO': 'Romania',
3200         'RU': 'Russian Federation',
3201         'RW': 'Rwanda',
3202         'BL': 'Saint Barthélemy',
3203         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3204         'KN': 'Saint Kitts and Nevis',
3205         'LC': 'Saint Lucia',
3206         'MF': 'Saint Martin (French part)',
3207         'PM': 'Saint Pierre and Miquelon',
3208         'VC': 'Saint Vincent and the Grenadines',
3209         'WS': 'Samoa',
3210         'SM': 'San Marino',
3211         'ST': 'Sao Tome and Principe',
3212         'SA': 'Saudi Arabia',
3213         'SN': 'Senegal',
3214         'RS': 'Serbia',
3215         'SC': 'Seychelles',
3216         'SL': 'Sierra Leone',
3217         'SG': 'Singapore',
3218         'SX': 'Sint Maarten (Dutch part)',
3219         'SK': 'Slovakia',
3220         'SI': 'Slovenia',
3221         'SB': 'Solomon Islands',
3222         'SO': 'Somalia',
3223         'ZA': 'South Africa',
3224         'GS': 'South Georgia and the South Sandwich Islands',
3225         'SS': 'South Sudan',
3226         'ES': 'Spain',
3227         'LK': 'Sri Lanka',
3228         'SD': 'Sudan',
3229         'SR': 'Suriname',
3230         'SJ': 'Svalbard and Jan Mayen',
3231         'SZ': 'Swaziland',
3232         'SE': 'Sweden',
3233         'CH': 'Switzerland',
3234         'SY': 'Syrian Arab Republic',
3235         'TW': 'Taiwan, Province of China',
3236         'TJ': 'Tajikistan',
3237         'TZ': 'Tanzania, United Republic of',
3238         'TH': 'Thailand',
3239         'TL': 'Timor-Leste',
3240         'TG': 'Togo',
3241         'TK': 'Tokelau',
3242         'TO': 'Tonga',
3243         'TT': 'Trinidad and Tobago',
3244         'TN': 'Tunisia',
3245         'TR': 'Turkey',
3246         'TM': 'Turkmenistan',
3247         'TC': 'Turks and Caicos Islands',
3248         'TV': 'Tuvalu',
3249         'UG': 'Uganda',
3250         'UA': 'Ukraine',
3251         'AE': 'United Arab Emirates',
3252         'GB': 'United Kingdom',
3253         'US': 'United States',
3254         'UM': 'United States Minor Outlying Islands',
3255         'UY': 'Uruguay',
3256         'UZ': 'Uzbekistan',
3257         'VU': 'Vanuatu',
3258         'VE': 'Venezuela, Bolivarian Republic of',
3259         'VN': 'Viet Nam',
3260         'VG': 'Virgin Islands, British',
3261         'VI': 'Virgin Islands, U.S.',
3262         'WF': 'Wallis and Futuna',
3263         'EH': 'Western Sahara',
3264         'YE': 'Yemen',
3265         'ZM': 'Zambia',
3266         'ZW': 'Zimbabwe',
3267     }
3268
3269     @classmethod
3270     def short2full(cls, code):
3271         """Convert an ISO 3166-2 country code to the corresponding full name"""
3272         return cls._country_map.get(code.upper())
3273
3274
3275 class GeoUtils(object):
3276     # Major IPv4 address blocks per country
3277     _country_ip_map = {
3278         'AD': '85.94.160.0/19',
3279         'AE': '94.200.0.0/13',
3280         'AF': '149.54.0.0/17',
3281         'AG': '209.59.64.0/18',
3282         'AI': '204.14.248.0/21',
3283         'AL': '46.99.0.0/16',
3284         'AM': '46.70.0.0/15',
3285         'AO': '105.168.0.0/13',
3286         'AP': '159.117.192.0/21',
3287         'AR': '181.0.0.0/12',
3288         'AS': '202.70.112.0/20',
3289         'AT': '84.112.0.0/13',
3290         'AU': '1.128.0.0/11',
3291         'AW': '181.41.0.0/18',
3292         'AZ': '5.191.0.0/16',
3293         'BA': '31.176.128.0/17',
3294         'BB': '65.48.128.0/17',
3295         'BD': '114.130.0.0/16',
3296         'BE': '57.0.0.0/8',
3297         'BF': '129.45.128.0/17',
3298         'BG': '95.42.0.0/15',
3299         'BH': '37.131.0.0/17',
3300         'BI': '154.117.192.0/18',
3301         'BJ': '137.255.0.0/16',
3302         'BL': '192.131.134.0/24',
3303         'BM': '196.12.64.0/18',
3304         'BN': '156.31.0.0/16',
3305         'BO': '161.56.0.0/16',
3306         'BQ': '161.0.80.0/20',
3307         'BR': '152.240.0.0/12',
3308         'BS': '24.51.64.0/18',
3309         'BT': '119.2.96.0/19',
3310         'BW': '168.167.0.0/16',
3311         'BY': '178.120.0.0/13',
3312         'BZ': '179.42.192.0/18',
3313         'CA': '99.224.0.0/11',
3314         'CD': '41.243.0.0/16',
3315         'CF': '196.32.200.0/21',
3316         'CG': '197.214.128.0/17',
3317         'CH': '85.0.0.0/13',
3318         'CI': '154.232.0.0/14',
3319         'CK': '202.65.32.0/19',
3320         'CL': '152.172.0.0/14',
3321         'CM': '165.210.0.0/15',
3322         'CN': '36.128.0.0/10',
3323         'CO': '181.240.0.0/12',
3324         'CR': '201.192.0.0/12',
3325         'CU': '152.206.0.0/15',
3326         'CV': '165.90.96.0/19',
3327         'CW': '190.88.128.0/17',
3328         'CY': '46.198.0.0/15',
3329         'CZ': '88.100.0.0/14',
3330         'DE': '53.0.0.0/8',
3331         'DJ': '197.241.0.0/17',
3332         'DK': '87.48.0.0/12',
3333         'DM': '192.243.48.0/20',
3334         'DO': '152.166.0.0/15',
3335         'DZ': '41.96.0.0/12',
3336         'EC': '186.68.0.0/15',
3337         'EE': '90.190.0.0/15',
3338         'EG': '156.160.0.0/11',
3339         'ER': '196.200.96.0/20',
3340         'ES': '88.0.0.0/11',
3341         'ET': '196.188.0.0/14',
3342         'EU': '2.16.0.0/13',
3343         'FI': '91.152.0.0/13',
3344         'FJ': '144.120.0.0/16',
3345         'FM': '119.252.112.0/20',
3346         'FO': '88.85.32.0/19',
3347         'FR': '90.0.0.0/9',
3348         'GA': '41.158.0.0/15',
3349         'GB': '25.0.0.0/8',
3350         'GD': '74.122.88.0/21',
3351         'GE': '31.146.0.0/16',
3352         'GF': '161.22.64.0/18',
3353         'GG': '62.68.160.0/19',
3354         'GH': '45.208.0.0/14',
3355         'GI': '85.115.128.0/19',
3356         'GL': '88.83.0.0/19',
3357         'GM': '160.182.0.0/15',
3358         'GN': '197.149.192.0/18',
3359         'GP': '104.250.0.0/19',
3360         'GQ': '105.235.224.0/20',
3361         'GR': '94.64.0.0/13',
3362         'GT': '168.234.0.0/16',
3363         'GU': '168.123.0.0/16',
3364         'GW': '197.214.80.0/20',
3365         'GY': '181.41.64.0/18',
3366         'HK': '113.252.0.0/14',
3367         'HN': '181.210.0.0/16',
3368         'HR': '93.136.0.0/13',
3369         'HT': '148.102.128.0/17',
3370         'HU': '84.0.0.0/14',
3371         'ID': '39.192.0.0/10',
3372         'IE': '87.32.0.0/12',
3373         'IL': '79.176.0.0/13',
3374         'IM': '5.62.80.0/20',
3375         'IN': '117.192.0.0/10',
3376         'IO': '203.83.48.0/21',
3377         'IQ': '37.236.0.0/14',
3378         'IR': '2.176.0.0/12',
3379         'IS': '82.221.0.0/16',
3380         'IT': '79.0.0.0/10',
3381         'JE': '87.244.64.0/18',
3382         'JM': '72.27.0.0/17',
3383         'JO': '176.29.0.0/16',
3384         'JP': '126.0.0.0/8',
3385         'KE': '105.48.0.0/12',
3386         'KG': '158.181.128.0/17',
3387         'KH': '36.37.128.0/17',
3388         'KI': '103.25.140.0/22',
3389         'KM': '197.255.224.0/20',
3390         'KN': '198.32.32.0/19',
3391         'KP': '175.45.176.0/22',
3392         'KR': '175.192.0.0/10',
3393         'KW': '37.36.0.0/14',
3394         'KY': '64.96.0.0/15',
3395         'KZ': '2.72.0.0/13',
3396         'LA': '115.84.64.0/18',
3397         'LB': '178.135.0.0/16',
3398         'LC': '192.147.231.0/24',
3399         'LI': '82.117.0.0/19',
3400         'LK': '112.134.0.0/15',
3401         'LR': '41.86.0.0/19',
3402         'LS': '129.232.0.0/17',
3403         'LT': '78.56.0.0/13',
3404         'LU': '188.42.0.0/16',
3405         'LV': '46.109.0.0/16',
3406         'LY': '41.252.0.0/14',
3407         'MA': '105.128.0.0/11',
3408         'MC': '88.209.64.0/18',
3409         'MD': '37.246.0.0/16',
3410         'ME': '178.175.0.0/17',
3411         'MF': '74.112.232.0/21',
3412         'MG': '154.126.0.0/17',
3413         'MH': '117.103.88.0/21',
3414         'MK': '77.28.0.0/15',
3415         'ML': '154.118.128.0/18',
3416         'MM': '37.111.0.0/17',
3417         'MN': '49.0.128.0/17',
3418         'MO': '60.246.0.0/16',
3419         'MP': '202.88.64.0/20',
3420         'MQ': '109.203.224.0/19',
3421         'MR': '41.188.64.0/18',
3422         'MS': '208.90.112.0/22',
3423         'MT': '46.11.0.0/16',
3424         'MU': '105.16.0.0/12',
3425         'MV': '27.114.128.0/18',
3426         'MW': '105.234.0.0/16',
3427         'MX': '187.192.0.0/11',
3428         'MY': '175.136.0.0/13',
3429         'MZ': '197.218.0.0/15',
3430         'NA': '41.182.0.0/16',
3431         'NC': '101.101.0.0/18',
3432         'NE': '197.214.0.0/18',
3433         'NF': '203.17.240.0/22',
3434         'NG': '105.112.0.0/12',
3435         'NI': '186.76.0.0/15',
3436         'NL': '145.96.0.0/11',
3437         'NO': '84.208.0.0/13',
3438         'NP': '36.252.0.0/15',
3439         'NR': '203.98.224.0/19',
3440         'NU': '49.156.48.0/22',
3441         'NZ': '49.224.0.0/14',
3442         'OM': '5.36.0.0/15',
3443         'PA': '186.72.0.0/15',
3444         'PE': '186.160.0.0/14',
3445         'PF': '123.50.64.0/18',
3446         'PG': '124.240.192.0/19',
3447         'PH': '49.144.0.0/13',
3448         'PK': '39.32.0.0/11',
3449         'PL': '83.0.0.0/11',
3450         'PM': '70.36.0.0/20',
3451         'PR': '66.50.0.0/16',
3452         'PS': '188.161.0.0/16',
3453         'PT': '85.240.0.0/13',
3454         'PW': '202.124.224.0/20',
3455         'PY': '181.120.0.0/14',
3456         'QA': '37.210.0.0/15',
3457         'RE': '139.26.0.0/16',
3458         'RO': '79.112.0.0/13',
3459         'RS': '178.220.0.0/14',
3460         'RU': '5.136.0.0/13',
3461         'RW': '105.178.0.0/15',
3462         'SA': '188.48.0.0/13',
3463         'SB': '202.1.160.0/19',
3464         'SC': '154.192.0.0/11',
3465         'SD': '154.96.0.0/13',
3466         'SE': '78.64.0.0/12',
3467         'SG': '152.56.0.0/14',
3468         'SI': '188.196.0.0/14',
3469         'SK': '78.98.0.0/15',
3470         'SL': '197.215.0.0/17',
3471         'SM': '89.186.32.0/19',
3472         'SN': '41.82.0.0/15',
3473         'SO': '197.220.64.0/19',
3474         'SR': '186.179.128.0/17',
3475         'SS': '105.235.208.0/21',
3476         'ST': '197.159.160.0/19',
3477         'SV': '168.243.0.0/16',
3478         'SX': '190.102.0.0/20',
3479         'SY': '5.0.0.0/16',
3480         'SZ': '41.84.224.0/19',
3481         'TC': '65.255.48.0/20',
3482         'TD': '154.68.128.0/19',
3483         'TG': '196.168.0.0/14',
3484         'TH': '171.96.0.0/13',
3485         'TJ': '85.9.128.0/18',
3486         'TK': '27.96.24.0/21',
3487         'TL': '180.189.160.0/20',
3488         'TM': '95.85.96.0/19',
3489         'TN': '197.0.0.0/11',
3490         'TO': '175.176.144.0/21',
3491         'TR': '78.160.0.0/11',
3492         'TT': '186.44.0.0/15',
3493         'TV': '202.2.96.0/19',
3494         'TW': '120.96.0.0/11',
3495         'TZ': '156.156.0.0/14',
3496         'UA': '93.72.0.0/13',
3497         'UG': '154.224.0.0/13',
3498         'US': '3.0.0.0/8',
3499         'UY': '167.56.0.0/13',
3500         'UZ': '82.215.64.0/18',
3501         'VA': '212.77.0.0/19',
3502         'VC': '24.92.144.0/20',
3503         'VE': '186.88.0.0/13',
3504         'VG': '172.103.64.0/18',
3505         'VI': '146.226.0.0/16',
3506         'VN': '14.160.0.0/11',
3507         'VU': '202.80.32.0/20',
3508         'WF': '117.20.32.0/21',
3509         'WS': '202.4.32.0/19',
3510         'YE': '134.35.0.0/16',
3511         'YT': '41.242.116.0/22',
3512         'ZA': '41.0.0.0/11',
3513         'ZM': '165.56.0.0/13',
3514         'ZW': '41.85.192.0/19',
3515     }
3516
3517     @classmethod
3518     def random_ipv4(cls, code):
3519         block = cls._country_ip_map.get(code.upper())
3520         if not block:
3521             return None
3522         addr, preflen = block.split('/')
3523         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3524         addr_max = addr_min | (0xffffffff >> int(preflen))
3525         return compat_str(socket.inet_ntoa(
3526             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3527
3528
3529 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3530     def __init__(self, proxies=None):
3531         # Set default handlers
3532         for type in ('http', 'https'):
3533             setattr(self, '%s_open' % type,
3534                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3535                         meth(r, proxy, type))
3536         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3537
3538     def proxy_open(self, req, proxy, type):
3539         req_proxy = req.headers.get('Ytdl-request-proxy')
3540         if req_proxy is not None:
3541             proxy = req_proxy
3542             del req.headers['Ytdl-request-proxy']
3543
3544         if proxy == '__noproxy__':
3545             return None  # No Proxy
3546         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3547             req.add_header('Ytdl-socks-proxy', proxy)
3548             # youtube-dl's http/https handlers do wrapping the socket with socks
3549             return None
3550         return compat_urllib_request.ProxyHandler.proxy_open(
3551             self, req, proxy, type)
3552
3553
3554 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3555 # released into Public Domain
3556 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3557
3558 def long_to_bytes(n, blocksize=0):
3559     """long_to_bytes(n:long, blocksize:int) : string
3560     Convert a long integer to a byte string.
3561
3562     If optional blocksize is given and greater than zero, pad the front of the
3563     byte string with binary zeros so that the length is a multiple of
3564     blocksize.
3565     """
3566     # after much testing, this algorithm was deemed to be the fastest
3567     s = b''
3568     n = int(n)
3569     while n > 0:
3570         s = compat_struct_pack('>I', n & 0xffffffff) + s
3571         n = n >> 32
3572     # strip off leading zeros
3573     for i in range(len(s)):
3574         if s[i] != b'\000'[0]:
3575             break
3576     else:
3577         # only happens when n == 0
3578         s = b'\000'
3579         i = 0
3580     s = s[i:]
3581     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3582     # de-padding being done above, but sigh...
3583     if blocksize > 0 and len(s) % blocksize:
3584         s = (blocksize - len(s) % blocksize) * b'\000' + s
3585     return s
3586
3587
3588 def bytes_to_long(s):
3589     """bytes_to_long(string) : long
3590     Convert a byte string to a long integer.
3591
3592     This is (essentially) the inverse of long_to_bytes().
3593     """
3594     acc = 0
3595     length = len(s)
3596     if length % 4:
3597         extra = (4 - length % 4)
3598         s = b'\000' * extra + s
3599         length = length + extra
3600     for i in range(0, length, 4):
3601         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3602     return acc
3603
3604
3605 def ohdave_rsa_encrypt(data, exponent, modulus):
3606     '''
3607     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3608
3609     Input:
3610         data: data to encrypt, bytes-like object
3611         exponent, modulus: parameter e and N of RSA algorithm, both integer
3612     Output: hex string of encrypted data
3613
3614     Limitation: supports one block encryption only
3615     '''
3616
3617     payload = int(binascii.hexlify(data[::-1]), 16)
3618     encrypted = pow(payload, exponent, modulus)
3619     return '%x' % encrypted
3620
3621
3622 def pkcs1pad(data, length):
3623     """
3624     Padding input data with PKCS#1 scheme
3625
3626     @param {int[]} data        input data
3627     @param {int}   length      target length
3628     @returns {int[]}           padded data
3629     """
3630     if len(data) > length - 11:
3631         raise ValueError('Input data too long for PKCS#1 padding')
3632
3633     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3634     return [0, 2] + pseudo_random + [0] + data
3635
3636
3637 def encode_base_n(num, n, table=None):
3638     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3639     if not table:
3640         table = FULL_TABLE[:n]
3641
3642     if n > len(table):
3643         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3644
3645     if num == 0:
3646         return table[0]
3647
3648     ret = ''
3649     while num:
3650         ret = table[num % n] + ret
3651         num = num // n
3652     return ret
3653
3654
3655 def decode_packed_codes(code):
3656     mobj = re.search(PACKED_CODES_RE, code)
3657     obfucasted_code, base, count, symbols = mobj.groups()
3658     base = int(base)
3659     count = int(count)
3660     symbols = symbols.split('|')
3661     symbol_table = {}
3662
3663     while count:
3664         count -= 1
3665         base_n_count = encode_base_n(count, base)
3666         symbol_table[base_n_count] = symbols[count] or base_n_count
3667
3668     return re.sub(
3669         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3670         obfucasted_code)
3671
3672
3673 def parse_m3u8_attributes(attrib):
3674     info = {}
3675     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3676         if val.startswith('"'):
3677             val = val[1:-1]
3678         info[key] = val
3679     return info
3680
3681
3682 def urshift(val, n):
3683     return val >> n if val >= 0 else (val + 0x100000000) >> n
3684
3685
3686 # Based on png2str() written by @gdkchan and improved by @yokrysty
3687 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3688 def decode_png(png_data):
3689     # Reference: https://www.w3.org/TR/PNG/
3690     header = png_data[8:]
3691
3692     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3693         raise IOError('Not a valid PNG file.')
3694
3695     int_map = {1: '>B', 2: '>H', 4: '>I'}
3696     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3697
3698     chunks = []
3699
3700     while header:
3701         length = unpack_integer(header[:4])
3702         header = header[4:]
3703
3704         chunk_type = header[:4]
3705         header = header[4:]
3706
3707         chunk_data = header[:length]
3708         header = header[length:]
3709
3710         header = header[4:]  # Skip CRC
3711
3712         chunks.append({
3713             'type': chunk_type,
3714             'length': length,
3715             'data': chunk_data
3716         })
3717
3718     ihdr = chunks[0]['data']
3719
3720     width = unpack_integer(ihdr[:4])
3721     height = unpack_integer(ihdr[4:8])
3722
3723     idat = b''
3724
3725     for chunk in chunks:
3726         if chunk['type'] == b'IDAT':
3727             idat += chunk['data']
3728
3729     if not idat:
3730         raise IOError('Unable to read PNG data.')
3731
3732     decompressed_data = bytearray(zlib.decompress(idat))
3733
3734     stride = width * 3
3735     pixels = []
3736
3737     def _get_pixel(idx):
3738         x = idx % stride
3739         y = idx // stride
3740         return pixels[y][x]
3741
3742     for y in range(height):
3743         basePos = y * (1 + stride)
3744         filter_type = decompressed_data[basePos]
3745
3746         current_row = []
3747
3748         pixels.append(current_row)
3749
3750         for x in range(stride):
3751             color = decompressed_data[1 + basePos + x]
3752             basex = y * stride + x
3753             left = 0
3754             up = 0
3755
3756             if x > 2:
3757                 left = _get_pixel(basex - 3)
3758             if y > 0:
3759                 up = _get_pixel(basex - stride)
3760
3761             if filter_type == 1:  # Sub
3762                 color = (color + left) & 0xff
3763             elif filter_type == 2:  # Up
3764                 color = (color + up) & 0xff
3765             elif filter_type == 3:  # Average
3766                 color = (color + ((left + up) >> 1)) & 0xff
3767             elif filter_type == 4:  # Paeth
3768                 a = left
3769                 b = up
3770                 c = 0
3771
3772                 if x > 2 and y > 0:
3773                     c = _get_pixel(basex - stride - 3)
3774
3775                 p = a + b - c
3776
3777                 pa = abs(p - a)
3778                 pb = abs(p - b)
3779                 pc = abs(p - c)
3780
3781                 if pa <= pb and pa <= pc:
3782                     color = (color + a) & 0xff
3783                 elif pb <= pc:
3784                     color = (color + b) & 0xff
3785                 else:
3786                     color = (color + c) & 0xff
3787
3788             current_row.append(color)
3789
3790     return width, height, pixels
3791
3792
3793 def write_xattr(path, key, value):
3794     # This mess below finds the best xattr tool for the job
3795     try:
3796         # try the pyxattr module...
3797         import xattr
3798
3799         if hasattr(xattr, 'set'):  # pyxattr
3800             # Unicode arguments are not supported in python-pyxattr until
3801             # version 0.5.0
3802             # See https://github.com/rg3/youtube-dl/issues/5498
3803             pyxattr_required_version = '0.5.0'
3804             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3805                 # TODO: fallback to CLI tools
3806                 raise XAttrUnavailableError(
3807                     'python-pyxattr is detected but is too old. '
3808                     'youtube-dl requires %s or above while your version is %s. '
3809                     'Falling back to other xattr implementations' % (
3810                         pyxattr_required_version, xattr.__version__))
3811
3812             setxattr = xattr.set
3813         else:  # xattr
3814             setxattr = xattr.setxattr
3815
3816         try:
3817             setxattr(path, key, value)
3818         except EnvironmentError as e:
3819             raise XAttrMetadataError(e.errno, e.strerror)
3820
3821     except ImportError:
3822         if compat_os_name == 'nt':
3823             # Write xattrs to NTFS Alternate Data Streams:
3824             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3825             assert ':' not in key
3826             assert os.path.exists(path)
3827
3828             ads_fn = path + ':' + key
3829             try:
3830                 with open(ads_fn, 'wb') as f:
3831                     f.write(value)
3832             except EnvironmentError as e:
3833                 raise XAttrMetadataError(e.errno, e.strerror)
3834         else:
3835             user_has_setfattr = check_executable('setfattr', ['--version'])
3836             user_has_xattr = check_executable('xattr', ['-h'])
3837
3838             if user_has_setfattr or user_has_xattr:
3839
3840                 value = value.decode('utf-8')
3841                 if user_has_setfattr:
3842                     executable = 'setfattr'
3843                     opts = ['-n', key, '-v', value]
3844                 elif user_has_xattr:
3845                     executable = 'xattr'
3846                     opts = ['-w', key, value]
3847
3848                 cmd = ([encodeFilename(executable, True)] +
3849                        [encodeArgument(o) for o in opts] +
3850                        [encodeFilename(path, True)])
3851
3852                 try:
3853                     p = subprocess.Popen(
3854                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3855                 except EnvironmentError as e:
3856                     raise XAttrMetadataError(e.errno, e.strerror)
3857                 stdout, stderr = p.communicate()
3858                 stderr = stderr.decode('utf-8', 'replace')
3859                 if p.returncode != 0:
3860                     raise XAttrMetadataError(p.returncode, stderr)
3861
3862             else:
3863                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3864                 if sys.platform.startswith('linux'):
3865                     raise XAttrUnavailableError(
3866                         "Couldn't find a tool to set the xattrs. "
3867                         "Install either the python 'pyxattr' or 'xattr' "
3868                         "modules, or the GNU 'attr' package "
3869                         "(which contains the 'setfattr' tool).")
3870                 else:
3871                     raise XAttrUnavailableError(
3872                         "Couldn't find a tool to set the xattrs. "
3873                         "Install either the python 'xattr' module, "
3874                         "or the 'xattr' binary.")
3875
3876
3877 def random_birthday(year_field, month_field, day_field):
3878     return {
3879         year_field: str(random.randint(1950, 1995)),
3880         month_field: str(random.randint(1, 12)),
3881         day_field: str(random.randint(1, 31)),
3882     }