_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_ctypes_WINFUNCTYPE,
  43     compat_etree_fromstring,
  44     compat_expanduser,
  45     compat_html_entities,
  46     compat_html_entities_html5,
  47     compat_http_client,
  48     compat_kwargs,
  49     compat_os_name,
  50     compat_parse_qs,
  51     compat_shlex_quote,
  52     compat_socket_create_connection,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187
 188
 189 def preferredencoding():
 190     """Get preferred encoding.
 191
 192     Returns the best encoding scheme for the system, based on
 193     locale.getpreferredencoding() and some further tweaks.
 194     """
 195     try:
 196         pref = locale.getpreferredencoding()
 197         'TEST'.encode(pref)
 198     except Exception:
 199         pref = 'UTF-8'
 200
 201     return pref
 202
 203
 204 def write_json_file(obj, fn):
 205     """ Encode obj as JSON and write it to fn, atomically if possible """
 206
 207     fn = encodeFilename(fn)
 208     if sys.version_info < (3, 0) and sys.platform != 'win32':
 209         encoding = get_filesystem_encoding()
 210         # os.path.basename returns a bytes object, but NamedTemporaryFile
 211         # will fail if the filename contains non ascii characters unless we
 212         # use a unicode object
 213         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 214         # the same for os.path.dirname
 215         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 216     else:
 217         path_basename = os.path.basename
 218         path_dirname = os.path.dirname
 219
 220     args = {
 221         'suffix': '.tmp',
 222         'prefix': path_basename(fn) + '.',
 223         'dir': path_dirname(fn),
 224         'delete': False,
 225     }
 226
 227     # In Python 2.x, json.dump expects a bytestream.
 228     # In Python 3.x, it writes to a character stream
 229     if sys.version_info < (3, 0):
 230         args['mode'] = 'wb'
 231     else:
 232         args.update({
 233             'mode': 'w',
 234             'encoding': 'utf-8',
 235         })
 236
 237     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 238
 239     try:
 240         with tf:
 241             json.dump(obj, tf)
 242         if sys.platform == 'win32':
 243             # Need to remove existing file on Windows, else os.rename raises
 244             # WindowsError or FileExistsError.
 245             try:
 246                 os.unlink(fn)
 247             except OSError:
 248                 pass
 249         os.rename(tf.name, fn)
 250     except Exception:
 251         try:
 252             os.remove(tf.name)
 253         except OSError:
 254             pass
 255         raise
 256
 257
 258 if sys.version_info >= (2, 7):
 259     def find_xpath_attr(node, xpath, key, val=None):
 260         """ Find the xpath xpath[@key=val] """
 261         assert re.match(r'^[a-zA-Z_-]+$', key)
 262         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 263         return node.find(expr)
 264 else:
 265     def find_xpath_attr(node, xpath, key, val=None):
 266         for f in node.findall(compat_xpath(xpath)):
 267             if key not in f.attrib:
 268                 continue
 269             if val is None or f.attrib.get(key) == val:
 270                 return f
 271         return None
 272
 273 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 274 # the namespace parameter
 275
 276
 277 def xpath_with_ns(path, ns_map):
 278     components = [c.split(':') for c in path.split('/')]
 279     replaced = []
 280     for c in components:
 281         if len(c) == 1:
 282             replaced.append(c[0])
 283         else:
 284             ns, tag = c
 285             replaced.append('{%s}%s' % (ns_map[ns], tag))
 286     return '/'.join(replaced)
 287
 288
 289 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 290     def _find_xpath(xpath):
 291         return node.find(compat_xpath(xpath))
 292
 293     if isinstance(xpath, (str, compat_str)):
 294         n = _find_xpath(xpath)
 295     else:
 296         for xp in xpath:
 297             n = _find_xpath(xp)
 298             if n is not None:
 299                 break
 300
 301     if n is None:
 302         if default is not NO_DEFAULT:
 303             return default
 304         elif fatal:
 305             name = xpath if name is None else name
 306             raise ExtractorError('Could not find XML element %s' % name)
 307         else:
 308             return None
 309     return n
 310
 311
 312 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 313     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 314     if n is None or n == default:
 315         return n
 316     if n.text is None:
 317         if default is not NO_DEFAULT:
 318             return default
 319         elif fatal:
 320             name = xpath if name is None else name
 321             raise ExtractorError('Could not find XML element\'s text %s' % name)
 322         else:
 323             return None
 324     return n.text
 325
 326
 327 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 328     n = find_xpath_attr(node, xpath, key)
 329     if n is None:
 330         if default is not NO_DEFAULT:
 331             return default
 332         elif fatal:
 333             name = '%s[@%s]' % (xpath, key) if name is None else name
 334             raise ExtractorError('Could not find XML attribute %s' % name)
 335         else:
 336             return None
 337     return n.attrib[key]
 338
 339
 340 def get_element_by_id(id, html):
 341     """Return the content of the tag with the specified ID in the passed HTML document"""
 342     return get_element_by_attribute('id', id, html)
 343
 344
 345 def get_element_by_class(class_name, html):
 346     """Return the content of the first tag with the specified class in the passed HTML document"""
 347     retval = get_elements_by_class(class_name, html)
 348     return retval[0] if retval else None
 349
 350
 351 def get_element_by_attribute(attribute, value, html, escape_value=True):
 352     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 353     return retval[0] if retval else None
 354
 355
 356 def get_elements_by_class(class_name, html):
 357     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 358     return get_elements_by_attribute(
 359         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 360         html, escape_value=False)
 361
 362
 363 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 364     """Return the content of the tag with the specified attribute in the passed HTML document"""
 365
 366     value = re.escape(value) if escape_value else value
 367
 368     retlist = []
 369     for m in re.finditer(r'''(?xs)
 370         <([a-zA-Z0-9:._-]+)
 371          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 372          \s+%s=['"]?%s['"]?
 373          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 374         \s*>
 375         (?P<content>.*?)
 376         </\1>
 377     ''' % (re.escape(attribute), value), html):
 378         res = m.group('content')
 379
 380         if res.startswith('"') or res.startswith("'"):
 381             res = res[1:-1]
 382
 383         retlist.append(unescapeHTML(res))
 384
 385     return retlist
 386
 387
 388 class HTMLAttributeParser(compat_HTMLParser):
 389     """Trivial HTML parser to gather the attributes for a single element"""
 390     def __init__(self):
 391         self.attrs = {}
 392         compat_HTMLParser.__init__(self)
 393
 394     def handle_starttag(self, tag, attrs):
 395         self.attrs = dict(attrs)
 396
 397
 398 def extract_attributes(html_element):
 399     """Given a string for an HTML element such as
 400     <el
 401          a="foo" B="bar" c="&98;az" d=boz
 402          empty= noval entity="&amp;"
 403          sq='"' dq="'"
 404     >
 405     Decode and return a dictionary of attributes.
 406     {
 407         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 408         'empty': '', 'noval': None, 'entity': '&',
 409         'sq': '"', 'dq': '\''
 410     }.
 411     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 412     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 413     """
 414     parser = HTMLAttributeParser()
 415     try:
 416         parser.feed(html_element)
 417         parser.close()
 418     # Older Python may throw HTMLParseError in case of malformed HTML
 419     except compat_HTMLParseError:
 420         pass
 421     return parser.attrs
 422
 423
 424 def clean_html(html):
 425     """Clean an HTML snippet into a readable string"""
 426
 427     if html is None:  # Convenience for sanitizing descriptions etc.
 428         return html
 429
 430     # Newline vs <br />
 431     html = html.replace('\n', ' ')
 432     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 433     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 434     # Strip html tags
 435     html = re.sub('<.*?>', '', html)
 436     # Replace html entities
 437     html = unescapeHTML(html)
 438     return html.strip()
 439
 440
 441 def sanitize_open(filename, open_mode):
 442     """Try to open the given filename, and slightly tweak it if this fails.
 443
 444     Attempts to open the given filename. If this fails, it tries to change
 445     the filename slightly, step by step, until it's either able to open it
 446     or it fails and raises a final exception, like the standard open()
 447     function.
 448
 449     It returns the tuple (stream, definitive_file_name).
 450     """
 451     try:
 452         if filename == '-':
 453             if sys.platform == 'win32':
 454                 import msvcrt
 455                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 456             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 457         stream = open(encodeFilename(filename), open_mode)
 458         return (stream, filename)
 459     except (IOError, OSError) as err:
 460         if err.errno in (errno.EACCES,):
 461             raise
 462
 463         # In case of error, try to remove win32 forbidden chars
 464         alt_filename = sanitize_path(filename)
 465         if alt_filename == filename:
 466             raise
 467         else:
 468             # An exception here should be caught in the caller
 469             stream = open(encodeFilename(alt_filename), open_mode)
 470             return (stream, alt_filename)
 471
 472
 473 def timeconvert(timestr):
 474     """Convert RFC 2822 defined time string into system timestamp"""
 475     timestamp = None
 476     timetuple = email.utils.parsedate_tz(timestr)
 477     if timetuple is not None:
 478         timestamp = email.utils.mktime_tz(timetuple)
 479     return timestamp
 480
 481
 482 def sanitize_filename(s, restricted=False, is_id=False):
 483     """Sanitizes a string so it could be used as part of a filename.
 484     If restricted is set, use a stricter subset of allowed characters.
 485     Set is_id if this is not an arbitrary string, but an ID that should be kept
 486     if possible.
 487     """
 488     def replace_insane(char):
 489         if restricted and char in ACCENT_CHARS:
 490             return ACCENT_CHARS[char]
 491         if char == '?' or ord(char) < 32 or ord(char) == 127:
 492             return ''
 493         elif char == '"':
 494             return '' if restricted else '\''
 495         elif char == ':':
 496             return '_-' if restricted else ' -'
 497         elif char in '\\/|*<>':
 498             return '_'
 499         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 500             return '_'
 501         if restricted and ord(char) > 127:
 502             return '_'
 503         return char
 504
 505     # Handle timestamps
 506     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 507     result = ''.join(map(replace_insane, s))
 508     if not is_id:
 509         while '__' in result:
 510             result = result.replace('__', '_')
 511         result = result.strip('_')
 512         # Common case of "Foreign band name - English song title"
 513         if restricted and result.startswith('-_'):
 514             result = result[2:]
 515         if result.startswith('-'):
 516             result = '_' + result[len('-'):]
 517         result = result.lstrip('.')
 518         if not result:
 519             result = '_'
 520     return result
 521
 522
 523 def sanitize_path(s):
 524     """Sanitizes and normalizes path on Windows"""
 525     if sys.platform != 'win32':
 526         return s
 527     drive_or_unc, _ = os.path.splitdrive(s)
 528     if sys.version_info < (2, 7) and not drive_or_unc:
 529         drive_or_unc, _ = os.path.splitunc(s)
 530     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 531     if drive_or_unc:
 532         norm_path.pop(0)
 533     sanitized_path = [
 534         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 535         for path_part in norm_path]
 536     if drive_or_unc:
 537         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 538     return os.path.join(*sanitized_path)
 539
 540
 541 def sanitize_url(url):
 542     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 543     # the number of unwanted failures due to missing protocol
 544     if url.startswith('//'):
 545         return 'http:%s' % url
 546     # Fix some common typos seen so far
 547     COMMON_TYPOS = (
 548         # https://github.com/rg3/youtube-dl/issues/15649
 549         (r'^httpss://', r'https://'),
 550         # https://bx1.be/lives/direct-tv/
 551         (r'^rmtp([es]?)://', r'rtmp\1://'),
 552     )
 553     for mistake, fixup in COMMON_TYPOS:
 554         if re.match(mistake, url):
 555             return re.sub(mistake, fixup, url)
 556     return url
 557
 558
 559 def sanitized_Request(url, *args, **kwargs):
 560     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 561
 562
 563 def expand_path(s):
 564     """Expand shell variables and ~"""
 565     return os.path.expandvars(compat_expanduser(s))
 566
 567
 568 def orderedSet(iterable):
 569     """ Remove all duplicates from the input iterable """
 570     res = []
 571     for el in iterable:
 572         if el not in res:
 573             res.append(el)
 574     return res
 575
 576
 577 def _htmlentity_transform(entity_with_semicolon):
 578     """Transforms an HTML entity to a character."""
 579     entity = entity_with_semicolon[:-1]
 580
 581     # Known non-numeric HTML entity
 582     if entity in compat_html_entities.name2codepoint:
 583         return compat_chr(compat_html_entities.name2codepoint[entity])
 584
 585     # TODO: HTML5 allows entities without a semicolon. For example,
 586     # '&Eacuteric' should be decoded as 'Éric'.
 587     if entity_with_semicolon in compat_html_entities_html5:
 588         return compat_html_entities_html5[entity_with_semicolon]
 589
 590     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 591     if mobj is not None:
 592         numstr = mobj.group(1)
 593         if numstr.startswith('x'):
 594             base = 16
 595             numstr = '0%s' % numstr
 596         else:
 597             base = 10
 598         # See https://github.com/rg3/youtube-dl/issues/7518
 599         try:
 600             return compat_chr(int(numstr, base))
 601         except ValueError:
 602             pass
 603
 604     # Unknown entity in name, return its literal representation
 605     return '&%s;' % entity
 606
 607
 608 def unescapeHTML(s):
 609     if s is None:
 610         return None
 611     assert type(s) == compat_str
 612
 613     return re.sub(
 614         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 615
 616
 617 def get_subprocess_encoding():
 618     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 619         # For subprocess calls, encode with locale encoding
 620         # Refer to http://stackoverflow.com/a/9951851/35070
 621         encoding = preferredencoding()
 622     else:
 623         encoding = sys.getfilesystemencoding()
 624     if encoding is None:
 625         encoding = 'utf-8'
 626     return encoding
 627
 628
 629 def encodeFilename(s, for_subprocess=False):
 630     """
 631     @param s The name of the file
 632     """
 633
 634     assert type(s) == compat_str
 635
 636     # Python 3 has a Unicode API
 637     if sys.version_info >= (3, 0):
 638         return s
 639
 640     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 641     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 642     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 643     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 644         return s
 645
 646     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 647     if sys.platform.startswith('java'):
 648         return s
 649
 650     return s.encode(get_subprocess_encoding(), 'ignore')
 651
 652
 653 def decodeFilename(b, for_subprocess=False):
 654
 655     if sys.version_info >= (3, 0):
 656         return b
 657
 658     if not isinstance(b, bytes):
 659         return b
 660
 661     return b.decode(get_subprocess_encoding(), 'ignore')
 662
 663
 664 def encodeArgument(s):
 665     if not isinstance(s, compat_str):
 666         # Legacy code that uses byte strings
 667         # Uncomment the following line after fixing all post processors
 668         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 669         s = s.decode('ascii')
 670     return encodeFilename(s, True)
 671
 672
 673 def decodeArgument(b):
 674     return decodeFilename(b, True)
 675
 676
 677 def decodeOption(optval):
 678     if optval is None:
 679         return optval
 680     if isinstance(optval, bytes):
 681         optval = optval.decode(preferredencoding())
 682
 683     assert isinstance(optval, compat_str)
 684     return optval
 685
 686
 687 def formatSeconds(secs):
 688     if secs > 3600:
 689         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 690     elif secs > 60:
 691         return '%d:%02d' % (secs // 60, secs % 60)
 692     else:
 693         return '%d' % secs
 694
 695
 696 def make_HTTPS_handler(params, **kwargs):
 697     opts_no_check_certificate = params.get('nocheckcertificate', False)
 698     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 699         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 700         if opts_no_check_certificate:
 701             context.check_hostname = False
 702             context.verify_mode = ssl.CERT_NONE
 703         try:
 704             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 705         except TypeError:
 706             # Python 2.7.8
 707             # (create_default_context present but HTTPSHandler has no context=)
 708             pass
 709
 710     if sys.version_info < (3, 2):
 711         return YoutubeDLHTTPSHandler(params, **kwargs)
 712     else:  # Python < 3.4
 713         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 714         context.verify_mode = (ssl.CERT_NONE
 715                                if opts_no_check_certificate
 716                                else ssl.CERT_REQUIRED)
 717         context.set_default_verify_paths()
 718         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 719
 720
 721 def bug_reports_message():
 722     if ytdl_is_updateable():
 723         update_cmd = 'type  youtube-dl -U  to update'
 724     else:
 725         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 726     msg = '; please report this issue on https://yt-dl.org/bug .'
 727     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 728     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 729     return msg
 730
 731
 732 class YoutubeDLError(Exception):
 733     """Base exception for YoutubeDL errors."""
 734     pass
 735
 736
 737 class ExtractorError(YoutubeDLError):
 738     """Error during info extraction."""
 739
 740     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 741         """ tb, if given, is the original traceback (so that it can be printed out).
 742         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 743         """
 744
 745         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 746             expected = True
 747         if video_id is not None:
 748             msg = video_id + ': ' + msg
 749         if cause:
 750             msg += ' (caused by %r)' % cause
 751         if not expected:
 752             msg += bug_reports_message()
 753         super(ExtractorError, self).__init__(msg)
 754
 755         self.traceback = tb
 756         self.exc_info = sys.exc_info()  # preserve original exception
 757         self.cause = cause
 758         self.video_id = video_id
 759
 760     def format_traceback(self):
 761         if self.traceback is None:
 762             return None
 763         return ''.join(traceback.format_tb(self.traceback))
 764
 765
 766 class UnsupportedError(ExtractorError):
 767     def __init__(self, url):
 768         super(UnsupportedError, self).__init__(
 769             'Unsupported URL: %s' % url, expected=True)
 770         self.url = url
 771
 772
 773 class RegexNotFoundError(ExtractorError):
 774     """Error when a regex didn't match"""
 775     pass
 776
 777
 778 class GeoRestrictedError(ExtractorError):
 779     """Geographic restriction Error exception.
 780
 781     This exception may be thrown when a video is not available from your
 782     geographic location due to geographic restrictions imposed by a website.
 783     """
 784     def __init__(self, msg, countries=None):
 785         super(GeoRestrictedError, self).__init__(msg, expected=True)
 786         self.msg = msg
 787         self.countries = countries
 788
 789
 790 class DownloadError(YoutubeDLError):
 791     """Download Error exception.
 792
 793     This exception may be thrown by FileDownloader objects if they are not
 794     configured to continue on errors. They will contain the appropriate
 795     error message.
 796     """
 797
 798     def __init__(self, msg, exc_info=None):
 799         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 800         super(DownloadError, self).__init__(msg)
 801         self.exc_info = exc_info
 802
 803
 804 class SameFileError(YoutubeDLError):
 805     """Same File exception.
 806
 807     This exception will be thrown by FileDownloader objects if they detect
 808     multiple files would have to be downloaded to the same file on disk.
 809     """
 810     pass
 811
 812
 813 class PostProcessingError(YoutubeDLError):
 814     """Post Processing exception.
 815
 816     This exception may be raised by PostProcessor's .run() method to
 817     indicate an error in the postprocessing task.
 818     """
 819
 820     def __init__(self, msg):
 821         super(PostProcessingError, self).__init__(msg)
 822         self.msg = msg
 823
 824
 825 class MaxDownloadsReached(YoutubeDLError):
 826     """ --max-downloads limit has been reached. """
 827     pass
 828
 829
 830 class UnavailableVideoError(YoutubeDLError):
 831     """Unavailable Format exception.
 832
 833     This exception will be thrown when a video is requested
 834     in a format that is not available for that video.
 835     """
 836     pass
 837
 838
 839 class ContentTooShortError(YoutubeDLError):
 840     """Content Too Short exception.
 841
 842     This exception may be raised by FileDownloader objects when a file they
 843     download is too small for what the server announced first, indicating
 844     the connection was probably interrupted.
 845     """
 846
 847     def __init__(self, downloaded, expected):
 848         super(ContentTooShortError, self).__init__(
 849             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 850         )
 851         # Both in bytes
 852         self.downloaded = downloaded
 853         self.expected = expected
 854
 855
 856 class XAttrMetadataError(YoutubeDLError):
 857     def __init__(self, code=None, msg='Unknown error'):
 858         super(XAttrMetadataError, self).__init__(msg)
 859         self.code = code
 860         self.msg = msg
 861
 862         # Parsing code and msg
 863         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 864                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 865             self.reason = 'NO_SPACE'
 866         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 867             self.reason = 'VALUE_TOO_LONG'
 868         else:
 869             self.reason = 'NOT_SUPPORTED'
 870
 871
 872 class XAttrUnavailableError(YoutubeDLError):
 873     pass
 874
 875
 876 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 877     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 878     # expected HTTP responses to meet HTTP/1.0 or later (see also
 879     # https://github.com/rg3/youtube-dl/issues/6727)
 880     if sys.version_info < (3, 0):
 881         kwargs['strict'] = True
 882     hc = http_class(*args, **compat_kwargs(kwargs))
 883     source_address = ydl_handler._params.get('source_address')
 884     if source_address is not None:
 885         sa = (source_address, 0)
 886         if hasattr(hc, 'source_address'):  # Python 2.7+
 887             hc.source_address = sa
 888         else:  # Python 2.6
 889             def _hc_connect(self, *args, **kwargs):
 890                 sock = compat_socket_create_connection(
 891                     (self.host, self.port), self.timeout, sa)
 892                 if is_https:
 893                     self.sock = ssl.wrap_socket(
 894                         sock, self.key_file, self.cert_file,
 895                         ssl_version=ssl.PROTOCOL_TLSv1)
 896                 else:
 897                     self.sock = sock
 898             hc.connect = functools.partial(_hc_connect, hc)
 899
 900     return hc
 901
 902
 903 def handle_youtubedl_headers(headers):
 904     filtered_headers = headers
 905
 906     if 'Youtubedl-no-compression' in filtered_headers:
 907         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 908         del filtered_headers['Youtubedl-no-compression']
 909
 910     return filtered_headers
 911
 912
 913 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 914     """Handler for HTTP requests and responses.
 915
 916     This class, when installed with an OpenerDirector, automatically adds
 917     the standard headers to every HTTP request and handles gzipped and
 918     deflated responses from web servers. If compression is to be avoided in
 919     a particular request, the original request in the program code only has
 920     to include the HTTP header "Youtubedl-no-compression", which will be
 921     removed before making the real request.
 922
 923     Part of this code was copied from:
 924
 925     http://techknack.net/python-urllib2-handlers/
 926
 927     Andrew Rowls, the author of that code, agreed to release it to the
 928     public domain.
 929     """
 930
 931     def __init__(self, params, *args, **kwargs):
 932         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 933         self._params = params
 934
 935     def http_open(self, req):
 936         conn_class = compat_http_client.HTTPConnection
 937
 938         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 939         if socks_proxy:
 940             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 941             del req.headers['Ytdl-socks-proxy']
 942
 943         return self.do_open(functools.partial(
 944             _create_http_connection, self, conn_class, False),
 945             req)
 946
 947     @staticmethod
 948     def deflate(data):
 949         try:
 950             return zlib.decompress(data, -zlib.MAX_WBITS)
 951         except zlib.error:
 952             return zlib.decompress(data)
 953
 954     def http_request(self, req):
 955         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 956         # always respected by websites, some tend to give out URLs with non percent-encoded
 957         # non-ASCII characters (see telemb.py, ard.py [#3412])
 958         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 959         # To work around aforementioned issue we will replace request's original URL with
 960         # percent-encoded one
 961         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 962         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 963         url = req.get_full_url()
 964         url_escaped = escape_url(url)
 965
 966         # Substitute URL if any change after escaping
 967         if url != url_escaped:
 968             req = update_Request(req, url=url_escaped)
 969
 970         for h, v in std_headers.items():
 971             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 972             # The dict keys are capitalized because of this bug by urllib
 973             if h.capitalize() not in req.headers:
 974                 req.add_header(h, v)
 975
 976         req.headers = handle_youtubedl_headers(req.headers)
 977
 978         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 979             # Python 2.6 is brain-dead when it comes to fragments
 980             req._Request__original = req._Request__original.partition('#')[0]
 981             req._Request__r_type = req._Request__r_type.partition('#')[0]
 982
 983         return req
 984
 985     def http_response(self, req, resp):
 986         old_resp = resp
 987         # gzip
 988         if resp.headers.get('Content-encoding', '') == 'gzip':
 989             content = resp.read()
 990             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 991             try:
 992                 uncompressed = io.BytesIO(gz.read())
 993             except IOError as original_ioerror:
 994                 # There may be junk add the end of the file
 995                 # See http://stackoverflow.com/q/4928560/35070 for details
 996                 for i in range(1, 1024):
 997                     try:
 998                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 999                         uncompressed = io.BytesIO(gz.read())
1000                     except IOError:
1001                         continue
1002                     break
1003                 else:
1004                     raise original_ioerror
1005             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1006             resp.msg = old_resp.msg
1007             del resp.headers['Content-encoding']
1008         # deflate
1009         if resp.headers.get('Content-encoding', '') == 'deflate':
1010             gz = io.BytesIO(self.deflate(resp.read()))
1011             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1012             resp.msg = old_resp.msg
1013             del resp.headers['Content-encoding']
1014         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015         # https://github.com/rg3/youtube-dl/issues/6457).
1016         if 300 <= resp.code < 400:
1017             location = resp.headers.get('Location')
1018             if location:
1019                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020                 if sys.version_info >= (3, 0):
1021                     location = location.encode('iso-8859-1').decode('utf-8')
1022                 else:
1023                     location = location.decode('utf-8')
1024                 location_escaped = escape_url(location)
1025                 if location != location_escaped:
1026                     del resp.headers['Location']
1027                     if sys.version_info < (3, 0):
1028                         location_escaped = location_escaped.encode('utf-8')
1029                     resp.headers['Location'] = location_escaped
1030         return resp
1031
1032     https_request = http_request
1033     https_response = http_response
1034
1035
1036 def make_socks_conn_class(base_class, socks_proxy):
1037     assert issubclass(base_class, (
1038         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1039
1040     url_components = compat_urlparse.urlparse(socks_proxy)
1041     if url_components.scheme.lower() == 'socks5':
1042         socks_type = ProxyType.SOCKS5
1043     elif url_components.scheme.lower() in ('socks', 'socks4'):
1044         socks_type = ProxyType.SOCKS4
1045     elif url_components.scheme.lower() == 'socks4a':
1046         socks_type = ProxyType.SOCKS4A
1047
1048     def unquote_if_non_empty(s):
1049         if not s:
1050             return s
1051         return compat_urllib_parse_unquote_plus(s)
1052
1053     proxy_args = (
1054         socks_type,
1055         url_components.hostname, url_components.port or 1080,
1056         True,  # Remote DNS
1057         unquote_if_non_empty(url_components.username),
1058         unquote_if_non_empty(url_components.password),
1059     )
1060
1061     class SocksConnection(base_class):
1062         def connect(self):
1063             self.sock = sockssocket()
1064             self.sock.setproxy(*proxy_args)
1065             if type(self.timeout) in (int, float):
1066                 self.sock.settimeout(self.timeout)
1067             self.sock.connect((self.host, self.port))
1068
1069             if isinstance(self, compat_http_client.HTTPSConnection):
1070                 if hasattr(self, '_context'):  # Python > 2.6
1071                     self.sock = self._context.wrap_socket(
1072                         self.sock, server_hostname=self.host)
1073                 else:
1074                     self.sock = ssl.wrap_socket(self.sock)
1075
1076     return SocksConnection
1077
1078
1079 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083         self._params = params
1084
1085     def https_open(self, req):
1086         kwargs = {}
1087         conn_class = self._https_conn_class
1088
1089         if hasattr(self, '_context'):  # python > 2.6
1090             kwargs['context'] = self._context
1091         if hasattr(self, '_check_hostname'):  # python 3.x
1092             kwargs['check_hostname'] = self._check_hostname
1093
1094         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1095         if socks_proxy:
1096             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097             del req.headers['Ytdl-socks-proxy']
1098
1099         return self.do_open(functools.partial(
1100             _create_http_connection, self, conn_class, True),
1101             req, **kwargs)
1102
1103
1104 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105     def __init__(self, cookiejar=None):
1106         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1107
1108     def http_response(self, request, response):
1109         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110         # characters in Set-Cookie HTTP header of last response (see
1111         # https://github.com/rg3/youtube-dl/issues/6769).
1112         # In order to at least prevent crashing we will percent encode Set-Cookie
1113         # header before HTTPCookieProcessor starts processing it.
1114         # if sys.version_info < (3, 0) and response.headers:
1115         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116         #         set_cookie = response.headers.get(set_cookie_header)
1117         #         if set_cookie:
1118         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119         #             if set_cookie != set_cookie_escaped:
1120         #                 del response.headers[set_cookie_header]
1121         #                 response.headers[set_cookie_header] = set_cookie_escaped
1122         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1123
1124     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125     https_response = http_response
1126
1127
1128 def extract_timezone(date_str):
1129     m = re.search(
1130         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1131         date_str)
1132     if not m:
1133         timezone = datetime.timedelta()
1134     else:
1135         date_str = date_str[:-len(m.group('tz'))]
1136         if not m.group('sign'):
1137             timezone = datetime.timedelta()
1138         else:
1139             sign = 1 if m.group('sign') == '+' else -1
1140             timezone = datetime.timedelta(
1141                 hours=sign * int(m.group('hours')),
1142                 minutes=sign * int(m.group('minutes')))
1143     return timezone, date_str
1144
1145
1146 def parse_iso8601(date_str, delimiter='T', timezone=None):
1147     """ Return a UNIX timestamp from the given date """
1148
1149     if date_str is None:
1150         return None
1151
1152     date_str = re.sub(r'\.[0-9]+', '', date_str)
1153
1154     if timezone is None:
1155         timezone, date_str = extract_timezone(date_str)
1156
1157     try:
1158         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160         return calendar.timegm(dt.timetuple())
1161     except ValueError:
1162         pass
1163
1164
1165 def date_formats(day_first=True):
1166     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1167
1168
1169 def unified_strdate(date_str, day_first=True):
1170     """Return a string with the date in the format YYYYMMDD"""
1171
1172     if date_str is None:
1173         return None
1174     upload_date = None
1175     # Replace commas
1176     date_str = date_str.replace(',', ' ')
1177     # Remove AM/PM + timezone
1178     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1179     _, date_str = extract_timezone(date_str)
1180
1181     for expression in date_formats(day_first):
1182         try:
1183             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1184         except ValueError:
1185             pass
1186     if upload_date is None:
1187         timetuple = email.utils.parsedate_tz(date_str)
1188         if timetuple:
1189             try:
1190                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191             except ValueError:
1192                 pass
1193     if upload_date is not None:
1194         return compat_str(upload_date)
1195
1196
1197 def unified_timestamp(date_str, day_first=True):
1198     if date_str is None:
1199         return None
1200
1201     date_str = re.sub(r'[,|]', '', date_str)
1202
1203     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1204     timezone, date_str = extract_timezone(date_str)
1205
1206     # Remove AM/PM + timezone
1207     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1208
1209     # Remove unrecognized timezones from ISO 8601 alike timestamps
1210     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1211     if m:
1212         date_str = date_str[:-len(m.group('tz'))]
1213
1214     # Python only supports microseconds, so remove nanoseconds
1215     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1216     if m:
1217         date_str = m.group(1)
1218
1219     for expression in date_formats(day_first):
1220         try:
1221             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1222             return calendar.timegm(dt.timetuple())
1223         except ValueError:
1224             pass
1225     timetuple = email.utils.parsedate_tz(date_str)
1226     if timetuple:
1227         return calendar.timegm(timetuple) + pm_delta * 3600
1228
1229
1230 def determine_ext(url, default_ext='unknown_video'):
1231     if url is None:
1232         return default_ext
1233     guess = url.partition('?')[0].rpartition('.')[2]
1234     if re.match(r'^[A-Za-z0-9]+$', guess):
1235         return guess
1236     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1237     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1238         return guess.rstrip('/')
1239     else:
1240         return default_ext
1241
1242
1243 def subtitles_filename(filename, sub_lang, sub_format):
1244     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1245
1246
1247 def date_from_str(date_str):
1248     """
1249     Return a datetime object from a string in the format YYYYMMDD or
1250     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1251     today = datetime.date.today()
1252     if date_str in ('now', 'today'):
1253         return today
1254     if date_str == 'yesterday':
1255         return today - datetime.timedelta(days=1)
1256     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1257     if match is not None:
1258         sign = match.group('sign')
1259         time = int(match.group('time'))
1260         if sign == '-':
1261             time = -time
1262         unit = match.group('unit')
1263         # A bad approximation?
1264         if unit == 'month':
1265             unit = 'day'
1266             time *= 30
1267         elif unit == 'year':
1268             unit = 'day'
1269             time *= 365
1270         unit += 's'
1271         delta = datetime.timedelta(**{unit: time})
1272         return today + delta
1273     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1274
1275
1276 def hyphenate_date(date_str):
1277     """
1278     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1279     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1280     if match is not None:
1281         return '-'.join(match.groups())
1282     else:
1283         return date_str
1284
1285
1286 class DateRange(object):
1287     """Represents a time interval between two dates"""
1288
1289     def __init__(self, start=None, end=None):
1290         """start and end must be strings in the format accepted by date"""
1291         if start is not None:
1292             self.start = date_from_str(start)
1293         else:
1294             self.start = datetime.datetime.min.date()
1295         if end is not None:
1296             self.end = date_from_str(end)
1297         else:
1298             self.end = datetime.datetime.max.date()
1299         if self.start > self.end:
1300             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1301
1302     @classmethod
1303     def day(cls, day):
1304         """Returns a range that only contains the given day"""
1305         return cls(day, day)
1306
1307     def __contains__(self, date):
1308         """Check if the date is in the range"""
1309         if not isinstance(date, datetime.date):
1310             date = date_from_str(date)
1311         return self.start <= date <= self.end
1312
1313     def __str__(self):
1314         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1315
1316
1317 def platform_name():
1318     """ Returns the platform name as a compat_str """
1319     res = platform.platform()
1320     if isinstance(res, bytes):
1321         res = res.decode(preferredencoding())
1322
1323     assert isinstance(res, compat_str)
1324     return res
1325
1326
1327 def _windows_write_string(s, out):
1328     """ Returns True if the string was written using special methods,
1329     False if it has yet to be written out."""
1330     # Adapted from http://stackoverflow.com/a/3259271/35070
1331
1332     import ctypes
1333     import ctypes.wintypes
1334
1335     WIN_OUTPUT_IDS = {
1336         1: -11,
1337         2: -12,
1338     }
1339
1340     try:
1341         fileno = out.fileno()
1342     except AttributeError:
1343         # If the output stream doesn't have a fileno, it's virtual
1344         return False
1345     except io.UnsupportedOperation:
1346         # Some strange Windows pseudo files?
1347         return False
1348     if fileno not in WIN_OUTPUT_IDS:
1349         return False
1350
1351     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1352         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1353         ('GetStdHandle', ctypes.windll.kernel32))
1354     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1355
1356     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1357         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1358         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1359         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1360     written = ctypes.wintypes.DWORD(0)
1361
1362     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1363     FILE_TYPE_CHAR = 0x0002
1364     FILE_TYPE_REMOTE = 0x8000
1365     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1366         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1367         ctypes.POINTER(ctypes.wintypes.DWORD))(
1368         ('GetConsoleMode', ctypes.windll.kernel32))
1369     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1370
1371     def not_a_console(handle):
1372         if handle == INVALID_HANDLE_VALUE or handle is None:
1373             return True
1374         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1375                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1376
1377     if not_a_console(h):
1378         return False
1379
1380     def next_nonbmp_pos(s):
1381         try:
1382             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1383         except StopIteration:
1384             return len(s)
1385
1386     while s:
1387         count = min(next_nonbmp_pos(s), 1024)
1388
1389         ret = WriteConsoleW(
1390             h, s, count if count else 2, ctypes.byref(written), None)
1391         if ret == 0:
1392             raise OSError('Failed to write string')
1393         if not count:  # We just wrote a non-BMP character
1394             assert written.value == 2
1395             s = s[1:]
1396         else:
1397             assert written.value > 0
1398             s = s[written.value:]
1399     return True
1400
1401
1402 def write_string(s, out=None, encoding=None):
1403     if out is None:
1404         out = sys.stderr
1405     assert type(s) == compat_str
1406
1407     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1408         if _windows_write_string(s, out):
1409             return
1410
1411     if ('b' in getattr(out, 'mode', '') or
1412             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1413         byt = s.encode(encoding or preferredencoding(), 'ignore')
1414         out.write(byt)
1415     elif hasattr(out, 'buffer'):
1416         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1417         byt = s.encode(enc, 'ignore')
1418         out.buffer.write(byt)
1419     else:
1420         out.write(s)
1421     out.flush()
1422
1423
1424 def bytes_to_intlist(bs):
1425     if not bs:
1426         return []
1427     if isinstance(bs[0], int):  # Python 3
1428         return list(bs)
1429     else:
1430         return [ord(c) for c in bs]
1431
1432
1433 def intlist_to_bytes(xs):
1434     if not xs:
1435         return b''
1436     return compat_struct_pack('%dB' % len(xs), *xs)
1437
1438
1439 # Cross-platform file locking
1440 if sys.platform == 'win32':
1441     import ctypes.wintypes
1442     import msvcrt
1443
1444     class OVERLAPPED(ctypes.Structure):
1445         _fields_ = [
1446             ('Internal', ctypes.wintypes.LPVOID),
1447             ('InternalHigh', ctypes.wintypes.LPVOID),
1448             ('Offset', ctypes.wintypes.DWORD),
1449             ('OffsetHigh', ctypes.wintypes.DWORD),
1450             ('hEvent', ctypes.wintypes.HANDLE),
1451         ]
1452
1453     kernel32 = ctypes.windll.kernel32
1454     LockFileEx = kernel32.LockFileEx
1455     LockFileEx.argtypes = [
1456         ctypes.wintypes.HANDLE,     # hFile
1457         ctypes.wintypes.DWORD,      # dwFlags
1458         ctypes.wintypes.DWORD,      # dwReserved
1459         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1460         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1461         ctypes.POINTER(OVERLAPPED)  # Overlapped
1462     ]
1463     LockFileEx.restype = ctypes.wintypes.BOOL
1464     UnlockFileEx = kernel32.UnlockFileEx
1465     UnlockFileEx.argtypes = [
1466         ctypes.wintypes.HANDLE,     # hFile
1467         ctypes.wintypes.DWORD,      # dwReserved
1468         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1469         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1470         ctypes.POINTER(OVERLAPPED)  # Overlapped
1471     ]
1472     UnlockFileEx.restype = ctypes.wintypes.BOOL
1473     whole_low = 0xffffffff
1474     whole_high = 0x7fffffff
1475
1476     def _lock_file(f, exclusive):
1477         overlapped = OVERLAPPED()
1478         overlapped.Offset = 0
1479         overlapped.OffsetHigh = 0
1480         overlapped.hEvent = 0
1481         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1482         handle = msvcrt.get_osfhandle(f.fileno())
1483         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1484                           whole_low, whole_high, f._lock_file_overlapped_p):
1485             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1486
1487     def _unlock_file(f):
1488         assert f._lock_file_overlapped_p
1489         handle = msvcrt.get_osfhandle(f.fileno())
1490         if not UnlockFileEx(handle, 0,
1491                             whole_low, whole_high, f._lock_file_overlapped_p):
1492             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1493
1494 else:
1495     # Some platforms, such as Jython, is missing fcntl
1496     try:
1497         import fcntl
1498
1499         def _lock_file(f, exclusive):
1500             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1501
1502         def _unlock_file(f):
1503             fcntl.flock(f, fcntl.LOCK_UN)
1504     except ImportError:
1505         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1506
1507         def _lock_file(f, exclusive):
1508             raise IOError(UNSUPPORTED_MSG)
1509
1510         def _unlock_file(f):
1511             raise IOError(UNSUPPORTED_MSG)
1512
1513
1514 class locked_file(object):
1515     def __init__(self, filename, mode, encoding=None):
1516         assert mode in ['r', 'a', 'w']
1517         self.f = io.open(filename, mode, encoding=encoding)
1518         self.mode = mode
1519
1520     def __enter__(self):
1521         exclusive = self.mode != 'r'
1522         try:
1523             _lock_file(self.f, exclusive)
1524         except IOError:
1525             self.f.close()
1526             raise
1527         return self
1528
1529     def __exit__(self, etype, value, traceback):
1530         try:
1531             _unlock_file(self.f)
1532         finally:
1533             self.f.close()
1534
1535     def __iter__(self):
1536         return iter(self.f)
1537
1538     def write(self, *args):
1539         return self.f.write(*args)
1540
1541     def read(self, *args):
1542         return self.f.read(*args)
1543
1544
1545 def get_filesystem_encoding():
1546     encoding = sys.getfilesystemencoding()
1547     return encoding if encoding is not None else 'utf-8'
1548
1549
1550 def shell_quote(args):
1551     quoted_args = []
1552     encoding = get_filesystem_encoding()
1553     for a in args:
1554         if isinstance(a, bytes):
1555             # We may get a filename encoded with 'encodeFilename'
1556             a = a.decode(encoding)
1557         quoted_args.append(compat_shlex_quote(a))
1558     return ' '.join(quoted_args)
1559
1560
1561 def smuggle_url(url, data):
1562     """ Pass additional data in a URL for internal use. """
1563
1564     url, idata = unsmuggle_url(url, {})
1565     data.update(idata)
1566     sdata = compat_urllib_parse_urlencode(
1567         {'__youtubedl_smuggle': json.dumps(data)})
1568     return url + '#' + sdata
1569
1570
1571 def unsmuggle_url(smug_url, default=None):
1572     if '#__youtubedl_smuggle' not in smug_url:
1573         return smug_url, default
1574     url, _, sdata = smug_url.rpartition('#')
1575     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1576     data = json.loads(jsond)
1577     return url, data
1578
1579
1580 def format_bytes(bytes):
1581     if bytes is None:
1582         return 'N/A'
1583     if type(bytes) is str:
1584         bytes = float(bytes)
1585     if bytes == 0.0:
1586         exponent = 0
1587     else:
1588         exponent = int(math.log(bytes, 1024.0))
1589     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1590     converted = float(bytes) / float(1024 ** exponent)
1591     return '%.2f%s' % (converted, suffix)
1592
1593
1594 def lookup_unit_table(unit_table, s):
1595     units_re = '|'.join(re.escape(u) for u in unit_table)
1596     m = re.match(
1597         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1598     if not m:
1599         return None
1600     num_str = m.group('num').replace(',', '.')
1601     mult = unit_table[m.group('unit')]
1602     return int(float(num_str) * mult)
1603
1604
1605 def parse_filesize(s):
1606     if s is None:
1607         return None
1608
1609     # The lower-case forms are of course incorrect and unofficial,
1610     # but we support those too
1611     _UNIT_TABLE = {
1612         'B': 1,
1613         'b': 1,
1614         'bytes': 1,
1615         'KiB': 1024,
1616         'KB': 1000,
1617         'kB': 1024,
1618         'Kb': 1000,
1619         'kb': 1000,
1620         'kilobytes': 1000,
1621         'kibibytes': 1024,
1622         'MiB': 1024 ** 2,
1623         'MB': 1000 ** 2,
1624         'mB': 1024 ** 2,
1625         'Mb': 1000 ** 2,
1626         'mb': 1000 ** 2,
1627         'megabytes': 1000 ** 2,
1628         'mebibytes': 1024 ** 2,
1629         'GiB': 1024 ** 3,
1630         'GB': 1000 ** 3,
1631         'gB': 1024 ** 3,
1632         'Gb': 1000 ** 3,
1633         'gb': 1000 ** 3,
1634         'gigabytes': 1000 ** 3,
1635         'gibibytes': 1024 ** 3,
1636         'TiB': 1024 ** 4,
1637         'TB': 1000 ** 4,
1638         'tB': 1024 ** 4,
1639         'Tb': 1000 ** 4,
1640         'tb': 1000 ** 4,
1641         'terabytes': 1000 ** 4,
1642         'tebibytes': 1024 ** 4,
1643         'PiB': 1024 ** 5,
1644         'PB': 1000 ** 5,
1645         'pB': 1024 ** 5,
1646         'Pb': 1000 ** 5,
1647         'pb': 1000 ** 5,
1648         'petabytes': 1000 ** 5,
1649         'pebibytes': 1024 ** 5,
1650         'EiB': 1024 ** 6,
1651         'EB': 1000 ** 6,
1652         'eB': 1024 ** 6,
1653         'Eb': 1000 ** 6,
1654         'eb': 1000 ** 6,
1655         'exabytes': 1000 ** 6,
1656         'exbibytes': 1024 ** 6,
1657         'ZiB': 1024 ** 7,
1658         'ZB': 1000 ** 7,
1659         'zB': 1024 ** 7,
1660         'Zb': 1000 ** 7,
1661         'zb': 1000 ** 7,
1662         'zettabytes': 1000 ** 7,
1663         'zebibytes': 1024 ** 7,
1664         'YiB': 1024 ** 8,
1665         'YB': 1000 ** 8,
1666         'yB': 1024 ** 8,
1667         'Yb': 1000 ** 8,
1668         'yb': 1000 ** 8,
1669         'yottabytes': 1000 ** 8,
1670         'yobibytes': 1024 ** 8,
1671     }
1672
1673     return lookup_unit_table(_UNIT_TABLE, s)
1674
1675
1676 def parse_count(s):
1677     if s is None:
1678         return None
1679
1680     s = s.strip()
1681
1682     if re.match(r'^[\d,.]+$', s):
1683         return str_to_int(s)
1684
1685     _UNIT_TABLE = {
1686         'k': 1000,
1687         'K': 1000,
1688         'm': 1000 ** 2,
1689         'M': 1000 ** 2,
1690         'kk': 1000 ** 2,
1691         'KK': 1000 ** 2,
1692     }
1693
1694     return lookup_unit_table(_UNIT_TABLE, s)
1695
1696
1697 def parse_resolution(s):
1698     if s is None:
1699         return {}
1700
1701     mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1702     if mobj:
1703         return {
1704             'width': int(mobj.group('w')),
1705             'height': int(mobj.group('h')),
1706         }
1707
1708     mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1709     if mobj:
1710         return {'height': int(mobj.group(1))}
1711
1712     mobj = re.search(r'\b([48])[kK]\b', s)
1713     if mobj:
1714         return {'height': int(mobj.group(1)) * 540}
1715
1716     return {}
1717
1718
1719 def month_by_name(name, lang='en'):
1720     """ Return the number of a month by (locale-independently) English name """
1721
1722     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1723
1724     try:
1725         return month_names.index(name) + 1
1726     except ValueError:
1727         return None
1728
1729
1730 def month_by_abbreviation(abbrev):
1731     """ Return the number of a month by (locale-independently) English
1732         abbreviations """
1733
1734     try:
1735         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1736     except ValueError:
1737         return None
1738
1739
1740 def fix_xml_ampersands(xml_str):
1741     """Replace all the '&' by '&amp;' in XML"""
1742     return re.sub(
1743         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1744         '&amp;',
1745         xml_str)
1746
1747
1748 def setproctitle(title):
1749     assert isinstance(title, compat_str)
1750
1751     # ctypes in Jython is not complete
1752     # http://bugs.jython.org/issue2148
1753     if sys.platform.startswith('java'):
1754         return
1755
1756     try:
1757         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1758     except OSError:
1759         return
1760     except TypeError:
1761         # LoadLibrary in Windows Python 2.7.13 only expects
1762         # a bytestring, but since unicode_literals turns
1763         # every string into a unicode string, it fails.
1764         return
1765     title_bytes = title.encode('utf-8')
1766     buf = ctypes.create_string_buffer(len(title_bytes))
1767     buf.value = title_bytes
1768     try:
1769         libc.prctl(15, buf, 0, 0, 0)
1770     except AttributeError:
1771         return  # Strange libc, just skip this
1772
1773
1774 def remove_start(s, start):
1775     return s[len(start):] if s is not None and s.startswith(start) else s
1776
1777
1778 def remove_end(s, end):
1779     return s[:-len(end)] if s is not None and s.endswith(end) else s
1780
1781
1782 def remove_quotes(s):
1783     if s is None or len(s) < 2:
1784         return s
1785     for quote in ('"', "'", ):
1786         if s[0] == quote and s[-1] == quote:
1787             return s[1:-1]
1788     return s
1789
1790
1791 def url_basename(url):
1792     path = compat_urlparse.urlparse(url).path
1793     return path.strip('/').split('/')[-1]
1794
1795
1796 def base_url(url):
1797     return re.match(r'https?://[^?#&]+/', url).group()
1798
1799
1800 def urljoin(base, path):
1801     if isinstance(path, bytes):
1802         path = path.decode('utf-8')
1803     if not isinstance(path, compat_str) or not path:
1804         return None
1805     if re.match(r'^(?:https?:)?//', path):
1806         return path
1807     if isinstance(base, bytes):
1808         base = base.decode('utf-8')
1809     if not isinstance(base, compat_str) or not re.match(
1810             r'^(?:https?:)?//', base):
1811         return None
1812     return compat_urlparse.urljoin(base, path)
1813
1814
1815 class HEADRequest(compat_urllib_request.Request):
1816     def get_method(self):
1817         return 'HEAD'
1818
1819
1820 class PUTRequest(compat_urllib_request.Request):
1821     def get_method(self):
1822         return 'PUT'
1823
1824
1825 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1826     if get_attr:
1827         if v is not None:
1828             v = getattr(v, get_attr, None)
1829     if v == '':
1830         v = None
1831     if v is None:
1832         return default
1833     try:
1834         return int(v) * invscale // scale
1835     except ValueError:
1836         return default
1837
1838
1839 def str_or_none(v, default=None):
1840     return default if v is None else compat_str(v)
1841
1842
1843 def str_to_int(int_str):
1844     """ A more relaxed version of int_or_none """
1845     if int_str is None:
1846         return None
1847     int_str = re.sub(r'[,\.\+]', '', int_str)
1848     return int(int_str)
1849
1850
1851 def float_or_none(v, scale=1, invscale=1, default=None):
1852     if v is None:
1853         return default
1854     try:
1855         return float(v) * invscale / scale
1856     except ValueError:
1857         return default
1858
1859
1860 def bool_or_none(v, default=None):
1861     return v if isinstance(v, bool) else default
1862
1863
1864 def strip_or_none(v):
1865     return None if v is None else v.strip()
1866
1867
1868 def parse_duration(s):
1869     if not isinstance(s, compat_basestring):
1870         return None
1871
1872     s = s.strip()
1873
1874     days, hours, mins, secs, ms = [None] * 5
1875     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1876     if m:
1877         days, hours, mins, secs, ms = m.groups()
1878     else:
1879         m = re.match(
1880             r'''(?ix)(?:P?
1881                 (?:
1882                     [0-9]+\s*y(?:ears?)?\s*
1883                 )?
1884                 (?:
1885                     [0-9]+\s*m(?:onths?)?\s*
1886                 )?
1887                 (?:
1888                     [0-9]+\s*w(?:eeks?)?\s*
1889                 )?
1890                 (?:
1891                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1892                 )?
1893                 T)?
1894                 (?:
1895                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1896                 )?
1897                 (?:
1898                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1899                 )?
1900                 (?:
1901                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1902                 )?Z?$''', s)
1903         if m:
1904             days, hours, mins, secs, ms = m.groups()
1905         else:
1906             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1907             if m:
1908                 hours, mins = m.groups()
1909             else:
1910                 return None
1911
1912     duration = 0
1913     if secs:
1914         duration += float(secs)
1915     if mins:
1916         duration += float(mins) * 60
1917     if hours:
1918         duration += float(hours) * 60 * 60
1919     if days:
1920         duration += float(days) * 24 * 60 * 60
1921     if ms:
1922         duration += float(ms)
1923     return duration
1924
1925
1926 def prepend_extension(filename, ext, expected_real_ext=None):
1927     name, real_ext = os.path.splitext(filename)
1928     return (
1929         '{0}.{1}{2}'.format(name, ext, real_ext)
1930         if not expected_real_ext or real_ext[1:] == expected_real_ext
1931         else '{0}.{1}'.format(filename, ext))
1932
1933
1934 def replace_extension(filename, ext, expected_real_ext=None):
1935     name, real_ext = os.path.splitext(filename)
1936     return '{0}.{1}'.format(
1937         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1938         ext)
1939
1940
1941 def check_executable(exe, args=[]):
1942     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1943     args can be a list of arguments for a short output (like -version) """
1944     try:
1945         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1946     except OSError:
1947         return False
1948     return exe
1949
1950
1951 def get_exe_version(exe, args=['--version'],
1952                     version_re=None, unrecognized='present'):
1953     """ Returns the version of the specified executable,
1954     or False if the executable is not present """
1955     try:
1956         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1957         # SIGTTOU if youtube-dl is run in the background.
1958         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1959         out, _ = subprocess.Popen(
1960             [encodeArgument(exe)] + args,
1961             stdin=subprocess.PIPE,
1962             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1963     except OSError:
1964         return False
1965     if isinstance(out, bytes):  # Python 2.x
1966         out = out.decode('ascii', 'ignore')
1967     return detect_exe_version(out, version_re, unrecognized)
1968
1969
1970 def detect_exe_version(output, version_re=None, unrecognized='present'):
1971     assert isinstance(output, compat_str)
1972     if version_re is None:
1973         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1974     m = re.search(version_re, output)
1975     if m:
1976         return m.group(1)
1977     else:
1978         return unrecognized
1979
1980
1981 class PagedList(object):
1982     def __len__(self):
1983         # This is only useful for tests
1984         return len(self.getslice())
1985
1986
1987 class OnDemandPagedList(PagedList):
1988     def __init__(self, pagefunc, pagesize, use_cache=True):
1989         self._pagefunc = pagefunc
1990         self._pagesize = pagesize
1991         self._use_cache = use_cache
1992         if use_cache:
1993             self._cache = {}
1994
1995     def getslice(self, start=0, end=None):
1996         res = []
1997         for pagenum in itertools.count(start // self._pagesize):
1998             firstid = pagenum * self._pagesize
1999             nextfirstid = pagenum * self._pagesize + self._pagesize
2000             if start >= nextfirstid:
2001                 continue
2002
2003             page_results = None
2004             if self._use_cache:
2005                 page_results = self._cache.get(pagenum)
2006             if page_results is None:
2007                 page_results = list(self._pagefunc(pagenum))
2008             if self._use_cache:
2009                 self._cache[pagenum] = page_results
2010
2011             startv = (
2012                 start % self._pagesize
2013                 if firstid <= start < nextfirstid
2014                 else 0)
2015
2016             endv = (
2017                 ((end - 1) % self._pagesize) + 1
2018                 if (end is not None and firstid <= end <= nextfirstid)
2019                 else None)
2020
2021             if startv != 0 or endv is not None:
2022                 page_results = page_results[startv:endv]
2023             res.extend(page_results)
2024
2025             # A little optimization - if current page is not "full", ie. does
2026             # not contain page_size videos then we can assume that this page
2027             # is the last one - there are no more ids on further pages -
2028             # i.e. no need to query again.
2029             if len(page_results) + startv < self._pagesize:
2030                 break
2031
2032             # If we got the whole page, but the next page is not interesting,
2033             # break out early as well
2034             if end == nextfirstid:
2035                 break
2036         return res
2037
2038
2039 class InAdvancePagedList(PagedList):
2040     def __init__(self, pagefunc, pagecount, pagesize):
2041         self._pagefunc = pagefunc
2042         self._pagecount = pagecount
2043         self._pagesize = pagesize
2044
2045     def getslice(self, start=0, end=None):
2046         res = []
2047         start_page = start // self._pagesize
2048         end_page = (
2049             self._pagecount if end is None else (end // self._pagesize + 1))
2050         skip_elems = start - start_page * self._pagesize
2051         only_more = None if end is None else end - start
2052         for pagenum in range(start_page, end_page):
2053             page = list(self._pagefunc(pagenum))
2054             if skip_elems:
2055                 page = page[skip_elems:]
2056                 skip_elems = None
2057             if only_more is not None:
2058                 if len(page) < only_more:
2059                     only_more -= len(page)
2060                 else:
2061                     page = page[:only_more]
2062                     res.extend(page)
2063                     break
2064             res.extend(page)
2065         return res
2066
2067
2068 def uppercase_escape(s):
2069     unicode_escape = codecs.getdecoder('unicode_escape')
2070     return re.sub(
2071         r'\\U[0-9a-fA-F]{8}',
2072         lambda m: unicode_escape(m.group(0))[0],
2073         s)
2074
2075
2076 def lowercase_escape(s):
2077     unicode_escape = codecs.getdecoder('unicode_escape')
2078     return re.sub(
2079         r'\\u[0-9a-fA-F]{4}',
2080         lambda m: unicode_escape(m.group(0))[0],
2081         s)
2082
2083
2084 def escape_rfc3986(s):
2085     """Escape non-ASCII characters as suggested by RFC 3986"""
2086     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2087         s = s.encode('utf-8')
2088     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2089
2090
2091 def escape_url(url):
2092     """Escape URL as suggested by RFC 3986"""
2093     url_parsed = compat_urllib_parse_urlparse(url)
2094     return url_parsed._replace(
2095         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2096         path=escape_rfc3986(url_parsed.path),
2097         params=escape_rfc3986(url_parsed.params),
2098         query=escape_rfc3986(url_parsed.query),
2099         fragment=escape_rfc3986(url_parsed.fragment)
2100     ).geturl()
2101
2102
2103 def read_batch_urls(batch_fd):
2104     def fixup(url):
2105         if not isinstance(url, compat_str):
2106             url = url.decode('utf-8', 'replace')
2107         BOM_UTF8 = '\xef\xbb\xbf'
2108         if url.startswith(BOM_UTF8):
2109             url = url[len(BOM_UTF8):]
2110         url = url.strip()
2111         if url.startswith(('#', ';', ']')):
2112             return False
2113         return url
2114
2115     with contextlib.closing(batch_fd) as fd:
2116         return [url for url in map(fixup, fd) if url]
2117
2118
2119 def urlencode_postdata(*args, **kargs):
2120     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2121
2122
2123 def update_url_query(url, query):
2124     if not query:
2125         return url
2126     parsed_url = compat_urlparse.urlparse(url)
2127     qs = compat_parse_qs(parsed_url.query)
2128     qs.update(query)
2129     return compat_urlparse.urlunparse(parsed_url._replace(
2130         query=compat_urllib_parse_urlencode(qs, True)))
2131
2132
2133 def update_Request(req, url=None, data=None, headers={}, query={}):
2134     req_headers = req.headers.copy()
2135     req_headers.update(headers)
2136     req_data = data or req.data
2137     req_url = update_url_query(url or req.get_full_url(), query)
2138     req_get_method = req.get_method()
2139     if req_get_method == 'HEAD':
2140         req_type = HEADRequest
2141     elif req_get_method == 'PUT':
2142         req_type = PUTRequest
2143     else:
2144         req_type = compat_urllib_request.Request
2145     new_req = req_type(
2146         req_url, data=req_data, headers=req_headers,
2147         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2148     if hasattr(req, 'timeout'):
2149         new_req.timeout = req.timeout
2150     return new_req
2151
2152
2153 def _multipart_encode_impl(data, boundary):
2154     content_type = 'multipart/form-data; boundary=%s' % boundary
2155
2156     out = b''
2157     for k, v in data.items():
2158         out += b'--' + boundary.encode('ascii') + b'\r\n'
2159         if isinstance(k, compat_str):
2160             k = k.encode('utf-8')
2161         if isinstance(v, compat_str):
2162             v = v.encode('utf-8')
2163         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2164         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2165         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2166         if boundary.encode('ascii') in content:
2167             raise ValueError('Boundary overlaps with data')
2168         out += content
2169
2170     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2171
2172     return out, content_type
2173
2174
2175 def multipart_encode(data, boundary=None):
2176     '''
2177     Encode a dict to RFC 7578-compliant form-data
2178
2179     data:
2180         A dict where keys and values can be either Unicode or bytes-like
2181         objects.
2182     boundary:
2183         If specified a Unicode object, it's used as the boundary. Otherwise
2184         a random boundary is generated.
2185
2186     Reference: https://tools.ietf.org/html/rfc7578
2187     '''
2188     has_specified_boundary = boundary is not None
2189
2190     while True:
2191         if boundary is None:
2192             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2193
2194         try:
2195             out, content_type = _multipart_encode_impl(data, boundary)
2196             break
2197         except ValueError:
2198             if has_specified_boundary:
2199                 raise
2200             boundary = None
2201
2202     return out, content_type
2203
2204
2205 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2206     if isinstance(key_or_keys, (list, tuple)):
2207         for key in key_or_keys:
2208             if key not in d or d[key] is None or skip_false_values and not d[key]:
2209                 continue
2210             return d[key]
2211         return default
2212     return d.get(key_or_keys, default)
2213
2214
2215 def try_get(src, getter, expected_type=None):
2216     if not isinstance(getter, (list, tuple)):
2217         getter = [getter]
2218     for get in getter:
2219         try:
2220             v = get(src)
2221         except (AttributeError, KeyError, TypeError, IndexError):
2222             pass
2223         else:
2224             if expected_type is None or isinstance(v, expected_type):
2225                 return v
2226
2227
2228 def merge_dicts(*dicts):
2229     merged = {}
2230     for a_dict in dicts:
2231         for k, v in a_dict.items():
2232             if v is None:
2233                 continue
2234             if (k not in merged or
2235                     (isinstance(v, compat_str) and v and
2236                         isinstance(merged[k], compat_str) and
2237                         not merged[k])):
2238                 merged[k] = v
2239     return merged
2240
2241
2242 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2243     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2244
2245
2246 US_RATINGS = {
2247     'G': 0,
2248     'PG': 10,
2249     'PG-13': 13,
2250     'R': 16,
2251     'NC': 18,
2252 }
2253
2254
2255 TV_PARENTAL_GUIDELINES = {
2256     'TV-Y': 0,
2257     'TV-Y7': 7,
2258     'TV-G': 0,
2259     'TV-PG': 0,
2260     'TV-14': 14,
2261     'TV-MA': 17,
2262 }
2263
2264
2265 def parse_age_limit(s):
2266     if type(s) == int:
2267         return s if 0 <= s <= 21 else None
2268     if not isinstance(s, compat_basestring):
2269         return None
2270     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2271     if m:
2272         return int(m.group('age'))
2273     if s in US_RATINGS:
2274         return US_RATINGS[s]
2275     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2276     if m:
2277         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2278     return None
2279
2280
2281 def strip_jsonp(code):
2282     return re.sub(
2283         r'''(?sx)^
2284             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2285             (?:\s*&&\s*(?P=func_name))?
2286             \s*\(\s*(?P<callback_data>.*)\);?
2287             \s*?(?://[^\n]*)*$''',
2288         r'\g<callback_data>', code)
2289
2290
2291 def js_to_json(code):
2292     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2293     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2294     INTEGER_TABLE = (
2295         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2296         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2297     )
2298
2299     def fix_kv(m):
2300         v = m.group(0)
2301         if v in ('true', 'false', 'null'):
2302             return v
2303         elif v.startswith('/*') or v.startswith('//') or v == ',':
2304             return ""
2305
2306         if v[0] in ("'", '"'):
2307             v = re.sub(r'(?s)\\.|"', lambda m: {
2308                 '"': '\\"',
2309                 "\\'": "'",
2310                 '\\\n': '',
2311                 '\\x': '\\u00',
2312             }.get(m.group(0), m.group(0)), v[1:-1])
2313
2314         for regex, base in INTEGER_TABLE:
2315             im = re.match(regex, v)
2316             if im:
2317                 i = int(im.group(1), base)
2318                 return '"%d":' % i if v.endswith(':') else '%d' % i
2319
2320         return '"%s"' % v
2321
2322     return re.sub(r'''(?sx)
2323         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2324         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2325         {comment}|,(?={skip}[\]}}])|
2326         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2327         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2328         [0-9]+(?={skip}:)
2329         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2330
2331
2332 def qualities(quality_ids):
2333     """ Get a numeric quality value out of a list of possible values """
2334     def q(qid):
2335         try:
2336             return quality_ids.index(qid)
2337         except ValueError:
2338             return -1
2339     return q
2340
2341
2342 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2343
2344
2345 def limit_length(s, length):
2346     """ Add ellipses to overly long strings """
2347     if s is None:
2348         return None
2349     ELLIPSES = '...'
2350     if len(s) > length:
2351         return s[:length - len(ELLIPSES)] + ELLIPSES
2352     return s
2353
2354
2355 def version_tuple(v):
2356     return tuple(int(e) for e in re.split(r'[-.]', v))
2357
2358
2359 def is_outdated_version(version, limit, assume_new=True):
2360     if not version:
2361         return not assume_new
2362     try:
2363         return version_tuple(version) < version_tuple(limit)
2364     except ValueError:
2365         return not assume_new
2366
2367
2368 def ytdl_is_updateable():
2369     """ Returns if youtube-dl can be updated with -U """
2370     from zipimport import zipimporter
2371
2372     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2373
2374
2375 def args_to_str(args):
2376     # Get a short string representation for a subprocess command
2377     return ' '.join(compat_shlex_quote(a) for a in args)
2378
2379
2380 def error_to_compat_str(err):
2381     err_str = str(err)
2382     # On python 2 error byte string must be decoded with proper
2383     # encoding rather than ascii
2384     if sys.version_info[0] < 3:
2385         err_str = err_str.decode(preferredencoding())
2386     return err_str
2387
2388
2389 def mimetype2ext(mt):
2390     if mt is None:
2391         return None
2392
2393     ext = {
2394         'audio/mp4': 'm4a',
2395         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2396         # it's the most popular one
2397         'audio/mpeg': 'mp3',
2398     }.get(mt)
2399     if ext is not None:
2400         return ext
2401
2402     _, _, res = mt.rpartition('/')
2403     res = res.split(';')[0].strip().lower()
2404
2405     return {
2406         '3gpp': '3gp',
2407         'smptett+xml': 'tt',
2408         'ttaf+xml': 'dfxp',
2409         'ttml+xml': 'ttml',
2410         'x-flv': 'flv',
2411         'x-mp4-fragmented': 'mp4',
2412         'x-ms-sami': 'sami',
2413         'x-ms-wmv': 'wmv',
2414         'mpegurl': 'm3u8',
2415         'x-mpegurl': 'm3u8',
2416         'vnd.apple.mpegurl': 'm3u8',
2417         'dash+xml': 'mpd',
2418         'f4m+xml': 'f4m',
2419         'hds+xml': 'f4m',
2420         'vnd.ms-sstr+xml': 'ism',
2421         'quicktime': 'mov',
2422         'mp2t': 'ts',
2423     }.get(res, res)
2424
2425
2426 def parse_codecs(codecs_str):
2427     # http://tools.ietf.org/html/rfc6381
2428     if not codecs_str:
2429         return {}
2430     splited_codecs = list(filter(None, map(
2431         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2432     vcodec, acodec = None, None
2433     for full_codec in splited_codecs:
2434         codec = full_codec.split('.')[0]
2435         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2436             if not vcodec:
2437                 vcodec = full_codec
2438         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2439             if not acodec:
2440                 acodec = full_codec
2441         else:
2442             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2443     if not vcodec and not acodec:
2444         if len(splited_codecs) == 2:
2445             return {
2446                 'vcodec': vcodec,
2447                 'acodec': acodec,
2448             }
2449         elif len(splited_codecs) == 1:
2450             return {
2451                 'vcodec': 'none',
2452                 'acodec': vcodec,
2453             }
2454     else:
2455         return {
2456             'vcodec': vcodec or 'none',
2457             'acodec': acodec or 'none',
2458         }
2459     return {}
2460
2461
2462 def urlhandle_detect_ext(url_handle):
2463     getheader = url_handle.headers.get
2464
2465     cd = getheader('Content-Disposition')
2466     if cd:
2467         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2468         if m:
2469             e = determine_ext(m.group('filename'), default_ext=None)
2470             if e:
2471                 return e
2472
2473     return mimetype2ext(getheader('Content-Type'))
2474
2475
2476 def encode_data_uri(data, mime_type):
2477     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2478
2479
2480 def age_restricted(content_limit, age_limit):
2481     """ Returns True iff the content should be blocked """
2482
2483     if age_limit is None:  # No limit set
2484         return False
2485     if content_limit is None:
2486         return False  # Content available for everyone
2487     return age_limit < content_limit
2488
2489
2490 def is_html(first_bytes):
2491     """ Detect whether a file contains HTML by examining its first bytes. """
2492
2493     BOMS = [
2494         (b'\xef\xbb\xbf', 'utf-8'),
2495         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2496         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2497         (b'\xff\xfe', 'utf-16-le'),
2498         (b'\xfe\xff', 'utf-16-be'),
2499     ]
2500     for bom, enc in BOMS:
2501         if first_bytes.startswith(bom):
2502             s = first_bytes[len(bom):].decode(enc, 'replace')
2503             break
2504     else:
2505         s = first_bytes.decode('utf-8', 'replace')
2506
2507     return re.match(r'^\s*<', s)
2508
2509
2510 def determine_protocol(info_dict):
2511     protocol = info_dict.get('protocol')
2512     if protocol is not None:
2513         return protocol
2514
2515     url = info_dict['url']
2516     if url.startswith('rtmp'):
2517         return 'rtmp'
2518     elif url.startswith('mms'):
2519         return 'mms'
2520     elif url.startswith('rtsp'):
2521         return 'rtsp'
2522
2523     ext = determine_ext(url)
2524     if ext == 'm3u8':
2525         return 'm3u8'
2526     elif ext == 'f4m':
2527         return 'f4m'
2528
2529     return compat_urllib_parse_urlparse(url).scheme
2530
2531
2532 def render_table(header_row, data):
2533     """ Render a list of rows, each as a list of values """
2534     table = [header_row] + data
2535     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2536     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2537     return '\n'.join(format_str % tuple(row) for row in table)
2538
2539
2540 def _match_one(filter_part, dct):
2541     COMPARISON_OPERATORS = {
2542         '<': operator.lt,
2543         '<=': operator.le,
2544         '>': operator.gt,
2545         '>=': operator.ge,
2546         '=': operator.eq,
2547         '!=': operator.ne,
2548     }
2549     operator_rex = re.compile(r'''(?x)\s*
2550         (?P<key>[a-z_]+)
2551         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2552         (?:
2553             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2554             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2555             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2556         )
2557         \s*$
2558         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2559     m = operator_rex.search(filter_part)
2560     if m:
2561         op = COMPARISON_OPERATORS[m.group('op')]
2562         actual_value = dct.get(m.group('key'))
2563         if (m.group('quotedstrval') is not None or
2564             m.group('strval') is not None or
2565             # If the original field is a string and matching comparisonvalue is
2566             # a number we should respect the origin of the original field
2567             # and process comparison value as a string (see
2568             # https://github.com/rg3/youtube-dl/issues/11082).
2569             actual_value is not None and m.group('intval') is not None and
2570                 isinstance(actual_value, compat_str)):
2571             if m.group('op') not in ('=', '!='):
2572                 raise ValueError(
2573                     'Operator %s does not support string values!' % m.group('op'))
2574             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2575             quote = m.group('quote')
2576             if quote is not None:
2577                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2578         else:
2579             try:
2580                 comparison_value = int(m.group('intval'))
2581             except ValueError:
2582                 comparison_value = parse_filesize(m.group('intval'))
2583                 if comparison_value is None:
2584                     comparison_value = parse_filesize(m.group('intval') + 'B')
2585                 if comparison_value is None:
2586                     raise ValueError(
2587                         'Invalid integer value %r in filter part %r' % (
2588                             m.group('intval'), filter_part))
2589         if actual_value is None:
2590             return m.group('none_inclusive')
2591         return op(actual_value, comparison_value)
2592
2593     UNARY_OPERATORS = {
2594         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2595         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2596     }
2597     operator_rex = re.compile(r'''(?x)\s*
2598         (?P<op>%s)\s*(?P<key>[a-z_]+)
2599         \s*$
2600         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2601     m = operator_rex.search(filter_part)
2602     if m:
2603         op = UNARY_OPERATORS[m.group('op')]
2604         actual_value = dct.get(m.group('key'))
2605         return op(actual_value)
2606
2607     raise ValueError('Invalid filter part %r' % filter_part)
2608
2609
2610 def match_str(filter_str, dct):
2611     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2612
2613     return all(
2614         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2615
2616
2617 def match_filter_func(filter_str):
2618     def _match_func(info_dict):
2619         if match_str(filter_str, info_dict):
2620             return None
2621         else:
2622             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2623             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2624     return _match_func
2625
2626
2627 def parse_dfxp_time_expr(time_expr):
2628     if not time_expr:
2629         return
2630
2631     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2632     if mobj:
2633         return float(mobj.group('time_offset'))
2634
2635     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2636     if mobj:
2637         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2638
2639
2640 def srt_subtitles_timecode(seconds):
2641     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2642
2643
2644 def dfxp2srt(dfxp_data):
2645     '''
2646     @param dfxp_data A bytes-like object containing DFXP data
2647     @returns A unicode object containing converted SRT data
2648     '''
2649     LEGACY_NAMESPACES = (
2650         (b'http://www.w3.org/ns/ttml', [
2651             b'http://www.w3.org/2004/11/ttaf1',
2652             b'http://www.w3.org/2006/04/ttaf1',
2653             b'http://www.w3.org/2006/10/ttaf1',
2654         ]),
2655         (b'http://www.w3.org/ns/ttml#styling', [
2656             b'http://www.w3.org/ns/ttml#style',
2657         ]),
2658     )
2659
2660     SUPPORTED_STYLING = [
2661         'color',
2662         'fontFamily',
2663         'fontSize',
2664         'fontStyle',
2665         'fontWeight',
2666         'textDecoration'
2667     ]
2668
2669     _x = functools.partial(xpath_with_ns, ns_map={
2670         'xml': 'http://www.w3.org/XML/1998/namespace',
2671         'ttml': 'http://www.w3.org/ns/ttml',
2672         'tts': 'http://www.w3.org/ns/ttml#styling',
2673     })
2674
2675     styles = {}
2676     default_style = {}
2677
2678     class TTMLPElementParser(object):
2679         _out = ''
2680         _unclosed_elements = []
2681         _applied_styles = []
2682
2683         def start(self, tag, attrib):
2684             if tag in (_x('ttml:br'), 'br'):
2685                 self._out += '\n'
2686             else:
2687                 unclosed_elements = []
2688                 style = {}
2689                 element_style_id = attrib.get('style')
2690                 if default_style:
2691                     style.update(default_style)
2692                 if element_style_id:
2693                     style.update(styles.get(element_style_id, {}))
2694                 for prop in SUPPORTED_STYLING:
2695                     prop_val = attrib.get(_x('tts:' + prop))
2696                     if prop_val:
2697                         style[prop] = prop_val
2698                 if style:
2699                     font = ''
2700                     for k, v in sorted(style.items()):
2701                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2702                             continue
2703                         if k == 'color':
2704                             font += ' color="%s"' % v
2705                         elif k == 'fontSize':
2706                             font += ' size="%s"' % v
2707                         elif k == 'fontFamily':
2708                             font += ' face="%s"' % v
2709                         elif k == 'fontWeight' and v == 'bold':
2710                             self._out += '<b>'
2711                             unclosed_elements.append('b')
2712                         elif k == 'fontStyle' and v == 'italic':
2713                             self._out += '<i>'
2714                             unclosed_elements.append('i')
2715                         elif k == 'textDecoration' and v == 'underline':
2716                             self._out += '<u>'
2717                             unclosed_elements.append('u')
2718                     if font:
2719                         self._out += '<font' + font + '>'
2720                         unclosed_elements.append('font')
2721                     applied_style = {}
2722                     if self._applied_styles:
2723                         applied_style.update(self._applied_styles[-1])
2724                     applied_style.update(style)
2725                     self._applied_styles.append(applied_style)
2726                 self._unclosed_elements.append(unclosed_elements)
2727
2728         def end(self, tag):
2729             if tag not in (_x('ttml:br'), 'br'):
2730                 unclosed_elements = self._unclosed_elements.pop()
2731                 for element in reversed(unclosed_elements):
2732                     self._out += '</%s>' % element
2733                 if unclosed_elements and self._applied_styles:
2734                     self._applied_styles.pop()
2735
2736         def data(self, data):
2737             self._out += data
2738
2739         def close(self):
2740             return self._out.strip()
2741
2742     def parse_node(node):
2743         target = TTMLPElementParser()
2744         parser = xml.etree.ElementTree.XMLParser(target=target)
2745         parser.feed(xml.etree.ElementTree.tostring(node))
2746         return parser.close()
2747
2748     for k, v in LEGACY_NAMESPACES:
2749         for ns in v:
2750             dfxp_data = dfxp_data.replace(ns, k)
2751
2752     dfxp = compat_etree_fromstring(dfxp_data)
2753     out = []
2754     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2755
2756     if not paras:
2757         raise ValueError('Invalid dfxp/TTML subtitle')
2758
2759     repeat = False
2760     while True:
2761         for style in dfxp.findall(_x('.//ttml:style')):
2762             style_id = style.get('id') or style.get(_x('xml:id'))
2763             if not style_id:
2764                 continue
2765             parent_style_id = style.get('style')
2766             if parent_style_id:
2767                 if parent_style_id not in styles:
2768                     repeat = True
2769                     continue
2770                 styles[style_id] = styles[parent_style_id].copy()
2771             for prop in SUPPORTED_STYLING:
2772                 prop_val = style.get(_x('tts:' + prop))
2773                 if prop_val:
2774                     styles.setdefault(style_id, {})[prop] = prop_val
2775         if repeat:
2776             repeat = False
2777         else:
2778             break
2779
2780     for p in ('body', 'div'):
2781         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2782         if ele is None:
2783             continue
2784         style = styles.get(ele.get('style'))
2785         if not style:
2786             continue
2787         default_style.update(style)
2788
2789     for para, index in zip(paras, itertools.count(1)):
2790         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2791         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2792         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2793         if begin_time is None:
2794             continue
2795         if not end_time:
2796             if not dur:
2797                 continue
2798             end_time = begin_time + dur
2799         out.append('%d\n%s --> %s\n%s\n\n' % (
2800             index,
2801             srt_subtitles_timecode(begin_time),
2802             srt_subtitles_timecode(end_time),
2803             parse_node(para)))
2804
2805     return ''.join(out)
2806
2807
2808 def cli_option(params, command_option, param):
2809     param = params.get(param)
2810     if param:
2811         param = compat_str(param)
2812     return [command_option, param] if param is not None else []
2813
2814
2815 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2816     param = params.get(param)
2817     if param is None:
2818         return []
2819     assert isinstance(param, bool)
2820     if separator:
2821         return [command_option + separator + (true_value if param else false_value)]
2822     return [command_option, true_value if param else false_value]
2823
2824
2825 def cli_valueless_option(params, command_option, param, expected_value=True):
2826     param = params.get(param)
2827     return [command_option] if param == expected_value else []
2828
2829
2830 def cli_configuration_args(params, param, default=[]):
2831     ex_args = params.get(param)
2832     if ex_args is None:
2833         return default
2834     assert isinstance(ex_args, list)
2835     return ex_args
2836
2837
2838 class ISO639Utils(object):
2839     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2840     _lang_map = {
2841         'aa': 'aar',
2842         'ab': 'abk',
2843         'ae': 'ave',
2844         'af': 'afr',
2845         'ak': 'aka',
2846         'am': 'amh',
2847         'an': 'arg',
2848         'ar': 'ara',
2849         'as': 'asm',
2850         'av': 'ava',
2851         'ay': 'aym',
2852         'az': 'aze',
2853         'ba': 'bak',
2854         'be': 'bel',
2855         'bg': 'bul',
2856         'bh': 'bih',
2857         'bi': 'bis',
2858         'bm': 'bam',
2859         'bn': 'ben',
2860         'bo': 'bod',
2861         'br': 'bre',
2862         'bs': 'bos',
2863         'ca': 'cat',
2864         'ce': 'che',
2865         'ch': 'cha',
2866         'co': 'cos',
2867         'cr': 'cre',
2868         'cs': 'ces',
2869         'cu': 'chu',
2870         'cv': 'chv',
2871         'cy': 'cym',
2872         'da': 'dan',
2873         'de': 'deu',
2874         'dv': 'div',
2875         'dz': 'dzo',
2876         'ee': 'ewe',
2877         'el': 'ell',
2878         'en': 'eng',
2879         'eo': 'epo',
2880         'es': 'spa',
2881         'et': 'est',
2882         'eu': 'eus',
2883         'fa': 'fas',
2884         'ff': 'ful',
2885         'fi': 'fin',
2886         'fj': 'fij',
2887         'fo': 'fao',
2888         'fr': 'fra',
2889         'fy': 'fry',
2890         'ga': 'gle',
2891         'gd': 'gla',
2892         'gl': 'glg',
2893         'gn': 'grn',
2894         'gu': 'guj',
2895         'gv': 'glv',
2896         'ha': 'hau',
2897         'he': 'heb',
2898         'hi': 'hin',
2899         'ho': 'hmo',
2900         'hr': 'hrv',
2901         'ht': 'hat',
2902         'hu': 'hun',
2903         'hy': 'hye',
2904         'hz': 'her',
2905         'ia': 'ina',
2906         'id': 'ind',
2907         'ie': 'ile',
2908         'ig': 'ibo',
2909         'ii': 'iii',
2910         'ik': 'ipk',
2911         'io': 'ido',
2912         'is': 'isl',
2913         'it': 'ita',
2914         'iu': 'iku',
2915         'ja': 'jpn',
2916         'jv': 'jav',
2917         'ka': 'kat',
2918         'kg': 'kon',
2919         'ki': 'kik',
2920         'kj': 'kua',
2921         'kk': 'kaz',
2922         'kl': 'kal',
2923         'km': 'khm',
2924         'kn': 'kan',
2925         'ko': 'kor',
2926         'kr': 'kau',
2927         'ks': 'kas',
2928         'ku': 'kur',
2929         'kv': 'kom',
2930         'kw': 'cor',
2931         'ky': 'kir',
2932         'la': 'lat',
2933         'lb': 'ltz',
2934         'lg': 'lug',
2935         'li': 'lim',
2936         'ln': 'lin',
2937         'lo': 'lao',
2938         'lt': 'lit',
2939         'lu': 'lub',
2940         'lv': 'lav',
2941         'mg': 'mlg',
2942         'mh': 'mah',
2943         'mi': 'mri',
2944         'mk': 'mkd',
2945         'ml': 'mal',
2946         'mn': 'mon',
2947         'mr': 'mar',
2948         'ms': 'msa',
2949         'mt': 'mlt',
2950         'my': 'mya',
2951         'na': 'nau',
2952         'nb': 'nob',
2953         'nd': 'nde',
2954         'ne': 'nep',
2955         'ng': 'ndo',
2956         'nl': 'nld',
2957         'nn': 'nno',
2958         'no': 'nor',
2959         'nr': 'nbl',
2960         'nv': 'nav',
2961         'ny': 'nya',
2962         'oc': 'oci',
2963         'oj': 'oji',
2964         'om': 'orm',
2965         'or': 'ori',
2966         'os': 'oss',
2967         'pa': 'pan',
2968         'pi': 'pli',
2969         'pl': 'pol',
2970         'ps': 'pus',
2971         'pt': 'por',
2972         'qu': 'que',
2973         'rm': 'roh',
2974         'rn': 'run',
2975         'ro': 'ron',
2976         'ru': 'rus',
2977         'rw': 'kin',
2978         'sa': 'san',
2979         'sc': 'srd',
2980         'sd': 'snd',
2981         'se': 'sme',
2982         'sg': 'sag',
2983         'si': 'sin',
2984         'sk': 'slk',
2985         'sl': 'slv',
2986         'sm': 'smo',
2987         'sn': 'sna',
2988         'so': 'som',
2989         'sq': 'sqi',
2990         'sr': 'srp',
2991         'ss': 'ssw',
2992         'st': 'sot',
2993         'su': 'sun',
2994         'sv': 'swe',
2995         'sw': 'swa',
2996         'ta': 'tam',
2997         'te': 'tel',
2998         'tg': 'tgk',
2999         'th': 'tha',
3000         'ti': 'tir',
3001         'tk': 'tuk',
3002         'tl': 'tgl',
3003         'tn': 'tsn',
3004         'to': 'ton',
3005         'tr': 'tur',
3006         'ts': 'tso',
3007         'tt': 'tat',
3008         'tw': 'twi',
3009         'ty': 'tah',
3010         'ug': 'uig',
3011         'uk': 'ukr',
3012         'ur': 'urd',
3013         'uz': 'uzb',
3014         've': 'ven',
3015         'vi': 'vie',
3016         'vo': 'vol',
3017         'wa': 'wln',
3018         'wo': 'wol',
3019         'xh': 'xho',
3020         'yi': 'yid',
3021         'yo': 'yor',
3022         'za': 'zha',
3023         'zh': 'zho',
3024         'zu': 'zul',
3025     }
3026
3027     @classmethod
3028     def short2long(cls, code):
3029         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3030         return cls._lang_map.get(code[:2])
3031
3032     @classmethod
3033     def long2short(cls, code):
3034         """Convert language code from ISO 639-2/T to ISO 639-1"""
3035         for short_name, long_name in cls._lang_map.items():
3036             if long_name == code:
3037                 return short_name
3038
3039
3040 class ISO3166Utils(object):
3041     # From http://data.okfn.org/data/core/country-list
3042     _country_map = {
3043         'AF': 'Afghanistan',
3044         'AX': 'Åland Islands',
3045         'AL': 'Albania',
3046         'DZ': 'Algeria',
3047         'AS': 'American Samoa',
3048         'AD': 'Andorra',
3049         'AO': 'Angola',
3050         'AI': 'Anguilla',
3051         'AQ': 'Antarctica',
3052         'AG': 'Antigua and Barbuda',
3053         'AR': 'Argentina',
3054         'AM': 'Armenia',
3055         'AW': 'Aruba',
3056         'AU': 'Australia',
3057         'AT': 'Austria',
3058         'AZ': 'Azerbaijan',
3059         'BS': 'Bahamas',
3060         'BH': 'Bahrain',
3061         'BD': 'Bangladesh',
3062         'BB': 'Barbados',
3063         'BY': 'Belarus',
3064         'BE': 'Belgium',
3065         'BZ': 'Belize',
3066         'BJ': 'Benin',
3067         'BM': 'Bermuda',
3068         'BT': 'Bhutan',
3069         'BO': 'Bolivia, Plurinational State of',
3070         'BQ': 'Bonaire, Sint Eustatius and Saba',
3071         'BA': 'Bosnia and Herzegovina',
3072         'BW': 'Botswana',
3073         'BV': 'Bouvet Island',
3074         'BR': 'Brazil',
3075         'IO': 'British Indian Ocean Territory',
3076         'BN': 'Brunei Darussalam',
3077         'BG': 'Bulgaria',
3078         'BF': 'Burkina Faso',
3079         'BI': 'Burundi',
3080         'KH': 'Cambodia',
3081         'CM': 'Cameroon',
3082         'CA': 'Canada',
3083         'CV': 'Cape Verde',
3084         'KY': 'Cayman Islands',
3085         'CF': 'Central African Republic',
3086         'TD': 'Chad',
3087         'CL': 'Chile',
3088         'CN': 'China',
3089         'CX': 'Christmas Island',
3090         'CC': 'Cocos (Keeling) Islands',
3091         'CO': 'Colombia',
3092         'KM': 'Comoros',
3093         'CG': 'Congo',
3094         'CD': 'Congo, the Democratic Republic of the',
3095         'CK': 'Cook Islands',
3096         'CR': 'Costa Rica',
3097         'CI': 'Côte d\'Ivoire',
3098         'HR': 'Croatia',
3099         'CU': 'Cuba',
3100         'CW': 'Curaçao',
3101         'CY': 'Cyprus',
3102         'CZ': 'Czech Republic',
3103         'DK': 'Denmark',
3104         'DJ': 'Djibouti',
3105         'DM': 'Dominica',
3106         'DO': 'Dominican Republic',
3107         'EC': 'Ecuador',
3108         'EG': 'Egypt',
3109         'SV': 'El Salvador',
3110         'GQ': 'Equatorial Guinea',
3111         'ER': 'Eritrea',
3112         'EE': 'Estonia',
3113         'ET': 'Ethiopia',
3114         'FK': 'Falkland Islands (Malvinas)',
3115         'FO': 'Faroe Islands',
3116         'FJ': 'Fiji',
3117         'FI': 'Finland',
3118         'FR': 'France',
3119         'GF': 'French Guiana',
3120         'PF': 'French Polynesia',
3121         'TF': 'French Southern Territories',
3122         'GA': 'Gabon',
3123         'GM': 'Gambia',
3124         'GE': 'Georgia',
3125         'DE': 'Germany',
3126         'GH': 'Ghana',
3127         'GI': 'Gibraltar',
3128         'GR': 'Greece',
3129         'GL': 'Greenland',
3130         'GD': 'Grenada',
3131         'GP': 'Guadeloupe',
3132         'GU': 'Guam',
3133         'GT': 'Guatemala',
3134         'GG': 'Guernsey',
3135         'GN': 'Guinea',
3136         'GW': 'Guinea-Bissau',
3137         'GY': 'Guyana',
3138         'HT': 'Haiti',
3139         'HM': 'Heard Island and McDonald Islands',
3140         'VA': 'Holy See (Vatican City State)',
3141         'HN': 'Honduras',
3142         'HK': 'Hong Kong',
3143         'HU': 'Hungary',
3144         'IS': 'Iceland',
3145         'IN': 'India',
3146         'ID': 'Indonesia',
3147         'IR': 'Iran, Islamic Republic of',
3148         'IQ': 'Iraq',
3149         'IE': 'Ireland',
3150         'IM': 'Isle of Man',
3151         'IL': 'Israel',
3152         'IT': 'Italy',
3153         'JM': 'Jamaica',
3154         'JP': 'Japan',
3155         'JE': 'Jersey',
3156         'JO': 'Jordan',
3157         'KZ': 'Kazakhstan',
3158         'KE': 'Kenya',
3159         'KI': 'Kiribati',
3160         'KP': 'Korea, Democratic People\'s Republic of',
3161         'KR': 'Korea, Republic of',
3162         'KW': 'Kuwait',
3163         'KG': 'Kyrgyzstan',
3164         'LA': 'Lao People\'s Democratic Republic',
3165         'LV': 'Latvia',
3166         'LB': 'Lebanon',
3167         'LS': 'Lesotho',
3168         'LR': 'Liberia',
3169         'LY': 'Libya',
3170         'LI': 'Liechtenstein',
3171         'LT': 'Lithuania',
3172         'LU': 'Luxembourg',
3173         'MO': 'Macao',
3174         'MK': 'Macedonia, the Former Yugoslav Republic of',
3175         'MG': 'Madagascar',
3176         'MW': 'Malawi',
3177         'MY': 'Malaysia',
3178         'MV': 'Maldives',
3179         'ML': 'Mali',
3180         'MT': 'Malta',
3181         'MH': 'Marshall Islands',
3182         'MQ': 'Martinique',
3183         'MR': 'Mauritania',
3184         'MU': 'Mauritius',
3185         'YT': 'Mayotte',
3186         'MX': 'Mexico',
3187         'FM': 'Micronesia, Federated States of',
3188         'MD': 'Moldova, Republic of',
3189         'MC': 'Monaco',
3190         'MN': 'Mongolia',
3191         'ME': 'Montenegro',
3192         'MS': 'Montserrat',
3193         'MA': 'Morocco',
3194         'MZ': 'Mozambique',
3195         'MM': 'Myanmar',
3196         'NA': 'Namibia',
3197         'NR': 'Nauru',
3198         'NP': 'Nepal',
3199         'NL': 'Netherlands',
3200         'NC': 'New Caledonia',
3201         'NZ': 'New Zealand',
3202         'NI': 'Nicaragua',
3203         'NE': 'Niger',
3204         'NG': 'Nigeria',
3205         'NU': 'Niue',
3206         'NF': 'Norfolk Island',
3207         'MP': 'Northern Mariana Islands',
3208         'NO': 'Norway',
3209         'OM': 'Oman',
3210         'PK': 'Pakistan',
3211         'PW': 'Palau',
3212         'PS': 'Palestine, State of',
3213         'PA': 'Panama',
3214         'PG': 'Papua New Guinea',
3215         'PY': 'Paraguay',
3216         'PE': 'Peru',
3217         'PH': 'Philippines',
3218         'PN': 'Pitcairn',
3219         'PL': 'Poland',
3220         'PT': 'Portugal',
3221         'PR': 'Puerto Rico',
3222         'QA': 'Qatar',
3223         'RE': 'Réunion',
3224         'RO': 'Romania',
3225         'RU': 'Russian Federation',
3226         'RW': 'Rwanda',
3227         'BL': 'Saint Barthélemy',
3228         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3229         'KN': 'Saint Kitts and Nevis',
3230         'LC': 'Saint Lucia',
3231         'MF': 'Saint Martin (French part)',
3232         'PM': 'Saint Pierre and Miquelon',
3233         'VC': 'Saint Vincent and the Grenadines',
3234         'WS': 'Samoa',
3235         'SM': 'San Marino',
3236         'ST': 'Sao Tome and Principe',
3237         'SA': 'Saudi Arabia',
3238         'SN': 'Senegal',
3239         'RS': 'Serbia',
3240         'SC': 'Seychelles',
3241         'SL': 'Sierra Leone',
3242         'SG': 'Singapore',
3243         'SX': 'Sint Maarten (Dutch part)',
3244         'SK': 'Slovakia',
3245         'SI': 'Slovenia',
3246         'SB': 'Solomon Islands',
3247         'SO': 'Somalia',
3248         'ZA': 'South Africa',
3249         'GS': 'South Georgia and the South Sandwich Islands',
3250         'SS': 'South Sudan',
3251         'ES': 'Spain',
3252         'LK': 'Sri Lanka',
3253         'SD': 'Sudan',
3254         'SR': 'Suriname',
3255         'SJ': 'Svalbard and Jan Mayen',
3256         'SZ': 'Swaziland',
3257         'SE': 'Sweden',
3258         'CH': 'Switzerland',
3259         'SY': 'Syrian Arab Republic',
3260         'TW': 'Taiwan, Province of China',
3261         'TJ': 'Tajikistan',
3262         'TZ': 'Tanzania, United Republic of',
3263         'TH': 'Thailand',
3264         'TL': 'Timor-Leste',
3265         'TG': 'Togo',
3266         'TK': 'Tokelau',
3267         'TO': 'Tonga',
3268         'TT': 'Trinidad and Tobago',
3269         'TN': 'Tunisia',
3270         'TR': 'Turkey',
3271         'TM': 'Turkmenistan',
3272         'TC': 'Turks and Caicos Islands',
3273         'TV': 'Tuvalu',
3274         'UG': 'Uganda',
3275         'UA': 'Ukraine',
3276         'AE': 'United Arab Emirates',
3277         'GB': 'United Kingdom',
3278         'US': 'United States',
3279         'UM': 'United States Minor Outlying Islands',
3280         'UY': 'Uruguay',
3281         'UZ': 'Uzbekistan',
3282         'VU': 'Vanuatu',
3283         'VE': 'Venezuela, Bolivarian Republic of',
3284         'VN': 'Viet Nam',
3285         'VG': 'Virgin Islands, British',
3286         'VI': 'Virgin Islands, U.S.',
3287         'WF': 'Wallis and Futuna',
3288         'EH': 'Western Sahara',
3289         'YE': 'Yemen',
3290         'ZM': 'Zambia',
3291         'ZW': 'Zimbabwe',
3292     }
3293
3294     @classmethod
3295     def short2full(cls, code):
3296         """Convert an ISO 3166-2 country code to the corresponding full name"""
3297         return cls._country_map.get(code.upper())
3298
3299
3300 class GeoUtils(object):
3301     # Major IPv4 address blocks per country
3302     _country_ip_map = {
3303         'AD': '85.94.160.0/19',
3304         'AE': '94.200.0.0/13',
3305         'AF': '149.54.0.0/17',
3306         'AG': '209.59.64.0/18',
3307         'AI': '204.14.248.0/21',
3308         'AL': '46.99.0.0/16',
3309         'AM': '46.70.0.0/15',
3310         'AO': '105.168.0.0/13',
3311         'AP': '159.117.192.0/21',
3312         'AR': '181.0.0.0/12',
3313         'AS': '202.70.112.0/20',
3314         'AT': '84.112.0.0/13',
3315         'AU': '1.128.0.0/11',
3316         'AW': '181.41.0.0/18',
3317         'AZ': '5.191.0.0/16',
3318         'BA': '31.176.128.0/17',
3319         'BB': '65.48.128.0/17',
3320         'BD': '114.130.0.0/16',
3321         'BE': '57.0.0.0/8',
3322         'BF': '129.45.128.0/17',
3323         'BG': '95.42.0.0/15',
3324         'BH': '37.131.0.0/17',
3325         'BI': '154.117.192.0/18',
3326         'BJ': '137.255.0.0/16',
3327         'BL': '192.131.134.0/24',
3328         'BM': '196.12.64.0/18',
3329         'BN': '156.31.0.0/16',
3330         'BO': '161.56.0.0/16',
3331         'BQ': '161.0.80.0/20',
3332         'BR': '152.240.0.0/12',
3333         'BS': '24.51.64.0/18',
3334         'BT': '119.2.96.0/19',
3335         'BW': '168.167.0.0/16',
3336         'BY': '178.120.0.0/13',
3337         'BZ': '179.42.192.0/18',
3338         'CA': '99.224.0.0/11',
3339         'CD': '41.243.0.0/16',
3340         'CF': '196.32.200.0/21',
3341         'CG': '197.214.128.0/17',
3342         'CH': '85.0.0.0/13',
3343         'CI': '154.232.0.0/14',
3344         'CK': '202.65.32.0/19',
3345         'CL': '152.172.0.0/14',
3346         'CM': '165.210.0.0/15',
3347         'CN': '36.128.0.0/10',
3348         'CO': '181.240.0.0/12',
3349         'CR': '201.192.0.0/12',
3350         'CU': '152.206.0.0/15',
3351         'CV': '165.90.96.0/19',
3352         'CW': '190.88.128.0/17',
3353         'CY': '46.198.0.0/15',
3354         'CZ': '88.100.0.0/14',
3355         'DE': '53.0.0.0/8',
3356         'DJ': '197.241.0.0/17',
3357         'DK': '87.48.0.0/12',
3358         'DM': '192.243.48.0/20',
3359         'DO': '152.166.0.0/15',
3360         'DZ': '41.96.0.0/12',
3361         'EC': '186.68.0.0/15',
3362         'EE': '90.190.0.0/15',
3363         'EG': '156.160.0.0/11',
3364         'ER': '196.200.96.0/20',
3365         'ES': '88.0.0.0/11',
3366         'ET': '196.188.0.0/14',
3367         'EU': '2.16.0.0/13',
3368         'FI': '91.152.0.0/13',
3369         'FJ': '144.120.0.0/16',
3370         'FM': '119.252.112.0/20',
3371         'FO': '88.85.32.0/19',
3372         'FR': '90.0.0.0/9',
3373         'GA': '41.158.0.0/15',
3374         'GB': '25.0.0.0/8',
3375         'GD': '74.122.88.0/21',
3376         'GE': '31.146.0.0/16',
3377         'GF': '161.22.64.0/18',
3378         'GG': '62.68.160.0/19',
3379         'GH': '45.208.0.0/14',
3380         'GI': '85.115.128.0/19',
3381         'GL': '88.83.0.0/19',
3382         'GM': '160.182.0.0/15',
3383         'GN': '197.149.192.0/18',
3384         'GP': '104.250.0.0/19',
3385         'GQ': '105.235.224.0/20',
3386         'GR': '94.64.0.0/13',
3387         'GT': '168.234.0.0/16',
3388         'GU': '168.123.0.0/16',
3389         'GW': '197.214.80.0/20',
3390         'GY': '181.41.64.0/18',
3391         'HK': '113.252.0.0/14',
3392         'HN': '181.210.0.0/16',
3393         'HR': '93.136.0.0/13',
3394         'HT': '148.102.128.0/17',
3395         'HU': '84.0.0.0/14',
3396         'ID': '39.192.0.0/10',
3397         'IE': '87.32.0.0/12',
3398         'IL': '79.176.0.0/13',
3399         'IM': '5.62.80.0/20',
3400         'IN': '117.192.0.0/10',
3401         'IO': '203.83.48.0/21',
3402         'IQ': '37.236.0.0/14',
3403         'IR': '2.176.0.0/12',
3404         'IS': '82.221.0.0/16',
3405         'IT': '79.0.0.0/10',
3406         'JE': '87.244.64.0/18',
3407         'JM': '72.27.0.0/17',
3408         'JO': '176.29.0.0/16',
3409         'JP': '126.0.0.0/8',
3410         'KE': '105.48.0.0/12',
3411         'KG': '158.181.128.0/17',
3412         'KH': '36.37.128.0/17',
3413         'KI': '103.25.140.0/22',
3414         'KM': '197.255.224.0/20',
3415         'KN': '198.32.32.0/19',
3416         'KP': '175.45.176.0/22',
3417         'KR': '175.192.0.0/10',
3418         'KW': '37.36.0.0/14',
3419         'KY': '64.96.0.0/15',
3420         'KZ': '2.72.0.0/13',
3421         'LA': '115.84.64.0/18',
3422         'LB': '178.135.0.0/16',
3423         'LC': '192.147.231.0/24',
3424         'LI': '82.117.0.0/19',
3425         'LK': '112.134.0.0/15',
3426         'LR': '41.86.0.0/19',
3427         'LS': '129.232.0.0/17',
3428         'LT': '78.56.0.0/13',
3429         'LU': '188.42.0.0/16',
3430         'LV': '46.109.0.0/16',
3431         'LY': '41.252.0.0/14',
3432         'MA': '105.128.0.0/11',
3433         'MC': '88.209.64.0/18',
3434         'MD': '37.246.0.0/16',
3435         'ME': '178.175.0.0/17',
3436         'MF': '74.112.232.0/21',
3437         'MG': '154.126.0.0/17',
3438         'MH': '117.103.88.0/21',
3439         'MK': '77.28.0.0/15',
3440         'ML': '154.118.128.0/18',
3441         'MM': '37.111.0.0/17',
3442         'MN': '49.0.128.0/17',
3443         'MO': '60.246.0.0/16',
3444         'MP': '202.88.64.0/20',
3445         'MQ': '109.203.224.0/19',
3446         'MR': '41.188.64.0/18',
3447         'MS': '208.90.112.0/22',
3448         'MT': '46.11.0.0/16',
3449         'MU': '105.16.0.0/12',
3450         'MV': '27.114.128.0/18',
3451         'MW': '105.234.0.0/16',
3452         'MX': '187.192.0.0/11',
3453         'MY': '175.136.0.0/13',
3454         'MZ': '197.218.0.0/15',
3455         'NA': '41.182.0.0/16',
3456         'NC': '101.101.0.0/18',
3457         'NE': '197.214.0.0/18',
3458         'NF': '203.17.240.0/22',
3459         'NG': '105.112.0.0/12',
3460         'NI': '186.76.0.0/15',
3461         'NL': '145.96.0.0/11',
3462         'NO': '84.208.0.0/13',
3463         'NP': '36.252.0.0/15',
3464         'NR': '203.98.224.0/19',
3465         'NU': '49.156.48.0/22',
3466         'NZ': '49.224.0.0/14',
3467         'OM': '5.36.0.0/15',
3468         'PA': '186.72.0.0/15',
3469         'PE': '186.160.0.0/14',
3470         'PF': '123.50.64.0/18',
3471         'PG': '124.240.192.0/19',
3472         'PH': '49.144.0.0/13',
3473         'PK': '39.32.0.0/11',
3474         'PL': '83.0.0.0/11',
3475         'PM': '70.36.0.0/20',
3476         'PR': '66.50.0.0/16',
3477         'PS': '188.161.0.0/16',
3478         'PT': '85.240.0.0/13',
3479         'PW': '202.124.224.0/20',
3480         'PY': '181.120.0.0/14',
3481         'QA': '37.210.0.0/15',
3482         'RE': '139.26.0.0/16',
3483         'RO': '79.112.0.0/13',
3484         'RS': '178.220.0.0/14',
3485         'RU': '5.136.0.0/13',
3486         'RW': '105.178.0.0/15',
3487         'SA': '188.48.0.0/13',
3488         'SB': '202.1.160.0/19',
3489         'SC': '154.192.0.0/11',
3490         'SD': '154.96.0.0/13',
3491         'SE': '78.64.0.0/12',
3492         'SG': '152.56.0.0/14',
3493         'SI': '188.196.0.0/14',
3494         'SK': '78.98.0.0/15',
3495         'SL': '197.215.0.0/17',
3496         'SM': '89.186.32.0/19',
3497         'SN': '41.82.0.0/15',
3498         'SO': '197.220.64.0/19',
3499         'SR': '186.179.128.0/17',
3500         'SS': '105.235.208.0/21',
3501         'ST': '197.159.160.0/19',
3502         'SV': '168.243.0.0/16',
3503         'SX': '190.102.0.0/20',
3504         'SY': '5.0.0.0/16',
3505         'SZ': '41.84.224.0/19',
3506         'TC': '65.255.48.0/20',
3507         'TD': '154.68.128.0/19',
3508         'TG': '196.168.0.0/14',
3509         'TH': '171.96.0.0/13',
3510         'TJ': '85.9.128.0/18',
3511         'TK': '27.96.24.0/21',
3512         'TL': '180.189.160.0/20',
3513         'TM': '95.85.96.0/19',
3514         'TN': '197.0.0.0/11',
3515         'TO': '175.176.144.0/21',
3516         'TR': '78.160.0.0/11',
3517         'TT': '186.44.0.0/15',
3518         'TV': '202.2.96.0/19',
3519         'TW': '120.96.0.0/11',
3520         'TZ': '156.156.0.0/14',
3521         'UA': '93.72.0.0/13',
3522         'UG': '154.224.0.0/13',
3523         'US': '3.0.0.0/8',
3524         'UY': '167.56.0.0/13',
3525         'UZ': '82.215.64.0/18',
3526         'VA': '212.77.0.0/19',
3527         'VC': '24.92.144.0/20',
3528         'VE': '186.88.0.0/13',
3529         'VG': '172.103.64.0/18',
3530         'VI': '146.226.0.0/16',
3531         'VN': '14.160.0.0/11',
3532         'VU': '202.80.32.0/20',
3533         'WF': '117.20.32.0/21',
3534         'WS': '202.4.32.0/19',
3535         'YE': '134.35.0.0/16',
3536         'YT': '41.242.116.0/22',
3537         'ZA': '41.0.0.0/11',
3538         'ZM': '165.56.0.0/13',
3539         'ZW': '41.85.192.0/19',
3540     }
3541
3542     @classmethod
3543     def random_ipv4(cls, code_or_block):
3544         if len(code_or_block) == 2:
3545             block = cls._country_ip_map.get(code_or_block.upper())
3546             if not block:
3547                 return None
3548         else:
3549             block = code_or_block
3550         addr, preflen = block.split('/')
3551         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3552         addr_max = addr_min | (0xffffffff >> int(preflen))
3553         return compat_str(socket.inet_ntoa(
3554             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3555
3556
3557 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3558     def __init__(self, proxies=None):
3559         # Set default handlers
3560         for type in ('http', 'https'):
3561             setattr(self, '%s_open' % type,
3562                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3563                         meth(r, proxy, type))
3564         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3565
3566     def proxy_open(self, req, proxy, type):
3567         req_proxy = req.headers.get('Ytdl-request-proxy')
3568         if req_proxy is not None:
3569             proxy = req_proxy
3570             del req.headers['Ytdl-request-proxy']
3571
3572         if proxy == '__noproxy__':
3573             return None  # No Proxy
3574         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3575             req.add_header('Ytdl-socks-proxy', proxy)
3576             # youtube-dl's http/https handlers do wrapping the socket with socks
3577             return None
3578         return compat_urllib_request.ProxyHandler.proxy_open(
3579             self, req, proxy, type)
3580
3581
3582 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3583 # released into Public Domain
3584 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3585
3586 def long_to_bytes(n, blocksize=0):
3587     """long_to_bytes(n:long, blocksize:int) : string
3588     Convert a long integer to a byte string.
3589
3590     If optional blocksize is given and greater than zero, pad the front of the
3591     byte string with binary zeros so that the length is a multiple of
3592     blocksize.
3593     """
3594     # after much testing, this algorithm was deemed to be the fastest
3595     s = b''
3596     n = int(n)
3597     while n > 0:
3598         s = compat_struct_pack('>I', n & 0xffffffff) + s
3599         n = n >> 32
3600     # strip off leading zeros
3601     for i in range(len(s)):
3602         if s[i] != b'\000'[0]:
3603             break
3604     else:
3605         # only happens when n == 0
3606         s = b'\000'
3607         i = 0
3608     s = s[i:]
3609     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3610     # de-padding being done above, but sigh...
3611     if blocksize > 0 and len(s) % blocksize:
3612         s = (blocksize - len(s) % blocksize) * b'\000' + s
3613     return s
3614
3615
3616 def bytes_to_long(s):
3617     """bytes_to_long(string) : long
3618     Convert a byte string to a long integer.
3619
3620     This is (essentially) the inverse of long_to_bytes().
3621     """
3622     acc = 0
3623     length = len(s)
3624     if length % 4:
3625         extra = (4 - length % 4)
3626         s = b'\000' * extra + s
3627         length = length + extra
3628     for i in range(0, length, 4):
3629         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3630     return acc
3631
3632
3633 def ohdave_rsa_encrypt(data, exponent, modulus):
3634     '''
3635     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3636
3637     Input:
3638         data: data to encrypt, bytes-like object
3639         exponent, modulus: parameter e and N of RSA algorithm, both integer
3640     Output: hex string of encrypted data
3641
3642     Limitation: supports one block encryption only
3643     '''
3644
3645     payload = int(binascii.hexlify(data[::-1]), 16)
3646     encrypted = pow(payload, exponent, modulus)
3647     return '%x' % encrypted
3648
3649
3650 def pkcs1pad(data, length):
3651     """
3652     Padding input data with PKCS#1 scheme
3653
3654     @param {int[]} data        input data
3655     @param {int}   length      target length
3656     @returns {int[]}           padded data
3657     """
3658     if len(data) > length - 11:
3659         raise ValueError('Input data too long for PKCS#1 padding')
3660
3661     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3662     return [0, 2] + pseudo_random + [0] + data
3663
3664
3665 def encode_base_n(num, n, table=None):
3666     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3667     if not table:
3668         table = FULL_TABLE[:n]
3669
3670     if n > len(table):
3671         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3672
3673     if num == 0:
3674         return table[0]
3675
3676     ret = ''
3677     while num:
3678         ret = table[num % n] + ret
3679         num = num // n
3680     return ret
3681
3682
3683 def decode_packed_codes(code):
3684     mobj = re.search(PACKED_CODES_RE, code)
3685     obfucasted_code, base, count, symbols = mobj.groups()
3686     base = int(base)
3687     count = int(count)
3688     symbols = symbols.split('|')
3689     symbol_table = {}
3690
3691     while count:
3692         count -= 1
3693         base_n_count = encode_base_n(count, base)
3694         symbol_table[base_n_count] = symbols[count] or base_n_count
3695
3696     return re.sub(
3697         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3698         obfucasted_code)
3699
3700
3701 def parse_m3u8_attributes(attrib):
3702     info = {}
3703     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3704         if val.startswith('"'):
3705             val = val[1:-1]
3706         info[key] = val
3707     return info
3708
3709
3710 def urshift(val, n):
3711     return val >> n if val >= 0 else (val + 0x100000000) >> n
3712
3713
3714 # Based on png2str() written by @gdkchan and improved by @yokrysty
3715 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3716 def decode_png(png_data):
3717     # Reference: https://www.w3.org/TR/PNG/
3718     header = png_data[8:]
3719
3720     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3721         raise IOError('Not a valid PNG file.')
3722
3723     int_map = {1: '>B', 2: '>H', 4: '>I'}
3724     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3725
3726     chunks = []
3727
3728     while header:
3729         length = unpack_integer(header[:4])
3730         header = header[4:]
3731
3732         chunk_type = header[:4]
3733         header = header[4:]
3734
3735         chunk_data = header[:length]
3736         header = header[length:]
3737
3738         header = header[4:]  # Skip CRC
3739
3740         chunks.append({
3741             'type': chunk_type,
3742             'length': length,
3743             'data': chunk_data
3744         })
3745
3746     ihdr = chunks[0]['data']
3747
3748     width = unpack_integer(ihdr[:4])
3749     height = unpack_integer(ihdr[4:8])
3750
3751     idat = b''
3752
3753     for chunk in chunks:
3754         if chunk['type'] == b'IDAT':
3755             idat += chunk['data']
3756
3757     if not idat:
3758         raise IOError('Unable to read PNG data.')
3759
3760     decompressed_data = bytearray(zlib.decompress(idat))
3761
3762     stride = width * 3
3763     pixels = []
3764
3765     def _get_pixel(idx):
3766         x = idx % stride
3767         y = idx // stride
3768         return pixels[y][x]
3769
3770     for y in range(height):
3771         basePos = y * (1 + stride)
3772         filter_type = decompressed_data[basePos]
3773
3774         current_row = []
3775
3776         pixels.append(current_row)
3777
3778         for x in range(stride):
3779             color = decompressed_data[1 + basePos + x]
3780             basex = y * stride + x
3781             left = 0
3782             up = 0
3783
3784             if x > 2:
3785                 left = _get_pixel(basex - 3)
3786             if y > 0:
3787                 up = _get_pixel(basex - stride)
3788
3789             if filter_type == 1:  # Sub
3790                 color = (color + left) & 0xff
3791             elif filter_type == 2:  # Up
3792                 color = (color + up) & 0xff
3793             elif filter_type == 3:  # Average
3794                 color = (color + ((left + up) >> 1)) & 0xff
3795             elif filter_type == 4:  # Paeth
3796                 a = left
3797                 b = up
3798                 c = 0
3799
3800                 if x > 2 and y > 0:
3801                     c = _get_pixel(basex - stride - 3)
3802
3803                 p = a + b - c
3804
3805                 pa = abs(p - a)
3806                 pb = abs(p - b)
3807                 pc = abs(p - c)
3808
3809                 if pa <= pb and pa <= pc:
3810                     color = (color + a) & 0xff
3811                 elif pb <= pc:
3812                     color = (color + b) & 0xff
3813                 else:
3814                     color = (color + c) & 0xff
3815
3816             current_row.append(color)
3817
3818     return width, height, pixels
3819
3820
3821 def write_xattr(path, key, value):
3822     # This mess below finds the best xattr tool for the job
3823     try:
3824         # try the pyxattr module...
3825         import xattr
3826
3827         if hasattr(xattr, 'set'):  # pyxattr
3828             # Unicode arguments are not supported in python-pyxattr until
3829             # version 0.5.0
3830             # See https://github.com/rg3/youtube-dl/issues/5498
3831             pyxattr_required_version = '0.5.0'
3832             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3833                 # TODO: fallback to CLI tools
3834                 raise XAttrUnavailableError(
3835                     'python-pyxattr is detected but is too old. '
3836                     'youtube-dl requires %s or above while your version is %s. '
3837                     'Falling back to other xattr implementations' % (
3838                         pyxattr_required_version, xattr.__version__))
3839
3840             setxattr = xattr.set
3841         else:  # xattr
3842             setxattr = xattr.setxattr
3843
3844         try:
3845             setxattr(path, key, value)
3846         except EnvironmentError as e:
3847             raise XAttrMetadataError(e.errno, e.strerror)
3848
3849     except ImportError:
3850         if compat_os_name == 'nt':
3851             # Write xattrs to NTFS Alternate Data Streams:
3852             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3853             assert ':' not in key
3854             assert os.path.exists(path)
3855
3856             ads_fn = path + ':' + key
3857             try:
3858                 with open(ads_fn, 'wb') as f:
3859                     f.write(value)
3860             except EnvironmentError as e:
3861                 raise XAttrMetadataError(e.errno, e.strerror)
3862         else:
3863             user_has_setfattr = check_executable('setfattr', ['--version'])
3864             user_has_xattr = check_executable('xattr', ['-h'])
3865
3866             if user_has_setfattr or user_has_xattr:
3867
3868                 value = value.decode('utf-8')
3869                 if user_has_setfattr:
3870                     executable = 'setfattr'
3871                     opts = ['-n', key, '-v', value]
3872                 elif user_has_xattr:
3873                     executable = 'xattr'
3874                     opts = ['-w', key, value]
3875
3876                 cmd = ([encodeFilename(executable, True)] +
3877                        [encodeArgument(o) for o in opts] +
3878                        [encodeFilename(path, True)])
3879
3880                 try:
3881                     p = subprocess.Popen(
3882                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3883                 except EnvironmentError as e:
3884                     raise XAttrMetadataError(e.errno, e.strerror)
3885                 stdout, stderr = p.communicate()
3886                 stderr = stderr.decode('utf-8', 'replace')
3887                 if p.returncode != 0:
3888                     raise XAttrMetadataError(p.returncode, stderr)
3889
3890             else:
3891                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3892                 if sys.platform.startswith('linux'):
3893                     raise XAttrUnavailableError(
3894                         "Couldn't find a tool to set the xattrs. "
3895                         "Install either the python 'pyxattr' or 'xattr' "
3896                         "modules, or the GNU 'attr' package "
3897                         "(which contains the 'setfattr' tool).")
3898                 else:
3899                     raise XAttrUnavailableError(
3900                         "Couldn't find a tool to set the xattrs. "
3901                         "Install either the python 'xattr' module, "
3902                         "or the 'xattr' binary.")
3903
3904
3905 def random_birthday(year_field, month_field, day_field):
3906     return {
3907         year_field: str(random.randint(1950, 1995)),
3908         month_field: str(random.randint(1, 12)),
3909         day_field: str(random.randint(1, 31)),
3910     }