_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_ctypes_WINFUNCTYPE,
  43     compat_etree_fromstring,
  44     compat_expanduser,
  45     compat_html_entities,
  46     compat_html_entities_html5,
  47     compat_http_client,
  48     compat_kwargs,
  49     compat_os_name,
  50     compat_parse_qs,
  51     compat_shlex_quote,
  52     compat_socket_create_connection,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 188
 189
 190 def preferredencoding():
 191     """Get preferred encoding.
 192
 193     Returns the best encoding scheme for the system, based on
 194     locale.getpreferredencoding() and some further tweaks.
 195     """
 196     try:
 197         pref = locale.getpreferredencoding()
 198         'TEST'.encode(pref)
 199     except Exception:
 200         pref = 'UTF-8'
 201
 202     return pref
 203
 204
 205 def write_json_file(obj, fn):
 206     """ Encode obj as JSON and write it to fn, atomically if possible """
 207
 208     fn = encodeFilename(fn)
 209     if sys.version_info < (3, 0) and sys.platform != 'win32':
 210         encoding = get_filesystem_encoding()
 211         # os.path.basename returns a bytes object, but NamedTemporaryFile
 212         # will fail if the filename contains non ascii characters unless we
 213         # use a unicode object
 214         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 215         # the same for os.path.dirname
 216         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 217     else:
 218         path_basename = os.path.basename
 219         path_dirname = os.path.dirname
 220
 221     args = {
 222         'suffix': '.tmp',
 223         'prefix': path_basename(fn) + '.',
 224         'dir': path_dirname(fn),
 225         'delete': False,
 226     }
 227
 228     # In Python 2.x, json.dump expects a bytestream.
 229     # In Python 3.x, it writes to a character stream
 230     if sys.version_info < (3, 0):
 231         args['mode'] = 'wb'
 232     else:
 233         args.update({
 234             'mode': 'w',
 235             'encoding': 'utf-8',
 236         })
 237
 238     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 239
 240     try:
 241         with tf:
 242             json.dump(obj, tf)
 243         if sys.platform == 'win32':
 244             # Need to remove existing file on Windows, else os.rename raises
 245             # WindowsError or FileExistsError.
 246             try:
 247                 os.unlink(fn)
 248             except OSError:
 249                 pass
 250         os.rename(tf.name, fn)
 251     except Exception:
 252         try:
 253             os.remove(tf.name)
 254         except OSError:
 255             pass
 256         raise
 257
 258
 259 if sys.version_info >= (2, 7):
 260     def find_xpath_attr(node, xpath, key, val=None):
 261         """ Find the xpath xpath[@key=val] """
 262         assert re.match(r'^[a-zA-Z_-]+$', key)
 263         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 264         return node.find(expr)
 265 else:
 266     def find_xpath_attr(node, xpath, key, val=None):
 267         for f in node.findall(compat_xpath(xpath)):
 268             if key not in f.attrib:
 269                 continue
 270             if val is None or f.attrib.get(key) == val:
 271                 return f
 272         return None
 273
 274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 275 # the namespace parameter
 276
 277
 278 def xpath_with_ns(path, ns_map):
 279     components = [c.split(':') for c in path.split('/')]
 280     replaced = []
 281     for c in components:
 282         if len(c) == 1:
 283             replaced.append(c[0])
 284         else:
 285             ns, tag = c
 286             replaced.append('{%s}%s' % (ns_map[ns], tag))
 287     return '/'.join(replaced)
 288
 289
 290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 291     def _find_xpath(xpath):
 292         return node.find(compat_xpath(xpath))
 293
 294     if isinstance(xpath, (str, compat_str)):
 295         n = _find_xpath(xpath)
 296     else:
 297         for xp in xpath:
 298             n = _find_xpath(xp)
 299             if n is not None:
 300                 break
 301
 302     if n is None:
 303         if default is not NO_DEFAULT:
 304             return default
 305         elif fatal:
 306             name = xpath if name is None else name
 307             raise ExtractorError('Could not find XML element %s' % name)
 308         else:
 309             return None
 310     return n
 311
 312
 313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 314     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 315     if n is None or n == default:
 316         return n
 317     if n.text is None:
 318         if default is not NO_DEFAULT:
 319             return default
 320         elif fatal:
 321             name = xpath if name is None else name
 322             raise ExtractorError('Could not find XML element\'s text %s' % name)
 323         else:
 324             return None
 325     return n.text
 326
 327
 328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 329     n = find_xpath_attr(node, xpath, key)
 330     if n is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = '%s[@%s]' % (xpath, key) if name is None else name
 335             raise ExtractorError('Could not find XML attribute %s' % name)
 336         else:
 337             return None
 338     return n.attrib[key]
 339
 340
 341 def get_element_by_id(id, html):
 342     """Return the content of the tag with the specified ID in the passed HTML document"""
 343     return get_element_by_attribute('id', id, html)
 344
 345
 346 def get_element_by_class(class_name, html):
 347     """Return the content of the first tag with the specified class in the passed HTML document"""
 348     retval = get_elements_by_class(class_name, html)
 349     return retval[0] if retval else None
 350
 351
 352 def get_element_by_attribute(attribute, value, html, escape_value=True):
 353     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 354     return retval[0] if retval else None
 355
 356
 357 def get_elements_by_class(class_name, html):
 358     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 359     return get_elements_by_attribute(
 360         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 361         html, escape_value=False)
 362
 363
 364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 365     """Return the content of the tag with the specified attribute in the passed HTML document"""
 366
 367     value = re.escape(value) if escape_value else value
 368
 369     retlist = []
 370     for m in re.finditer(r'''(?xs)
 371         <([a-zA-Z0-9:._-]+)
 372          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 373          \s+%s=['"]?%s['"]?
 374          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 375         \s*>
 376         (?P<content>.*?)
 377         </\1>
 378     ''' % (re.escape(attribute), value), html):
 379         res = m.group('content')
 380
 381         if res.startswith('"') or res.startswith("'"):
 382             res = res[1:-1]
 383
 384         retlist.append(unescapeHTML(res))
 385
 386     return retlist
 387
 388
 389 class HTMLAttributeParser(compat_HTMLParser):
 390     """Trivial HTML parser to gather the attributes for a single element"""
 391     def __init__(self):
 392         self.attrs = {}
 393         compat_HTMLParser.__init__(self)
 394
 395     def handle_starttag(self, tag, attrs):
 396         self.attrs = dict(attrs)
 397
 398
 399 def extract_attributes(html_element):
 400     """Given a string for an HTML element such as
 401     <el
 402          a="foo" B="bar" c="&98;az" d=boz
 403          empty= noval entity="&amp;"
 404          sq='"' dq="'"
 405     >
 406     Decode and return a dictionary of attributes.
 407     {
 408         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 409         'empty': '', 'noval': None, 'entity': '&',
 410         'sq': '"', 'dq': '\''
 411     }.
 412     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 413     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 414     """
 415     parser = HTMLAttributeParser()
 416     try:
 417         parser.feed(html_element)
 418         parser.close()
 419     # Older Python may throw HTMLParseError in case of malformed HTML
 420     except compat_HTMLParseError:
 421         pass
 422     return parser.attrs
 423
 424
 425 def clean_html(html):
 426     """Clean an HTML snippet into a readable string"""
 427
 428     if html is None:  # Convenience for sanitizing descriptions etc.
 429         return html
 430
 431     # Newline vs <br />
 432     html = html.replace('\n', ' ')
 433     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 434     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 435     # Strip html tags
 436     html = re.sub('<.*?>', '', html)
 437     # Replace html entities
 438     html = unescapeHTML(html)
 439     return html.strip()
 440
 441
 442 def sanitize_open(filename, open_mode):
 443     """Try to open the given filename, and slightly tweak it if this fails.
 444
 445     Attempts to open the given filename. If this fails, it tries to change
 446     the filename slightly, step by step, until it's either able to open it
 447     or it fails and raises a final exception, like the standard open()
 448     function.
 449
 450     It returns the tuple (stream, definitive_file_name).
 451     """
 452     try:
 453         if filename == '-':
 454             if sys.platform == 'win32':
 455                 import msvcrt
 456                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 457             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 458         stream = open(encodeFilename(filename), open_mode)
 459         return (stream, filename)
 460     except (IOError, OSError) as err:
 461         if err.errno in (errno.EACCES,):
 462             raise
 463
 464         # In case of error, try to remove win32 forbidden chars
 465         alt_filename = sanitize_path(filename)
 466         if alt_filename == filename:
 467             raise
 468         else:
 469             # An exception here should be caught in the caller
 470             stream = open(encodeFilename(alt_filename), open_mode)
 471             return (stream, alt_filename)
 472
 473
 474 def timeconvert(timestr):
 475     """Convert RFC 2822 defined time string into system timestamp"""
 476     timestamp = None
 477     timetuple = email.utils.parsedate_tz(timestr)
 478     if timetuple is not None:
 479         timestamp = email.utils.mktime_tz(timetuple)
 480     return timestamp
 481
 482
 483 def sanitize_filename(s, restricted=False, is_id=False):
 484     """Sanitizes a string so it could be used as part of a filename.
 485     If restricted is set, use a stricter subset of allowed characters.
 486     Set is_id if this is not an arbitrary string, but an ID that should be kept
 487     if possible.
 488     """
 489     def replace_insane(char):
 490         if restricted and char in ACCENT_CHARS:
 491             return ACCENT_CHARS[char]
 492         if char == '?' or ord(char) < 32 or ord(char) == 127:
 493             return ''
 494         elif char == '"':
 495             return '' if restricted else '\''
 496         elif char == ':':
 497             return '_-' if restricted else ' -'
 498         elif char in '\\/|*<>':
 499             return '_'
 500         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 501             return '_'
 502         if restricted and ord(char) > 127:
 503             return '_'
 504         return char
 505
 506     # Handle timestamps
 507     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 508     result = ''.join(map(replace_insane, s))
 509     if not is_id:
 510         while '__' in result:
 511             result = result.replace('__', '_')
 512         result = result.strip('_')
 513         # Common case of "Foreign band name - English song title"
 514         if restricted and result.startswith('-_'):
 515             result = result[2:]
 516         if result.startswith('-'):
 517             result = '_' + result[len('-'):]
 518         result = result.lstrip('.')
 519         if not result:
 520             result = '_'
 521     return result
 522
 523
 524 def sanitize_path(s):
 525     """Sanitizes and normalizes path on Windows"""
 526     if sys.platform != 'win32':
 527         return s
 528     drive_or_unc, _ = os.path.splitdrive(s)
 529     if sys.version_info < (2, 7) and not drive_or_unc:
 530         drive_or_unc, _ = os.path.splitunc(s)
 531     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 532     if drive_or_unc:
 533         norm_path.pop(0)
 534     sanitized_path = [
 535         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 536         for path_part in norm_path]
 537     if drive_or_unc:
 538         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 539     return os.path.join(*sanitized_path)
 540
 541
 542 def sanitize_url(url):
 543     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 544     # the number of unwanted failures due to missing protocol
 545     if url.startswith('//'):
 546         return 'http:%s' % url
 547     # Fix some common typos seen so far
 548     COMMON_TYPOS = (
 549         # https://github.com/rg3/youtube-dl/issues/15649
 550         (r'^httpss://', r'https://'),
 551         # https://bx1.be/lives/direct-tv/
 552         (r'^rmtp([es]?)://', r'rtmp\1://'),
 553     )
 554     for mistake, fixup in COMMON_TYPOS:
 555         if re.match(mistake, url):
 556             return re.sub(mistake, fixup, url)
 557     return url
 558
 559
 560 def sanitized_Request(url, *args, **kwargs):
 561     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 562
 563
 564 def expand_path(s):
 565     """Expand shell variables and ~"""
 566     return os.path.expandvars(compat_expanduser(s))
 567
 568
 569 def orderedSet(iterable):
 570     """ Remove all duplicates from the input iterable """
 571     res = []
 572     for el in iterable:
 573         if el not in res:
 574             res.append(el)
 575     return res
 576
 577
 578 def _htmlentity_transform(entity_with_semicolon):
 579     """Transforms an HTML entity to a character."""
 580     entity = entity_with_semicolon[:-1]
 581
 582     # Known non-numeric HTML entity
 583     if entity in compat_html_entities.name2codepoint:
 584         return compat_chr(compat_html_entities.name2codepoint[entity])
 585
 586     # TODO: HTML5 allows entities without a semicolon. For example,
 587     # '&Eacuteric' should be decoded as 'Éric'.
 588     if entity_with_semicolon in compat_html_entities_html5:
 589         return compat_html_entities_html5[entity_with_semicolon]
 590
 591     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 592     if mobj is not None:
 593         numstr = mobj.group(1)
 594         if numstr.startswith('x'):
 595             base = 16
 596             numstr = '0%s' % numstr
 597         else:
 598             base = 10
 599         # See https://github.com/rg3/youtube-dl/issues/7518
 600         try:
 601             return compat_chr(int(numstr, base))
 602         except ValueError:
 603             pass
 604
 605     # Unknown entity in name, return its literal representation
 606     return '&%s;' % entity
 607
 608
 609 def unescapeHTML(s):
 610     if s is None:
 611         return None
 612     assert type(s) == compat_str
 613
 614     return re.sub(
 615         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 616
 617
 618 def get_subprocess_encoding():
 619     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 620         # For subprocess calls, encode with locale encoding
 621         # Refer to http://stackoverflow.com/a/9951851/35070
 622         encoding = preferredencoding()
 623     else:
 624         encoding = sys.getfilesystemencoding()
 625     if encoding is None:
 626         encoding = 'utf-8'
 627     return encoding
 628
 629
 630 def encodeFilename(s, for_subprocess=False):
 631     """
 632     @param s The name of the file
 633     """
 634
 635     assert type(s) == compat_str
 636
 637     # Python 3 has a Unicode API
 638     if sys.version_info >= (3, 0):
 639         return s
 640
 641     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 642     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 643     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 644     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 645         return s
 646
 647     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 648     if sys.platform.startswith('java'):
 649         return s
 650
 651     return s.encode(get_subprocess_encoding(), 'ignore')
 652
 653
 654 def decodeFilename(b, for_subprocess=False):
 655
 656     if sys.version_info >= (3, 0):
 657         return b
 658
 659     if not isinstance(b, bytes):
 660         return b
 661
 662     return b.decode(get_subprocess_encoding(), 'ignore')
 663
 664
 665 def encodeArgument(s):
 666     if not isinstance(s, compat_str):
 667         # Legacy code that uses byte strings
 668         # Uncomment the following line after fixing all post processors
 669         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 670         s = s.decode('ascii')
 671     return encodeFilename(s, True)
 672
 673
 674 def decodeArgument(b):
 675     return decodeFilename(b, True)
 676
 677
 678 def decodeOption(optval):
 679     if optval is None:
 680         return optval
 681     if isinstance(optval, bytes):
 682         optval = optval.decode(preferredencoding())
 683
 684     assert isinstance(optval, compat_str)
 685     return optval
 686
 687
 688 def formatSeconds(secs):
 689     if secs > 3600:
 690         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 691     elif secs > 60:
 692         return '%d:%02d' % (secs // 60, secs % 60)
 693     else:
 694         return '%d' % secs
 695
 696
 697 def make_HTTPS_handler(params, **kwargs):
 698     opts_no_check_certificate = params.get('nocheckcertificate', False)
 699     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 700         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 701         if opts_no_check_certificate:
 702             context.check_hostname = False
 703             context.verify_mode = ssl.CERT_NONE
 704         try:
 705             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 706         except TypeError:
 707             # Python 2.7.8
 708             # (create_default_context present but HTTPSHandler has no context=)
 709             pass
 710
 711     if sys.version_info < (3, 2):
 712         return YoutubeDLHTTPSHandler(params, **kwargs)
 713     else:  # Python < 3.4
 714         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 715         context.verify_mode = (ssl.CERT_NONE
 716                                if opts_no_check_certificate
 717                                else ssl.CERT_REQUIRED)
 718         context.set_default_verify_paths()
 719         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 720
 721
 722 def bug_reports_message():
 723     if ytdl_is_updateable():
 724         update_cmd = 'type  youtube-dl -U  to update'
 725     else:
 726         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 727     msg = '; please report this issue on https://yt-dl.org/bug .'
 728     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 729     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 730     return msg
 731
 732
 733 class YoutubeDLError(Exception):
 734     """Base exception for YoutubeDL errors."""
 735     pass
 736
 737
 738 class ExtractorError(YoutubeDLError):
 739     """Error during info extraction."""
 740
 741     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 742         """ tb, if given, is the original traceback (so that it can be printed out).
 743         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 744         """
 745
 746         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 747             expected = True
 748         if video_id is not None:
 749             msg = video_id + ': ' + msg
 750         if cause:
 751             msg += ' (caused by %r)' % cause
 752         if not expected:
 753             msg += bug_reports_message()
 754         super(ExtractorError, self).__init__(msg)
 755
 756         self.traceback = tb
 757         self.exc_info = sys.exc_info()  # preserve original exception
 758         self.cause = cause
 759         self.video_id = video_id
 760
 761     def format_traceback(self):
 762         if self.traceback is None:
 763             return None
 764         return ''.join(traceback.format_tb(self.traceback))
 765
 766
 767 class UnsupportedError(ExtractorError):
 768     def __init__(self, url):
 769         super(UnsupportedError, self).__init__(
 770             'Unsupported URL: %s' % url, expected=True)
 771         self.url = url
 772
 773
 774 class RegexNotFoundError(ExtractorError):
 775     """Error when a regex didn't match"""
 776     pass
 777
 778
 779 class GeoRestrictedError(ExtractorError):
 780     """Geographic restriction Error exception.
 781
 782     This exception may be thrown when a video is not available from your
 783     geographic location due to geographic restrictions imposed by a website.
 784     """
 785     def __init__(self, msg, countries=None):
 786         super(GeoRestrictedError, self).__init__(msg, expected=True)
 787         self.msg = msg
 788         self.countries = countries
 789
 790
 791 class DownloadError(YoutubeDLError):
 792     """Download Error exception.
 793
 794     This exception may be thrown by FileDownloader objects if they are not
 795     configured to continue on errors. They will contain the appropriate
 796     error message.
 797     """
 798
 799     def __init__(self, msg, exc_info=None):
 800         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 801         super(DownloadError, self).__init__(msg)
 802         self.exc_info = exc_info
 803
 804
 805 class SameFileError(YoutubeDLError):
 806     """Same File exception.
 807
 808     This exception will be thrown by FileDownloader objects if they detect
 809     multiple files would have to be downloaded to the same file on disk.
 810     """
 811     pass
 812
 813
 814 class PostProcessingError(YoutubeDLError):
 815     """Post Processing exception.
 816
 817     This exception may be raised by PostProcessor's .run() method to
 818     indicate an error in the postprocessing task.
 819     """
 820
 821     def __init__(self, msg):
 822         super(PostProcessingError, self).__init__(msg)
 823         self.msg = msg
 824
 825
 826 class MaxDownloadsReached(YoutubeDLError):
 827     """ --max-downloads limit has been reached. """
 828     pass
 829
 830
 831 class UnavailableVideoError(YoutubeDLError):
 832     """Unavailable Format exception.
 833
 834     This exception will be thrown when a video is requested
 835     in a format that is not available for that video.
 836     """
 837     pass
 838
 839
 840 class ContentTooShortError(YoutubeDLError):
 841     """Content Too Short exception.
 842
 843     This exception may be raised by FileDownloader objects when a file they
 844     download is too small for what the server announced first, indicating
 845     the connection was probably interrupted.
 846     """
 847
 848     def __init__(self, downloaded, expected):
 849         super(ContentTooShortError, self).__init__(
 850             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 851         )
 852         # Both in bytes
 853         self.downloaded = downloaded
 854         self.expected = expected
 855
 856
 857 class XAttrMetadataError(YoutubeDLError):
 858     def __init__(self, code=None, msg='Unknown error'):
 859         super(XAttrMetadataError, self).__init__(msg)
 860         self.code = code
 861         self.msg = msg
 862
 863         # Parsing code and msg
 864         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 865                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 866             self.reason = 'NO_SPACE'
 867         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 868             self.reason = 'VALUE_TOO_LONG'
 869         else:
 870             self.reason = 'NOT_SUPPORTED'
 871
 872
 873 class XAttrUnavailableError(YoutubeDLError):
 874     pass
 875
 876
 877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 878     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 879     # expected HTTP responses to meet HTTP/1.0 or later (see also
 880     # https://github.com/rg3/youtube-dl/issues/6727)
 881     if sys.version_info < (3, 0):
 882         kwargs['strict'] = True
 883     hc = http_class(*args, **compat_kwargs(kwargs))
 884     source_address = ydl_handler._params.get('source_address')
 885     if source_address is not None:
 886         sa = (source_address, 0)
 887         if hasattr(hc, 'source_address'):  # Python 2.7+
 888             hc.source_address = sa
 889         else:  # Python 2.6
 890             def _hc_connect(self, *args, **kwargs):
 891                 sock = compat_socket_create_connection(
 892                     (self.host, self.port), self.timeout, sa)
 893                 if is_https:
 894                     self.sock = ssl.wrap_socket(
 895                         sock, self.key_file, self.cert_file,
 896                         ssl_version=ssl.PROTOCOL_TLSv1)
 897                 else:
 898                     self.sock = sock
 899             hc.connect = functools.partial(_hc_connect, hc)
 900
 901     return hc
 902
 903
 904 def handle_youtubedl_headers(headers):
 905     filtered_headers = headers
 906
 907     if 'Youtubedl-no-compression' in filtered_headers:
 908         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 909         del filtered_headers['Youtubedl-no-compression']
 910
 911     return filtered_headers
 912
 913
 914 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 915     """Handler for HTTP requests and responses.
 916
 917     This class, when installed with an OpenerDirector, automatically adds
 918     the standard headers to every HTTP request and handles gzipped and
 919     deflated responses from web servers. If compression is to be avoided in
 920     a particular request, the original request in the program code only has
 921     to include the HTTP header "Youtubedl-no-compression", which will be
 922     removed before making the real request.
 923
 924     Part of this code was copied from:
 925
 926     http://techknack.net/python-urllib2-handlers/
 927
 928     Andrew Rowls, the author of that code, agreed to release it to the
 929     public domain.
 930     """
 931
 932     def __init__(self, params, *args, **kwargs):
 933         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 934         self._params = params
 935
 936     def http_open(self, req):
 937         conn_class = compat_http_client.HTTPConnection
 938
 939         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 940         if socks_proxy:
 941             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 942             del req.headers['Ytdl-socks-proxy']
 943
 944         return self.do_open(functools.partial(
 945             _create_http_connection, self, conn_class, False),
 946             req)
 947
 948     @staticmethod
 949     def deflate(data):
 950         try:
 951             return zlib.decompress(data, -zlib.MAX_WBITS)
 952         except zlib.error:
 953             return zlib.decompress(data)
 954
 955     def http_request(self, req):
 956         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 957         # always respected by websites, some tend to give out URLs with non percent-encoded
 958         # non-ASCII characters (see telemb.py, ard.py [#3412])
 959         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 960         # To work around aforementioned issue we will replace request's original URL with
 961         # percent-encoded one
 962         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 963         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 964         url = req.get_full_url()
 965         url_escaped = escape_url(url)
 966
 967         # Substitute URL if any change after escaping
 968         if url != url_escaped:
 969             req = update_Request(req, url=url_escaped)
 970
 971         for h, v in std_headers.items():
 972             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 973             # The dict keys are capitalized because of this bug by urllib
 974             if h.capitalize() not in req.headers:
 975                 req.add_header(h, v)
 976
 977         req.headers = handle_youtubedl_headers(req.headers)
 978
 979         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 980             # Python 2.6 is brain-dead when it comes to fragments
 981             req._Request__original = req._Request__original.partition('#')[0]
 982             req._Request__r_type = req._Request__r_type.partition('#')[0]
 983
 984         return req
 985
 986     def http_response(self, req, resp):
 987         old_resp = resp
 988         # gzip
 989         if resp.headers.get('Content-encoding', '') == 'gzip':
 990             content = resp.read()
 991             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 992             try:
 993                 uncompressed = io.BytesIO(gz.read())
 994             except IOError as original_ioerror:
 995                 # There may be junk add the end of the file
 996                 # See http://stackoverflow.com/q/4928560/35070 for details
 997                 for i in range(1, 1024):
 998                     try:
 999                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1000                         uncompressed = io.BytesIO(gz.read())
1001                     except IOError:
1002                         continue
1003                     break
1004                 else:
1005                     raise original_ioerror
1006             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1007             resp.msg = old_resp.msg
1008             del resp.headers['Content-encoding']
1009         # deflate
1010         if resp.headers.get('Content-encoding', '') == 'deflate':
1011             gz = io.BytesIO(self.deflate(resp.read()))
1012             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1013             resp.msg = old_resp.msg
1014             del resp.headers['Content-encoding']
1015         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1016         # https://github.com/rg3/youtube-dl/issues/6457).
1017         if 300 <= resp.code < 400:
1018             location = resp.headers.get('Location')
1019             if location:
1020                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1021                 if sys.version_info >= (3, 0):
1022                     location = location.encode('iso-8859-1').decode('utf-8')
1023                 else:
1024                     location = location.decode('utf-8')
1025                 location_escaped = escape_url(location)
1026                 if location != location_escaped:
1027                     del resp.headers['Location']
1028                     if sys.version_info < (3, 0):
1029                         location_escaped = location_escaped.encode('utf-8')
1030                     resp.headers['Location'] = location_escaped
1031         return resp
1032
1033     https_request = http_request
1034     https_response = http_response
1035
1036
1037 def make_socks_conn_class(base_class, socks_proxy):
1038     assert issubclass(base_class, (
1039         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1040
1041     url_components = compat_urlparse.urlparse(socks_proxy)
1042     if url_components.scheme.lower() == 'socks5':
1043         socks_type = ProxyType.SOCKS5
1044     elif url_components.scheme.lower() in ('socks', 'socks4'):
1045         socks_type = ProxyType.SOCKS4
1046     elif url_components.scheme.lower() == 'socks4a':
1047         socks_type = ProxyType.SOCKS4A
1048
1049     def unquote_if_non_empty(s):
1050         if not s:
1051             return s
1052         return compat_urllib_parse_unquote_plus(s)
1053
1054     proxy_args = (
1055         socks_type,
1056         url_components.hostname, url_components.port or 1080,
1057         True,  # Remote DNS
1058         unquote_if_non_empty(url_components.username),
1059         unquote_if_non_empty(url_components.password),
1060     )
1061
1062     class SocksConnection(base_class):
1063         def connect(self):
1064             self.sock = sockssocket()
1065             self.sock.setproxy(*proxy_args)
1066             if type(self.timeout) in (int, float):
1067                 self.sock.settimeout(self.timeout)
1068             self.sock.connect((self.host, self.port))
1069
1070             if isinstance(self, compat_http_client.HTTPSConnection):
1071                 if hasattr(self, '_context'):  # Python > 2.6
1072                     self.sock = self._context.wrap_socket(
1073                         self.sock, server_hostname=self.host)
1074                 else:
1075                     self.sock = ssl.wrap_socket(self.sock)
1076
1077     return SocksConnection
1078
1079
1080 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1081     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1082         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1083         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1084         self._params = params
1085
1086     def https_open(self, req):
1087         kwargs = {}
1088         conn_class = self._https_conn_class
1089
1090         if hasattr(self, '_context'):  # python > 2.6
1091             kwargs['context'] = self._context
1092         if hasattr(self, '_check_hostname'):  # python 3.x
1093             kwargs['check_hostname'] = self._check_hostname
1094
1095         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1096         if socks_proxy:
1097             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1098             del req.headers['Ytdl-socks-proxy']
1099
1100         return self.do_open(functools.partial(
1101             _create_http_connection, self, conn_class, True),
1102             req, **kwargs)
1103
1104
1105 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1106     def __init__(self, cookiejar=None):
1107         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1108
1109     def http_response(self, request, response):
1110         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1111         # characters in Set-Cookie HTTP header of last response (see
1112         # https://github.com/rg3/youtube-dl/issues/6769).
1113         # In order to at least prevent crashing we will percent encode Set-Cookie
1114         # header before HTTPCookieProcessor starts processing it.
1115         # if sys.version_info < (3, 0) and response.headers:
1116         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1117         #         set_cookie = response.headers.get(set_cookie_header)
1118         #         if set_cookie:
1119         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1120         #             if set_cookie != set_cookie_escaped:
1121         #                 del response.headers[set_cookie_header]
1122         #                 response.headers[set_cookie_header] = set_cookie_escaped
1123         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1124
1125     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1126     https_response = http_response
1127
1128
1129 def extract_timezone(date_str):
1130     m = re.search(
1131         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1132         date_str)
1133     if not m:
1134         timezone = datetime.timedelta()
1135     else:
1136         date_str = date_str[:-len(m.group('tz'))]
1137         if not m.group('sign'):
1138             timezone = datetime.timedelta()
1139         else:
1140             sign = 1 if m.group('sign') == '+' else -1
1141             timezone = datetime.timedelta(
1142                 hours=sign * int(m.group('hours')),
1143                 minutes=sign * int(m.group('minutes')))
1144     return timezone, date_str
1145
1146
1147 def parse_iso8601(date_str, delimiter='T', timezone=None):
1148     """ Return a UNIX timestamp from the given date """
1149
1150     if date_str is None:
1151         return None
1152
1153     date_str = re.sub(r'\.[0-9]+', '', date_str)
1154
1155     if timezone is None:
1156         timezone, date_str = extract_timezone(date_str)
1157
1158     try:
1159         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1160         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1161         return calendar.timegm(dt.timetuple())
1162     except ValueError:
1163         pass
1164
1165
1166 def date_formats(day_first=True):
1167     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1168
1169
1170 def unified_strdate(date_str, day_first=True):
1171     """Return a string with the date in the format YYYYMMDD"""
1172
1173     if date_str is None:
1174         return None
1175     upload_date = None
1176     # Replace commas
1177     date_str = date_str.replace(',', ' ')
1178     # Remove AM/PM + timezone
1179     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1180     _, date_str = extract_timezone(date_str)
1181
1182     for expression in date_formats(day_first):
1183         try:
1184             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1185         except ValueError:
1186             pass
1187     if upload_date is None:
1188         timetuple = email.utils.parsedate_tz(date_str)
1189         if timetuple:
1190             try:
1191                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1192             except ValueError:
1193                 pass
1194     if upload_date is not None:
1195         return compat_str(upload_date)
1196
1197
1198 def unified_timestamp(date_str, day_first=True):
1199     if date_str is None:
1200         return None
1201
1202     date_str = re.sub(r'[,|]', '', date_str)
1203
1204     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1205     timezone, date_str = extract_timezone(date_str)
1206
1207     # Remove AM/PM + timezone
1208     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1209
1210     # Remove unrecognized timezones from ISO 8601 alike timestamps
1211     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1212     if m:
1213         date_str = date_str[:-len(m.group('tz'))]
1214
1215     # Python only supports microseconds, so remove nanoseconds
1216     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1217     if m:
1218         date_str = m.group(1)
1219
1220     for expression in date_formats(day_first):
1221         try:
1222             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1223             return calendar.timegm(dt.timetuple())
1224         except ValueError:
1225             pass
1226     timetuple = email.utils.parsedate_tz(date_str)
1227     if timetuple:
1228         return calendar.timegm(timetuple) + pm_delta * 3600
1229
1230
1231 def determine_ext(url, default_ext='unknown_video'):
1232     if url is None or '.' not in url:
1233         return default_ext
1234     guess = url.partition('?')[0].rpartition('.')[2]
1235     if re.match(r'^[A-Za-z0-9]+$', guess):
1236         return guess
1237     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1238     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1239         return guess.rstrip('/')
1240     else:
1241         return default_ext
1242
1243
1244 def subtitles_filename(filename, sub_lang, sub_format):
1245     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1246
1247
1248 def date_from_str(date_str):
1249     """
1250     Return a datetime object from a string in the format YYYYMMDD or
1251     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1252     today = datetime.date.today()
1253     if date_str in ('now', 'today'):
1254         return today
1255     if date_str == 'yesterday':
1256         return today - datetime.timedelta(days=1)
1257     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1258     if match is not None:
1259         sign = match.group('sign')
1260         time = int(match.group('time'))
1261         if sign == '-':
1262             time = -time
1263         unit = match.group('unit')
1264         # A bad approximation?
1265         if unit == 'month':
1266             unit = 'day'
1267             time *= 30
1268         elif unit == 'year':
1269             unit = 'day'
1270             time *= 365
1271         unit += 's'
1272         delta = datetime.timedelta(**{unit: time})
1273         return today + delta
1274     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1275
1276
1277 def hyphenate_date(date_str):
1278     """
1279     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1280     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1281     if match is not None:
1282         return '-'.join(match.groups())
1283     else:
1284         return date_str
1285
1286
1287 class DateRange(object):
1288     """Represents a time interval between two dates"""
1289
1290     def __init__(self, start=None, end=None):
1291         """start and end must be strings in the format accepted by date"""
1292         if start is not None:
1293             self.start = date_from_str(start)
1294         else:
1295             self.start = datetime.datetime.min.date()
1296         if end is not None:
1297             self.end = date_from_str(end)
1298         else:
1299             self.end = datetime.datetime.max.date()
1300         if self.start > self.end:
1301             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1302
1303     @classmethod
1304     def day(cls, day):
1305         """Returns a range that only contains the given day"""
1306         return cls(day, day)
1307
1308     def __contains__(self, date):
1309         """Check if the date is in the range"""
1310         if not isinstance(date, datetime.date):
1311             date = date_from_str(date)
1312         return self.start <= date <= self.end
1313
1314     def __str__(self):
1315         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1316
1317
1318 def platform_name():
1319     """ Returns the platform name as a compat_str """
1320     res = platform.platform()
1321     if isinstance(res, bytes):
1322         res = res.decode(preferredencoding())
1323
1324     assert isinstance(res, compat_str)
1325     return res
1326
1327
1328 def _windows_write_string(s, out):
1329     """ Returns True if the string was written using special methods,
1330     False if it has yet to be written out."""
1331     # Adapted from http://stackoverflow.com/a/3259271/35070
1332
1333     import ctypes
1334     import ctypes.wintypes
1335
1336     WIN_OUTPUT_IDS = {
1337         1: -11,
1338         2: -12,
1339     }
1340
1341     try:
1342         fileno = out.fileno()
1343     except AttributeError:
1344         # If the output stream doesn't have a fileno, it's virtual
1345         return False
1346     except io.UnsupportedOperation:
1347         # Some strange Windows pseudo files?
1348         return False
1349     if fileno not in WIN_OUTPUT_IDS:
1350         return False
1351
1352     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1353         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1354         ('GetStdHandle', ctypes.windll.kernel32))
1355     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1356
1357     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1358         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1359         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1360         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1361     written = ctypes.wintypes.DWORD(0)
1362
1363     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1364     FILE_TYPE_CHAR = 0x0002
1365     FILE_TYPE_REMOTE = 0x8000
1366     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1367         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1368         ctypes.POINTER(ctypes.wintypes.DWORD))(
1369         ('GetConsoleMode', ctypes.windll.kernel32))
1370     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1371
1372     def not_a_console(handle):
1373         if handle == INVALID_HANDLE_VALUE or handle is None:
1374             return True
1375         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1376                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1377
1378     if not_a_console(h):
1379         return False
1380
1381     def next_nonbmp_pos(s):
1382         try:
1383             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1384         except StopIteration:
1385             return len(s)
1386
1387     while s:
1388         count = min(next_nonbmp_pos(s), 1024)
1389
1390         ret = WriteConsoleW(
1391             h, s, count if count else 2, ctypes.byref(written), None)
1392         if ret == 0:
1393             raise OSError('Failed to write string')
1394         if not count:  # We just wrote a non-BMP character
1395             assert written.value == 2
1396             s = s[1:]
1397         else:
1398             assert written.value > 0
1399             s = s[written.value:]
1400     return True
1401
1402
1403 def write_string(s, out=None, encoding=None):
1404     if out is None:
1405         out = sys.stderr
1406     assert type(s) == compat_str
1407
1408     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1409         if _windows_write_string(s, out):
1410             return
1411
1412     if ('b' in getattr(out, 'mode', '') or
1413             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1414         byt = s.encode(encoding or preferredencoding(), 'ignore')
1415         out.write(byt)
1416     elif hasattr(out, 'buffer'):
1417         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1418         byt = s.encode(enc, 'ignore')
1419         out.buffer.write(byt)
1420     else:
1421         out.write(s)
1422     out.flush()
1423
1424
1425 def bytes_to_intlist(bs):
1426     if not bs:
1427         return []
1428     if isinstance(bs[0], int):  # Python 3
1429         return list(bs)
1430     else:
1431         return [ord(c) for c in bs]
1432
1433
1434 def intlist_to_bytes(xs):
1435     if not xs:
1436         return b''
1437     return compat_struct_pack('%dB' % len(xs), *xs)
1438
1439
1440 # Cross-platform file locking
1441 if sys.platform == 'win32':
1442     import ctypes.wintypes
1443     import msvcrt
1444
1445     class OVERLAPPED(ctypes.Structure):
1446         _fields_ = [
1447             ('Internal', ctypes.wintypes.LPVOID),
1448             ('InternalHigh', ctypes.wintypes.LPVOID),
1449             ('Offset', ctypes.wintypes.DWORD),
1450             ('OffsetHigh', ctypes.wintypes.DWORD),
1451             ('hEvent', ctypes.wintypes.HANDLE),
1452         ]
1453
1454     kernel32 = ctypes.windll.kernel32
1455     LockFileEx = kernel32.LockFileEx
1456     LockFileEx.argtypes = [
1457         ctypes.wintypes.HANDLE,     # hFile
1458         ctypes.wintypes.DWORD,      # dwFlags
1459         ctypes.wintypes.DWORD,      # dwReserved
1460         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1461         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1462         ctypes.POINTER(OVERLAPPED)  # Overlapped
1463     ]
1464     LockFileEx.restype = ctypes.wintypes.BOOL
1465     UnlockFileEx = kernel32.UnlockFileEx
1466     UnlockFileEx.argtypes = [
1467         ctypes.wintypes.HANDLE,     # hFile
1468         ctypes.wintypes.DWORD,      # dwReserved
1469         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1470         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1471         ctypes.POINTER(OVERLAPPED)  # Overlapped
1472     ]
1473     UnlockFileEx.restype = ctypes.wintypes.BOOL
1474     whole_low = 0xffffffff
1475     whole_high = 0x7fffffff
1476
1477     def _lock_file(f, exclusive):
1478         overlapped = OVERLAPPED()
1479         overlapped.Offset = 0
1480         overlapped.OffsetHigh = 0
1481         overlapped.hEvent = 0
1482         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1483         handle = msvcrt.get_osfhandle(f.fileno())
1484         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1485                           whole_low, whole_high, f._lock_file_overlapped_p):
1486             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1487
1488     def _unlock_file(f):
1489         assert f._lock_file_overlapped_p
1490         handle = msvcrt.get_osfhandle(f.fileno())
1491         if not UnlockFileEx(handle, 0,
1492                             whole_low, whole_high, f._lock_file_overlapped_p):
1493             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1494
1495 else:
1496     # Some platforms, such as Jython, is missing fcntl
1497     try:
1498         import fcntl
1499
1500         def _lock_file(f, exclusive):
1501             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1502
1503         def _unlock_file(f):
1504             fcntl.flock(f, fcntl.LOCK_UN)
1505     except ImportError:
1506         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1507
1508         def _lock_file(f, exclusive):
1509             raise IOError(UNSUPPORTED_MSG)
1510
1511         def _unlock_file(f):
1512             raise IOError(UNSUPPORTED_MSG)
1513
1514
1515 class locked_file(object):
1516     def __init__(self, filename, mode, encoding=None):
1517         assert mode in ['r', 'a', 'w']
1518         self.f = io.open(filename, mode, encoding=encoding)
1519         self.mode = mode
1520
1521     def __enter__(self):
1522         exclusive = self.mode != 'r'
1523         try:
1524             _lock_file(self.f, exclusive)
1525         except IOError:
1526             self.f.close()
1527             raise
1528         return self
1529
1530     def __exit__(self, etype, value, traceback):
1531         try:
1532             _unlock_file(self.f)
1533         finally:
1534             self.f.close()
1535
1536     def __iter__(self):
1537         return iter(self.f)
1538
1539     def write(self, *args):
1540         return self.f.write(*args)
1541
1542     def read(self, *args):
1543         return self.f.read(*args)
1544
1545
1546 def get_filesystem_encoding():
1547     encoding = sys.getfilesystemencoding()
1548     return encoding if encoding is not None else 'utf-8'
1549
1550
1551 def shell_quote(args):
1552     quoted_args = []
1553     encoding = get_filesystem_encoding()
1554     for a in args:
1555         if isinstance(a, bytes):
1556             # We may get a filename encoded with 'encodeFilename'
1557             a = a.decode(encoding)
1558         quoted_args.append(compat_shlex_quote(a))
1559     return ' '.join(quoted_args)
1560
1561
1562 def smuggle_url(url, data):
1563     """ Pass additional data in a URL for internal use. """
1564
1565     url, idata = unsmuggle_url(url, {})
1566     data.update(idata)
1567     sdata = compat_urllib_parse_urlencode(
1568         {'__youtubedl_smuggle': json.dumps(data)})
1569     return url + '#' + sdata
1570
1571
1572 def unsmuggle_url(smug_url, default=None):
1573     if '#__youtubedl_smuggle' not in smug_url:
1574         return smug_url, default
1575     url, _, sdata = smug_url.rpartition('#')
1576     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1577     data = json.loads(jsond)
1578     return url, data
1579
1580
1581 def format_bytes(bytes):
1582     if bytes is None:
1583         return 'N/A'
1584     if type(bytes) is str:
1585         bytes = float(bytes)
1586     if bytes == 0.0:
1587         exponent = 0
1588     else:
1589         exponent = int(math.log(bytes, 1024.0))
1590     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1591     converted = float(bytes) / float(1024 ** exponent)
1592     return '%.2f%s' % (converted, suffix)
1593
1594
1595 def lookup_unit_table(unit_table, s):
1596     units_re = '|'.join(re.escape(u) for u in unit_table)
1597     m = re.match(
1598         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1599     if not m:
1600         return None
1601     num_str = m.group('num').replace(',', '.')
1602     mult = unit_table[m.group('unit')]
1603     return int(float(num_str) * mult)
1604
1605
1606 def parse_filesize(s):
1607     if s is None:
1608         return None
1609
1610     # The lower-case forms are of course incorrect and unofficial,
1611     # but we support those too
1612     _UNIT_TABLE = {
1613         'B': 1,
1614         'b': 1,
1615         'bytes': 1,
1616         'KiB': 1024,
1617         'KB': 1000,
1618         'kB': 1024,
1619         'Kb': 1000,
1620         'kb': 1000,
1621         'kilobytes': 1000,
1622         'kibibytes': 1024,
1623         'MiB': 1024 ** 2,
1624         'MB': 1000 ** 2,
1625         'mB': 1024 ** 2,
1626         'Mb': 1000 ** 2,
1627         'mb': 1000 ** 2,
1628         'megabytes': 1000 ** 2,
1629         'mebibytes': 1024 ** 2,
1630         'GiB': 1024 ** 3,
1631         'GB': 1000 ** 3,
1632         'gB': 1024 ** 3,
1633         'Gb': 1000 ** 3,
1634         'gb': 1000 ** 3,
1635         'gigabytes': 1000 ** 3,
1636         'gibibytes': 1024 ** 3,
1637         'TiB': 1024 ** 4,
1638         'TB': 1000 ** 4,
1639         'tB': 1024 ** 4,
1640         'Tb': 1000 ** 4,
1641         'tb': 1000 ** 4,
1642         'terabytes': 1000 ** 4,
1643         'tebibytes': 1024 ** 4,
1644         'PiB': 1024 ** 5,
1645         'PB': 1000 ** 5,
1646         'pB': 1024 ** 5,
1647         'Pb': 1000 ** 5,
1648         'pb': 1000 ** 5,
1649         'petabytes': 1000 ** 5,
1650         'pebibytes': 1024 ** 5,
1651         'EiB': 1024 ** 6,
1652         'EB': 1000 ** 6,
1653         'eB': 1024 ** 6,
1654         'Eb': 1000 ** 6,
1655         'eb': 1000 ** 6,
1656         'exabytes': 1000 ** 6,
1657         'exbibytes': 1024 ** 6,
1658         'ZiB': 1024 ** 7,
1659         'ZB': 1000 ** 7,
1660         'zB': 1024 ** 7,
1661         'Zb': 1000 ** 7,
1662         'zb': 1000 ** 7,
1663         'zettabytes': 1000 ** 7,
1664         'zebibytes': 1024 ** 7,
1665         'YiB': 1024 ** 8,
1666         'YB': 1000 ** 8,
1667         'yB': 1024 ** 8,
1668         'Yb': 1000 ** 8,
1669         'yb': 1000 ** 8,
1670         'yottabytes': 1000 ** 8,
1671         'yobibytes': 1024 ** 8,
1672     }
1673
1674     return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677 def parse_count(s):
1678     if s is None:
1679         return None
1680
1681     s = s.strip()
1682
1683     if re.match(r'^[\d,.]+$', s):
1684         return str_to_int(s)
1685
1686     _UNIT_TABLE = {
1687         'k': 1000,
1688         'K': 1000,
1689         'm': 1000 ** 2,
1690         'M': 1000 ** 2,
1691         'kk': 1000 ** 2,
1692         'KK': 1000 ** 2,
1693     }
1694
1695     return lookup_unit_table(_UNIT_TABLE, s)
1696
1697
1698 def parse_resolution(s):
1699     if s is None:
1700         return {}
1701
1702     mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1703     if mobj:
1704         return {
1705             'width': int(mobj.group('w')),
1706             'height': int(mobj.group('h')),
1707         }
1708
1709     mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1710     if mobj:
1711         return {'height': int(mobj.group(1))}
1712
1713     mobj = re.search(r'\b([48])[kK]\b', s)
1714     if mobj:
1715         return {'height': int(mobj.group(1)) * 540}
1716
1717     return {}
1718
1719
1720 def month_by_name(name, lang='en'):
1721     """ Return the number of a month by (locale-independently) English name """
1722
1723     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1724
1725     try:
1726         return month_names.index(name) + 1
1727     except ValueError:
1728         return None
1729
1730
1731 def month_by_abbreviation(abbrev):
1732     """ Return the number of a month by (locale-independently) English
1733         abbreviations """
1734
1735     try:
1736         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1737     except ValueError:
1738         return None
1739
1740
1741 def fix_xml_ampersands(xml_str):
1742     """Replace all the '&' by '&amp;' in XML"""
1743     return re.sub(
1744         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1745         '&amp;',
1746         xml_str)
1747
1748
1749 def setproctitle(title):
1750     assert isinstance(title, compat_str)
1751
1752     # ctypes in Jython is not complete
1753     # http://bugs.jython.org/issue2148
1754     if sys.platform.startswith('java'):
1755         return
1756
1757     try:
1758         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1759     except OSError:
1760         return
1761     except TypeError:
1762         # LoadLibrary in Windows Python 2.7.13 only expects
1763         # a bytestring, but since unicode_literals turns
1764         # every string into a unicode string, it fails.
1765         return
1766     title_bytes = title.encode('utf-8')
1767     buf = ctypes.create_string_buffer(len(title_bytes))
1768     buf.value = title_bytes
1769     try:
1770         libc.prctl(15, buf, 0, 0, 0)
1771     except AttributeError:
1772         return  # Strange libc, just skip this
1773
1774
1775 def remove_start(s, start):
1776     return s[len(start):] if s is not None and s.startswith(start) else s
1777
1778
1779 def remove_end(s, end):
1780     return s[:-len(end)] if s is not None and s.endswith(end) else s
1781
1782
1783 def remove_quotes(s):
1784     if s is None or len(s) < 2:
1785         return s
1786     for quote in ('"', "'", ):
1787         if s[0] == quote and s[-1] == quote:
1788             return s[1:-1]
1789     return s
1790
1791
1792 def url_basename(url):
1793     path = compat_urlparse.urlparse(url).path
1794     return path.strip('/').split('/')[-1]
1795
1796
1797 def base_url(url):
1798     return re.match(r'https?://[^?#&]+/', url).group()
1799
1800
1801 def urljoin(base, path):
1802     if isinstance(path, bytes):
1803         path = path.decode('utf-8')
1804     if not isinstance(path, compat_str) or not path:
1805         return None
1806     if re.match(r'^(?:https?:)?//', path):
1807         return path
1808     if isinstance(base, bytes):
1809         base = base.decode('utf-8')
1810     if not isinstance(base, compat_str) or not re.match(
1811             r'^(?:https?:)?//', base):
1812         return None
1813     return compat_urlparse.urljoin(base, path)
1814
1815
1816 class HEADRequest(compat_urllib_request.Request):
1817     def get_method(self):
1818         return 'HEAD'
1819
1820
1821 class PUTRequest(compat_urllib_request.Request):
1822     def get_method(self):
1823         return 'PUT'
1824
1825
1826 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1827     if get_attr:
1828         if v is not None:
1829             v = getattr(v, get_attr, None)
1830     if v == '':
1831         v = None
1832     if v is None:
1833         return default
1834     try:
1835         return int(v) * invscale // scale
1836     except ValueError:
1837         return default
1838
1839
1840 def str_or_none(v, default=None):
1841     return default if v is None else compat_str(v)
1842
1843
1844 def str_to_int(int_str):
1845     """ A more relaxed version of int_or_none """
1846     if int_str is None:
1847         return None
1848     int_str = re.sub(r'[,\.\+]', '', int_str)
1849     return int(int_str)
1850
1851
1852 def float_or_none(v, scale=1, invscale=1, default=None):
1853     if v is None:
1854         return default
1855     try:
1856         return float(v) * invscale / scale
1857     except ValueError:
1858         return default
1859
1860
1861 def bool_or_none(v, default=None):
1862     return v if isinstance(v, bool) else default
1863
1864
1865 def strip_or_none(v):
1866     return None if v is None else v.strip()
1867
1868
1869 def parse_duration(s):
1870     if not isinstance(s, compat_basestring):
1871         return None
1872
1873     s = s.strip()
1874
1875     days, hours, mins, secs, ms = [None] * 5
1876     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1877     if m:
1878         days, hours, mins, secs, ms = m.groups()
1879     else:
1880         m = re.match(
1881             r'''(?ix)(?:P?
1882                 (?:
1883                     [0-9]+\s*y(?:ears?)?\s*
1884                 )?
1885                 (?:
1886                     [0-9]+\s*m(?:onths?)?\s*
1887                 )?
1888                 (?:
1889                     [0-9]+\s*w(?:eeks?)?\s*
1890                 )?
1891                 (?:
1892                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1893                 )?
1894                 T)?
1895                 (?:
1896                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1897                 )?
1898                 (?:
1899                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1900                 )?
1901                 (?:
1902                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1903                 )?Z?$''', s)
1904         if m:
1905             days, hours, mins, secs, ms = m.groups()
1906         else:
1907             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1908             if m:
1909                 hours, mins = m.groups()
1910             else:
1911                 return None
1912
1913     duration = 0
1914     if secs:
1915         duration += float(secs)
1916     if mins:
1917         duration += float(mins) * 60
1918     if hours:
1919         duration += float(hours) * 60 * 60
1920     if days:
1921         duration += float(days) * 24 * 60 * 60
1922     if ms:
1923         duration += float(ms)
1924     return duration
1925
1926
1927 def prepend_extension(filename, ext, expected_real_ext=None):
1928     name, real_ext = os.path.splitext(filename)
1929     return (
1930         '{0}.{1}{2}'.format(name, ext, real_ext)
1931         if not expected_real_ext or real_ext[1:] == expected_real_ext
1932         else '{0}.{1}'.format(filename, ext))
1933
1934
1935 def replace_extension(filename, ext, expected_real_ext=None):
1936     name, real_ext = os.path.splitext(filename)
1937     return '{0}.{1}'.format(
1938         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1939         ext)
1940
1941
1942 def check_executable(exe, args=[]):
1943     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1944     args can be a list of arguments for a short output (like -version) """
1945     try:
1946         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1947     except OSError:
1948         return False
1949     return exe
1950
1951
1952 def get_exe_version(exe, args=['--version'],
1953                     version_re=None, unrecognized='present'):
1954     """ Returns the version of the specified executable,
1955     or False if the executable is not present """
1956     try:
1957         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1958         # SIGTTOU if youtube-dl is run in the background.
1959         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1960         out, _ = subprocess.Popen(
1961             [encodeArgument(exe)] + args,
1962             stdin=subprocess.PIPE,
1963             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1964     except OSError:
1965         return False
1966     if isinstance(out, bytes):  # Python 2.x
1967         out = out.decode('ascii', 'ignore')
1968     return detect_exe_version(out, version_re, unrecognized)
1969
1970
1971 def detect_exe_version(output, version_re=None, unrecognized='present'):
1972     assert isinstance(output, compat_str)
1973     if version_re is None:
1974         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1975     m = re.search(version_re, output)
1976     if m:
1977         return m.group(1)
1978     else:
1979         return unrecognized
1980
1981
1982 class PagedList(object):
1983     def __len__(self):
1984         # This is only useful for tests
1985         return len(self.getslice())
1986
1987
1988 class OnDemandPagedList(PagedList):
1989     def __init__(self, pagefunc, pagesize, use_cache=True):
1990         self._pagefunc = pagefunc
1991         self._pagesize = pagesize
1992         self._use_cache = use_cache
1993         if use_cache:
1994             self._cache = {}
1995
1996     def getslice(self, start=0, end=None):
1997         res = []
1998         for pagenum in itertools.count(start // self._pagesize):
1999             firstid = pagenum * self._pagesize
2000             nextfirstid = pagenum * self._pagesize + self._pagesize
2001             if start >= nextfirstid:
2002                 continue
2003
2004             page_results = None
2005             if self._use_cache:
2006                 page_results = self._cache.get(pagenum)
2007             if page_results is None:
2008                 page_results = list(self._pagefunc(pagenum))
2009             if self._use_cache:
2010                 self._cache[pagenum] = page_results
2011
2012             startv = (
2013                 start % self._pagesize
2014                 if firstid <= start < nextfirstid
2015                 else 0)
2016
2017             endv = (
2018                 ((end - 1) % self._pagesize) + 1
2019                 if (end is not None and firstid <= end <= nextfirstid)
2020                 else None)
2021
2022             if startv != 0 or endv is not None:
2023                 page_results = page_results[startv:endv]
2024             res.extend(page_results)
2025
2026             # A little optimization - if current page is not "full", ie. does
2027             # not contain page_size videos then we can assume that this page
2028             # is the last one - there are no more ids on further pages -
2029             # i.e. no need to query again.
2030             if len(page_results) + startv < self._pagesize:
2031                 break
2032
2033             # If we got the whole page, but the next page is not interesting,
2034             # break out early as well
2035             if end == nextfirstid:
2036                 break
2037         return res
2038
2039
2040 class InAdvancePagedList(PagedList):
2041     def __init__(self, pagefunc, pagecount, pagesize):
2042         self._pagefunc = pagefunc
2043         self._pagecount = pagecount
2044         self._pagesize = pagesize
2045
2046     def getslice(self, start=0, end=None):
2047         res = []
2048         start_page = start // self._pagesize
2049         end_page = (
2050             self._pagecount if end is None else (end // self._pagesize + 1))
2051         skip_elems = start - start_page * self._pagesize
2052         only_more = None if end is None else end - start
2053         for pagenum in range(start_page, end_page):
2054             page = list(self._pagefunc(pagenum))
2055             if skip_elems:
2056                 page = page[skip_elems:]
2057                 skip_elems = None
2058             if only_more is not None:
2059                 if len(page) < only_more:
2060                     only_more -= len(page)
2061                 else:
2062                     page = page[:only_more]
2063                     res.extend(page)
2064                     break
2065             res.extend(page)
2066         return res
2067
2068
2069 def uppercase_escape(s):
2070     unicode_escape = codecs.getdecoder('unicode_escape')
2071     return re.sub(
2072         r'\\U[0-9a-fA-F]{8}',
2073         lambda m: unicode_escape(m.group(0))[0],
2074         s)
2075
2076
2077 def lowercase_escape(s):
2078     unicode_escape = codecs.getdecoder('unicode_escape')
2079     return re.sub(
2080         r'\\u[0-9a-fA-F]{4}',
2081         lambda m: unicode_escape(m.group(0))[0],
2082         s)
2083
2084
2085 def escape_rfc3986(s):
2086     """Escape non-ASCII characters as suggested by RFC 3986"""
2087     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2088         s = s.encode('utf-8')
2089     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2090
2091
2092 def escape_url(url):
2093     """Escape URL as suggested by RFC 3986"""
2094     url_parsed = compat_urllib_parse_urlparse(url)
2095     return url_parsed._replace(
2096         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2097         path=escape_rfc3986(url_parsed.path),
2098         params=escape_rfc3986(url_parsed.params),
2099         query=escape_rfc3986(url_parsed.query),
2100         fragment=escape_rfc3986(url_parsed.fragment)
2101     ).geturl()
2102
2103
2104 def read_batch_urls(batch_fd):
2105     def fixup(url):
2106         if not isinstance(url, compat_str):
2107             url = url.decode('utf-8', 'replace')
2108         BOM_UTF8 = '\xef\xbb\xbf'
2109         if url.startswith(BOM_UTF8):
2110             url = url[len(BOM_UTF8):]
2111         url = url.strip()
2112         if url.startswith(('#', ';', ']')):
2113             return False
2114         return url
2115
2116     with contextlib.closing(batch_fd) as fd:
2117         return [url for url in map(fixup, fd) if url]
2118
2119
2120 def urlencode_postdata(*args, **kargs):
2121     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2122
2123
2124 def update_url_query(url, query):
2125     if not query:
2126         return url
2127     parsed_url = compat_urlparse.urlparse(url)
2128     qs = compat_parse_qs(parsed_url.query)
2129     qs.update(query)
2130     return compat_urlparse.urlunparse(parsed_url._replace(
2131         query=compat_urllib_parse_urlencode(qs, True)))
2132
2133
2134 def update_Request(req, url=None, data=None, headers={}, query={}):
2135     req_headers = req.headers.copy()
2136     req_headers.update(headers)
2137     req_data = data or req.data
2138     req_url = update_url_query(url or req.get_full_url(), query)
2139     req_get_method = req.get_method()
2140     if req_get_method == 'HEAD':
2141         req_type = HEADRequest
2142     elif req_get_method == 'PUT':
2143         req_type = PUTRequest
2144     else:
2145         req_type = compat_urllib_request.Request
2146     new_req = req_type(
2147         req_url, data=req_data, headers=req_headers,
2148         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2149     if hasattr(req, 'timeout'):
2150         new_req.timeout = req.timeout
2151     return new_req
2152
2153
2154 def _multipart_encode_impl(data, boundary):
2155     content_type = 'multipart/form-data; boundary=%s' % boundary
2156
2157     out = b''
2158     for k, v in data.items():
2159         out += b'--' + boundary.encode('ascii') + b'\r\n'
2160         if isinstance(k, compat_str):
2161             k = k.encode('utf-8')
2162         if isinstance(v, compat_str):
2163             v = v.encode('utf-8')
2164         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2165         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2166         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2167         if boundary.encode('ascii') in content:
2168             raise ValueError('Boundary overlaps with data')
2169         out += content
2170
2171     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2172
2173     return out, content_type
2174
2175
2176 def multipart_encode(data, boundary=None):
2177     '''
2178     Encode a dict to RFC 7578-compliant form-data
2179
2180     data:
2181         A dict where keys and values can be either Unicode or bytes-like
2182         objects.
2183     boundary:
2184         If specified a Unicode object, it's used as the boundary. Otherwise
2185         a random boundary is generated.
2186
2187     Reference: https://tools.ietf.org/html/rfc7578
2188     '''
2189     has_specified_boundary = boundary is not None
2190
2191     while True:
2192         if boundary is None:
2193             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2194
2195         try:
2196             out, content_type = _multipart_encode_impl(data, boundary)
2197             break
2198         except ValueError:
2199             if has_specified_boundary:
2200                 raise
2201             boundary = None
2202
2203     return out, content_type
2204
2205
2206 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2207     if isinstance(key_or_keys, (list, tuple)):
2208         for key in key_or_keys:
2209             if key not in d or d[key] is None or skip_false_values and not d[key]:
2210                 continue
2211             return d[key]
2212         return default
2213     return d.get(key_or_keys, default)
2214
2215
2216 def try_get(src, getter, expected_type=None):
2217     if not isinstance(getter, (list, tuple)):
2218         getter = [getter]
2219     for get in getter:
2220         try:
2221             v = get(src)
2222         except (AttributeError, KeyError, TypeError, IndexError):
2223             pass
2224         else:
2225             if expected_type is None or isinstance(v, expected_type):
2226                 return v
2227
2228
2229 def merge_dicts(*dicts):
2230     merged = {}
2231     for a_dict in dicts:
2232         for k, v in a_dict.items():
2233             if v is None:
2234                 continue
2235             if (k not in merged or
2236                     (isinstance(v, compat_str) and v and
2237                         isinstance(merged[k], compat_str) and
2238                         not merged[k])):
2239                 merged[k] = v
2240     return merged
2241
2242
2243 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2244     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2245
2246
2247 US_RATINGS = {
2248     'G': 0,
2249     'PG': 10,
2250     'PG-13': 13,
2251     'R': 16,
2252     'NC': 18,
2253 }
2254
2255
2256 TV_PARENTAL_GUIDELINES = {
2257     'TV-Y': 0,
2258     'TV-Y7': 7,
2259     'TV-G': 0,
2260     'TV-PG': 0,
2261     'TV-14': 14,
2262     'TV-MA': 17,
2263 }
2264
2265
2266 def parse_age_limit(s):
2267     if type(s) == int:
2268         return s if 0 <= s <= 21 else None
2269     if not isinstance(s, compat_basestring):
2270         return None
2271     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2272     if m:
2273         return int(m.group('age'))
2274     if s in US_RATINGS:
2275         return US_RATINGS[s]
2276     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2277     if m:
2278         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2279     return None
2280
2281
2282 def strip_jsonp(code):
2283     return re.sub(
2284         r'''(?sx)^
2285             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2286             (?:\s*&&\s*(?P=func_name))?
2287             \s*\(\s*(?P<callback_data>.*)\);?
2288             \s*?(?://[^\n]*)*$''',
2289         r'\g<callback_data>', code)
2290
2291
2292 def js_to_json(code):
2293     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2294     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2295     INTEGER_TABLE = (
2296         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2297         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2298     )
2299
2300     def fix_kv(m):
2301         v = m.group(0)
2302         if v in ('true', 'false', 'null'):
2303             return v
2304         elif v.startswith('/*') or v.startswith('//') or v == ',':
2305             return ""
2306
2307         if v[0] in ("'", '"'):
2308             v = re.sub(r'(?s)\\.|"', lambda m: {
2309                 '"': '\\"',
2310                 "\\'": "'",
2311                 '\\\n': '',
2312                 '\\x': '\\u00',
2313             }.get(m.group(0), m.group(0)), v[1:-1])
2314
2315         for regex, base in INTEGER_TABLE:
2316             im = re.match(regex, v)
2317             if im:
2318                 i = int(im.group(1), base)
2319                 return '"%d":' % i if v.endswith(':') else '%d' % i
2320
2321         return '"%s"' % v
2322
2323     return re.sub(r'''(?sx)
2324         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2325         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2326         {comment}|,(?={skip}[\]}}])|
2327         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2328         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2329         [0-9]+(?={skip}:)
2330         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2331
2332
2333 def qualities(quality_ids):
2334     """ Get a numeric quality value out of a list of possible values """
2335     def q(qid):
2336         try:
2337             return quality_ids.index(qid)
2338         except ValueError:
2339             return -1
2340     return q
2341
2342
2343 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2344
2345
2346 def limit_length(s, length):
2347     """ Add ellipses to overly long strings """
2348     if s is None:
2349         return None
2350     ELLIPSES = '...'
2351     if len(s) > length:
2352         return s[:length - len(ELLIPSES)] + ELLIPSES
2353     return s
2354
2355
2356 def version_tuple(v):
2357     return tuple(int(e) for e in re.split(r'[-.]', v))
2358
2359
2360 def is_outdated_version(version, limit, assume_new=True):
2361     if not version:
2362         return not assume_new
2363     try:
2364         return version_tuple(version) < version_tuple(limit)
2365     except ValueError:
2366         return not assume_new
2367
2368
2369 def ytdl_is_updateable():
2370     """ Returns if youtube-dl can be updated with -U """
2371     from zipimport import zipimporter
2372
2373     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2374
2375
2376 def args_to_str(args):
2377     # Get a short string representation for a subprocess command
2378     return ' '.join(compat_shlex_quote(a) for a in args)
2379
2380
2381 def error_to_compat_str(err):
2382     err_str = str(err)
2383     # On python 2 error byte string must be decoded with proper
2384     # encoding rather than ascii
2385     if sys.version_info[0] < 3:
2386         err_str = err_str.decode(preferredencoding())
2387     return err_str
2388
2389
2390 def mimetype2ext(mt):
2391     if mt is None:
2392         return None
2393
2394     ext = {
2395         'audio/mp4': 'm4a',
2396         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2397         # it's the most popular one
2398         'audio/mpeg': 'mp3',
2399     }.get(mt)
2400     if ext is not None:
2401         return ext
2402
2403     _, _, res = mt.rpartition('/')
2404     res = res.split(';')[0].strip().lower()
2405
2406     return {
2407         '3gpp': '3gp',
2408         'smptett+xml': 'tt',
2409         'ttaf+xml': 'dfxp',
2410         'ttml+xml': 'ttml',
2411         'x-flv': 'flv',
2412         'x-mp4-fragmented': 'mp4',
2413         'x-ms-sami': 'sami',
2414         'x-ms-wmv': 'wmv',
2415         'mpegurl': 'm3u8',
2416         'x-mpegurl': 'm3u8',
2417         'vnd.apple.mpegurl': 'm3u8',
2418         'dash+xml': 'mpd',
2419         'f4m+xml': 'f4m',
2420         'hds+xml': 'f4m',
2421         'vnd.ms-sstr+xml': 'ism',
2422         'quicktime': 'mov',
2423         'mp2t': 'ts',
2424     }.get(res, res)
2425
2426
2427 def parse_codecs(codecs_str):
2428     # http://tools.ietf.org/html/rfc6381
2429     if not codecs_str:
2430         return {}
2431     splited_codecs = list(filter(None, map(
2432         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2433     vcodec, acodec = None, None
2434     for full_codec in splited_codecs:
2435         codec = full_codec.split('.')[0]
2436         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2437             if not vcodec:
2438                 vcodec = full_codec
2439         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2440             if not acodec:
2441                 acodec = full_codec
2442         else:
2443             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2444     if not vcodec and not acodec:
2445         if len(splited_codecs) == 2:
2446             return {
2447                 'vcodec': vcodec,
2448                 'acodec': acodec,
2449             }
2450         elif len(splited_codecs) == 1:
2451             return {
2452                 'vcodec': 'none',
2453                 'acodec': vcodec,
2454             }
2455     else:
2456         return {
2457             'vcodec': vcodec or 'none',
2458             'acodec': acodec or 'none',
2459         }
2460     return {}
2461
2462
2463 def urlhandle_detect_ext(url_handle):
2464     getheader = url_handle.headers.get
2465
2466     cd = getheader('Content-Disposition')
2467     if cd:
2468         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2469         if m:
2470             e = determine_ext(m.group('filename'), default_ext=None)
2471             if e:
2472                 return e
2473
2474     return mimetype2ext(getheader('Content-Type'))
2475
2476
2477 def encode_data_uri(data, mime_type):
2478     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2479
2480
2481 def age_restricted(content_limit, age_limit):
2482     """ Returns True iff the content should be blocked """
2483
2484     if age_limit is None:  # No limit set
2485         return False
2486     if content_limit is None:
2487         return False  # Content available for everyone
2488     return age_limit < content_limit
2489
2490
2491 def is_html(first_bytes):
2492     """ Detect whether a file contains HTML by examining its first bytes. """
2493
2494     BOMS = [
2495         (b'\xef\xbb\xbf', 'utf-8'),
2496         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2497         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2498         (b'\xff\xfe', 'utf-16-le'),
2499         (b'\xfe\xff', 'utf-16-be'),
2500     ]
2501     for bom, enc in BOMS:
2502         if first_bytes.startswith(bom):
2503             s = first_bytes[len(bom):].decode(enc, 'replace')
2504             break
2505     else:
2506         s = first_bytes.decode('utf-8', 'replace')
2507
2508     return re.match(r'^\s*<', s)
2509
2510
2511 def determine_protocol(info_dict):
2512     protocol = info_dict.get('protocol')
2513     if protocol is not None:
2514         return protocol
2515
2516     url = info_dict['url']
2517     if url.startswith('rtmp'):
2518         return 'rtmp'
2519     elif url.startswith('mms'):
2520         return 'mms'
2521     elif url.startswith('rtsp'):
2522         return 'rtsp'
2523
2524     ext = determine_ext(url)
2525     if ext == 'm3u8':
2526         return 'm3u8'
2527     elif ext == 'f4m':
2528         return 'f4m'
2529
2530     return compat_urllib_parse_urlparse(url).scheme
2531
2532
2533 def render_table(header_row, data):
2534     """ Render a list of rows, each as a list of values """
2535     table = [header_row] + data
2536     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2537     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2538     return '\n'.join(format_str % tuple(row) for row in table)
2539
2540
2541 def _match_one(filter_part, dct):
2542     COMPARISON_OPERATORS = {
2543         '<': operator.lt,
2544         '<=': operator.le,
2545         '>': operator.gt,
2546         '>=': operator.ge,
2547         '=': operator.eq,
2548         '!=': operator.ne,
2549     }
2550     operator_rex = re.compile(r'''(?x)\s*
2551         (?P<key>[a-z_]+)
2552         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2553         (?:
2554             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2555             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2556             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2557         )
2558         \s*$
2559         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2560     m = operator_rex.search(filter_part)
2561     if m:
2562         op = COMPARISON_OPERATORS[m.group('op')]
2563         actual_value = dct.get(m.group('key'))
2564         if (m.group('quotedstrval') is not None or
2565             m.group('strval') is not None or
2566             # If the original field is a string and matching comparisonvalue is
2567             # a number we should respect the origin of the original field
2568             # and process comparison value as a string (see
2569             # https://github.com/rg3/youtube-dl/issues/11082).
2570             actual_value is not None and m.group('intval') is not None and
2571                 isinstance(actual_value, compat_str)):
2572             if m.group('op') not in ('=', '!='):
2573                 raise ValueError(
2574                     'Operator %s does not support string values!' % m.group('op'))
2575             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2576             quote = m.group('quote')
2577             if quote is not None:
2578                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2579         else:
2580             try:
2581                 comparison_value = int(m.group('intval'))
2582             except ValueError:
2583                 comparison_value = parse_filesize(m.group('intval'))
2584                 if comparison_value is None:
2585                     comparison_value = parse_filesize(m.group('intval') + 'B')
2586                 if comparison_value is None:
2587                     raise ValueError(
2588                         'Invalid integer value %r in filter part %r' % (
2589                             m.group('intval'), filter_part))
2590         if actual_value is None:
2591             return m.group('none_inclusive')
2592         return op(actual_value, comparison_value)
2593
2594     UNARY_OPERATORS = {
2595         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2596         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2597     }
2598     operator_rex = re.compile(r'''(?x)\s*
2599         (?P<op>%s)\s*(?P<key>[a-z_]+)
2600         \s*$
2601         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2602     m = operator_rex.search(filter_part)
2603     if m:
2604         op = UNARY_OPERATORS[m.group('op')]
2605         actual_value = dct.get(m.group('key'))
2606         return op(actual_value)
2607
2608     raise ValueError('Invalid filter part %r' % filter_part)
2609
2610
2611 def match_str(filter_str, dct):
2612     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2613
2614     return all(
2615         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2616
2617
2618 def match_filter_func(filter_str):
2619     def _match_func(info_dict):
2620         if match_str(filter_str, info_dict):
2621             return None
2622         else:
2623             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2624             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2625     return _match_func
2626
2627
2628 def parse_dfxp_time_expr(time_expr):
2629     if not time_expr:
2630         return
2631
2632     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2633     if mobj:
2634         return float(mobj.group('time_offset'))
2635
2636     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2637     if mobj:
2638         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2639
2640
2641 def srt_subtitles_timecode(seconds):
2642     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2643
2644
2645 def dfxp2srt(dfxp_data):
2646     '''
2647     @param dfxp_data A bytes-like object containing DFXP data
2648     @returns A unicode object containing converted SRT data
2649     '''
2650     LEGACY_NAMESPACES = (
2651         (b'http://www.w3.org/ns/ttml', [
2652             b'http://www.w3.org/2004/11/ttaf1',
2653             b'http://www.w3.org/2006/04/ttaf1',
2654             b'http://www.w3.org/2006/10/ttaf1',
2655         ]),
2656         (b'http://www.w3.org/ns/ttml#styling', [
2657             b'http://www.w3.org/ns/ttml#style',
2658         ]),
2659     )
2660
2661     SUPPORTED_STYLING = [
2662         'color',
2663         'fontFamily',
2664         'fontSize',
2665         'fontStyle',
2666         'fontWeight',
2667         'textDecoration'
2668     ]
2669
2670     _x = functools.partial(xpath_with_ns, ns_map={
2671         'xml': 'http://www.w3.org/XML/1998/namespace',
2672         'ttml': 'http://www.w3.org/ns/ttml',
2673         'tts': 'http://www.w3.org/ns/ttml#styling',
2674     })
2675
2676     styles = {}
2677     default_style = {}
2678
2679     class TTMLPElementParser(object):
2680         _out = ''
2681         _unclosed_elements = []
2682         _applied_styles = []
2683
2684         def start(self, tag, attrib):
2685             if tag in (_x('ttml:br'), 'br'):
2686                 self._out += '\n'
2687             else:
2688                 unclosed_elements = []
2689                 style = {}
2690                 element_style_id = attrib.get('style')
2691                 if default_style:
2692                     style.update(default_style)
2693                 if element_style_id:
2694                     style.update(styles.get(element_style_id, {}))
2695                 for prop in SUPPORTED_STYLING:
2696                     prop_val = attrib.get(_x('tts:' + prop))
2697                     if prop_val:
2698                         style[prop] = prop_val
2699                 if style:
2700                     font = ''
2701                     for k, v in sorted(style.items()):
2702                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2703                             continue
2704                         if k == 'color':
2705                             font += ' color="%s"' % v
2706                         elif k == 'fontSize':
2707                             font += ' size="%s"' % v
2708                         elif k == 'fontFamily':
2709                             font += ' face="%s"' % v
2710                         elif k == 'fontWeight' and v == 'bold':
2711                             self._out += '<b>'
2712                             unclosed_elements.append('b')
2713                         elif k == 'fontStyle' and v == 'italic':
2714                             self._out += '<i>'
2715                             unclosed_elements.append('i')
2716                         elif k == 'textDecoration' and v == 'underline':
2717                             self._out += '<u>'
2718                             unclosed_elements.append('u')
2719                     if font:
2720                         self._out += '<font' + font + '>'
2721                         unclosed_elements.append('font')
2722                     applied_style = {}
2723                     if self._applied_styles:
2724                         applied_style.update(self._applied_styles[-1])
2725                     applied_style.update(style)
2726                     self._applied_styles.append(applied_style)
2727                 self._unclosed_elements.append(unclosed_elements)
2728
2729         def end(self, tag):
2730             if tag not in (_x('ttml:br'), 'br'):
2731                 unclosed_elements = self._unclosed_elements.pop()
2732                 for element in reversed(unclosed_elements):
2733                     self._out += '</%s>' % element
2734                 if unclosed_elements and self._applied_styles:
2735                     self._applied_styles.pop()
2736
2737         def data(self, data):
2738             self._out += data
2739
2740         def close(self):
2741             return self._out.strip()
2742
2743     def parse_node(node):
2744         target = TTMLPElementParser()
2745         parser = xml.etree.ElementTree.XMLParser(target=target)
2746         parser.feed(xml.etree.ElementTree.tostring(node))
2747         return parser.close()
2748
2749     for k, v in LEGACY_NAMESPACES:
2750         for ns in v:
2751             dfxp_data = dfxp_data.replace(ns, k)
2752
2753     dfxp = compat_etree_fromstring(dfxp_data)
2754     out = []
2755     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2756
2757     if not paras:
2758         raise ValueError('Invalid dfxp/TTML subtitle')
2759
2760     repeat = False
2761     while True:
2762         for style in dfxp.findall(_x('.//ttml:style')):
2763             style_id = style.get('id') or style.get(_x('xml:id'))
2764             if not style_id:
2765                 continue
2766             parent_style_id = style.get('style')
2767             if parent_style_id:
2768                 if parent_style_id not in styles:
2769                     repeat = True
2770                     continue
2771                 styles[style_id] = styles[parent_style_id].copy()
2772             for prop in SUPPORTED_STYLING:
2773                 prop_val = style.get(_x('tts:' + prop))
2774                 if prop_val:
2775                     styles.setdefault(style_id, {})[prop] = prop_val
2776         if repeat:
2777             repeat = False
2778         else:
2779             break
2780
2781     for p in ('body', 'div'):
2782         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2783         if ele is None:
2784             continue
2785         style = styles.get(ele.get('style'))
2786         if not style:
2787             continue
2788         default_style.update(style)
2789
2790     for para, index in zip(paras, itertools.count(1)):
2791         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2792         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2793         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2794         if begin_time is None:
2795             continue
2796         if not end_time:
2797             if not dur:
2798                 continue
2799             end_time = begin_time + dur
2800         out.append('%d\n%s --> %s\n%s\n\n' % (
2801             index,
2802             srt_subtitles_timecode(begin_time),
2803             srt_subtitles_timecode(end_time),
2804             parse_node(para)))
2805
2806     return ''.join(out)
2807
2808
2809 def cli_option(params, command_option, param):
2810     param = params.get(param)
2811     if param:
2812         param = compat_str(param)
2813     return [command_option, param] if param is not None else []
2814
2815
2816 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2817     param = params.get(param)
2818     if param is None:
2819         return []
2820     assert isinstance(param, bool)
2821     if separator:
2822         return [command_option + separator + (true_value if param else false_value)]
2823     return [command_option, true_value if param else false_value]
2824
2825
2826 def cli_valueless_option(params, command_option, param, expected_value=True):
2827     param = params.get(param)
2828     return [command_option] if param == expected_value else []
2829
2830
2831 def cli_configuration_args(params, param, default=[]):
2832     ex_args = params.get(param)
2833     if ex_args is None:
2834         return default
2835     assert isinstance(ex_args, list)
2836     return ex_args
2837
2838
2839 class ISO639Utils(object):
2840     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2841     _lang_map = {
2842         'aa': 'aar',
2843         'ab': 'abk',
2844         'ae': 'ave',
2845         'af': 'afr',
2846         'ak': 'aka',
2847         'am': 'amh',
2848         'an': 'arg',
2849         'ar': 'ara',
2850         'as': 'asm',
2851         'av': 'ava',
2852         'ay': 'aym',
2853         'az': 'aze',
2854         'ba': 'bak',
2855         'be': 'bel',
2856         'bg': 'bul',
2857         'bh': 'bih',
2858         'bi': 'bis',
2859         'bm': 'bam',
2860         'bn': 'ben',
2861         'bo': 'bod',
2862         'br': 'bre',
2863         'bs': 'bos',
2864         'ca': 'cat',
2865         'ce': 'che',
2866         'ch': 'cha',
2867         'co': 'cos',
2868         'cr': 'cre',
2869         'cs': 'ces',
2870         'cu': 'chu',
2871         'cv': 'chv',
2872         'cy': 'cym',
2873         'da': 'dan',
2874         'de': 'deu',
2875         'dv': 'div',
2876         'dz': 'dzo',
2877         'ee': 'ewe',
2878         'el': 'ell',
2879         'en': 'eng',
2880         'eo': 'epo',
2881         'es': 'spa',
2882         'et': 'est',
2883         'eu': 'eus',
2884         'fa': 'fas',
2885         'ff': 'ful',
2886         'fi': 'fin',
2887         'fj': 'fij',
2888         'fo': 'fao',
2889         'fr': 'fra',
2890         'fy': 'fry',
2891         'ga': 'gle',
2892         'gd': 'gla',
2893         'gl': 'glg',
2894         'gn': 'grn',
2895         'gu': 'guj',
2896         'gv': 'glv',
2897         'ha': 'hau',
2898         'he': 'heb',
2899         'hi': 'hin',
2900         'ho': 'hmo',
2901         'hr': 'hrv',
2902         'ht': 'hat',
2903         'hu': 'hun',
2904         'hy': 'hye',
2905         'hz': 'her',
2906         'ia': 'ina',
2907         'id': 'ind',
2908         'ie': 'ile',
2909         'ig': 'ibo',
2910         'ii': 'iii',
2911         'ik': 'ipk',
2912         'io': 'ido',
2913         'is': 'isl',
2914         'it': 'ita',
2915         'iu': 'iku',
2916         'ja': 'jpn',
2917         'jv': 'jav',
2918         'ka': 'kat',
2919         'kg': 'kon',
2920         'ki': 'kik',
2921         'kj': 'kua',
2922         'kk': 'kaz',
2923         'kl': 'kal',
2924         'km': 'khm',
2925         'kn': 'kan',
2926         'ko': 'kor',
2927         'kr': 'kau',
2928         'ks': 'kas',
2929         'ku': 'kur',
2930         'kv': 'kom',
2931         'kw': 'cor',
2932         'ky': 'kir',
2933         'la': 'lat',
2934         'lb': 'ltz',
2935         'lg': 'lug',
2936         'li': 'lim',
2937         'ln': 'lin',
2938         'lo': 'lao',
2939         'lt': 'lit',
2940         'lu': 'lub',
2941         'lv': 'lav',
2942         'mg': 'mlg',
2943         'mh': 'mah',
2944         'mi': 'mri',
2945         'mk': 'mkd',
2946         'ml': 'mal',
2947         'mn': 'mon',
2948         'mr': 'mar',
2949         'ms': 'msa',
2950         'mt': 'mlt',
2951         'my': 'mya',
2952         'na': 'nau',
2953         'nb': 'nob',
2954         'nd': 'nde',
2955         'ne': 'nep',
2956         'ng': 'ndo',
2957         'nl': 'nld',
2958         'nn': 'nno',
2959         'no': 'nor',
2960         'nr': 'nbl',
2961         'nv': 'nav',
2962         'ny': 'nya',
2963         'oc': 'oci',
2964         'oj': 'oji',
2965         'om': 'orm',
2966         'or': 'ori',
2967         'os': 'oss',
2968         'pa': 'pan',
2969         'pi': 'pli',
2970         'pl': 'pol',
2971         'ps': 'pus',
2972         'pt': 'por',
2973         'qu': 'que',
2974         'rm': 'roh',
2975         'rn': 'run',
2976         'ro': 'ron',
2977         'ru': 'rus',
2978         'rw': 'kin',
2979         'sa': 'san',
2980         'sc': 'srd',
2981         'sd': 'snd',
2982         'se': 'sme',
2983         'sg': 'sag',
2984         'si': 'sin',
2985         'sk': 'slk',
2986         'sl': 'slv',
2987         'sm': 'smo',
2988         'sn': 'sna',
2989         'so': 'som',
2990         'sq': 'sqi',
2991         'sr': 'srp',
2992         'ss': 'ssw',
2993         'st': 'sot',
2994         'su': 'sun',
2995         'sv': 'swe',
2996         'sw': 'swa',
2997         'ta': 'tam',
2998         'te': 'tel',
2999         'tg': 'tgk',
3000         'th': 'tha',
3001         'ti': 'tir',
3002         'tk': 'tuk',
3003         'tl': 'tgl',
3004         'tn': 'tsn',
3005         'to': 'ton',
3006         'tr': 'tur',
3007         'ts': 'tso',
3008         'tt': 'tat',
3009         'tw': 'twi',
3010         'ty': 'tah',
3011         'ug': 'uig',
3012         'uk': 'ukr',
3013         'ur': 'urd',
3014         'uz': 'uzb',
3015         've': 'ven',
3016         'vi': 'vie',
3017         'vo': 'vol',
3018         'wa': 'wln',
3019         'wo': 'wol',
3020         'xh': 'xho',
3021         'yi': 'yid',
3022         'yo': 'yor',
3023         'za': 'zha',
3024         'zh': 'zho',
3025         'zu': 'zul',
3026     }
3027
3028     @classmethod
3029     def short2long(cls, code):
3030         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3031         return cls._lang_map.get(code[:2])
3032
3033     @classmethod
3034     def long2short(cls, code):
3035         """Convert language code from ISO 639-2/T to ISO 639-1"""
3036         for short_name, long_name in cls._lang_map.items():
3037             if long_name == code:
3038                 return short_name
3039
3040
3041 class ISO3166Utils(object):
3042     # From http://data.okfn.org/data/core/country-list
3043     _country_map = {
3044         'AF': 'Afghanistan',
3045         'AX': 'Åland Islands',
3046         'AL': 'Albania',
3047         'DZ': 'Algeria',
3048         'AS': 'American Samoa',
3049         'AD': 'Andorra',
3050         'AO': 'Angola',
3051         'AI': 'Anguilla',
3052         'AQ': 'Antarctica',
3053         'AG': 'Antigua and Barbuda',
3054         'AR': 'Argentina',
3055         'AM': 'Armenia',
3056         'AW': 'Aruba',
3057         'AU': 'Australia',
3058         'AT': 'Austria',
3059         'AZ': 'Azerbaijan',
3060         'BS': 'Bahamas',
3061         'BH': 'Bahrain',
3062         'BD': 'Bangladesh',
3063         'BB': 'Barbados',
3064         'BY': 'Belarus',
3065         'BE': 'Belgium',
3066         'BZ': 'Belize',
3067         'BJ': 'Benin',
3068         'BM': 'Bermuda',
3069         'BT': 'Bhutan',
3070         'BO': 'Bolivia, Plurinational State of',
3071         'BQ': 'Bonaire, Sint Eustatius and Saba',
3072         'BA': 'Bosnia and Herzegovina',
3073         'BW': 'Botswana',
3074         'BV': 'Bouvet Island',
3075         'BR': 'Brazil',
3076         'IO': 'British Indian Ocean Territory',
3077         'BN': 'Brunei Darussalam',
3078         'BG': 'Bulgaria',
3079         'BF': 'Burkina Faso',
3080         'BI': 'Burundi',
3081         'KH': 'Cambodia',
3082         'CM': 'Cameroon',
3083         'CA': 'Canada',
3084         'CV': 'Cape Verde',
3085         'KY': 'Cayman Islands',
3086         'CF': 'Central African Republic',
3087         'TD': 'Chad',
3088         'CL': 'Chile',
3089         'CN': 'China',
3090         'CX': 'Christmas Island',
3091         'CC': 'Cocos (Keeling) Islands',
3092         'CO': 'Colombia',
3093         'KM': 'Comoros',
3094         'CG': 'Congo',
3095         'CD': 'Congo, the Democratic Republic of the',
3096         'CK': 'Cook Islands',
3097         'CR': 'Costa Rica',
3098         'CI': 'Côte d\'Ivoire',
3099         'HR': 'Croatia',
3100         'CU': 'Cuba',
3101         'CW': 'Curaçao',
3102         'CY': 'Cyprus',
3103         'CZ': 'Czech Republic',
3104         'DK': 'Denmark',
3105         'DJ': 'Djibouti',
3106         'DM': 'Dominica',
3107         'DO': 'Dominican Republic',
3108         'EC': 'Ecuador',
3109         'EG': 'Egypt',
3110         'SV': 'El Salvador',
3111         'GQ': 'Equatorial Guinea',
3112         'ER': 'Eritrea',
3113         'EE': 'Estonia',
3114         'ET': 'Ethiopia',
3115         'FK': 'Falkland Islands (Malvinas)',
3116         'FO': 'Faroe Islands',
3117         'FJ': 'Fiji',
3118         'FI': 'Finland',
3119         'FR': 'France',
3120         'GF': 'French Guiana',
3121         'PF': 'French Polynesia',
3122         'TF': 'French Southern Territories',
3123         'GA': 'Gabon',
3124         'GM': 'Gambia',
3125         'GE': 'Georgia',
3126         'DE': 'Germany',
3127         'GH': 'Ghana',
3128         'GI': 'Gibraltar',
3129         'GR': 'Greece',
3130         'GL': 'Greenland',
3131         'GD': 'Grenada',
3132         'GP': 'Guadeloupe',
3133         'GU': 'Guam',
3134         'GT': 'Guatemala',
3135         'GG': 'Guernsey',
3136         'GN': 'Guinea',
3137         'GW': 'Guinea-Bissau',
3138         'GY': 'Guyana',
3139         'HT': 'Haiti',
3140         'HM': 'Heard Island and McDonald Islands',
3141         'VA': 'Holy See (Vatican City State)',
3142         'HN': 'Honduras',
3143         'HK': 'Hong Kong',
3144         'HU': 'Hungary',
3145         'IS': 'Iceland',
3146         'IN': 'India',
3147         'ID': 'Indonesia',
3148         'IR': 'Iran, Islamic Republic of',
3149         'IQ': 'Iraq',
3150         'IE': 'Ireland',
3151         'IM': 'Isle of Man',
3152         'IL': 'Israel',
3153         'IT': 'Italy',
3154         'JM': 'Jamaica',
3155         'JP': 'Japan',
3156         'JE': 'Jersey',
3157         'JO': 'Jordan',
3158         'KZ': 'Kazakhstan',
3159         'KE': 'Kenya',
3160         'KI': 'Kiribati',
3161         'KP': 'Korea, Democratic People\'s Republic of',
3162         'KR': 'Korea, Republic of',
3163         'KW': 'Kuwait',
3164         'KG': 'Kyrgyzstan',
3165         'LA': 'Lao People\'s Democratic Republic',
3166         'LV': 'Latvia',
3167         'LB': 'Lebanon',
3168         'LS': 'Lesotho',
3169         'LR': 'Liberia',
3170         'LY': 'Libya',
3171         'LI': 'Liechtenstein',
3172         'LT': 'Lithuania',
3173         'LU': 'Luxembourg',
3174         'MO': 'Macao',
3175         'MK': 'Macedonia, the Former Yugoslav Republic of',
3176         'MG': 'Madagascar',
3177         'MW': 'Malawi',
3178         'MY': 'Malaysia',
3179         'MV': 'Maldives',
3180         'ML': 'Mali',
3181         'MT': 'Malta',
3182         'MH': 'Marshall Islands',
3183         'MQ': 'Martinique',
3184         'MR': 'Mauritania',
3185         'MU': 'Mauritius',
3186         'YT': 'Mayotte',
3187         'MX': 'Mexico',
3188         'FM': 'Micronesia, Federated States of',
3189         'MD': 'Moldova, Republic of',
3190         'MC': 'Monaco',
3191         'MN': 'Mongolia',
3192         'ME': 'Montenegro',
3193         'MS': 'Montserrat',
3194         'MA': 'Morocco',
3195         'MZ': 'Mozambique',
3196         'MM': 'Myanmar',
3197         'NA': 'Namibia',
3198         'NR': 'Nauru',
3199         'NP': 'Nepal',
3200         'NL': 'Netherlands',
3201         'NC': 'New Caledonia',
3202         'NZ': 'New Zealand',
3203         'NI': 'Nicaragua',
3204         'NE': 'Niger',
3205         'NG': 'Nigeria',
3206         'NU': 'Niue',
3207         'NF': 'Norfolk Island',
3208         'MP': 'Northern Mariana Islands',
3209         'NO': 'Norway',
3210         'OM': 'Oman',
3211         'PK': 'Pakistan',
3212         'PW': 'Palau',
3213         'PS': 'Palestine, State of',
3214         'PA': 'Panama',
3215         'PG': 'Papua New Guinea',
3216         'PY': 'Paraguay',
3217         'PE': 'Peru',
3218         'PH': 'Philippines',
3219         'PN': 'Pitcairn',
3220         'PL': 'Poland',
3221         'PT': 'Portugal',
3222         'PR': 'Puerto Rico',
3223         'QA': 'Qatar',
3224         'RE': 'Réunion',
3225         'RO': 'Romania',
3226         'RU': 'Russian Federation',
3227         'RW': 'Rwanda',
3228         'BL': 'Saint Barthélemy',
3229         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3230         'KN': 'Saint Kitts and Nevis',
3231         'LC': 'Saint Lucia',
3232         'MF': 'Saint Martin (French part)',
3233         'PM': 'Saint Pierre and Miquelon',
3234         'VC': 'Saint Vincent and the Grenadines',
3235         'WS': 'Samoa',
3236         'SM': 'San Marino',
3237         'ST': 'Sao Tome and Principe',
3238         'SA': 'Saudi Arabia',
3239         'SN': 'Senegal',
3240         'RS': 'Serbia',
3241         'SC': 'Seychelles',
3242         'SL': 'Sierra Leone',
3243         'SG': 'Singapore',
3244         'SX': 'Sint Maarten (Dutch part)',
3245         'SK': 'Slovakia',
3246         'SI': 'Slovenia',
3247         'SB': 'Solomon Islands',
3248         'SO': 'Somalia',
3249         'ZA': 'South Africa',
3250         'GS': 'South Georgia and the South Sandwich Islands',
3251         'SS': 'South Sudan',
3252         'ES': 'Spain',
3253         'LK': 'Sri Lanka',
3254         'SD': 'Sudan',
3255         'SR': 'Suriname',
3256         'SJ': 'Svalbard and Jan Mayen',
3257         'SZ': 'Swaziland',
3258         'SE': 'Sweden',
3259         'CH': 'Switzerland',
3260         'SY': 'Syrian Arab Republic',
3261         'TW': 'Taiwan, Province of China',
3262         'TJ': 'Tajikistan',
3263         'TZ': 'Tanzania, United Republic of',
3264         'TH': 'Thailand',
3265         'TL': 'Timor-Leste',
3266         'TG': 'Togo',
3267         'TK': 'Tokelau',
3268         'TO': 'Tonga',
3269         'TT': 'Trinidad and Tobago',
3270         'TN': 'Tunisia',
3271         'TR': 'Turkey',
3272         'TM': 'Turkmenistan',
3273         'TC': 'Turks and Caicos Islands',
3274         'TV': 'Tuvalu',
3275         'UG': 'Uganda',
3276         'UA': 'Ukraine',
3277         'AE': 'United Arab Emirates',
3278         'GB': 'United Kingdom',
3279         'US': 'United States',
3280         'UM': 'United States Minor Outlying Islands',
3281         'UY': 'Uruguay',
3282         'UZ': 'Uzbekistan',
3283         'VU': 'Vanuatu',
3284         'VE': 'Venezuela, Bolivarian Republic of',
3285         'VN': 'Viet Nam',
3286         'VG': 'Virgin Islands, British',
3287         'VI': 'Virgin Islands, U.S.',
3288         'WF': 'Wallis and Futuna',
3289         'EH': 'Western Sahara',
3290         'YE': 'Yemen',
3291         'ZM': 'Zambia',
3292         'ZW': 'Zimbabwe',
3293     }
3294
3295     @classmethod
3296     def short2full(cls, code):
3297         """Convert an ISO 3166-2 country code to the corresponding full name"""
3298         return cls._country_map.get(code.upper())
3299
3300
3301 class GeoUtils(object):
3302     # Major IPv4 address blocks per country
3303     _country_ip_map = {
3304         'AD': '85.94.160.0/19',
3305         'AE': '94.200.0.0/13',
3306         'AF': '149.54.0.0/17',
3307         'AG': '209.59.64.0/18',
3308         'AI': '204.14.248.0/21',
3309         'AL': '46.99.0.0/16',
3310         'AM': '46.70.0.0/15',
3311         'AO': '105.168.0.0/13',
3312         'AP': '159.117.192.0/21',
3313         'AR': '181.0.0.0/12',
3314         'AS': '202.70.112.0/20',
3315         'AT': '84.112.0.0/13',
3316         'AU': '1.128.0.0/11',
3317         'AW': '181.41.0.0/18',
3318         'AZ': '5.191.0.0/16',
3319         'BA': '31.176.128.0/17',
3320         'BB': '65.48.128.0/17',
3321         'BD': '114.130.0.0/16',
3322         'BE': '57.0.0.0/8',
3323         'BF': '129.45.128.0/17',
3324         'BG': '95.42.0.0/15',
3325         'BH': '37.131.0.0/17',
3326         'BI': '154.117.192.0/18',
3327         'BJ': '137.255.0.0/16',
3328         'BL': '192.131.134.0/24',
3329         'BM': '196.12.64.0/18',
3330         'BN': '156.31.0.0/16',
3331         'BO': '161.56.0.0/16',
3332         'BQ': '161.0.80.0/20',
3333         'BR': '152.240.0.0/12',
3334         'BS': '24.51.64.0/18',
3335         'BT': '119.2.96.0/19',
3336         'BW': '168.167.0.0/16',
3337         'BY': '178.120.0.0/13',
3338         'BZ': '179.42.192.0/18',
3339         'CA': '99.224.0.0/11',
3340         'CD': '41.243.0.0/16',
3341         'CF': '196.32.200.0/21',
3342         'CG': '197.214.128.0/17',
3343         'CH': '85.0.0.0/13',
3344         'CI': '154.232.0.0/14',
3345         'CK': '202.65.32.0/19',
3346         'CL': '152.172.0.0/14',
3347         'CM': '165.210.0.0/15',
3348         'CN': '36.128.0.0/10',
3349         'CO': '181.240.0.0/12',
3350         'CR': '201.192.0.0/12',
3351         'CU': '152.206.0.0/15',
3352         'CV': '165.90.96.0/19',
3353         'CW': '190.88.128.0/17',
3354         'CY': '46.198.0.0/15',
3355         'CZ': '88.100.0.0/14',
3356         'DE': '53.0.0.0/8',
3357         'DJ': '197.241.0.0/17',
3358         'DK': '87.48.0.0/12',
3359         'DM': '192.243.48.0/20',
3360         'DO': '152.166.0.0/15',
3361         'DZ': '41.96.0.0/12',
3362         'EC': '186.68.0.0/15',
3363         'EE': '90.190.0.0/15',
3364         'EG': '156.160.0.0/11',
3365         'ER': '196.200.96.0/20',
3366         'ES': '88.0.0.0/11',
3367         'ET': '196.188.0.0/14',
3368         'EU': '2.16.0.0/13',
3369         'FI': '91.152.0.0/13',
3370         'FJ': '144.120.0.0/16',
3371         'FM': '119.252.112.0/20',
3372         'FO': '88.85.32.0/19',
3373         'FR': '90.0.0.0/9',
3374         'GA': '41.158.0.0/15',
3375         'GB': '25.0.0.0/8',
3376         'GD': '74.122.88.0/21',
3377         'GE': '31.146.0.0/16',
3378         'GF': '161.22.64.0/18',
3379         'GG': '62.68.160.0/19',
3380         'GH': '45.208.0.0/14',
3381         'GI': '85.115.128.0/19',
3382         'GL': '88.83.0.0/19',
3383         'GM': '160.182.0.0/15',
3384         'GN': '197.149.192.0/18',
3385         'GP': '104.250.0.0/19',
3386         'GQ': '105.235.224.0/20',
3387         'GR': '94.64.0.0/13',
3388         'GT': '168.234.0.0/16',
3389         'GU': '168.123.0.0/16',
3390         'GW': '197.214.80.0/20',
3391         'GY': '181.41.64.0/18',
3392         'HK': '113.252.0.0/14',
3393         'HN': '181.210.0.0/16',
3394         'HR': '93.136.0.0/13',
3395         'HT': '148.102.128.0/17',
3396         'HU': '84.0.0.0/14',
3397         'ID': '39.192.0.0/10',
3398         'IE': '87.32.0.0/12',
3399         'IL': '79.176.0.0/13',
3400         'IM': '5.62.80.0/20',
3401         'IN': '117.192.0.0/10',
3402         'IO': '203.83.48.0/21',
3403         'IQ': '37.236.0.0/14',
3404         'IR': '2.176.0.0/12',
3405         'IS': '82.221.0.0/16',
3406         'IT': '79.0.0.0/10',
3407         'JE': '87.244.64.0/18',
3408         'JM': '72.27.0.0/17',
3409         'JO': '176.29.0.0/16',
3410         'JP': '126.0.0.0/8',
3411         'KE': '105.48.0.0/12',
3412         'KG': '158.181.128.0/17',
3413         'KH': '36.37.128.0/17',
3414         'KI': '103.25.140.0/22',
3415         'KM': '197.255.224.0/20',
3416         'KN': '198.32.32.0/19',
3417         'KP': '175.45.176.0/22',
3418         'KR': '175.192.0.0/10',
3419         'KW': '37.36.0.0/14',
3420         'KY': '64.96.0.0/15',
3421         'KZ': '2.72.0.0/13',
3422         'LA': '115.84.64.0/18',
3423         'LB': '178.135.0.0/16',
3424         'LC': '192.147.231.0/24',
3425         'LI': '82.117.0.0/19',
3426         'LK': '112.134.0.0/15',
3427         'LR': '41.86.0.0/19',
3428         'LS': '129.232.0.0/17',
3429         'LT': '78.56.0.0/13',
3430         'LU': '188.42.0.0/16',
3431         'LV': '46.109.0.0/16',
3432         'LY': '41.252.0.0/14',
3433         'MA': '105.128.0.0/11',
3434         'MC': '88.209.64.0/18',
3435         'MD': '37.246.0.0/16',
3436         'ME': '178.175.0.0/17',
3437         'MF': '74.112.232.0/21',
3438         'MG': '154.126.0.0/17',
3439         'MH': '117.103.88.0/21',
3440         'MK': '77.28.0.0/15',
3441         'ML': '154.118.128.0/18',
3442         'MM': '37.111.0.0/17',
3443         'MN': '49.0.128.0/17',
3444         'MO': '60.246.0.0/16',
3445         'MP': '202.88.64.0/20',
3446         'MQ': '109.203.224.0/19',
3447         'MR': '41.188.64.0/18',
3448         'MS': '208.90.112.0/22',
3449         'MT': '46.11.0.0/16',
3450         'MU': '105.16.0.0/12',
3451         'MV': '27.114.128.0/18',
3452         'MW': '105.234.0.0/16',
3453         'MX': '187.192.0.0/11',
3454         'MY': '175.136.0.0/13',
3455         'MZ': '197.218.0.0/15',
3456         'NA': '41.182.0.0/16',
3457         'NC': '101.101.0.0/18',
3458         'NE': '197.214.0.0/18',
3459         'NF': '203.17.240.0/22',
3460         'NG': '105.112.0.0/12',
3461         'NI': '186.76.0.0/15',
3462         'NL': '145.96.0.0/11',
3463         'NO': '84.208.0.0/13',
3464         'NP': '36.252.0.0/15',
3465         'NR': '203.98.224.0/19',
3466         'NU': '49.156.48.0/22',
3467         'NZ': '49.224.0.0/14',
3468         'OM': '5.36.0.0/15',
3469         'PA': '186.72.0.0/15',
3470         'PE': '186.160.0.0/14',
3471         'PF': '123.50.64.0/18',
3472         'PG': '124.240.192.0/19',
3473         'PH': '49.144.0.0/13',
3474         'PK': '39.32.0.0/11',
3475         'PL': '83.0.0.0/11',
3476         'PM': '70.36.0.0/20',
3477         'PR': '66.50.0.0/16',
3478         'PS': '188.161.0.0/16',
3479         'PT': '85.240.0.0/13',
3480         'PW': '202.124.224.0/20',
3481         'PY': '181.120.0.0/14',
3482         'QA': '37.210.0.0/15',
3483         'RE': '139.26.0.0/16',
3484         'RO': '79.112.0.0/13',
3485         'RS': '178.220.0.0/14',
3486         'RU': '5.136.0.0/13',
3487         'RW': '105.178.0.0/15',
3488         'SA': '188.48.0.0/13',
3489         'SB': '202.1.160.0/19',
3490         'SC': '154.192.0.0/11',
3491         'SD': '154.96.0.0/13',
3492         'SE': '78.64.0.0/12',
3493         'SG': '152.56.0.0/14',
3494         'SI': '188.196.0.0/14',
3495         'SK': '78.98.0.0/15',
3496         'SL': '197.215.0.0/17',
3497         'SM': '89.186.32.0/19',
3498         'SN': '41.82.0.0/15',
3499         'SO': '197.220.64.0/19',
3500         'SR': '186.179.128.0/17',
3501         'SS': '105.235.208.0/21',
3502         'ST': '197.159.160.0/19',
3503         'SV': '168.243.0.0/16',
3504         'SX': '190.102.0.0/20',
3505         'SY': '5.0.0.0/16',
3506         'SZ': '41.84.224.0/19',
3507         'TC': '65.255.48.0/20',
3508         'TD': '154.68.128.0/19',
3509         'TG': '196.168.0.0/14',
3510         'TH': '171.96.0.0/13',
3511         'TJ': '85.9.128.0/18',
3512         'TK': '27.96.24.0/21',
3513         'TL': '180.189.160.0/20',
3514         'TM': '95.85.96.0/19',
3515         'TN': '197.0.0.0/11',
3516         'TO': '175.176.144.0/21',
3517         'TR': '78.160.0.0/11',
3518         'TT': '186.44.0.0/15',
3519         'TV': '202.2.96.0/19',
3520         'TW': '120.96.0.0/11',
3521         'TZ': '156.156.0.0/14',
3522         'UA': '93.72.0.0/13',
3523         'UG': '154.224.0.0/13',
3524         'US': '3.0.0.0/8',
3525         'UY': '167.56.0.0/13',
3526         'UZ': '82.215.64.0/18',
3527         'VA': '212.77.0.0/19',
3528         'VC': '24.92.144.0/20',
3529         'VE': '186.88.0.0/13',
3530         'VG': '172.103.64.0/18',
3531         'VI': '146.226.0.0/16',
3532         'VN': '14.160.0.0/11',
3533         'VU': '202.80.32.0/20',
3534         'WF': '117.20.32.0/21',
3535         'WS': '202.4.32.0/19',
3536         'YE': '134.35.0.0/16',
3537         'YT': '41.242.116.0/22',
3538         'ZA': '41.0.0.0/11',
3539         'ZM': '165.56.0.0/13',
3540         'ZW': '41.85.192.0/19',
3541     }
3542
3543     @classmethod
3544     def random_ipv4(cls, code_or_block):
3545         if len(code_or_block) == 2:
3546             block = cls._country_ip_map.get(code_or_block.upper())
3547             if not block:
3548                 return None
3549         else:
3550             block = code_or_block
3551         addr, preflen = block.split('/')
3552         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3553         addr_max = addr_min | (0xffffffff >> int(preflen))
3554         return compat_str(socket.inet_ntoa(
3555             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3556
3557
3558 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3559     def __init__(self, proxies=None):
3560         # Set default handlers
3561         for type in ('http', 'https'):
3562             setattr(self, '%s_open' % type,
3563                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3564                         meth(r, proxy, type))
3565         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3566
3567     def proxy_open(self, req, proxy, type):
3568         req_proxy = req.headers.get('Ytdl-request-proxy')
3569         if req_proxy is not None:
3570             proxy = req_proxy
3571             del req.headers['Ytdl-request-proxy']
3572
3573         if proxy == '__noproxy__':
3574             return None  # No Proxy
3575         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3576             req.add_header('Ytdl-socks-proxy', proxy)
3577             # youtube-dl's http/https handlers do wrapping the socket with socks
3578             return None
3579         return compat_urllib_request.ProxyHandler.proxy_open(
3580             self, req, proxy, type)
3581
3582
3583 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3584 # released into Public Domain
3585 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3586
3587 def long_to_bytes(n, blocksize=0):
3588     """long_to_bytes(n:long, blocksize:int) : string
3589     Convert a long integer to a byte string.
3590
3591     If optional blocksize is given and greater than zero, pad the front of the
3592     byte string with binary zeros so that the length is a multiple of
3593     blocksize.
3594     """
3595     # after much testing, this algorithm was deemed to be the fastest
3596     s = b''
3597     n = int(n)
3598     while n > 0:
3599         s = compat_struct_pack('>I', n & 0xffffffff) + s
3600         n = n >> 32
3601     # strip off leading zeros
3602     for i in range(len(s)):
3603         if s[i] != b'\000'[0]:
3604             break
3605     else:
3606         # only happens when n == 0
3607         s = b'\000'
3608         i = 0
3609     s = s[i:]
3610     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3611     # de-padding being done above, but sigh...
3612     if blocksize > 0 and len(s) % blocksize:
3613         s = (blocksize - len(s) % blocksize) * b'\000' + s
3614     return s
3615
3616
3617 def bytes_to_long(s):
3618     """bytes_to_long(string) : long
3619     Convert a byte string to a long integer.
3620
3621     This is (essentially) the inverse of long_to_bytes().
3622     """
3623     acc = 0
3624     length = len(s)
3625     if length % 4:
3626         extra = (4 - length % 4)
3627         s = b'\000' * extra + s
3628         length = length + extra
3629     for i in range(0, length, 4):
3630         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3631     return acc
3632
3633
3634 def ohdave_rsa_encrypt(data, exponent, modulus):
3635     '''
3636     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3637
3638     Input:
3639         data: data to encrypt, bytes-like object
3640         exponent, modulus: parameter e and N of RSA algorithm, both integer
3641     Output: hex string of encrypted data
3642
3643     Limitation: supports one block encryption only
3644     '''
3645
3646     payload = int(binascii.hexlify(data[::-1]), 16)
3647     encrypted = pow(payload, exponent, modulus)
3648     return '%x' % encrypted
3649
3650
3651 def pkcs1pad(data, length):
3652     """
3653     Padding input data with PKCS#1 scheme
3654
3655     @param {int[]} data        input data
3656     @param {int}   length      target length
3657     @returns {int[]}           padded data
3658     """
3659     if len(data) > length - 11:
3660         raise ValueError('Input data too long for PKCS#1 padding')
3661
3662     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3663     return [0, 2] + pseudo_random + [0] + data
3664
3665
3666 def encode_base_n(num, n, table=None):
3667     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3668     if not table:
3669         table = FULL_TABLE[:n]
3670
3671     if n > len(table):
3672         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3673
3674     if num == 0:
3675         return table[0]
3676
3677     ret = ''
3678     while num:
3679         ret = table[num % n] + ret
3680         num = num // n
3681     return ret
3682
3683
3684 def decode_packed_codes(code):
3685     mobj = re.search(PACKED_CODES_RE, code)
3686     obfucasted_code, base, count, symbols = mobj.groups()
3687     base = int(base)
3688     count = int(count)
3689     symbols = symbols.split('|')
3690     symbol_table = {}
3691
3692     while count:
3693         count -= 1
3694         base_n_count = encode_base_n(count, base)
3695         symbol_table[base_n_count] = symbols[count] or base_n_count
3696
3697     return re.sub(
3698         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3699         obfucasted_code)
3700
3701
3702 def parse_m3u8_attributes(attrib):
3703     info = {}
3704     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3705         if val.startswith('"'):
3706             val = val[1:-1]
3707         info[key] = val
3708     return info
3709
3710
3711 def urshift(val, n):
3712     return val >> n if val >= 0 else (val + 0x100000000) >> n
3713
3714
3715 # Based on png2str() written by @gdkchan and improved by @yokrysty
3716 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3717 def decode_png(png_data):
3718     # Reference: https://www.w3.org/TR/PNG/
3719     header = png_data[8:]
3720
3721     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3722         raise IOError('Not a valid PNG file.')
3723
3724     int_map = {1: '>B', 2: '>H', 4: '>I'}
3725     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3726
3727     chunks = []
3728
3729     while header:
3730         length = unpack_integer(header[:4])
3731         header = header[4:]
3732
3733         chunk_type = header[:4]
3734         header = header[4:]
3735
3736         chunk_data = header[:length]
3737         header = header[length:]
3738
3739         header = header[4:]  # Skip CRC
3740
3741         chunks.append({
3742             'type': chunk_type,
3743             'length': length,
3744             'data': chunk_data
3745         })
3746
3747     ihdr = chunks[0]['data']
3748
3749     width = unpack_integer(ihdr[:4])
3750     height = unpack_integer(ihdr[4:8])
3751
3752     idat = b''
3753
3754     for chunk in chunks:
3755         if chunk['type'] == b'IDAT':
3756             idat += chunk['data']
3757
3758     if not idat:
3759         raise IOError('Unable to read PNG data.')
3760
3761     decompressed_data = bytearray(zlib.decompress(idat))
3762
3763     stride = width * 3
3764     pixels = []
3765
3766     def _get_pixel(idx):
3767         x = idx % stride
3768         y = idx // stride
3769         return pixels[y][x]
3770
3771     for y in range(height):
3772         basePos = y * (1 + stride)
3773         filter_type = decompressed_data[basePos]
3774
3775         current_row = []
3776
3777         pixels.append(current_row)
3778
3779         for x in range(stride):
3780             color = decompressed_data[1 + basePos + x]
3781             basex = y * stride + x
3782             left = 0
3783             up = 0
3784
3785             if x > 2:
3786                 left = _get_pixel(basex - 3)
3787             if y > 0:
3788                 up = _get_pixel(basex - stride)
3789
3790             if filter_type == 1:  # Sub
3791                 color = (color + left) & 0xff
3792             elif filter_type == 2:  # Up
3793                 color = (color + up) & 0xff
3794             elif filter_type == 3:  # Average
3795                 color = (color + ((left + up) >> 1)) & 0xff
3796             elif filter_type == 4:  # Paeth
3797                 a = left
3798                 b = up
3799                 c = 0
3800
3801                 if x > 2 and y > 0:
3802                     c = _get_pixel(basex - stride - 3)
3803
3804                 p = a + b - c
3805
3806                 pa = abs(p - a)
3807                 pb = abs(p - b)
3808                 pc = abs(p - c)
3809
3810                 if pa <= pb and pa <= pc:
3811                     color = (color + a) & 0xff
3812                 elif pb <= pc:
3813                     color = (color + b) & 0xff
3814                 else:
3815                     color = (color + c) & 0xff
3816
3817             current_row.append(color)
3818
3819     return width, height, pixels
3820
3821
3822 def write_xattr(path, key, value):
3823     # This mess below finds the best xattr tool for the job
3824     try:
3825         # try the pyxattr module...
3826         import xattr
3827
3828         if hasattr(xattr, 'set'):  # pyxattr
3829             # Unicode arguments are not supported in python-pyxattr until
3830             # version 0.5.0
3831             # See https://github.com/rg3/youtube-dl/issues/5498
3832             pyxattr_required_version = '0.5.0'
3833             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3834                 # TODO: fallback to CLI tools
3835                 raise XAttrUnavailableError(
3836                     'python-pyxattr is detected but is too old. '
3837                     'youtube-dl requires %s or above while your version is %s. '
3838                     'Falling back to other xattr implementations' % (
3839                         pyxattr_required_version, xattr.__version__))
3840
3841             setxattr = xattr.set
3842         else:  # xattr
3843             setxattr = xattr.setxattr
3844
3845         try:
3846             setxattr(path, key, value)
3847         except EnvironmentError as e:
3848             raise XAttrMetadataError(e.errno, e.strerror)
3849
3850     except ImportError:
3851         if compat_os_name == 'nt':
3852             # Write xattrs to NTFS Alternate Data Streams:
3853             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3854             assert ':' not in key
3855             assert os.path.exists(path)
3856
3857             ads_fn = path + ':' + key
3858             try:
3859                 with open(ads_fn, 'wb') as f:
3860                     f.write(value)
3861             except EnvironmentError as e:
3862                 raise XAttrMetadataError(e.errno, e.strerror)
3863         else:
3864             user_has_setfattr = check_executable('setfattr', ['--version'])
3865             user_has_xattr = check_executable('xattr', ['-h'])
3866
3867             if user_has_setfattr or user_has_xattr:
3868
3869                 value = value.decode('utf-8')
3870                 if user_has_setfattr:
3871                     executable = 'setfattr'
3872                     opts = ['-n', key, '-v', value]
3873                 elif user_has_xattr:
3874                     executable = 'xattr'
3875                     opts = ['-w', key, value]
3876
3877                 cmd = ([encodeFilename(executable, True)] +
3878                        [encodeArgument(o) for o in opts] +
3879                        [encodeFilename(path, True)])
3880
3881                 try:
3882                     p = subprocess.Popen(
3883                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3884                 except EnvironmentError as e:
3885                     raise XAttrMetadataError(e.errno, e.strerror)
3886                 stdout, stderr = p.communicate()
3887                 stderr = stderr.decode('utf-8', 'replace')
3888                 if p.returncode != 0:
3889                     raise XAttrMetadataError(p.returncode, stderr)
3890
3891             else:
3892                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3893                 if sys.platform.startswith('linux'):
3894                     raise XAttrUnavailableError(
3895                         "Couldn't find a tool to set the xattrs. "
3896                         "Install either the python 'pyxattr' or 'xattr' "
3897                         "modules, or the GNU 'attr' package "
3898                         "(which contains the 'setfattr' tool).")
3899                 else:
3900                     raise XAttrUnavailableError(
3901                         "Couldn't find a tool to set the xattrs. "
3902                         "Install either the python 'xattr' module, "
3903                         "or the 'xattr' binary.")
3904
3905
3906 def random_birthday(year_field, month_field, day_field):
3907     return {
3908         year_field: str(random.randint(1950, 1995)),
3909         month_field: str(random.randint(1, 12)),
3910         day_field: str(random.randint(1, 31)),
3911     }