git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParseError,
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_cookiejar,
  43     compat_ctypes_WINFUNCTYPE,
  44     compat_etree_fromstring,
  45     compat_expanduser,
  46     compat_html_entities,
  47     compat_html_entities_html5,
  48     compat_http_client,
  49     compat_kwargs,
  50     compat_os_name,
  51     compat_parse_qs,
  52     compat_shlex_quote,
  53     compat_str,
  54     compat_struct_pack,
  55     compat_struct_unpack,
  56     compat_urllib_error,
  57     compat_urllib_parse,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_request,
  62     compat_urlparse,
  63     compat_xpath,
  64 )
  65
  66 from .socks import (
  67     ProxyType,
  68     sockssocket,
  69 )
  70
  71
  72 def register_socks_protocols():
  73     # "Register" SOCKS protocols
  74     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  75     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  76     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  77         if scheme not in compat_urlparse.uses_netloc:
  78             compat_urlparse.uses_netloc.append(scheme)
  79
  80
  81 # This is not clearly defined otherwise
  82 compiled_regex_type = type(re.compile(''))
  83
  84 std_headers = {
  85     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
  86     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  87     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  88     'Accept-Encoding': 'gzip, deflate',
  89     'Accept-Language': 'en-us,en;q=0.5',
  90 }
  91
  92
  93 USER_AGENTS = {
  94     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  95 }
  96
  97
  98 NO_DEFAULT = object()
  99
 100 ENGLISH_MONTH_NAMES = [
 101     'January', 'February', 'March', 'April', 'May', 'June',
 102     'July', 'August', 'September', 'October', 'November', 'December']
 103
 104 MONTH_NAMES = {
 105     'en': ENGLISH_MONTH_NAMES,
 106     'fr': [
 107         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 108         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 109 }
 110
 111 KNOWN_EXTENSIONS = (
 112     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 113     'flv', 'f4v', 'f4a', 'f4b',
 114     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 115     'mkv', 'mka', 'mk3d',
 116     'avi', 'divx',
 117     'mov',
 118     'asf', 'wmv', 'wma',
 119     '3gp', '3g2',
 120     'mp3',
 121     'flac',
 122     'ape',
 123     'wav',
 124     'f4f', 'f4m', 'm3u8', 'smil')
 125
 126 # needed for sanitizing filenames in restricted mode
 127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 128                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 129                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 130
 131 DATE_FORMATS = (
 132     '%d %B %Y',
 133     '%d %b %Y',
 134     '%B %d %Y',
 135     '%B %dst %Y',
 136     '%B %dnd %Y',
 137     '%B %dth %Y',
 138     '%b %d %Y',
 139     '%b %dst %Y',
 140     '%b %dnd %Y',
 141     '%b %dth %Y',
 142     '%b %dst %Y %I:%M',
 143     '%b %dnd %Y %I:%M',
 144     '%b %dth %Y %I:%M',
 145     '%Y %m %d',
 146     '%Y-%m-%d',
 147     '%Y/%m/%d',
 148     '%Y/%m/%d %H:%M',
 149     '%Y/%m/%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M',
 151     '%Y-%m-%d %H:%M:%S',
 152     '%Y-%m-%d %H:%M:%S.%f',
 153     '%d.%m.%Y %H:%M',
 154     '%d.%m.%Y %H.%M',
 155     '%Y-%m-%dT%H:%M:%SZ',
 156     '%Y-%m-%dT%H:%M:%S.%fZ',
 157     '%Y-%m-%dT%H:%M:%S.%f0Z',
 158     '%Y-%m-%dT%H:%M:%S',
 159     '%Y-%m-%dT%H:%M:%S.%f',
 160     '%Y-%m-%dT%H:%M',
 161     '%b %d %Y at %H:%M',
 162     '%b %d %Y at %H:%M:%S',
 163     '%B %d %Y at %H:%M',
 164     '%B %d %Y at %H:%M:%S',
 165 )
 166
 167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 168 DATE_FORMATS_DAY_FIRST.extend([
 169     '%d-%m-%Y',
 170     '%d.%m.%Y',
 171     '%d.%m.%y',
 172     '%d/%m/%Y',
 173     '%d/%m/%y',
 174     '%d/%m/%Y %H:%M:%S',
 175 ])
 176
 177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 178 DATE_FORMATS_MONTH_FIRST.extend([
 179     '%m-%d-%Y',
 180     '%m.%d.%Y',
 181     '%m/%d/%Y',
 182     '%m/%d/%y',
 183     '%m/%d/%Y %H:%M:%S',
 184 ])
 185
 186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 188
 189
 190 def preferredencoding():
 191     """Get preferred encoding.
 192
 193     Returns the best encoding scheme for the system, based on
 194     locale.getpreferredencoding() and some further tweaks.
 195     """
 196     try:
 197         pref = locale.getpreferredencoding()
 198         'TEST'.encode(pref)
 199     except Exception:
 200         pref = 'UTF-8'
 201
 202     return pref
 203
 204
 205 def write_json_file(obj, fn):
 206     """ Encode obj as JSON and write it to fn, atomically if possible """
 207
 208     fn = encodeFilename(fn)
 209     if sys.version_info < (3, 0) and sys.platform != 'win32':
 210         encoding = get_filesystem_encoding()
 211         # os.path.basename returns a bytes object, but NamedTemporaryFile
 212         # will fail if the filename contains non ascii characters unless we
 213         # use a unicode object
 214         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 215         # the same for os.path.dirname
 216         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 217     else:
 218         path_basename = os.path.basename
 219         path_dirname = os.path.dirname
 220
 221     args = {
 222         'suffix': '.tmp',
 223         'prefix': path_basename(fn) + '.',
 224         'dir': path_dirname(fn),
 225         'delete': False,
 226     }
 227
 228     # In Python 2.x, json.dump expects a bytestream.
 229     # In Python 3.x, it writes to a character stream
 230     if sys.version_info < (3, 0):
 231         args['mode'] = 'wb'
 232     else:
 233         args.update({
 234             'mode': 'w',
 235             'encoding': 'utf-8',
 236         })
 237
 238     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 239
 240     try:
 241         with tf:
 242             json.dump(obj, tf)
 243         if sys.platform == 'win32':
 244             # Need to remove existing file on Windows, else os.rename raises
 245             # WindowsError or FileExistsError.
 246             try:
 247                 os.unlink(fn)
 248             except OSError:
 249                 pass
 250         os.rename(tf.name, fn)
 251     except Exception:
 252         try:
 253             os.remove(tf.name)
 254         except OSError:
 255             pass
 256         raise
 257
 258
 259 if sys.version_info >= (2, 7):
 260     def find_xpath_attr(node, xpath, key, val=None):
 261         """ Find the xpath xpath[@key=val] """
 262         assert re.match(r'^[a-zA-Z_-]+$', key)
 263         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 264         return node.find(expr)
 265 else:
 266     def find_xpath_attr(node, xpath, key, val=None):
 267         for f in node.findall(compat_xpath(xpath)):
 268             if key not in f.attrib:
 269                 continue
 270             if val is None or f.attrib.get(key) == val:
 271                 return f
 272         return None
 273
 274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 275 # the namespace parameter
 276
 277
 278 def xpath_with_ns(path, ns_map):
 279     components = [c.split(':') for c in path.split('/')]
 280     replaced = []
 281     for c in components:
 282         if len(c) == 1:
 283             replaced.append(c[0])
 284         else:
 285             ns, tag = c
 286             replaced.append('{%s}%s' % (ns_map[ns], tag))
 287     return '/'.join(replaced)
 288
 289
 290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 291     def _find_xpath(xpath):
 292         return node.find(compat_xpath(xpath))
 293
 294     if isinstance(xpath, (str, compat_str)):
 295         n = _find_xpath(xpath)
 296     else:
 297         for xp in xpath:
 298             n = _find_xpath(xp)
 299             if n is not None:
 300                 break
 301
 302     if n is None:
 303         if default is not NO_DEFAULT:
 304             return default
 305         elif fatal:
 306             name = xpath if name is None else name
 307             raise ExtractorError('Could not find XML element %s' % name)
 308         else:
 309             return None
 310     return n
 311
 312
 313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 314     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 315     if n is None or n == default:
 316         return n
 317     if n.text is None:
 318         if default is not NO_DEFAULT:
 319             return default
 320         elif fatal:
 321             name = xpath if name is None else name
 322             raise ExtractorError('Could not find XML element\'s text %s' % name)
 323         else:
 324             return None
 325     return n.text
 326
 327
 328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 329     n = find_xpath_attr(node, xpath, key)
 330     if n is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = '%s[@%s]' % (xpath, key) if name is None else name
 335             raise ExtractorError('Could not find XML attribute %s' % name)
 336         else:
 337             return None
 338     return n.attrib[key]
 339
 340
 341 def get_element_by_id(id, html):
 342     """Return the content of the tag with the specified ID in the passed HTML document"""
 343     return get_element_by_attribute('id', id, html)
 344
 345
 346 def get_element_by_class(class_name, html):
 347     """Return the content of the first tag with the specified class in the passed HTML document"""
 348     retval = get_elements_by_class(class_name, html)
 349     return retval[0] if retval else None
 350
 351
 352 def get_element_by_attribute(attribute, value, html, escape_value=True):
 353     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 354     return retval[0] if retval else None
 355
 356
 357 def get_elements_by_class(class_name, html):
 358     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 359     return get_elements_by_attribute(
 360         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 361         html, escape_value=False)
 362
 363
 364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 365     """Return the content of the tag with the specified attribute in the passed HTML document"""
 366
 367     value = re.escape(value) if escape_value else value
 368
 369     retlist = []
 370     for m in re.finditer(r'''(?xs)
 371         <([a-zA-Z0-9:._-]+)
 372          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 373          \s+%s=['"]?%s['"]?
 374          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 375         \s*>
 376         (?P<content>.*?)
 377         </\1>
 378     ''' % (re.escape(attribute), value), html):
 379         res = m.group('content')
 380
 381         if res.startswith('"') or res.startswith("'"):
 382             res = res[1:-1]
 383
 384         retlist.append(unescapeHTML(res))
 385
 386     return retlist
 387
 388
 389 class HTMLAttributeParser(compat_HTMLParser):
 390     """Trivial HTML parser to gather the attributes for a single element"""
 391     def __init__(self):
 392         self.attrs = {}
 393         compat_HTMLParser.__init__(self)
 394
 395     def handle_starttag(self, tag, attrs):
 396         self.attrs = dict(attrs)
 397
 398
 399 def extract_attributes(html_element):
 400     """Given a string for an HTML element such as
 401     <el
 402          a="foo" B="bar" c="&98;az" d=boz
 403          empty= noval entity="&amp;"
 404          sq='"' dq="'"
 405     >
 406     Decode and return a dictionary of attributes.
 407     {
 408         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 409         'empty': '', 'noval': None, 'entity': '&',
 410         'sq': '"', 'dq': '\''
 411     }.
 412     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 413     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 414     """
 415     parser = HTMLAttributeParser()
 416     try:
 417         parser.feed(html_element)
 418         parser.close()
 419     # Older Python may throw HTMLParseError in case of malformed HTML
 420     except compat_HTMLParseError:
 421         pass
 422     return parser.attrs
 423
 424
 425 def clean_html(html):
 426     """Clean an HTML snippet into a readable string"""
 427
 428     if html is None:  # Convenience for sanitizing descriptions etc.
 429         return html
 430
 431     # Newline vs <br />
 432     html = html.replace('\n', ' ')
 433     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 434     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 435     # Strip html tags
 436     html = re.sub('<.*?>', '', html)
 437     # Replace html entities
 438     html = unescapeHTML(html)
 439     return html.strip()
 440
 441
 442 def sanitize_open(filename, open_mode):
 443     """Try to open the given filename, and slightly tweak it if this fails.
 444
 445     Attempts to open the given filename. If this fails, it tries to change
 446     the filename slightly, step by step, until it's either able to open it
 447     or it fails and raises a final exception, like the standard open()
 448     function.
 449
 450     It returns the tuple (stream, definitive_file_name).
 451     """
 452     try:
 453         if filename == '-':
 454             if sys.platform == 'win32':
 455                 import msvcrt
 456                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 457             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 458         stream = open(encodeFilename(filename), open_mode)
 459         return (stream, filename)
 460     except (IOError, OSError) as err:
 461         if err.errno in (errno.EACCES,):
 462             raise
 463
 464         # In case of error, try to remove win32 forbidden chars
 465         alt_filename = sanitize_path(filename)
 466         if alt_filename == filename:
 467             raise
 468         else:
 469             # An exception here should be caught in the caller
 470             stream = open(encodeFilename(alt_filename), open_mode)
 471             return (stream, alt_filename)
 472
 473
 474 def timeconvert(timestr):
 475     """Convert RFC 2822 defined time string into system timestamp"""
 476     timestamp = None
 477     timetuple = email.utils.parsedate_tz(timestr)
 478     if timetuple is not None:
 479         timestamp = email.utils.mktime_tz(timetuple)
 480     return timestamp
 481
 482
 483 def sanitize_filename(s, restricted=False, is_id=False):
 484     """Sanitizes a string so it could be used as part of a filename.
 485     If restricted is set, use a stricter subset of allowed characters.
 486     Set is_id if this is not an arbitrary string, but an ID that should be kept
 487     if possible.
 488     """
 489     def replace_insane(char):
 490         if restricted and char in ACCENT_CHARS:
 491             return ACCENT_CHARS[char]
 492         if char == '?' or ord(char) < 32 or ord(char) == 127:
 493             return ''
 494         elif char == '"':
 495             return '' if restricted else '\''
 496         elif char == ':':
 497             return '_-' if restricted else ' -'
 498         elif char in '\\/|*<>':
 499             return '_'
 500         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 501             return '_'
 502         if restricted and ord(char) > 127:
 503             return '_'
 504         return char
 505
 506     # Handle timestamps
 507     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 508     result = ''.join(map(replace_insane, s))
 509     if not is_id:
 510         while '__' in result:
 511             result = result.replace('__', '_')
 512         result = result.strip('_')
 513         # Common case of "Foreign band name - English song title"
 514         if restricted and result.startswith('-_'):
 515             result = result[2:]
 516         if result.startswith('-'):
 517             result = '_' + result[len('-'):]
 518         result = result.lstrip('.')
 519         if not result:
 520             result = '_'
 521     return result
 522
 523
 524 def sanitize_path(s):
 525     """Sanitizes and normalizes path on Windows"""
 526     if sys.platform != 'win32':
 527         return s
 528     drive_or_unc, _ = os.path.splitdrive(s)
 529     if sys.version_info < (2, 7) and not drive_or_unc:
 530         drive_or_unc, _ = os.path.splitunc(s)
 531     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 532     if drive_or_unc:
 533         norm_path.pop(0)
 534     sanitized_path = [
 535         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 536         for path_part in norm_path]
 537     if drive_or_unc:
 538         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 539     return os.path.join(*sanitized_path)
 540
 541
 542 def sanitize_url(url):
 543     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 544     # the number of unwanted failures due to missing protocol
 545     if url.startswith('//'):
 546         return 'http:%s' % url
 547     # Fix some common typos seen so far
 548     COMMON_TYPOS = (
 549         # https://github.com/rg3/youtube-dl/issues/15649
 550         (r'^httpss://', r'https://'),
 551         # https://bx1.be/lives/direct-tv/
 552         (r'^rmtp([es]?)://', r'rtmp\1://'),
 553     )
 554     for mistake, fixup in COMMON_TYPOS:
 555         if re.match(mistake, url):
 556             return re.sub(mistake, fixup, url)
 557     return url
 558
 559
 560 def sanitized_Request(url, *args, **kwargs):
 561     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 562
 563
 564 def expand_path(s):
 565     """Expand shell variables and ~"""
 566     return os.path.expandvars(compat_expanduser(s))
 567
 568
 569 def orderedSet(iterable):
 570     """ Remove all duplicates from the input iterable """
 571     res = []
 572     for el in iterable:
 573         if el not in res:
 574             res.append(el)
 575     return res
 576
 577
 578 def _htmlentity_transform(entity_with_semicolon):
 579     """Transforms an HTML entity to a character."""
 580     entity = entity_with_semicolon[:-1]
 581
 582     # Known non-numeric HTML entity
 583     if entity in compat_html_entities.name2codepoint:
 584         return compat_chr(compat_html_entities.name2codepoint[entity])
 585
 586     # TODO: HTML5 allows entities without a semicolon. For example,
 587     # '&Eacuteric' should be decoded as 'Éric'.
 588     if entity_with_semicolon in compat_html_entities_html5:
 589         return compat_html_entities_html5[entity_with_semicolon]
 590
 591     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 592     if mobj is not None:
 593         numstr = mobj.group(1)
 594         if numstr.startswith('x'):
 595             base = 16
 596             numstr = '0%s' % numstr
 597         else:
 598             base = 10
 599         # See https://github.com/rg3/youtube-dl/issues/7518
 600         try:
 601             return compat_chr(int(numstr, base))
 602         except ValueError:
 603             pass
 604
 605     # Unknown entity in name, return its literal representation
 606     return '&%s;' % entity
 607
 608
 609 def unescapeHTML(s):
 610     if s is None:
 611         return None
 612     assert type(s) == compat_str
 613
 614     return re.sub(
 615         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 616
 617
 618 def get_subprocess_encoding():
 619     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 620         # For subprocess calls, encode with locale encoding
 621         # Refer to http://stackoverflow.com/a/9951851/35070
 622         encoding = preferredencoding()
 623     else:
 624         encoding = sys.getfilesystemencoding()
 625     if encoding is None:
 626         encoding = 'utf-8'
 627     return encoding
 628
 629
 630 def encodeFilename(s, for_subprocess=False):
 631     """
 632     @param s The name of the file
 633     """
 634
 635     assert type(s) == compat_str
 636
 637     # Python 3 has a Unicode API
 638     if sys.version_info >= (3, 0):
 639         return s
 640
 641     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 642     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 643     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 644     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 645         return s
 646
 647     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 648     if sys.platform.startswith('java'):
 649         return s
 650
 651     return s.encode(get_subprocess_encoding(), 'ignore')
 652
 653
 654 def decodeFilename(b, for_subprocess=False):
 655
 656     if sys.version_info >= (3, 0):
 657         return b
 658
 659     if not isinstance(b, bytes):
 660         return b
 661
 662     return b.decode(get_subprocess_encoding(), 'ignore')
 663
 664
 665 def encodeArgument(s):
 666     if not isinstance(s, compat_str):
 667         # Legacy code that uses byte strings
 668         # Uncomment the following line after fixing all post processors
 669         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 670         s = s.decode('ascii')
 671     return encodeFilename(s, True)
 672
 673
 674 def decodeArgument(b):
 675     return decodeFilename(b, True)
 676
 677
 678 def decodeOption(optval):
 679     if optval is None:
 680         return optval
 681     if isinstance(optval, bytes):
 682         optval = optval.decode(preferredencoding())
 683
 684     assert isinstance(optval, compat_str)
 685     return optval
 686
 687
 688 def formatSeconds(secs):
 689     if secs > 3600:
 690         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 691     elif secs > 60:
 692         return '%d:%02d' % (secs // 60, secs % 60)
 693     else:
 694         return '%d' % secs
 695
 696
 697 def make_HTTPS_handler(params, **kwargs):
 698     opts_no_check_certificate = params.get('nocheckcertificate', False)
 699     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 700         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 701         if opts_no_check_certificate:
 702             context.check_hostname = False
 703             context.verify_mode = ssl.CERT_NONE
 704         try:
 705             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 706         except TypeError:
 707             # Python 2.7.8
 708             # (create_default_context present but HTTPSHandler has no context=)
 709             pass
 710
 711     if sys.version_info < (3, 2):
 712         return YoutubeDLHTTPSHandler(params, **kwargs)
 713     else:  # Python < 3.4
 714         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 715         context.verify_mode = (ssl.CERT_NONE
 716                                if opts_no_check_certificate
 717                                else ssl.CERT_REQUIRED)
 718         context.set_default_verify_paths()
 719         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 720
 721
 722 def bug_reports_message():
 723     if ytdl_is_updateable():
 724         update_cmd = 'type  youtube-dl -U  to update'
 725     else:
 726         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 727     msg = '; please report this issue on https://yt-dl.org/bug .'
 728     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 729     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 730     return msg
 731
 732
 733 class YoutubeDLError(Exception):
 734     """Base exception for YoutubeDL errors."""
 735     pass
 736
 737
 738 class ExtractorError(YoutubeDLError):
 739     """Error during info extraction."""
 740
 741     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 742         """ tb, if given, is the original traceback (so that it can be printed out).
 743         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 744         """
 745
 746         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 747             expected = True
 748         if video_id is not None:
 749             msg = video_id + ': ' + msg
 750         if cause:
 751             msg += ' (caused by %r)' % cause
 752         if not expected:
 753             msg += bug_reports_message()
 754         super(ExtractorError, self).__init__(msg)
 755
 756         self.traceback = tb
 757         self.exc_info = sys.exc_info()  # preserve original exception
 758         self.cause = cause
 759         self.video_id = video_id
 760
 761     def format_traceback(self):
 762         if self.traceback is None:
 763             return None
 764         return ''.join(traceback.format_tb(self.traceback))
 765
 766
 767 class UnsupportedError(ExtractorError):
 768     def __init__(self, url):
 769         super(UnsupportedError, self).__init__(
 770             'Unsupported URL: %s' % url, expected=True)
 771         self.url = url
 772
 773
 774 class RegexNotFoundError(ExtractorError):
 775     """Error when a regex didn't match"""
 776     pass
 777
 778
 779 class GeoRestrictedError(ExtractorError):
 780     """Geographic restriction Error exception.
 781
 782     This exception may be thrown when a video is not available from your
 783     geographic location due to geographic restrictions imposed by a website.
 784     """
 785     def __init__(self, msg, countries=None):
 786         super(GeoRestrictedError, self).__init__(msg, expected=True)
 787         self.msg = msg
 788         self.countries = countries
 789
 790
 791 class DownloadError(YoutubeDLError):
 792     """Download Error exception.
 793
 794     This exception may be thrown by FileDownloader objects if they are not
 795     configured to continue on errors. They will contain the appropriate
 796     error message.
 797     """
 798
 799     def __init__(self, msg, exc_info=None):
 800         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 801         super(DownloadError, self).__init__(msg)
 802         self.exc_info = exc_info
 803
 804
 805 class SameFileError(YoutubeDLError):
 806     """Same File exception.
 807
 808     This exception will be thrown by FileDownloader objects if they detect
 809     multiple files would have to be downloaded to the same file on disk.
 810     """
 811     pass
 812
 813
 814 class PostProcessingError(YoutubeDLError):
 815     """Post Processing exception.
 816
 817     This exception may be raised by PostProcessor's .run() method to
 818     indicate an error in the postprocessing task.
 819     """
 820
 821     def __init__(self, msg):
 822         super(PostProcessingError, self).__init__(msg)
 823         self.msg = msg
 824
 825
 826 class MaxDownloadsReached(YoutubeDLError):
 827     """ --max-downloads limit has been reached. """
 828     pass
 829
 830
 831 class UnavailableVideoError(YoutubeDLError):
 832     """Unavailable Format exception.
 833
 834     This exception will be thrown when a video is requested
 835     in a format that is not available for that video.
 836     """
 837     pass
 838
 839
 840 class ContentTooShortError(YoutubeDLError):
 841     """Content Too Short exception.
 842
 843     This exception may be raised by FileDownloader objects when a file they
 844     download is too small for what the server announced first, indicating
 845     the connection was probably interrupted.
 846     """
 847
 848     def __init__(self, downloaded, expected):
 849         super(ContentTooShortError, self).__init__(
 850             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 851         )
 852         # Both in bytes
 853         self.downloaded = downloaded
 854         self.expected = expected
 855
 856
 857 class XAttrMetadataError(YoutubeDLError):
 858     def __init__(self, code=None, msg='Unknown error'):
 859         super(XAttrMetadataError, self).__init__(msg)
 860         self.code = code
 861         self.msg = msg
 862
 863         # Parsing code and msg
 864         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 865                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 866             self.reason = 'NO_SPACE'
 867         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 868             self.reason = 'VALUE_TOO_LONG'
 869         else:
 870             self.reason = 'NOT_SUPPORTED'
 871
 872
 873 class XAttrUnavailableError(YoutubeDLError):
 874     pass
 875
 876
 877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 878     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 879     # expected HTTP responses to meet HTTP/1.0 or later (see also
 880     # https://github.com/rg3/youtube-dl/issues/6727)
 881     if sys.version_info < (3, 0):
 882         kwargs['strict'] = True
 883     hc = http_class(*args, **compat_kwargs(kwargs))
 884     source_address = ydl_handler._params.get('source_address')
 885
 886     if source_address is not None:
 887         # This is to workaround _create_connection() from socket where it will try all
 888         # address data from getaddrinfo() including IPv6. This filters the result from
 889         # getaddrinfo() based on the source_address value.
 890         # This is based on the cpython socket.create_connection() function.
 891         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
 892         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
 893             host, port = address
 894             err = None
 895             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
 896             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
 897             ip_addrs = [addr for addr in addrs if addr[0] == af]
 898             if addrs and not ip_addrs:
 899                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
 900                 raise socket.error(
 901                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
 902                     % (ip_version, source_address[0]))
 903             for res in ip_addrs:
 904                 af, socktype, proto, canonname, sa = res
 905                 sock = None
 906                 try:
 907                     sock = socket.socket(af, socktype, proto)
 908                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
 909                         sock.settimeout(timeout)
 910                     sock.bind(source_address)
 911                     sock.connect(sa)
 912                     err = None  # Explicitly break reference cycle
 913                     return sock
 914                 except socket.error as _:
 915                     err = _
 916                     if sock is not None:
 917                         sock.close()
 918             if err is not None:
 919                 raise err
 920             else:
 921                 raise socket.error('getaddrinfo returns an empty list')
 922         if hasattr(hc, '_create_connection'):
 923             hc._create_connection = _create_connection
 924         sa = (source_address, 0)
 925         if hasattr(hc, 'source_address'):  # Python 2.7+
 926             hc.source_address = sa
 927         else:  # Python 2.6
 928             def _hc_connect(self, *args, **kwargs):
 929                 sock = _create_connection(
 930                     (self.host, self.port), self.timeout, sa)
 931                 if is_https:
 932                     self.sock = ssl.wrap_socket(
 933                         sock, self.key_file, self.cert_file,
 934                         ssl_version=ssl.PROTOCOL_TLSv1)
 935                 else:
 936                     self.sock = sock
 937             hc.connect = functools.partial(_hc_connect, hc)
 938
 939     return hc
 940
 941
 942 def handle_youtubedl_headers(headers):
 943     filtered_headers = headers
 944
 945     if 'Youtubedl-no-compression' in filtered_headers:
 946         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 947         del filtered_headers['Youtubedl-no-compression']
 948
 949     return filtered_headers
 950
 951
 952 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 953     """Handler for HTTP requests and responses.
 954
 955     This class, when installed with an OpenerDirector, automatically adds
 956     the standard headers to every HTTP request and handles gzipped and
 957     deflated responses from web servers. If compression is to be avoided in
 958     a particular request, the original request in the program code only has
 959     to include the HTTP header "Youtubedl-no-compression", which will be
 960     removed before making the real request.
 961
 962     Part of this code was copied from:
 963
 964     http://techknack.net/python-urllib2-handlers/
 965
 966     Andrew Rowls, the author of that code, agreed to release it to the
 967     public domain.
 968     """
 969
 970     def __init__(self, params, *args, **kwargs):
 971         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 972         self._params = params
 973
 974     def http_open(self, req):
 975         conn_class = compat_http_client.HTTPConnection
 976
 977         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 978         if socks_proxy:
 979             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 980             del req.headers['Ytdl-socks-proxy']
 981
 982         return self.do_open(functools.partial(
 983             _create_http_connection, self, conn_class, False),
 984             req)
 985
 986     @staticmethod
 987     def deflate(data):
 988         try:
 989             return zlib.decompress(data, -zlib.MAX_WBITS)
 990         except zlib.error:
 991             return zlib.decompress(data)
 992
 993     def http_request(self, req):
 994         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 995         # always respected by websites, some tend to give out URLs with non percent-encoded
 996         # non-ASCII characters (see telemb.py, ard.py [#3412])
 997         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 998         # To work around aforementioned issue we will replace request's original URL with
 999         # percent-encoded one
1000         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1001         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1002         url = req.get_full_url()
1003         url_escaped = escape_url(url)
1004
1005         # Substitute URL if any change after escaping
1006         if url != url_escaped:
1007             req = update_Request(req, url=url_escaped)
1008
1009         for h, v in std_headers.items():
1010             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1011             # The dict keys are capitalized because of this bug by urllib
1012             if h.capitalize() not in req.headers:
1013                 req.add_header(h, v)
1014
1015         req.headers = handle_youtubedl_headers(req.headers)
1016
1017         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1018             # Python 2.6 is brain-dead when it comes to fragments
1019             req._Request__original = req._Request__original.partition('#')[0]
1020             req._Request__r_type = req._Request__r_type.partition('#')[0]
1021
1022         return req
1023
1024     def http_response(self, req, resp):
1025         old_resp = resp
1026         # gzip
1027         if resp.headers.get('Content-encoding', '') == 'gzip':
1028             content = resp.read()
1029             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1030             try:
1031                 uncompressed = io.BytesIO(gz.read())
1032             except IOError as original_ioerror:
1033                 # There may be junk add the end of the file
1034                 # See http://stackoverflow.com/q/4928560/35070 for details
1035                 for i in range(1, 1024):
1036                     try:
1037                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1038                         uncompressed = io.BytesIO(gz.read())
1039                     except IOError:
1040                         continue
1041                     break
1042                 else:
1043                     raise original_ioerror
1044             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1045             resp.msg = old_resp.msg
1046             del resp.headers['Content-encoding']
1047         # deflate
1048         if resp.headers.get('Content-encoding', '') == 'deflate':
1049             gz = io.BytesIO(self.deflate(resp.read()))
1050             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1051             resp.msg = old_resp.msg
1052             del resp.headers['Content-encoding']
1053         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1054         # https://github.com/rg3/youtube-dl/issues/6457).
1055         if 300 <= resp.code < 400:
1056             location = resp.headers.get('Location')
1057             if location:
1058                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1059                 if sys.version_info >= (3, 0):
1060                     location = location.encode('iso-8859-1').decode('utf-8')
1061                 else:
1062                     location = location.decode('utf-8')
1063                 location_escaped = escape_url(location)
1064                 if location != location_escaped:
1065                     del resp.headers['Location']
1066                     if sys.version_info < (3, 0):
1067                         location_escaped = location_escaped.encode('utf-8')
1068                     resp.headers['Location'] = location_escaped
1069         return resp
1070
1071     https_request = http_request
1072     https_response = http_response
1073
1074
1075 def make_socks_conn_class(base_class, socks_proxy):
1076     assert issubclass(base_class, (
1077         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1078
1079     url_components = compat_urlparse.urlparse(socks_proxy)
1080     if url_components.scheme.lower() == 'socks5':
1081         socks_type = ProxyType.SOCKS5
1082     elif url_components.scheme.lower() in ('socks', 'socks4'):
1083         socks_type = ProxyType.SOCKS4
1084     elif url_components.scheme.lower() == 'socks4a':
1085         socks_type = ProxyType.SOCKS4A
1086
1087     def unquote_if_non_empty(s):
1088         if not s:
1089             return s
1090         return compat_urllib_parse_unquote_plus(s)
1091
1092     proxy_args = (
1093         socks_type,
1094         url_components.hostname, url_components.port or 1080,
1095         True,  # Remote DNS
1096         unquote_if_non_empty(url_components.username),
1097         unquote_if_non_empty(url_components.password),
1098     )
1099
1100     class SocksConnection(base_class):
1101         def connect(self):
1102             self.sock = sockssocket()
1103             self.sock.setproxy(*proxy_args)
1104             if type(self.timeout) in (int, float):
1105                 self.sock.settimeout(self.timeout)
1106             self.sock.connect((self.host, self.port))
1107
1108             if isinstance(self, compat_http_client.HTTPSConnection):
1109                 if hasattr(self, '_context'):  # Python > 2.6
1110                     self.sock = self._context.wrap_socket(
1111                         self.sock, server_hostname=self.host)
1112                 else:
1113                     self.sock = ssl.wrap_socket(self.sock)
1114
1115     return SocksConnection
1116
1117
1118 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1119     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1120         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1121         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1122         self._params = params
1123
1124     def https_open(self, req):
1125         kwargs = {}
1126         conn_class = self._https_conn_class
1127
1128         if hasattr(self, '_context'):  # python > 2.6
1129             kwargs['context'] = self._context
1130         if hasattr(self, '_check_hostname'):  # python 3.x
1131             kwargs['check_hostname'] = self._check_hostname
1132
1133         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1134         if socks_proxy:
1135             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1136             del req.headers['Ytdl-socks-proxy']
1137
1138         return self.do_open(functools.partial(
1139             _create_http_connection, self, conn_class, True),
1140             req, **kwargs)
1141
1142
1143 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1144     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1145         # Store session cookies with `expires` set to 0 instead of an empty
1146         # string
1147         for cookie in self:
1148             if cookie.expires is None:
1149                 cookie.expires = 0
1150         compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
1151
1152     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1153         compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
1154         # Session cookies are denoted by either `expires` field set to
1155         # an empty string or 0. MozillaCookieJar only recognizes the former
1156         # (see [1]). So we need force the latter to be recognized as session
1157         # cookies on our own.
1158         # Session cookies may be important for cookies-based authentication,
1159         # e.g. usually, when user does not check 'Remember me' check box while
1160         # logging in on a site, some important cookies are stored as session
1161         # cookies so that not recognizing them will result in failed login.
1162         # 1. https://bugs.python.org/issue17164
1163         for cookie in self:
1164             # Treat `expires=0` cookies as session cookies
1165             if cookie.expires == 0:
1166                 cookie.expires = None
1167                 cookie.discard = True
1168
1169
1170 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1171     def __init__(self, cookiejar=None):
1172         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1173
1174     def http_response(self, request, response):
1175         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1176         # characters in Set-Cookie HTTP header of last response (see
1177         # https://github.com/rg3/youtube-dl/issues/6769).
1178         # In order to at least prevent crashing we will percent encode Set-Cookie
1179         # header before HTTPCookieProcessor starts processing it.
1180         # if sys.version_info < (3, 0) and response.headers:
1181         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1182         #         set_cookie = response.headers.get(set_cookie_header)
1183         #         if set_cookie:
1184         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1185         #             if set_cookie != set_cookie_escaped:
1186         #                 del response.headers[set_cookie_header]
1187         #                 response.headers[set_cookie_header] = set_cookie_escaped
1188         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1189
1190     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1191     https_response = http_response
1192
1193
1194 def extract_timezone(date_str):
1195     m = re.search(
1196         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1197         date_str)
1198     if not m:
1199         timezone = datetime.timedelta()
1200     else:
1201         date_str = date_str[:-len(m.group('tz'))]
1202         if not m.group('sign'):
1203             timezone = datetime.timedelta()
1204         else:
1205             sign = 1 if m.group('sign') == '+' else -1
1206             timezone = datetime.timedelta(
1207                 hours=sign * int(m.group('hours')),
1208                 minutes=sign * int(m.group('minutes')))
1209     return timezone, date_str
1210
1211
1212 def parse_iso8601(date_str, delimiter='T', timezone=None):
1213     """ Return a UNIX timestamp from the given date """
1214
1215     if date_str is None:
1216         return None
1217
1218     date_str = re.sub(r'\.[0-9]+', '', date_str)
1219
1220     if timezone is None:
1221         timezone, date_str = extract_timezone(date_str)
1222
1223     try:
1224         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1225         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1226         return calendar.timegm(dt.timetuple())
1227     except ValueError:
1228         pass
1229
1230
1231 def date_formats(day_first=True):
1232     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1233
1234
1235 def unified_strdate(date_str, day_first=True):
1236     """Return a string with the date in the format YYYYMMDD"""
1237
1238     if date_str is None:
1239         return None
1240     upload_date = None
1241     # Replace commas
1242     date_str = date_str.replace(',', ' ')
1243     # Remove AM/PM + timezone
1244     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1245     _, date_str = extract_timezone(date_str)
1246
1247     for expression in date_formats(day_first):
1248         try:
1249             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1250         except ValueError:
1251             pass
1252     if upload_date is None:
1253         timetuple = email.utils.parsedate_tz(date_str)
1254         if timetuple:
1255             try:
1256                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1257             except ValueError:
1258                 pass
1259     if upload_date is not None:
1260         return compat_str(upload_date)
1261
1262
1263 def unified_timestamp(date_str, day_first=True):
1264     if date_str is None:
1265         return None
1266
1267     date_str = re.sub(r'[,|]', '', date_str)
1268
1269     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1270     timezone, date_str = extract_timezone(date_str)
1271
1272     # Remove AM/PM + timezone
1273     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1274
1275     # Remove unrecognized timezones from ISO 8601 alike timestamps
1276     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1277     if m:
1278         date_str = date_str[:-len(m.group('tz'))]
1279
1280     # Python only supports microseconds, so remove nanoseconds
1281     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1282     if m:
1283         date_str = m.group(1)
1284
1285     for expression in date_formats(day_first):
1286         try:
1287             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1288             return calendar.timegm(dt.timetuple())
1289         except ValueError:
1290             pass
1291     timetuple = email.utils.parsedate_tz(date_str)
1292     if timetuple:
1293         return calendar.timegm(timetuple) + pm_delta * 3600
1294
1295
1296 def determine_ext(url, default_ext='unknown_video'):
1297     if url is None or '.' not in url:
1298         return default_ext
1299     guess = url.partition('?')[0].rpartition('.')[2]
1300     if re.match(r'^[A-Za-z0-9]+$', guess):
1301         return guess
1302     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1303     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1304         return guess.rstrip('/')
1305     else:
1306         return default_ext
1307
1308
1309 def subtitles_filename(filename, sub_lang, sub_format):
1310     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1311
1312
1313 def date_from_str(date_str):
1314     """
1315     Return a datetime object from a string in the format YYYYMMDD or
1316     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1317     today = datetime.date.today()
1318     if date_str in ('now', 'today'):
1319         return today
1320     if date_str == 'yesterday':
1321         return today - datetime.timedelta(days=1)
1322     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1323     if match is not None:
1324         sign = match.group('sign')
1325         time = int(match.group('time'))
1326         if sign == '-':
1327             time = -time
1328         unit = match.group('unit')
1329         # A bad approximation?
1330         if unit == 'month':
1331             unit = 'day'
1332             time *= 30
1333         elif unit == 'year':
1334             unit = 'day'
1335             time *= 365
1336         unit += 's'
1337         delta = datetime.timedelta(**{unit: time})
1338         return today + delta
1339     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1340
1341
1342 def hyphenate_date(date_str):
1343     """
1344     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346     if match is not None:
1347         return '-'.join(match.groups())
1348     else:
1349         return date_str
1350
1351
1352 class DateRange(object):
1353     """Represents a time interval between two dates"""
1354
1355     def __init__(self, start=None, end=None):
1356         """start and end must be strings in the format accepted by date"""
1357         if start is not None:
1358             self.start = date_from_str(start)
1359         else:
1360             self.start = datetime.datetime.min.date()
1361         if end is not None:
1362             self.end = date_from_str(end)
1363         else:
1364             self.end = datetime.datetime.max.date()
1365         if self.start > self.end:
1366             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368     @classmethod
1369     def day(cls, day):
1370         """Returns a range that only contains the given day"""
1371         return cls(day, day)
1372
1373     def __contains__(self, date):
1374         """Check if the date is in the range"""
1375         if not isinstance(date, datetime.date):
1376             date = date_from_str(date)
1377         return self.start <= date <= self.end
1378
1379     def __str__(self):
1380         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1381
1382
1383 def platform_name():
1384     """ Returns the platform name as a compat_str """
1385     res = platform.platform()
1386     if isinstance(res, bytes):
1387         res = res.decode(preferredencoding())
1388
1389     assert isinstance(res, compat_str)
1390     return res
1391
1392
1393 def _windows_write_string(s, out):
1394     """ Returns True if the string was written using special methods,
1395     False if it has yet to be written out."""
1396     # Adapted from http://stackoverflow.com/a/3259271/35070
1397
1398     import ctypes
1399     import ctypes.wintypes
1400
1401     WIN_OUTPUT_IDS = {
1402         1: -11,
1403         2: -12,
1404     }
1405
1406     try:
1407         fileno = out.fileno()
1408     except AttributeError:
1409         # If the output stream doesn't have a fileno, it's virtual
1410         return False
1411     except io.UnsupportedOperation:
1412         # Some strange Windows pseudo files?
1413         return False
1414     if fileno not in WIN_OUTPUT_IDS:
1415         return False
1416
1417     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1418         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1419         ('GetStdHandle', ctypes.windll.kernel32))
1420     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1421
1422     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1423         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1424         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1425         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1426     written = ctypes.wintypes.DWORD(0)
1427
1428     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1429     FILE_TYPE_CHAR = 0x0002
1430     FILE_TYPE_REMOTE = 0x8000
1431     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1432         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1433         ctypes.POINTER(ctypes.wintypes.DWORD))(
1434         ('GetConsoleMode', ctypes.windll.kernel32))
1435     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1436
1437     def not_a_console(handle):
1438         if handle == INVALID_HANDLE_VALUE or handle is None:
1439             return True
1440         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1441                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1442
1443     if not_a_console(h):
1444         return False
1445
1446     def next_nonbmp_pos(s):
1447         try:
1448             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1449         except StopIteration:
1450             return len(s)
1451
1452     while s:
1453         count = min(next_nonbmp_pos(s), 1024)
1454
1455         ret = WriteConsoleW(
1456             h, s, count if count else 2, ctypes.byref(written), None)
1457         if ret == 0:
1458             raise OSError('Failed to write string')
1459         if not count:  # We just wrote a non-BMP character
1460             assert written.value == 2
1461             s = s[1:]
1462         else:
1463             assert written.value > 0
1464             s = s[written.value:]
1465     return True
1466
1467
1468 def write_string(s, out=None, encoding=None):
1469     if out is None:
1470         out = sys.stderr
1471     assert type(s) == compat_str
1472
1473     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1474         if _windows_write_string(s, out):
1475             return
1476
1477     if ('b' in getattr(out, 'mode', '') or
1478             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1479         byt = s.encode(encoding or preferredencoding(), 'ignore')
1480         out.write(byt)
1481     elif hasattr(out, 'buffer'):
1482         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1483         byt = s.encode(enc, 'ignore')
1484         out.buffer.write(byt)
1485     else:
1486         out.write(s)
1487     out.flush()
1488
1489
1490 def bytes_to_intlist(bs):
1491     if not bs:
1492         return []
1493     if isinstance(bs[0], int):  # Python 3
1494         return list(bs)
1495     else:
1496         return [ord(c) for c in bs]
1497
1498
1499 def intlist_to_bytes(xs):
1500     if not xs:
1501         return b''
1502     return compat_struct_pack('%dB' % len(xs), *xs)
1503
1504
1505 # Cross-platform file locking
1506 if sys.platform == 'win32':
1507     import ctypes.wintypes
1508     import msvcrt
1509
1510     class OVERLAPPED(ctypes.Structure):
1511         _fields_ = [
1512             ('Internal', ctypes.wintypes.LPVOID),
1513             ('InternalHigh', ctypes.wintypes.LPVOID),
1514             ('Offset', ctypes.wintypes.DWORD),
1515             ('OffsetHigh', ctypes.wintypes.DWORD),
1516             ('hEvent', ctypes.wintypes.HANDLE),
1517         ]
1518
1519     kernel32 = ctypes.windll.kernel32
1520     LockFileEx = kernel32.LockFileEx
1521     LockFileEx.argtypes = [
1522         ctypes.wintypes.HANDLE,     # hFile
1523         ctypes.wintypes.DWORD,      # dwFlags
1524         ctypes.wintypes.DWORD,      # dwReserved
1525         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1526         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1527         ctypes.POINTER(OVERLAPPED)  # Overlapped
1528     ]
1529     LockFileEx.restype = ctypes.wintypes.BOOL
1530     UnlockFileEx = kernel32.UnlockFileEx
1531     UnlockFileEx.argtypes = [
1532         ctypes.wintypes.HANDLE,     # hFile
1533         ctypes.wintypes.DWORD,      # dwReserved
1534         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1535         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1536         ctypes.POINTER(OVERLAPPED)  # Overlapped
1537     ]
1538     UnlockFileEx.restype = ctypes.wintypes.BOOL
1539     whole_low = 0xffffffff
1540     whole_high = 0x7fffffff
1541
1542     def _lock_file(f, exclusive):
1543         overlapped = OVERLAPPED()
1544         overlapped.Offset = 0
1545         overlapped.OffsetHigh = 0
1546         overlapped.hEvent = 0
1547         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1548         handle = msvcrt.get_osfhandle(f.fileno())
1549         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1550                           whole_low, whole_high, f._lock_file_overlapped_p):
1551             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1552
1553     def _unlock_file(f):
1554         assert f._lock_file_overlapped_p
1555         handle = msvcrt.get_osfhandle(f.fileno())
1556         if not UnlockFileEx(handle, 0,
1557                             whole_low, whole_high, f._lock_file_overlapped_p):
1558             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1559
1560 else:
1561     # Some platforms, such as Jython, is missing fcntl
1562     try:
1563         import fcntl
1564
1565         def _lock_file(f, exclusive):
1566             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1567
1568         def _unlock_file(f):
1569             fcntl.flock(f, fcntl.LOCK_UN)
1570     except ImportError:
1571         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1572
1573         def _lock_file(f, exclusive):
1574             raise IOError(UNSUPPORTED_MSG)
1575
1576         def _unlock_file(f):
1577             raise IOError(UNSUPPORTED_MSG)
1578
1579
1580 class locked_file(object):
1581     def __init__(self, filename, mode, encoding=None):
1582         assert mode in ['r', 'a', 'w']
1583         self.f = io.open(filename, mode, encoding=encoding)
1584         self.mode = mode
1585
1586     def __enter__(self):
1587         exclusive = self.mode != 'r'
1588         try:
1589             _lock_file(self.f, exclusive)
1590         except IOError:
1591             self.f.close()
1592             raise
1593         return self
1594
1595     def __exit__(self, etype, value, traceback):
1596         try:
1597             _unlock_file(self.f)
1598         finally:
1599             self.f.close()
1600
1601     def __iter__(self):
1602         return iter(self.f)
1603
1604     def write(self, *args):
1605         return self.f.write(*args)
1606
1607     def read(self, *args):
1608         return self.f.read(*args)
1609
1610
1611 def get_filesystem_encoding():
1612     encoding = sys.getfilesystemencoding()
1613     return encoding if encoding is not None else 'utf-8'
1614
1615
1616 def shell_quote(args):
1617     quoted_args = []
1618     encoding = get_filesystem_encoding()
1619     for a in args:
1620         if isinstance(a, bytes):
1621             # We may get a filename encoded with 'encodeFilename'
1622             a = a.decode(encoding)
1623         quoted_args.append(compat_shlex_quote(a))
1624     return ' '.join(quoted_args)
1625
1626
1627 def smuggle_url(url, data):
1628     """ Pass additional data in a URL for internal use. """
1629
1630     url, idata = unsmuggle_url(url, {})
1631     data.update(idata)
1632     sdata = compat_urllib_parse_urlencode(
1633         {'__youtubedl_smuggle': json.dumps(data)})
1634     return url + '#' + sdata
1635
1636
1637 def unsmuggle_url(smug_url, default=None):
1638     if '#__youtubedl_smuggle' not in smug_url:
1639         return smug_url, default
1640     url, _, sdata = smug_url.rpartition('#')
1641     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1642     data = json.loads(jsond)
1643     return url, data
1644
1645
1646 def format_bytes(bytes):
1647     if bytes is None:
1648         return 'N/A'
1649     if type(bytes) is str:
1650         bytes = float(bytes)
1651     if bytes == 0.0:
1652         exponent = 0
1653     else:
1654         exponent = int(math.log(bytes, 1024.0))
1655     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1656     converted = float(bytes) / float(1024 ** exponent)
1657     return '%.2f%s' % (converted, suffix)
1658
1659
1660 def lookup_unit_table(unit_table, s):
1661     units_re = '|'.join(re.escape(u) for u in unit_table)
1662     m = re.match(
1663         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1664     if not m:
1665         return None
1666     num_str = m.group('num').replace(',', '.')
1667     mult = unit_table[m.group('unit')]
1668     return int(float(num_str) * mult)
1669
1670
1671 def parse_filesize(s):
1672     if s is None:
1673         return None
1674
1675     # The lower-case forms are of course incorrect and unofficial,
1676     # but we support those too
1677     _UNIT_TABLE = {
1678         'B': 1,
1679         'b': 1,
1680         'bytes': 1,
1681         'KiB': 1024,
1682         'KB': 1000,
1683         'kB': 1024,
1684         'Kb': 1000,
1685         'kb': 1000,
1686         'kilobytes': 1000,
1687         'kibibytes': 1024,
1688         'MiB': 1024 ** 2,
1689         'MB': 1000 ** 2,
1690         'mB': 1024 ** 2,
1691         'Mb': 1000 ** 2,
1692         'mb': 1000 ** 2,
1693         'megabytes': 1000 ** 2,
1694         'mebibytes': 1024 ** 2,
1695         'GiB': 1024 ** 3,
1696         'GB': 1000 ** 3,
1697         'gB': 1024 ** 3,
1698         'Gb': 1000 ** 3,
1699         'gb': 1000 ** 3,
1700         'gigabytes': 1000 ** 3,
1701         'gibibytes': 1024 ** 3,
1702         'TiB': 1024 ** 4,
1703         'TB': 1000 ** 4,
1704         'tB': 1024 ** 4,
1705         'Tb': 1000 ** 4,
1706         'tb': 1000 ** 4,
1707         'terabytes': 1000 ** 4,
1708         'tebibytes': 1024 ** 4,
1709         'PiB': 1024 ** 5,
1710         'PB': 1000 ** 5,
1711         'pB': 1024 ** 5,
1712         'Pb': 1000 ** 5,
1713         'pb': 1000 ** 5,
1714         'petabytes': 1000 ** 5,
1715         'pebibytes': 1024 ** 5,
1716         'EiB': 1024 ** 6,
1717         'EB': 1000 ** 6,
1718         'eB': 1024 ** 6,
1719         'Eb': 1000 ** 6,
1720         'eb': 1000 ** 6,
1721         'exabytes': 1000 ** 6,
1722         'exbibytes': 1024 ** 6,
1723         'ZiB': 1024 ** 7,
1724         'ZB': 1000 ** 7,
1725         'zB': 1024 ** 7,
1726         'Zb': 1000 ** 7,
1727         'zb': 1000 ** 7,
1728         'zettabytes': 1000 ** 7,
1729         'zebibytes': 1024 ** 7,
1730         'YiB': 1024 ** 8,
1731         'YB': 1000 ** 8,
1732         'yB': 1024 ** 8,
1733         'Yb': 1000 ** 8,
1734         'yb': 1000 ** 8,
1735         'yottabytes': 1000 ** 8,
1736         'yobibytes': 1024 ** 8,
1737     }
1738
1739     return lookup_unit_table(_UNIT_TABLE, s)
1740
1741
1742 def parse_count(s):
1743     if s is None:
1744         return None
1745
1746     s = s.strip()
1747
1748     if re.match(r'^[\d,.]+$', s):
1749         return str_to_int(s)
1750
1751     _UNIT_TABLE = {
1752         'k': 1000,
1753         'K': 1000,
1754         'm': 1000 ** 2,
1755         'M': 1000 ** 2,
1756         'kk': 1000 ** 2,
1757         'KK': 1000 ** 2,
1758     }
1759
1760     return lookup_unit_table(_UNIT_TABLE, s)
1761
1762
1763 def parse_resolution(s):
1764     if s is None:
1765         return {}
1766
1767     mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1768     if mobj:
1769         return {
1770             'width': int(mobj.group('w')),
1771             'height': int(mobj.group('h')),
1772         }
1773
1774     mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1775     if mobj:
1776         return {'height': int(mobj.group(1))}
1777
1778     mobj = re.search(r'\b([48])[kK]\b', s)
1779     if mobj:
1780         return {'height': int(mobj.group(1)) * 540}
1781
1782     return {}
1783
1784
1785 def month_by_name(name, lang='en'):
1786     """ Return the number of a month by (locale-independently) English name """
1787
1788     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1789
1790     try:
1791         return month_names.index(name) + 1
1792     except ValueError:
1793         return None
1794
1795
1796 def month_by_abbreviation(abbrev):
1797     """ Return the number of a month by (locale-independently) English
1798         abbreviations """
1799
1800     try:
1801         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1802     except ValueError:
1803         return None
1804
1805
1806 def fix_xml_ampersands(xml_str):
1807     """Replace all the '&' by '&amp;' in XML"""
1808     return re.sub(
1809         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1810         '&amp;',
1811         xml_str)
1812
1813
1814 def setproctitle(title):
1815     assert isinstance(title, compat_str)
1816
1817     # ctypes in Jython is not complete
1818     # http://bugs.jython.org/issue2148
1819     if sys.platform.startswith('java'):
1820         return
1821
1822     try:
1823         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1824     except OSError:
1825         return
1826     except TypeError:
1827         # LoadLibrary in Windows Python 2.7.13 only expects
1828         # a bytestring, but since unicode_literals turns
1829         # every string into a unicode string, it fails.
1830         return
1831     title_bytes = title.encode('utf-8')
1832     buf = ctypes.create_string_buffer(len(title_bytes))
1833     buf.value = title_bytes
1834     try:
1835         libc.prctl(15, buf, 0, 0, 0)
1836     except AttributeError:
1837         return  # Strange libc, just skip this
1838
1839
1840 def remove_start(s, start):
1841     return s[len(start):] if s is not None and s.startswith(start) else s
1842
1843
1844 def remove_end(s, end):
1845     return s[:-len(end)] if s is not None and s.endswith(end) else s
1846
1847
1848 def remove_quotes(s):
1849     if s is None or len(s) < 2:
1850         return s
1851     for quote in ('"', "'", ):
1852         if s[0] == quote and s[-1] == quote:
1853             return s[1:-1]
1854     return s
1855
1856
1857 def url_basename(url):
1858     path = compat_urlparse.urlparse(url).path
1859     return path.strip('/').split('/')[-1]
1860
1861
1862 def base_url(url):
1863     return re.match(r'https?://[^?#&]+/', url).group()
1864
1865
1866 def urljoin(base, path):
1867     if isinstance(path, bytes):
1868         path = path.decode('utf-8')
1869     if not isinstance(path, compat_str) or not path:
1870         return None
1871     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1872         return path
1873     if isinstance(base, bytes):
1874         base = base.decode('utf-8')
1875     if not isinstance(base, compat_str) or not re.match(
1876             r'^(?:https?:)?//', base):
1877         return None
1878     return compat_urlparse.urljoin(base, path)
1879
1880
1881 class HEADRequest(compat_urllib_request.Request):
1882     def get_method(self):
1883         return 'HEAD'
1884
1885
1886 class PUTRequest(compat_urllib_request.Request):
1887     def get_method(self):
1888         return 'PUT'
1889
1890
1891 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1892     if get_attr:
1893         if v is not None:
1894             v = getattr(v, get_attr, None)
1895     if v == '':
1896         v = None
1897     if v is None:
1898         return default
1899     try:
1900         return int(v) * invscale // scale
1901     except ValueError:
1902         return default
1903
1904
1905 def str_or_none(v, default=None):
1906     return default if v is None else compat_str(v)
1907
1908
1909 def str_to_int(int_str):
1910     """ A more relaxed version of int_or_none """
1911     if int_str is None:
1912         return None
1913     int_str = re.sub(r'[,\.\+]', '', int_str)
1914     return int(int_str)
1915
1916
1917 def float_or_none(v, scale=1, invscale=1, default=None):
1918     if v is None:
1919         return default
1920     try:
1921         return float(v) * invscale / scale
1922     except ValueError:
1923         return default
1924
1925
1926 def bool_or_none(v, default=None):
1927     return v if isinstance(v, bool) else default
1928
1929
1930 def strip_or_none(v):
1931     return None if v is None else v.strip()
1932
1933
1934 def url_or_none(url):
1935     if not url or not isinstance(url, compat_str):
1936         return None
1937     url = url.strip()
1938     return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1939
1940
1941 def parse_duration(s):
1942     if not isinstance(s, compat_basestring):
1943         return None
1944
1945     s = s.strip()
1946
1947     days, hours, mins, secs, ms = [None] * 5
1948     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1949     if m:
1950         days, hours, mins, secs, ms = m.groups()
1951     else:
1952         m = re.match(
1953             r'''(?ix)(?:P?
1954                 (?:
1955                     [0-9]+\s*y(?:ears?)?\s*
1956                 )?
1957                 (?:
1958                     [0-9]+\s*m(?:onths?)?\s*
1959                 )?
1960                 (?:
1961                     [0-9]+\s*w(?:eeks?)?\s*
1962                 )?
1963                 (?:
1964                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1965                 )?
1966                 T)?
1967                 (?:
1968                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1969                 )?
1970                 (?:
1971                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1972                 )?
1973                 (?:
1974                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1975                 )?Z?$''', s)
1976         if m:
1977             days, hours, mins, secs, ms = m.groups()
1978         else:
1979             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1980             if m:
1981                 hours, mins = m.groups()
1982             else:
1983                 return None
1984
1985     duration = 0
1986     if secs:
1987         duration += float(secs)
1988     if mins:
1989         duration += float(mins) * 60
1990     if hours:
1991         duration += float(hours) * 60 * 60
1992     if days:
1993         duration += float(days) * 24 * 60 * 60
1994     if ms:
1995         duration += float(ms)
1996     return duration
1997
1998
1999 def prepend_extension(filename, ext, expected_real_ext=None):
2000     name, real_ext = os.path.splitext(filename)
2001     return (
2002         '{0}.{1}{2}'.format(name, ext, real_ext)
2003         if not expected_real_ext or real_ext[1:] == expected_real_ext
2004         else '{0}.{1}'.format(filename, ext))
2005
2006
2007 def replace_extension(filename, ext, expected_real_ext=None):
2008     name, real_ext = os.path.splitext(filename)
2009     return '{0}.{1}'.format(
2010         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2011         ext)
2012
2013
2014 def check_executable(exe, args=[]):
2015     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2016     args can be a list of arguments for a short output (like -version) """
2017     try:
2018         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
2019     except OSError:
2020         return False
2021     return exe
2022
2023
2024 def get_exe_version(exe, args=['--version'],
2025                     version_re=None, unrecognized='present'):
2026     """ Returns the version of the specified executable,
2027     or False if the executable is not present """
2028     try:
2029         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2030         # SIGTTOU if youtube-dl is run in the background.
2031         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
2032         out, _ = subprocess.Popen(
2033             [encodeArgument(exe)] + args,
2034             stdin=subprocess.PIPE,
2035             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2036     except OSError:
2037         return False
2038     if isinstance(out, bytes):  # Python 2.x
2039         out = out.decode('ascii', 'ignore')
2040     return detect_exe_version(out, version_re, unrecognized)
2041
2042
2043 def detect_exe_version(output, version_re=None, unrecognized='present'):
2044     assert isinstance(output, compat_str)
2045     if version_re is None:
2046         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2047     m = re.search(version_re, output)
2048     if m:
2049         return m.group(1)
2050     else:
2051         return unrecognized
2052
2053
2054 class PagedList(object):
2055     def __len__(self):
2056         # This is only useful for tests
2057         return len(self.getslice())
2058
2059
2060 class OnDemandPagedList(PagedList):
2061     def __init__(self, pagefunc, pagesize, use_cache=True):
2062         self._pagefunc = pagefunc
2063         self._pagesize = pagesize
2064         self._use_cache = use_cache
2065         if use_cache:
2066             self._cache = {}
2067
2068     def getslice(self, start=0, end=None):
2069         res = []
2070         for pagenum in itertools.count(start // self._pagesize):
2071             firstid = pagenum * self._pagesize
2072             nextfirstid = pagenum * self._pagesize + self._pagesize
2073             if start >= nextfirstid:
2074                 continue
2075
2076             page_results = None
2077             if self._use_cache:
2078                 page_results = self._cache.get(pagenum)
2079             if page_results is None:
2080                 page_results = list(self._pagefunc(pagenum))
2081             if self._use_cache:
2082                 self._cache[pagenum] = page_results
2083
2084             startv = (
2085                 start % self._pagesize
2086                 if firstid <= start < nextfirstid
2087                 else 0)
2088
2089             endv = (
2090                 ((end - 1) % self._pagesize) + 1
2091                 if (end is not None and firstid <= end <= nextfirstid)
2092                 else None)
2093
2094             if startv != 0 or endv is not None:
2095                 page_results = page_results[startv:endv]
2096             res.extend(page_results)
2097
2098             # A little optimization - if current page is not "full", ie. does
2099             # not contain page_size videos then we can assume that this page
2100             # is the last one - there are no more ids on further pages -
2101             # i.e. no need to query again.
2102             if len(page_results) + startv < self._pagesize:
2103                 break
2104
2105             # If we got the whole page, but the next page is not interesting,
2106             # break out early as well
2107             if end == nextfirstid:
2108                 break
2109         return res
2110
2111
2112 class InAdvancePagedList(PagedList):
2113     def __init__(self, pagefunc, pagecount, pagesize):
2114         self._pagefunc = pagefunc
2115         self._pagecount = pagecount
2116         self._pagesize = pagesize
2117
2118     def getslice(self, start=0, end=None):
2119         res = []
2120         start_page = start // self._pagesize
2121         end_page = (
2122             self._pagecount if end is None else (end // self._pagesize + 1))
2123         skip_elems = start - start_page * self._pagesize
2124         only_more = None if end is None else end - start
2125         for pagenum in range(start_page, end_page):
2126             page = list(self._pagefunc(pagenum))
2127             if skip_elems:
2128                 page = page[skip_elems:]
2129                 skip_elems = None
2130             if only_more is not None:
2131                 if len(page) < only_more:
2132                     only_more -= len(page)
2133                 else:
2134                     page = page[:only_more]
2135                     res.extend(page)
2136                     break
2137             res.extend(page)
2138         return res
2139
2140
2141 def uppercase_escape(s):
2142     unicode_escape = codecs.getdecoder('unicode_escape')
2143     return re.sub(
2144         r'\\U[0-9a-fA-F]{8}',
2145         lambda m: unicode_escape(m.group(0))[0],
2146         s)
2147
2148
2149 def lowercase_escape(s):
2150     unicode_escape = codecs.getdecoder('unicode_escape')
2151     return re.sub(
2152         r'\\u[0-9a-fA-F]{4}',
2153         lambda m: unicode_escape(m.group(0))[0],
2154         s)
2155
2156
2157 def escape_rfc3986(s):
2158     """Escape non-ASCII characters as suggested by RFC 3986"""
2159     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2160         s = s.encode('utf-8')
2161     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2162
2163
2164 def escape_url(url):
2165     """Escape URL as suggested by RFC 3986"""
2166     url_parsed = compat_urllib_parse_urlparse(url)
2167     return url_parsed._replace(
2168         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2169         path=escape_rfc3986(url_parsed.path),
2170         params=escape_rfc3986(url_parsed.params),
2171         query=escape_rfc3986(url_parsed.query),
2172         fragment=escape_rfc3986(url_parsed.fragment)
2173     ).geturl()
2174
2175
2176 def read_batch_urls(batch_fd):
2177     def fixup(url):
2178         if not isinstance(url, compat_str):
2179             url = url.decode('utf-8', 'replace')
2180         BOM_UTF8 = '\xef\xbb\xbf'
2181         if url.startswith(BOM_UTF8):
2182             url = url[len(BOM_UTF8):]
2183         url = url.strip()
2184         if url.startswith(('#', ';', ']')):
2185             return False
2186         return url
2187
2188     with contextlib.closing(batch_fd) as fd:
2189         return [url for url in map(fixup, fd) if url]
2190
2191
2192 def urlencode_postdata(*args, **kargs):
2193     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2194
2195
2196 def update_url_query(url, query):
2197     if not query:
2198         return url
2199     parsed_url = compat_urlparse.urlparse(url)
2200     qs = compat_parse_qs(parsed_url.query)
2201     qs.update(query)
2202     return compat_urlparse.urlunparse(parsed_url._replace(
2203         query=compat_urllib_parse_urlencode(qs, True)))
2204
2205
2206 def update_Request(req, url=None, data=None, headers={}, query={}):
2207     req_headers = req.headers.copy()
2208     req_headers.update(headers)
2209     req_data = data or req.data
2210     req_url = update_url_query(url or req.get_full_url(), query)
2211     req_get_method = req.get_method()
2212     if req_get_method == 'HEAD':
2213         req_type = HEADRequest
2214     elif req_get_method == 'PUT':
2215         req_type = PUTRequest
2216     else:
2217         req_type = compat_urllib_request.Request
2218     new_req = req_type(
2219         req_url, data=req_data, headers=req_headers,
2220         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2221     if hasattr(req, 'timeout'):
2222         new_req.timeout = req.timeout
2223     return new_req
2224
2225
2226 def _multipart_encode_impl(data, boundary):
2227     content_type = 'multipart/form-data; boundary=%s' % boundary
2228
2229     out = b''
2230     for k, v in data.items():
2231         out += b'--' + boundary.encode('ascii') + b'\r\n'
2232         if isinstance(k, compat_str):
2233             k = k.encode('utf-8')
2234         if isinstance(v, compat_str):
2235             v = v.encode('utf-8')
2236         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2237         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2238         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2239         if boundary.encode('ascii') in content:
2240             raise ValueError('Boundary overlaps with data')
2241         out += content
2242
2243     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2244
2245     return out, content_type
2246
2247
2248 def multipart_encode(data, boundary=None):
2249     '''
2250     Encode a dict to RFC 7578-compliant form-data
2251
2252     data:
2253         A dict where keys and values can be either Unicode or bytes-like
2254         objects.
2255     boundary:
2256         If specified a Unicode object, it's used as the boundary. Otherwise
2257         a random boundary is generated.
2258
2259     Reference: https://tools.ietf.org/html/rfc7578
2260     '''
2261     has_specified_boundary = boundary is not None
2262
2263     while True:
2264         if boundary is None:
2265             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2266
2267         try:
2268             out, content_type = _multipart_encode_impl(data, boundary)
2269             break
2270         except ValueError:
2271             if has_specified_boundary:
2272                 raise
2273             boundary = None
2274
2275     return out, content_type
2276
2277
2278 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2279     if isinstance(key_or_keys, (list, tuple)):
2280         for key in key_or_keys:
2281             if key not in d or d[key] is None or skip_false_values and not d[key]:
2282                 continue
2283             return d[key]
2284         return default
2285     return d.get(key_or_keys, default)
2286
2287
2288 def try_get(src, getter, expected_type=None):
2289     if not isinstance(getter, (list, tuple)):
2290         getter = [getter]
2291     for get in getter:
2292         try:
2293             v = get(src)
2294         except (AttributeError, KeyError, TypeError, IndexError):
2295             pass
2296         else:
2297             if expected_type is None or isinstance(v, expected_type):
2298                 return v
2299
2300
2301 def merge_dicts(*dicts):
2302     merged = {}
2303     for a_dict in dicts:
2304         for k, v in a_dict.items():
2305             if v is None:
2306                 continue
2307             if (k not in merged or
2308                     (isinstance(v, compat_str) and v and
2309                         isinstance(merged[k], compat_str) and
2310                         not merged[k])):
2311                 merged[k] = v
2312     return merged
2313
2314
2315 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2316     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2317
2318
2319 US_RATINGS = {
2320     'G': 0,
2321     'PG': 10,
2322     'PG-13': 13,
2323     'R': 16,
2324     'NC': 18,
2325 }
2326
2327
2328 TV_PARENTAL_GUIDELINES = {
2329     'TV-Y': 0,
2330     'TV-Y7': 7,
2331     'TV-G': 0,
2332     'TV-PG': 0,
2333     'TV-14': 14,
2334     'TV-MA': 17,
2335 }
2336
2337
2338 def parse_age_limit(s):
2339     if type(s) == int:
2340         return s if 0 <= s <= 21 else None
2341     if not isinstance(s, compat_basestring):
2342         return None
2343     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2344     if m:
2345         return int(m.group('age'))
2346     if s in US_RATINGS:
2347         return US_RATINGS[s]
2348     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2349     if m:
2350         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2351     return None
2352
2353
2354 def strip_jsonp(code):
2355     return re.sub(
2356         r'''(?sx)^
2357             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2358             (?:\s*&&\s*(?P=func_name))?
2359             \s*\(\s*(?P<callback_data>.*)\);?
2360             \s*?(?://[^\n]*)*$''',
2361         r'\g<callback_data>', code)
2362
2363
2364 def js_to_json(code):
2365     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2366     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2367     INTEGER_TABLE = (
2368         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2369         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2370     )
2371
2372     def fix_kv(m):
2373         v = m.group(0)
2374         if v in ('true', 'false', 'null'):
2375             return v
2376         elif v.startswith('/*') or v.startswith('//') or v == ',':
2377             return ""
2378
2379         if v[0] in ("'", '"'):
2380             v = re.sub(r'(?s)\\.|"', lambda m: {
2381                 '"': '\\"',
2382                 "\\'": "'",
2383                 '\\\n': '',
2384                 '\\x': '\\u00',
2385             }.get(m.group(0), m.group(0)), v[1:-1])
2386
2387         for regex, base in INTEGER_TABLE:
2388             im = re.match(regex, v)
2389             if im:
2390                 i = int(im.group(1), base)
2391                 return '"%d":' % i if v.endswith(':') else '%d' % i
2392
2393         return '"%s"' % v
2394
2395     return re.sub(r'''(?sx)
2396         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2397         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2398         {comment}|,(?={skip}[\]}}])|
2399         (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2400         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2401         [0-9]+(?={skip}:)
2402         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2403
2404
2405 def qualities(quality_ids):
2406     """ Get a numeric quality value out of a list of possible values """
2407     def q(qid):
2408         try:
2409             return quality_ids.index(qid)
2410         except ValueError:
2411             return -1
2412     return q
2413
2414
2415 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2416
2417
2418 def limit_length(s, length):
2419     """ Add ellipses to overly long strings """
2420     if s is None:
2421         return None
2422     ELLIPSES = '...'
2423     if len(s) > length:
2424         return s[:length - len(ELLIPSES)] + ELLIPSES
2425     return s
2426
2427
2428 def version_tuple(v):
2429     return tuple(int(e) for e in re.split(r'[-.]', v))
2430
2431
2432 def is_outdated_version(version, limit, assume_new=True):
2433     if not version:
2434         return not assume_new
2435     try:
2436         return version_tuple(version) < version_tuple(limit)
2437     except ValueError:
2438         return not assume_new
2439
2440
2441 def ytdl_is_updateable():
2442     """ Returns if youtube-dl can be updated with -U """
2443     from zipimport import zipimporter
2444
2445     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2446
2447
2448 def args_to_str(args):
2449     # Get a short string representation for a subprocess command
2450     return ' '.join(compat_shlex_quote(a) for a in args)
2451
2452
2453 def error_to_compat_str(err):
2454     err_str = str(err)
2455     # On python 2 error byte string must be decoded with proper
2456     # encoding rather than ascii
2457     if sys.version_info[0] < 3:
2458         err_str = err_str.decode(preferredencoding())
2459     return err_str
2460
2461
2462 def mimetype2ext(mt):
2463     if mt is None:
2464         return None
2465
2466     ext = {
2467         'audio/mp4': 'm4a',
2468         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2469         # it's the most popular one
2470         'audio/mpeg': 'mp3',
2471     }.get(mt)
2472     if ext is not None:
2473         return ext
2474
2475     _, _, res = mt.rpartition('/')
2476     res = res.split(';')[0].strip().lower()
2477
2478     return {
2479         '3gpp': '3gp',
2480         'smptett+xml': 'tt',
2481         'ttaf+xml': 'dfxp',
2482         'ttml+xml': 'ttml',
2483         'x-flv': 'flv',
2484         'x-mp4-fragmented': 'mp4',
2485         'x-ms-sami': 'sami',
2486         'x-ms-wmv': 'wmv',
2487         'mpegurl': 'm3u8',
2488         'x-mpegurl': 'm3u8',
2489         'vnd.apple.mpegurl': 'm3u8',
2490         'dash+xml': 'mpd',
2491         'f4m+xml': 'f4m',
2492         'hds+xml': 'f4m',
2493         'vnd.ms-sstr+xml': 'ism',
2494         'quicktime': 'mov',
2495         'mp2t': 'ts',
2496     }.get(res, res)
2497
2498
2499 def parse_codecs(codecs_str):
2500     # http://tools.ietf.org/html/rfc6381
2501     if not codecs_str:
2502         return {}
2503     splited_codecs = list(filter(None, map(
2504         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2505     vcodec, acodec = None, None
2506     for full_codec in splited_codecs:
2507         codec = full_codec.split('.')[0]
2508         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
2509             if not vcodec:
2510                 vcodec = full_codec
2511         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2512             if not acodec:
2513                 acodec = full_codec
2514         else:
2515             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2516     if not vcodec and not acodec:
2517         if len(splited_codecs) == 2:
2518             return {
2519                 'vcodec': vcodec,
2520                 'acodec': acodec,
2521             }
2522         elif len(splited_codecs) == 1:
2523             return {
2524                 'vcodec': 'none',
2525                 'acodec': vcodec,
2526             }
2527     else:
2528         return {
2529             'vcodec': vcodec or 'none',
2530             'acodec': acodec or 'none',
2531         }
2532     return {}
2533
2534
2535 def urlhandle_detect_ext(url_handle):
2536     getheader = url_handle.headers.get
2537
2538     cd = getheader('Content-Disposition')
2539     if cd:
2540         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2541         if m:
2542             e = determine_ext(m.group('filename'), default_ext=None)
2543             if e:
2544                 return e
2545
2546     return mimetype2ext(getheader('Content-Type'))
2547
2548
2549 def encode_data_uri(data, mime_type):
2550     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2551
2552
2553 def age_restricted(content_limit, age_limit):
2554     """ Returns True iff the content should be blocked """
2555
2556     if age_limit is None:  # No limit set
2557         return False
2558     if content_limit is None:
2559         return False  # Content available for everyone
2560     return age_limit < content_limit
2561
2562
2563 def is_html(first_bytes):
2564     """ Detect whether a file contains HTML by examining its first bytes. """
2565
2566     BOMS = [
2567         (b'\xef\xbb\xbf', 'utf-8'),
2568         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2569         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2570         (b'\xff\xfe', 'utf-16-le'),
2571         (b'\xfe\xff', 'utf-16-be'),
2572     ]
2573     for bom, enc in BOMS:
2574         if first_bytes.startswith(bom):
2575             s = first_bytes[len(bom):].decode(enc, 'replace')
2576             break
2577     else:
2578         s = first_bytes.decode('utf-8', 'replace')
2579
2580     return re.match(r'^\s*<', s)
2581
2582
2583 def determine_protocol(info_dict):
2584     protocol = info_dict.get('protocol')
2585     if protocol is not None:
2586         return protocol
2587
2588     url = info_dict['url']
2589     if url.startswith('rtmp'):
2590         return 'rtmp'
2591     elif url.startswith('mms'):
2592         return 'mms'
2593     elif url.startswith('rtsp'):
2594         return 'rtsp'
2595
2596     ext = determine_ext(url)
2597     if ext == 'm3u8':
2598         return 'm3u8'
2599     elif ext == 'f4m':
2600         return 'f4m'
2601
2602     return compat_urllib_parse_urlparse(url).scheme
2603
2604
2605 def render_table(header_row, data):
2606     """ Render a list of rows, each as a list of values """
2607     table = [header_row] + data
2608     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2609     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2610     return '\n'.join(format_str % tuple(row) for row in table)
2611
2612
2613 def _match_one(filter_part, dct):
2614     COMPARISON_OPERATORS = {
2615         '<': operator.lt,
2616         '<=': operator.le,
2617         '>': operator.gt,
2618         '>=': operator.ge,
2619         '=': operator.eq,
2620         '!=': operator.ne,
2621     }
2622     operator_rex = re.compile(r'''(?x)\s*
2623         (?P<key>[a-z_]+)
2624         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2625         (?:
2626             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2627             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2628             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2629         )
2630         \s*$
2631         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2632     m = operator_rex.search(filter_part)
2633     if m:
2634         op = COMPARISON_OPERATORS[m.group('op')]
2635         actual_value = dct.get(m.group('key'))
2636         if (m.group('quotedstrval') is not None or
2637             m.group('strval') is not None or
2638             # If the original field is a string and matching comparisonvalue is
2639             # a number we should respect the origin of the original field
2640             # and process comparison value as a string (see
2641             # https://github.com/rg3/youtube-dl/issues/11082).
2642             actual_value is not None and m.group('intval') is not None and
2643                 isinstance(actual_value, compat_str)):
2644             if m.group('op') not in ('=', '!='):
2645                 raise ValueError(
2646                     'Operator %s does not support string values!' % m.group('op'))
2647             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2648             quote = m.group('quote')
2649             if quote is not None:
2650                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2651         else:
2652             try:
2653                 comparison_value = int(m.group('intval'))
2654             except ValueError:
2655                 comparison_value = parse_filesize(m.group('intval'))
2656                 if comparison_value is None:
2657                     comparison_value = parse_filesize(m.group('intval') + 'B')
2658                 if comparison_value is None:
2659                     raise ValueError(
2660                         'Invalid integer value %r in filter part %r' % (
2661                             m.group('intval'), filter_part))
2662         if actual_value is None:
2663             return m.group('none_inclusive')
2664         return op(actual_value, comparison_value)
2665
2666     UNARY_OPERATORS = {
2667         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2668         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2669     }
2670     operator_rex = re.compile(r'''(?x)\s*
2671         (?P<op>%s)\s*(?P<key>[a-z_]+)
2672         \s*$
2673         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2674     m = operator_rex.search(filter_part)
2675     if m:
2676         op = UNARY_OPERATORS[m.group('op')]
2677         actual_value = dct.get(m.group('key'))
2678         return op(actual_value)
2679
2680     raise ValueError('Invalid filter part %r' % filter_part)
2681
2682
2683 def match_str(filter_str, dct):
2684     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2685
2686     return all(
2687         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2688
2689
2690 def match_filter_func(filter_str):
2691     def _match_func(info_dict):
2692         if match_str(filter_str, info_dict):
2693             return None
2694         else:
2695             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2696             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2697     return _match_func
2698
2699
2700 def parse_dfxp_time_expr(time_expr):
2701     if not time_expr:
2702         return
2703
2704     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2705     if mobj:
2706         return float(mobj.group('time_offset'))
2707
2708     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2709     if mobj:
2710         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2711
2712
2713 def srt_subtitles_timecode(seconds):
2714     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2715
2716
2717 def dfxp2srt(dfxp_data):
2718     '''
2719     @param dfxp_data A bytes-like object containing DFXP data
2720     @returns A unicode object containing converted SRT data
2721     '''
2722     LEGACY_NAMESPACES = (
2723         (b'http://www.w3.org/ns/ttml', [
2724             b'http://www.w3.org/2004/11/ttaf1',
2725             b'http://www.w3.org/2006/04/ttaf1',
2726             b'http://www.w3.org/2006/10/ttaf1',
2727         ]),
2728         (b'http://www.w3.org/ns/ttml#styling', [
2729             b'http://www.w3.org/ns/ttml#style',
2730         ]),
2731     )
2732
2733     SUPPORTED_STYLING = [
2734         'color',
2735         'fontFamily',
2736         'fontSize',
2737         'fontStyle',
2738         'fontWeight',
2739         'textDecoration'
2740     ]
2741
2742     _x = functools.partial(xpath_with_ns, ns_map={
2743         'xml': 'http://www.w3.org/XML/1998/namespace',
2744         'ttml': 'http://www.w3.org/ns/ttml',
2745         'tts': 'http://www.w3.org/ns/ttml#styling',
2746     })
2747
2748     styles = {}
2749     default_style = {}
2750
2751     class TTMLPElementParser(object):
2752         _out = ''
2753         _unclosed_elements = []
2754         _applied_styles = []
2755
2756         def start(self, tag, attrib):
2757             if tag in (_x('ttml:br'), 'br'):
2758                 self._out += '\n'
2759             else:
2760                 unclosed_elements = []
2761                 style = {}
2762                 element_style_id = attrib.get('style')
2763                 if default_style:
2764                     style.update(default_style)
2765                 if element_style_id:
2766                     style.update(styles.get(element_style_id, {}))
2767                 for prop in SUPPORTED_STYLING:
2768                     prop_val = attrib.get(_x('tts:' + prop))
2769                     if prop_val:
2770                         style[prop] = prop_val
2771                 if style:
2772                     font = ''
2773                     for k, v in sorted(style.items()):
2774                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2775                             continue
2776                         if k == 'color':
2777                             font += ' color="%s"' % v
2778                         elif k == 'fontSize':
2779                             font += ' size="%s"' % v
2780                         elif k == 'fontFamily':
2781                             font += ' face="%s"' % v
2782                         elif k == 'fontWeight' and v == 'bold':
2783                             self._out += '<b>'
2784                             unclosed_elements.append('b')
2785                         elif k == 'fontStyle' and v == 'italic':
2786                             self._out += '<i>'
2787                             unclosed_elements.append('i')
2788                         elif k == 'textDecoration' and v == 'underline':
2789                             self._out += '<u>'
2790                             unclosed_elements.append('u')
2791                     if font:
2792                         self._out += '<font' + font + '>'
2793                         unclosed_elements.append('font')
2794                     applied_style = {}
2795                     if self._applied_styles:
2796                         applied_style.update(self._applied_styles[-1])
2797                     applied_style.update(style)
2798                     self._applied_styles.append(applied_style)
2799                 self._unclosed_elements.append(unclosed_elements)
2800
2801         def end(self, tag):
2802             if tag not in (_x('ttml:br'), 'br'):
2803                 unclosed_elements = self._unclosed_elements.pop()
2804                 for element in reversed(unclosed_elements):
2805                     self._out += '</%s>' % element
2806                 if unclosed_elements and self._applied_styles:
2807                     self._applied_styles.pop()
2808
2809         def data(self, data):
2810             self._out += data
2811
2812         def close(self):
2813             return self._out.strip()
2814
2815     def parse_node(node):
2816         target = TTMLPElementParser()
2817         parser = xml.etree.ElementTree.XMLParser(target=target)
2818         parser.feed(xml.etree.ElementTree.tostring(node))
2819         return parser.close()
2820
2821     for k, v in LEGACY_NAMESPACES:
2822         for ns in v:
2823             dfxp_data = dfxp_data.replace(ns, k)
2824
2825     dfxp = compat_etree_fromstring(dfxp_data)
2826     out = []
2827     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2828
2829     if not paras:
2830         raise ValueError('Invalid dfxp/TTML subtitle')
2831
2832     repeat = False
2833     while True:
2834         for style in dfxp.findall(_x('.//ttml:style')):
2835             style_id = style.get('id') or style.get(_x('xml:id'))
2836             if not style_id:
2837                 continue
2838             parent_style_id = style.get('style')
2839             if parent_style_id:
2840                 if parent_style_id not in styles:
2841                     repeat = True
2842                     continue
2843                 styles[style_id] = styles[parent_style_id].copy()
2844             for prop in SUPPORTED_STYLING:
2845                 prop_val = style.get(_x('tts:' + prop))
2846                 if prop_val:
2847                     styles.setdefault(style_id, {})[prop] = prop_val
2848         if repeat:
2849             repeat = False
2850         else:
2851             break
2852
2853     for p in ('body', 'div'):
2854         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2855         if ele is None:
2856             continue
2857         style = styles.get(ele.get('style'))
2858         if not style:
2859             continue
2860         default_style.update(style)
2861
2862     for para, index in zip(paras, itertools.count(1)):
2863         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2864         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2865         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2866         if begin_time is None:
2867             continue
2868         if not end_time:
2869             if not dur:
2870                 continue
2871             end_time = begin_time + dur
2872         out.append('%d\n%s --> %s\n%s\n\n' % (
2873             index,
2874             srt_subtitles_timecode(begin_time),
2875             srt_subtitles_timecode(end_time),
2876             parse_node(para)))
2877
2878     return ''.join(out)
2879
2880
2881 def cli_option(params, command_option, param):
2882     param = params.get(param)
2883     if param:
2884         param = compat_str(param)
2885     return [command_option, param] if param is not None else []
2886
2887
2888 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2889     param = params.get(param)
2890     if param is None:
2891         return []
2892     assert isinstance(param, bool)
2893     if separator:
2894         return [command_option + separator + (true_value if param else false_value)]
2895     return [command_option, true_value if param else false_value]
2896
2897
2898 def cli_valueless_option(params, command_option, param, expected_value=True):
2899     param = params.get(param)
2900     return [command_option] if param == expected_value else []
2901
2902
2903 def cli_configuration_args(params, param, default=[]):
2904     ex_args = params.get(param)
2905     if ex_args is None:
2906         return default
2907     assert isinstance(ex_args, list)
2908     return ex_args
2909
2910
2911 class ISO639Utils(object):
2912     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2913     _lang_map = {
2914         'aa': 'aar',
2915         'ab': 'abk',
2916         'ae': 'ave',
2917         'af': 'afr',
2918         'ak': 'aka',
2919         'am': 'amh',
2920         'an': 'arg',
2921         'ar': 'ara',
2922         'as': 'asm',
2923         'av': 'ava',
2924         'ay': 'aym',
2925         'az': 'aze',
2926         'ba': 'bak',
2927         'be': 'bel',
2928         'bg': 'bul',
2929         'bh': 'bih',
2930         'bi': 'bis',
2931         'bm': 'bam',
2932         'bn': 'ben',
2933         'bo': 'bod',
2934         'br': 'bre',
2935         'bs': 'bos',
2936         'ca': 'cat',
2937         'ce': 'che',
2938         'ch': 'cha',
2939         'co': 'cos',
2940         'cr': 'cre',
2941         'cs': 'ces',
2942         'cu': 'chu',
2943         'cv': 'chv',
2944         'cy': 'cym',
2945         'da': 'dan',
2946         'de': 'deu',
2947         'dv': 'div',
2948         'dz': 'dzo',
2949         'ee': 'ewe',
2950         'el': 'ell',
2951         'en': 'eng',
2952         'eo': 'epo',
2953         'es': 'spa',
2954         'et': 'est',
2955         'eu': 'eus',
2956         'fa': 'fas',
2957         'ff': 'ful',
2958         'fi': 'fin',
2959         'fj': 'fij',
2960         'fo': 'fao',
2961         'fr': 'fra',
2962         'fy': 'fry',
2963         'ga': 'gle',
2964         'gd': 'gla',
2965         'gl': 'glg',
2966         'gn': 'grn',
2967         'gu': 'guj',
2968         'gv': 'glv',
2969         'ha': 'hau',
2970         'he': 'heb',
2971         'iw': 'heb',  # Replaced by he in 1989 revision
2972         'hi': 'hin',
2973         'ho': 'hmo',
2974         'hr': 'hrv',
2975         'ht': 'hat',
2976         'hu': 'hun',
2977         'hy': 'hye',
2978         'hz': 'her',
2979         'ia': 'ina',
2980         'id': 'ind',
2981         'in': 'ind',  # Replaced by id in 1989 revision
2982         'ie': 'ile',
2983         'ig': 'ibo',
2984         'ii': 'iii',
2985         'ik': 'ipk',
2986         'io': 'ido',
2987         'is': 'isl',
2988         'it': 'ita',
2989         'iu': 'iku',
2990         'ja': 'jpn',
2991         'jv': 'jav',
2992         'ka': 'kat',
2993         'kg': 'kon',
2994         'ki': 'kik',
2995         'kj': 'kua',
2996         'kk': 'kaz',
2997         'kl': 'kal',
2998         'km': 'khm',
2999         'kn': 'kan',
3000         'ko': 'kor',
3001         'kr': 'kau',
3002         'ks': 'kas',
3003         'ku': 'kur',
3004         'kv': 'kom',
3005         'kw': 'cor',
3006         'ky': 'kir',
3007         'la': 'lat',
3008         'lb': 'ltz',
3009         'lg': 'lug',
3010         'li': 'lim',
3011         'ln': 'lin',
3012         'lo': 'lao',
3013         'lt': 'lit',
3014         'lu': 'lub',
3015         'lv': 'lav',
3016         'mg': 'mlg',
3017         'mh': 'mah',
3018         'mi': 'mri',
3019         'mk': 'mkd',
3020         'ml': 'mal',
3021         'mn': 'mon',
3022         'mr': 'mar',
3023         'ms': 'msa',
3024         'mt': 'mlt',
3025         'my': 'mya',
3026         'na': 'nau',
3027         'nb': 'nob',
3028         'nd': 'nde',
3029         'ne': 'nep',
3030         'ng': 'ndo',
3031         'nl': 'nld',
3032         'nn': 'nno',
3033         'no': 'nor',
3034         'nr': 'nbl',
3035         'nv': 'nav',
3036         'ny': 'nya',
3037         'oc': 'oci',
3038         'oj': 'oji',
3039         'om': 'orm',
3040         'or': 'ori',
3041         'os': 'oss',
3042         'pa': 'pan',
3043         'pi': 'pli',
3044         'pl': 'pol',
3045         'ps': 'pus',
3046         'pt': 'por',
3047         'qu': 'que',
3048         'rm': 'roh',
3049         'rn': 'run',
3050         'ro': 'ron',
3051         'ru': 'rus',
3052         'rw': 'kin',
3053         'sa': 'san',
3054         'sc': 'srd',
3055         'sd': 'snd',
3056         'se': 'sme',
3057         'sg': 'sag',
3058         'si': 'sin',
3059         'sk': 'slk',
3060         'sl': 'slv',
3061         'sm': 'smo',
3062         'sn': 'sna',
3063         'so': 'som',
3064         'sq': 'sqi',
3065         'sr': 'srp',
3066         'ss': 'ssw',
3067         'st': 'sot',
3068         'su': 'sun',
3069         'sv': 'swe',
3070         'sw': 'swa',
3071         'ta': 'tam',
3072         'te': 'tel',
3073         'tg': 'tgk',
3074         'th': 'tha',
3075         'ti': 'tir',
3076         'tk': 'tuk',
3077         'tl': 'tgl',
3078         'tn': 'tsn',
3079         'to': 'ton',
3080         'tr': 'tur',
3081         'ts': 'tso',
3082         'tt': 'tat',
3083         'tw': 'twi',
3084         'ty': 'tah',
3085         'ug': 'uig',
3086         'uk': 'ukr',
3087         'ur': 'urd',
3088         'uz': 'uzb',
3089         've': 'ven',
3090         'vi': 'vie',
3091         'vo': 'vol',
3092         'wa': 'wln',
3093         'wo': 'wol',
3094         'xh': 'xho',
3095         'yi': 'yid',
3096         'ji': 'yid',  # Replaced by yi in 1989 revision
3097         'yo': 'yor',
3098         'za': 'zha',
3099         'zh': 'zho',
3100         'zu': 'zul',
3101     }
3102
3103     @classmethod
3104     def short2long(cls, code):
3105         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3106         return cls._lang_map.get(code[:2])
3107
3108     @classmethod
3109     def long2short(cls, code):
3110         """Convert language code from ISO 639-2/T to ISO 639-1"""
3111         for short_name, long_name in cls._lang_map.items():
3112             if long_name == code:
3113                 return short_name
3114
3115
3116 class ISO3166Utils(object):
3117     # From http://data.okfn.org/data/core/country-list
3118     _country_map = {
3119         'AF': 'Afghanistan',
3120         'AX': 'Åland Islands',
3121         'AL': 'Albania',
3122         'DZ': 'Algeria',
3123         'AS': 'American Samoa',
3124         'AD': 'Andorra',
3125         'AO': 'Angola',
3126         'AI': 'Anguilla',
3127         'AQ': 'Antarctica',
3128         'AG': 'Antigua and Barbuda',
3129         'AR': 'Argentina',
3130         'AM': 'Armenia',
3131         'AW': 'Aruba',
3132         'AU': 'Australia',
3133         'AT': 'Austria',
3134         'AZ': 'Azerbaijan',
3135         'BS': 'Bahamas',
3136         'BH': 'Bahrain',
3137         'BD': 'Bangladesh',
3138         'BB': 'Barbados',
3139         'BY': 'Belarus',
3140         'BE': 'Belgium',
3141         'BZ': 'Belize',
3142         'BJ': 'Benin',
3143         'BM': 'Bermuda',
3144         'BT': 'Bhutan',
3145         'BO': 'Bolivia, Plurinational State of',
3146         'BQ': 'Bonaire, Sint Eustatius and Saba',
3147         'BA': 'Bosnia and Herzegovina',
3148         'BW': 'Botswana',
3149         'BV': 'Bouvet Island',
3150         'BR': 'Brazil',
3151         'IO': 'British Indian Ocean Territory',
3152         'BN': 'Brunei Darussalam',
3153         'BG': 'Bulgaria',
3154         'BF': 'Burkina Faso',
3155         'BI': 'Burundi',
3156         'KH': 'Cambodia',
3157         'CM': 'Cameroon',
3158         'CA': 'Canada',
3159         'CV': 'Cape Verde',
3160         'KY': 'Cayman Islands',
3161         'CF': 'Central African Republic',
3162         'TD': 'Chad',
3163         'CL': 'Chile',
3164         'CN': 'China',
3165         'CX': 'Christmas Island',
3166         'CC': 'Cocos (Keeling) Islands',
3167         'CO': 'Colombia',
3168         'KM': 'Comoros',
3169         'CG': 'Congo',
3170         'CD': 'Congo, the Democratic Republic of the',
3171         'CK': 'Cook Islands',
3172         'CR': 'Costa Rica',
3173         'CI': 'Côte d\'Ivoire',
3174         'HR': 'Croatia',
3175         'CU': 'Cuba',
3176         'CW': 'Curaçao',
3177         'CY': 'Cyprus',
3178         'CZ': 'Czech Republic',
3179         'DK': 'Denmark',
3180         'DJ': 'Djibouti',
3181         'DM': 'Dominica',
3182         'DO': 'Dominican Republic',
3183         'EC': 'Ecuador',
3184         'EG': 'Egypt',
3185         'SV': 'El Salvador',
3186         'GQ': 'Equatorial Guinea',
3187         'ER': 'Eritrea',
3188         'EE': 'Estonia',
3189         'ET': 'Ethiopia',
3190         'FK': 'Falkland Islands (Malvinas)',
3191         'FO': 'Faroe Islands',
3192         'FJ': 'Fiji',
3193         'FI': 'Finland',
3194         'FR': 'France',
3195         'GF': 'French Guiana',
3196         'PF': 'French Polynesia',
3197         'TF': 'French Southern Territories',
3198         'GA': 'Gabon',
3199         'GM': 'Gambia',
3200         'GE': 'Georgia',
3201         'DE': 'Germany',
3202         'GH': 'Ghana',
3203         'GI': 'Gibraltar',
3204         'GR': 'Greece',
3205         'GL': 'Greenland',
3206         'GD': 'Grenada',
3207         'GP': 'Guadeloupe',
3208         'GU': 'Guam',
3209         'GT': 'Guatemala',
3210         'GG': 'Guernsey',
3211         'GN': 'Guinea',
3212         'GW': 'Guinea-Bissau',
3213         'GY': 'Guyana',
3214         'HT': 'Haiti',
3215         'HM': 'Heard Island and McDonald Islands',
3216         'VA': 'Holy See (Vatican City State)',
3217         'HN': 'Honduras',
3218         'HK': 'Hong Kong',
3219         'HU': 'Hungary',
3220         'IS': 'Iceland',
3221         'IN': 'India',
3222         'ID': 'Indonesia',
3223         'IR': 'Iran, Islamic Republic of',
3224         'IQ': 'Iraq',
3225         'IE': 'Ireland',
3226         'IM': 'Isle of Man',
3227         'IL': 'Israel',
3228         'IT': 'Italy',
3229         'JM': 'Jamaica',
3230         'JP': 'Japan',
3231         'JE': 'Jersey',
3232         'JO': 'Jordan',
3233         'KZ': 'Kazakhstan',
3234         'KE': 'Kenya',
3235         'KI': 'Kiribati',
3236         'KP': 'Korea, Democratic People\'s Republic of',
3237         'KR': 'Korea, Republic of',
3238         'KW': 'Kuwait',
3239         'KG': 'Kyrgyzstan',
3240         'LA': 'Lao People\'s Democratic Republic',
3241         'LV': 'Latvia',
3242         'LB': 'Lebanon',
3243         'LS': 'Lesotho',
3244         'LR': 'Liberia',
3245         'LY': 'Libya',
3246         'LI': 'Liechtenstein',
3247         'LT': 'Lithuania',
3248         'LU': 'Luxembourg',
3249         'MO': 'Macao',
3250         'MK': 'Macedonia, the Former Yugoslav Republic of',
3251         'MG': 'Madagascar',
3252         'MW': 'Malawi',
3253         'MY': 'Malaysia',
3254         'MV': 'Maldives',
3255         'ML': 'Mali',
3256         'MT': 'Malta',
3257         'MH': 'Marshall Islands',
3258         'MQ': 'Martinique',
3259         'MR': 'Mauritania',
3260         'MU': 'Mauritius',
3261         'YT': 'Mayotte',
3262         'MX': 'Mexico',
3263         'FM': 'Micronesia, Federated States of',
3264         'MD': 'Moldova, Republic of',
3265         'MC': 'Monaco',
3266         'MN': 'Mongolia',
3267         'ME': 'Montenegro',
3268         'MS': 'Montserrat',
3269         'MA': 'Morocco',
3270         'MZ': 'Mozambique',
3271         'MM': 'Myanmar',
3272         'NA': 'Namibia',
3273         'NR': 'Nauru',
3274         'NP': 'Nepal',
3275         'NL': 'Netherlands',
3276         'NC': 'New Caledonia',
3277         'NZ': 'New Zealand',
3278         'NI': 'Nicaragua',
3279         'NE': 'Niger',
3280         'NG': 'Nigeria',
3281         'NU': 'Niue',
3282         'NF': 'Norfolk Island',
3283         'MP': 'Northern Mariana Islands',
3284         'NO': 'Norway',
3285         'OM': 'Oman',
3286         'PK': 'Pakistan',
3287         'PW': 'Palau',
3288         'PS': 'Palestine, State of',
3289         'PA': 'Panama',
3290         'PG': 'Papua New Guinea',
3291         'PY': 'Paraguay',
3292         'PE': 'Peru',
3293         'PH': 'Philippines',
3294         'PN': 'Pitcairn',
3295         'PL': 'Poland',
3296         'PT': 'Portugal',
3297         'PR': 'Puerto Rico',
3298         'QA': 'Qatar',
3299         'RE': 'Réunion',
3300         'RO': 'Romania',
3301         'RU': 'Russian Federation',
3302         'RW': 'Rwanda',
3303         'BL': 'Saint Barthélemy',
3304         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3305         'KN': 'Saint Kitts and Nevis',
3306         'LC': 'Saint Lucia',
3307         'MF': 'Saint Martin (French part)',
3308         'PM': 'Saint Pierre and Miquelon',
3309         'VC': 'Saint Vincent and the Grenadines',
3310         'WS': 'Samoa',
3311         'SM': 'San Marino',
3312         'ST': 'Sao Tome and Principe',
3313         'SA': 'Saudi Arabia',
3314         'SN': 'Senegal',
3315         'RS': 'Serbia',
3316         'SC': 'Seychelles',
3317         'SL': 'Sierra Leone',
3318         'SG': 'Singapore',
3319         'SX': 'Sint Maarten (Dutch part)',
3320         'SK': 'Slovakia',
3321         'SI': 'Slovenia',
3322         'SB': 'Solomon Islands',
3323         'SO': 'Somalia',
3324         'ZA': 'South Africa',
3325         'GS': 'South Georgia and the South Sandwich Islands',
3326         'SS': 'South Sudan',
3327         'ES': 'Spain',
3328         'LK': 'Sri Lanka',
3329         'SD': 'Sudan',
3330         'SR': 'Suriname',
3331         'SJ': 'Svalbard and Jan Mayen',
3332         'SZ': 'Swaziland',
3333         'SE': 'Sweden',
3334         'CH': 'Switzerland',
3335         'SY': 'Syrian Arab Republic',
3336         'TW': 'Taiwan, Province of China',
3337         'TJ': 'Tajikistan',
3338         'TZ': 'Tanzania, United Republic of',
3339         'TH': 'Thailand',
3340         'TL': 'Timor-Leste',
3341         'TG': 'Togo',
3342         'TK': 'Tokelau',
3343         'TO': 'Tonga',
3344         'TT': 'Trinidad and Tobago',
3345         'TN': 'Tunisia',
3346         'TR': 'Turkey',
3347         'TM': 'Turkmenistan',
3348         'TC': 'Turks and Caicos Islands',
3349         'TV': 'Tuvalu',
3350         'UG': 'Uganda',
3351         'UA': 'Ukraine',
3352         'AE': 'United Arab Emirates',
3353         'GB': 'United Kingdom',
3354         'US': 'United States',
3355         'UM': 'United States Minor Outlying Islands',
3356         'UY': 'Uruguay',
3357         'UZ': 'Uzbekistan',
3358         'VU': 'Vanuatu',
3359         'VE': 'Venezuela, Bolivarian Republic of',
3360         'VN': 'Viet Nam',
3361         'VG': 'Virgin Islands, British',
3362         'VI': 'Virgin Islands, U.S.',
3363         'WF': 'Wallis and Futuna',
3364         'EH': 'Western Sahara',
3365         'YE': 'Yemen',
3366         'ZM': 'Zambia',
3367         'ZW': 'Zimbabwe',
3368     }
3369
3370     @classmethod
3371     def short2full(cls, code):
3372         """Convert an ISO 3166-2 country code to the corresponding full name"""
3373         return cls._country_map.get(code.upper())
3374
3375
3376 class GeoUtils(object):
3377     # Major IPv4 address blocks per country
3378     _country_ip_map = {
3379         'AD': '85.94.160.0/19',
3380         'AE': '94.200.0.0/13',
3381         'AF': '149.54.0.0/17',
3382         'AG': '209.59.64.0/18',
3383         'AI': '204.14.248.0/21',
3384         'AL': '46.99.0.0/16',
3385         'AM': '46.70.0.0/15',
3386         'AO': '105.168.0.0/13',
3387         'AP': '159.117.192.0/21',
3388         'AR': '181.0.0.0/12',
3389         'AS': '202.70.112.0/20',
3390         'AT': '84.112.0.0/13',
3391         'AU': '1.128.0.0/11',
3392         'AW': '181.41.0.0/18',
3393         'AZ': '5.191.0.0/16',
3394         'BA': '31.176.128.0/17',
3395         'BB': '65.48.128.0/17',
3396         'BD': '114.130.0.0/16',
3397         'BE': '57.0.0.0/8',
3398         'BF': '129.45.128.0/17',
3399         'BG': '95.42.0.0/15',
3400         'BH': '37.131.0.0/17',
3401         'BI': '154.117.192.0/18',
3402         'BJ': '137.255.0.0/16',
3403         'BL': '192.131.134.0/24',
3404         'BM': '196.12.64.0/18',
3405         'BN': '156.31.0.0/16',
3406         'BO': '161.56.0.0/16',
3407         'BQ': '161.0.80.0/20',
3408         'BR': '152.240.0.0/12',
3409         'BS': '24.51.64.0/18',
3410         'BT': '119.2.96.0/19',
3411         'BW': '168.167.0.0/16',
3412         'BY': '178.120.0.0/13',
3413         'BZ': '179.42.192.0/18',
3414         'CA': '99.224.0.0/11',
3415         'CD': '41.243.0.0/16',
3416         'CF': '196.32.200.0/21',
3417         'CG': '197.214.128.0/17',
3418         'CH': '85.0.0.0/13',
3419         'CI': '154.232.0.0/14',
3420         'CK': '202.65.32.0/19',
3421         'CL': '152.172.0.0/14',
3422         'CM': '165.210.0.0/15',
3423         'CN': '36.128.0.0/10',
3424         'CO': '181.240.0.0/12',
3425         'CR': '201.192.0.0/12',
3426         'CU': '152.206.0.0/15',
3427         'CV': '165.90.96.0/19',
3428         'CW': '190.88.128.0/17',
3429         'CY': '46.198.0.0/15',
3430         'CZ': '88.100.0.0/14',
3431         'DE': '53.0.0.0/8',
3432         'DJ': '197.241.0.0/17',
3433         'DK': '87.48.0.0/12',
3434         'DM': '192.243.48.0/20',
3435         'DO': '152.166.0.0/15',
3436         'DZ': '41.96.0.0/12',
3437         'EC': '186.68.0.0/15',
3438         'EE': '90.190.0.0/15',
3439         'EG': '156.160.0.0/11',
3440         'ER': '196.200.96.0/20',
3441         'ES': '88.0.0.0/11',
3442         'ET': '196.188.0.0/14',
3443         'EU': '2.16.0.0/13',
3444         'FI': '91.152.0.0/13',
3445         'FJ': '144.120.0.0/16',
3446         'FM': '119.252.112.0/20',
3447         'FO': '88.85.32.0/19',
3448         'FR': '90.0.0.0/9',
3449         'GA': '41.158.0.0/15',
3450         'GB': '25.0.0.0/8',
3451         'GD': '74.122.88.0/21',
3452         'GE': '31.146.0.0/16',
3453         'GF': '161.22.64.0/18',
3454         'GG': '62.68.160.0/19',
3455         'GH': '45.208.0.0/14',
3456         'GI': '85.115.128.0/19',
3457         'GL': '88.83.0.0/19',
3458         'GM': '160.182.0.0/15',
3459         'GN': '197.149.192.0/18',
3460         'GP': '104.250.0.0/19',
3461         'GQ': '105.235.224.0/20',
3462         'GR': '94.64.0.0/13',
3463         'GT': '168.234.0.0/16',
3464         'GU': '168.123.0.0/16',
3465         'GW': '197.214.80.0/20',
3466         'GY': '181.41.64.0/18',
3467         'HK': '113.252.0.0/14',
3468         'HN': '181.210.0.0/16',
3469         'HR': '93.136.0.0/13',
3470         'HT': '148.102.128.0/17',
3471         'HU': '84.0.0.0/14',
3472         'ID': '39.192.0.0/10',
3473         'IE': '87.32.0.0/12',
3474         'IL': '79.176.0.0/13',
3475         'IM': '5.62.80.0/20',
3476         'IN': '117.192.0.0/10',
3477         'IO': '203.83.48.0/21',
3478         'IQ': '37.236.0.0/14',
3479         'IR': '2.176.0.0/12',
3480         'IS': '82.221.0.0/16',
3481         'IT': '79.0.0.0/10',
3482         'JE': '87.244.64.0/18',
3483         'JM': '72.27.0.0/17',
3484         'JO': '176.29.0.0/16',
3485         'JP': '126.0.0.0/8',
3486         'KE': '105.48.0.0/12',
3487         'KG': '158.181.128.0/17',
3488         'KH': '36.37.128.0/17',
3489         'KI': '103.25.140.0/22',
3490         'KM': '197.255.224.0/20',
3491         'KN': '198.32.32.0/19',
3492         'KP': '175.45.176.0/22',
3493         'KR': '175.192.0.0/10',
3494         'KW': '37.36.0.0/14',
3495         'KY': '64.96.0.0/15',
3496         'KZ': '2.72.0.0/13',
3497         'LA': '115.84.64.0/18',
3498         'LB': '178.135.0.0/16',
3499         'LC': '192.147.231.0/24',
3500         'LI': '82.117.0.0/19',
3501         'LK': '112.134.0.0/15',
3502         'LR': '41.86.0.0/19',
3503         'LS': '129.232.0.0/17',
3504         'LT': '78.56.0.0/13',
3505         'LU': '188.42.0.0/16',
3506         'LV': '46.109.0.0/16',
3507         'LY': '41.252.0.0/14',
3508         'MA': '105.128.0.0/11',
3509         'MC': '88.209.64.0/18',
3510         'MD': '37.246.0.0/16',
3511         'ME': '178.175.0.0/17',
3512         'MF': '74.112.232.0/21',
3513         'MG': '154.126.0.0/17',
3514         'MH': '117.103.88.0/21',
3515         'MK': '77.28.0.0/15',
3516         'ML': '154.118.128.0/18',
3517         'MM': '37.111.0.0/17',
3518         'MN': '49.0.128.0/17',
3519         'MO': '60.246.0.0/16',
3520         'MP': '202.88.64.0/20',
3521         'MQ': '109.203.224.0/19',
3522         'MR': '41.188.64.0/18',
3523         'MS': '208.90.112.0/22',
3524         'MT': '46.11.0.0/16',
3525         'MU': '105.16.0.0/12',
3526         'MV': '27.114.128.0/18',
3527         'MW': '105.234.0.0/16',
3528         'MX': '187.192.0.0/11',
3529         'MY': '175.136.0.0/13',
3530         'MZ': '197.218.0.0/15',
3531         'NA': '41.182.0.0/16',
3532         'NC': '101.101.0.0/18',
3533         'NE': '197.214.0.0/18',
3534         'NF': '203.17.240.0/22',
3535         'NG': '105.112.0.0/12',
3536         'NI': '186.76.0.0/15',
3537         'NL': '145.96.0.0/11',
3538         'NO': '84.208.0.0/13',
3539         'NP': '36.252.0.0/15',
3540         'NR': '203.98.224.0/19',
3541         'NU': '49.156.48.0/22',
3542         'NZ': '49.224.0.0/14',
3543         'OM': '5.36.0.0/15',
3544         'PA': '186.72.0.0/15',
3545         'PE': '186.160.0.0/14',
3546         'PF': '123.50.64.0/18',
3547         'PG': '124.240.192.0/19',
3548         'PH': '49.144.0.0/13',
3549         'PK': '39.32.0.0/11',
3550         'PL': '83.0.0.0/11',
3551         'PM': '70.36.0.0/20',
3552         'PR': '66.50.0.0/16',
3553         'PS': '188.161.0.0/16',
3554         'PT': '85.240.0.0/13',
3555         'PW': '202.124.224.0/20',
3556         'PY': '181.120.0.0/14',
3557         'QA': '37.210.0.0/15',
3558         'RE': '139.26.0.0/16',
3559         'RO': '79.112.0.0/13',
3560         'RS': '178.220.0.0/14',
3561         'RU': '5.136.0.0/13',
3562         'RW': '105.178.0.0/15',
3563         'SA': '188.48.0.0/13',
3564         'SB': '202.1.160.0/19',
3565         'SC': '154.192.0.0/11',
3566         'SD': '154.96.0.0/13',
3567         'SE': '78.64.0.0/12',
3568         'SG': '152.56.0.0/14',
3569         'SI': '188.196.0.0/14',
3570         'SK': '78.98.0.0/15',
3571         'SL': '197.215.0.0/17',
3572         'SM': '89.186.32.0/19',
3573         'SN': '41.82.0.0/15',
3574         'SO': '197.220.64.0/19',
3575         'SR': '186.179.128.0/17',
3576         'SS': '105.235.208.0/21',
3577         'ST': '197.159.160.0/19',
3578         'SV': '168.243.0.0/16',
3579         'SX': '190.102.0.0/20',
3580         'SY': '5.0.0.0/16',
3581         'SZ': '41.84.224.0/19',
3582         'TC': '65.255.48.0/20',
3583         'TD': '154.68.128.0/19',
3584         'TG': '196.168.0.0/14',
3585         'TH': '171.96.0.0/13',
3586         'TJ': '85.9.128.0/18',
3587         'TK': '27.96.24.0/21',
3588         'TL': '180.189.160.0/20',
3589         'TM': '95.85.96.0/19',
3590         'TN': '197.0.0.0/11',
3591         'TO': '175.176.144.0/21',
3592         'TR': '78.160.0.0/11',
3593         'TT': '186.44.0.0/15',
3594         'TV': '202.2.96.0/19',
3595         'TW': '120.96.0.0/11',
3596         'TZ': '156.156.0.0/14',
3597         'UA': '93.72.0.0/13',
3598         'UG': '154.224.0.0/13',
3599         'US': '3.0.0.0/8',
3600         'UY': '167.56.0.0/13',
3601         'UZ': '82.215.64.0/18',
3602         'VA': '212.77.0.0/19',
3603         'VC': '24.92.144.0/20',
3604         'VE': '186.88.0.0/13',
3605         'VG': '172.103.64.0/18',
3606         'VI': '146.226.0.0/16',
3607         'VN': '14.160.0.0/11',
3608         'VU': '202.80.32.0/20',
3609         'WF': '117.20.32.0/21',
3610         'WS': '202.4.32.0/19',
3611         'YE': '134.35.0.0/16',
3612         'YT': '41.242.116.0/22',
3613         'ZA': '41.0.0.0/11',
3614         'ZM': '165.56.0.0/13',
3615         'ZW': '41.85.192.0/19',
3616     }
3617
3618     @classmethod
3619     def random_ipv4(cls, code_or_block):
3620         if len(code_or_block) == 2:
3621             block = cls._country_ip_map.get(code_or_block.upper())
3622             if not block:
3623                 return None
3624         else:
3625             block = code_or_block
3626         addr, preflen = block.split('/')
3627         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3628         addr_max = addr_min | (0xffffffff >> int(preflen))
3629         return compat_str(socket.inet_ntoa(
3630             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3631
3632
3633 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3634     def __init__(self, proxies=None):
3635         # Set default handlers
3636         for type in ('http', 'https'):
3637             setattr(self, '%s_open' % type,
3638                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3639                         meth(r, proxy, type))
3640         compat_urllib_request.ProxyHandler.__init__(self, proxies)
3641
3642     def proxy_open(self, req, proxy, type):
3643         req_proxy = req.headers.get('Ytdl-request-proxy')
3644         if req_proxy is not None:
3645             proxy = req_proxy
3646             del req.headers['Ytdl-request-proxy']
3647
3648         if proxy == '__noproxy__':
3649             return None  # No Proxy
3650         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3651             req.add_header('Ytdl-socks-proxy', proxy)
3652             # youtube-dl's http/https handlers do wrapping the socket with socks
3653             return None
3654         return compat_urllib_request.ProxyHandler.proxy_open(
3655             self, req, proxy, type)
3656
3657
3658 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3659 # released into Public Domain
3660 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3661
3662 def long_to_bytes(n, blocksize=0):
3663     """long_to_bytes(n:long, blocksize:int) : string
3664     Convert a long integer to a byte string.
3665
3666     If optional blocksize is given and greater than zero, pad the front of the
3667     byte string with binary zeros so that the length is a multiple of
3668     blocksize.
3669     """
3670     # after much testing, this algorithm was deemed to be the fastest
3671     s = b''
3672     n = int(n)
3673     while n > 0:
3674         s = compat_struct_pack('>I', n & 0xffffffff) + s
3675         n = n >> 32
3676     # strip off leading zeros
3677     for i in range(len(s)):
3678         if s[i] != b'\000'[0]:
3679             break
3680     else:
3681         # only happens when n == 0
3682         s = b'\000'
3683         i = 0
3684     s = s[i:]
3685     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3686     # de-padding being done above, but sigh...
3687     if blocksize > 0 and len(s) % blocksize:
3688         s = (blocksize - len(s) % blocksize) * b'\000' + s
3689     return s
3690
3691
3692 def bytes_to_long(s):
3693     """bytes_to_long(string) : long
3694     Convert a byte string to a long integer.
3695
3696     This is (essentially) the inverse of long_to_bytes().
3697     """
3698     acc = 0
3699     length = len(s)
3700     if length % 4:
3701         extra = (4 - length % 4)
3702         s = b'\000' * extra + s
3703         length = length + extra
3704     for i in range(0, length, 4):
3705         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3706     return acc
3707
3708
3709 def ohdave_rsa_encrypt(data, exponent, modulus):
3710     '''
3711     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3712
3713     Input:
3714         data: data to encrypt, bytes-like object
3715         exponent, modulus: parameter e and N of RSA algorithm, both integer
3716     Output: hex string of encrypted data
3717
3718     Limitation: supports one block encryption only
3719     '''
3720
3721     payload = int(binascii.hexlify(data[::-1]), 16)
3722     encrypted = pow(payload, exponent, modulus)
3723     return '%x' % encrypted
3724
3725
3726 def pkcs1pad(data, length):
3727     """
3728     Padding input data with PKCS#1 scheme
3729
3730     @param {int[]} data        input data
3731     @param {int}   length      target length
3732     @returns {int[]}           padded data
3733     """
3734     if len(data) > length - 11:
3735         raise ValueError('Input data too long for PKCS#1 padding')
3736
3737     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3738     return [0, 2] + pseudo_random + [0] + data
3739
3740
3741 def encode_base_n(num, n, table=None):
3742     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3743     if not table:
3744         table = FULL_TABLE[:n]
3745
3746     if n > len(table):
3747         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3748
3749     if num == 0:
3750         return table[0]
3751
3752     ret = ''
3753     while num:
3754         ret = table[num % n] + ret
3755         num = num // n
3756     return ret
3757
3758
3759 def decode_packed_codes(code):
3760     mobj = re.search(PACKED_CODES_RE, code)
3761     obfucasted_code, base, count, symbols = mobj.groups()
3762     base = int(base)
3763     count = int(count)
3764     symbols = symbols.split('|')
3765     symbol_table = {}
3766
3767     while count:
3768         count -= 1
3769         base_n_count = encode_base_n(count, base)
3770         symbol_table[base_n_count] = symbols[count] or base_n_count
3771
3772     return re.sub(
3773         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3774         obfucasted_code)
3775
3776
3777 def parse_m3u8_attributes(attrib):
3778     info = {}
3779     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3780         if val.startswith('"'):
3781             val = val[1:-1]
3782         info[key] = val
3783     return info
3784
3785
3786 def urshift(val, n):
3787     return val >> n if val >= 0 else (val + 0x100000000) >> n
3788
3789
3790 # Based on png2str() written by @gdkchan and improved by @yokrysty
3791 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3792 def decode_png(png_data):
3793     # Reference: https://www.w3.org/TR/PNG/
3794     header = png_data[8:]
3795
3796     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3797         raise IOError('Not a valid PNG file.')
3798
3799     int_map = {1: '>B', 2: '>H', 4: '>I'}
3800     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3801
3802     chunks = []
3803
3804     while header:
3805         length = unpack_integer(header[:4])
3806         header = header[4:]
3807
3808         chunk_type = header[:4]
3809         header = header[4:]
3810
3811         chunk_data = header[:length]
3812         header = header[length:]
3813
3814         header = header[4:]  # Skip CRC
3815
3816         chunks.append({
3817             'type': chunk_type,
3818             'length': length,
3819             'data': chunk_data
3820         })
3821
3822     ihdr = chunks[0]['data']
3823
3824     width = unpack_integer(ihdr[:4])
3825     height = unpack_integer(ihdr[4:8])
3826
3827     idat = b''
3828
3829     for chunk in chunks:
3830         if chunk['type'] == b'IDAT':
3831             idat += chunk['data']
3832
3833     if not idat:
3834         raise IOError('Unable to read PNG data.')
3835
3836     decompressed_data = bytearray(zlib.decompress(idat))
3837
3838     stride = width * 3
3839     pixels = []
3840
3841     def _get_pixel(idx):
3842         x = idx % stride
3843         y = idx // stride
3844         return pixels[y][x]
3845
3846     for y in range(height):
3847         basePos = y * (1 + stride)
3848         filter_type = decompressed_data[basePos]
3849
3850         current_row = []
3851
3852         pixels.append(current_row)
3853
3854         for x in range(stride):
3855             color = decompressed_data[1 + basePos + x]
3856             basex = y * stride + x
3857             left = 0
3858             up = 0
3859
3860             if x > 2:
3861                 left = _get_pixel(basex - 3)
3862             if y > 0:
3863                 up = _get_pixel(basex - stride)
3864
3865             if filter_type == 1:  # Sub
3866                 color = (color + left) & 0xff
3867             elif filter_type == 2:  # Up
3868                 color = (color + up) & 0xff
3869             elif filter_type == 3:  # Average
3870                 color = (color + ((left + up) >> 1)) & 0xff
3871             elif filter_type == 4:  # Paeth
3872                 a = left
3873                 b = up
3874                 c = 0
3875
3876                 if x > 2 and y > 0:
3877                     c = _get_pixel(basex - stride - 3)
3878
3879                 p = a + b - c
3880
3881                 pa = abs(p - a)
3882                 pb = abs(p - b)
3883                 pc = abs(p - c)
3884
3885                 if pa <= pb and pa <= pc:
3886                     color = (color + a) & 0xff
3887                 elif pb <= pc:
3888                     color = (color + b) & 0xff
3889                 else:
3890                     color = (color + c) & 0xff
3891
3892             current_row.append(color)
3893
3894     return width, height, pixels
3895
3896
3897 def write_xattr(path, key, value):
3898     # This mess below finds the best xattr tool for the job
3899     try:
3900         # try the pyxattr module...
3901         import xattr
3902
3903         if hasattr(xattr, 'set'):  # pyxattr
3904             # Unicode arguments are not supported in python-pyxattr until
3905             # version 0.5.0
3906             # See https://github.com/rg3/youtube-dl/issues/5498
3907             pyxattr_required_version = '0.5.0'
3908             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3909                 # TODO: fallback to CLI tools
3910                 raise XAttrUnavailableError(
3911                     'python-pyxattr is detected but is too old. '
3912                     'youtube-dl requires %s or above while your version is %s. '
3913                     'Falling back to other xattr implementations' % (
3914                         pyxattr_required_version, xattr.__version__))
3915
3916             setxattr = xattr.set
3917         else:  # xattr
3918             setxattr = xattr.setxattr
3919
3920         try:
3921             setxattr(path, key, value)
3922         except EnvironmentError as e:
3923             raise XAttrMetadataError(e.errno, e.strerror)
3924
3925     except ImportError:
3926         if compat_os_name == 'nt':
3927             # Write xattrs to NTFS Alternate Data Streams:
3928             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3929             assert ':' not in key
3930             assert os.path.exists(path)
3931
3932             ads_fn = path + ':' + key
3933             try:
3934                 with open(ads_fn, 'wb') as f:
3935                     f.write(value)
3936             except EnvironmentError as e:
3937                 raise XAttrMetadataError(e.errno, e.strerror)
3938         else:
3939             user_has_setfattr = check_executable('setfattr', ['--version'])
3940             user_has_xattr = check_executable('xattr', ['-h'])
3941
3942             if user_has_setfattr or user_has_xattr:
3943
3944                 value = value.decode('utf-8')
3945                 if user_has_setfattr:
3946                     executable = 'setfattr'
3947                     opts = ['-n', key, '-v', value]
3948                 elif user_has_xattr:
3949                     executable = 'xattr'
3950                     opts = ['-w', key, value]
3951
3952                 cmd = ([encodeFilename(executable, True)] +
3953                        [encodeArgument(o) for o in opts] +
3954                        [encodeFilename(path, True)])
3955
3956                 try:
3957                     p = subprocess.Popen(
3958                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3959                 except EnvironmentError as e:
3960                     raise XAttrMetadataError(e.errno, e.strerror)
3961                 stdout, stderr = p.communicate()
3962                 stderr = stderr.decode('utf-8', 'replace')
3963                 if p.returncode != 0:
3964                     raise XAttrMetadataError(p.returncode, stderr)
3965
3966             else:
3967                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3968                 if sys.platform.startswith('linux'):
3969                     raise XAttrUnavailableError(
3970                         "Couldn't find a tool to set the xattrs. "
3971                         "Install either the python 'pyxattr' or 'xattr' "
3972                         "modules, or the GNU 'attr' package "
3973                         "(which contains the 'setfattr' tool).")
3974                 else:
3975                     raise XAttrUnavailableError(
3976                         "Couldn't find a tool to set the xattrs. "
3977                         "Install either the python 'xattr' module, "
3978                         "or the 'xattr' binary.")
3979
3980
3981 def random_birthday(year_field, month_field, day_field):
3982     start_date = datetime.date(1950, 1, 1)
3983     end_date = datetime.date(1995, 12, 31)
3984     offset = random.randint(0, (end_date - start_date).days)
3985     random_date = start_date + datetime.timedelta(offset)
3986     return {
3987         year_field: str(random_date.year),
3988         month_field: str(random_date.month),
3989         day_field: str(random_date.day),
3990     }