_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_os_name,
  46     compat_parse_qs,
  47     compat_shlex_quote,
  48     compat_socket_create_connection,
  49     compat_str,
  50     compat_struct_pack,
  51     compat_struct_unpack,
  52     compat_urllib_error,
  53     compat_urllib_parse,
  54     compat_urllib_parse_urlencode,
  55     compat_urllib_parse_urlparse,
  56     compat_urllib_parse_unquote_plus,
  57     compat_urllib_request,
  58     compat_urlparse,
  59     compat_xpath,
  60 )
  61
  62 from .socks import (
  63     ProxyType,
  64     sockssocket,
  65 )
  66
  67
  68 def register_socks_protocols():
  69     # "Register" SOCKS protocols
  70     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  71     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  72     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  73         if scheme not in compat_urlparse.uses_netloc:
  74             compat_urlparse.uses_netloc.append(scheme)
  75
  76
  77 # This is not clearly defined otherwise
  78 compiled_regex_type = type(re.compile(''))
  79
  80 std_headers = {
  81     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  82     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84     'Accept-Encoding': 'gzip, deflate',
  85     'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88
  89 USER_AGENTS = {
  90     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  91 }
  92
  93
  94 NO_DEFAULT = object()
  95
  96 ENGLISH_MONTH_NAMES = [
  97     'January', 'February', 'March', 'April', 'May', 'June',
  98     'July', 'August', 'September', 'October', 'November', 'December']
  99
 100 MONTH_NAMES = {
 101     'en': ENGLISH_MONTH_NAMES,
 102     'fr': [
 103         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 104         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 105 }
 106
 107 KNOWN_EXTENSIONS = (
 108     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 109     'flv', 'f4v', 'f4a', 'f4b',
 110     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 111     'mkv', 'mka', 'mk3d',
 112     'avi', 'divx',
 113     'mov',
 114     'asf', 'wmv', 'wma',
 115     '3gp', '3g2',
 116     'mp3',
 117     'flac',
 118     'ape',
 119     'wav',
 120     'f4f', 'f4m', 'm3u8', 'smil')
 121
 122 # needed for sanitizing filenames in restricted mode
 123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 124                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 125                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 126
 127 DATE_FORMATS = (
 128     '%d %B %Y',
 129     '%d %b %Y',
 130     '%B %d %Y',
 131     '%B %dst %Y',
 132     '%B %dnd %Y',
 133     '%B %dth %Y',
 134     '%b %d %Y',
 135     '%b %dst %Y',
 136     '%b %dnd %Y',
 137     '%b %dth %Y',
 138     '%b %dst %Y %I:%M',
 139     '%b %dnd %Y %I:%M',
 140     '%b %dth %Y %I:%M',
 141     '%Y %m %d',
 142     '%Y-%m-%d',
 143     '%Y/%m/%d',
 144     '%Y/%m/%d %H:%M',
 145     '%Y/%m/%d %H:%M:%S',
 146     '%Y-%m-%d %H:%M:%S',
 147     '%Y-%m-%d %H:%M:%S.%f',
 148     '%d.%m.%Y %H:%M',
 149     '%d.%m.%Y %H.%M',
 150     '%Y-%m-%dT%H:%M:%SZ',
 151     '%Y-%m-%dT%H:%M:%S.%fZ',
 152     '%Y-%m-%dT%H:%M:%S.%f0Z',
 153     '%Y-%m-%dT%H:%M:%S',
 154     '%Y-%m-%dT%H:%M:%S.%f',
 155     '%Y-%m-%dT%H:%M',
 156     '%b %d %Y at %H:%M',
 157     '%b %d %Y at %H:%M:%S',
 158 )
 159
 160 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 161 DATE_FORMATS_DAY_FIRST.extend([
 162     '%d-%m-%Y',
 163     '%d.%m.%Y',
 164     '%d.%m.%y',
 165     '%d/%m/%Y',
 166     '%d/%m/%y',
 167     '%d/%m/%Y %H:%M:%S',
 168 ])
 169
 170 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 171 DATE_FORMATS_MONTH_FIRST.extend([
 172     '%m-%d-%Y',
 173     '%m.%d.%Y',
 174     '%m/%d/%Y',
 175     '%m/%d/%y',
 176     '%m/%d/%Y %H:%M:%S',
 177 ])
 178
 179 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 180
 181
 182 def preferredencoding():
 183     """Get preferred encoding.
 184
 185     Returns the best encoding scheme for the system, based on
 186     locale.getpreferredencoding() and some further tweaks.
 187     """
 188     try:
 189         pref = locale.getpreferredencoding()
 190         'TEST'.encode(pref)
 191     except Exception:
 192         pref = 'UTF-8'
 193
 194     return pref
 195
 196
 197 def write_json_file(obj, fn):
 198     """ Encode obj as JSON and write it to fn, atomically if possible """
 199
 200     fn = encodeFilename(fn)
 201     if sys.version_info < (3, 0) and sys.platform != 'win32':
 202         encoding = get_filesystem_encoding()
 203         # os.path.basename returns a bytes object, but NamedTemporaryFile
 204         # will fail if the filename contains non ascii characters unless we
 205         # use a unicode object
 206         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 207         # the same for os.path.dirname
 208         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 209     else:
 210         path_basename = os.path.basename
 211         path_dirname = os.path.dirname
 212
 213     args = {
 214         'suffix': '.tmp',
 215         'prefix': path_basename(fn) + '.',
 216         'dir': path_dirname(fn),
 217         'delete': False,
 218     }
 219
 220     # In Python 2.x, json.dump expects a bytestream.
 221     # In Python 3.x, it writes to a character stream
 222     if sys.version_info < (3, 0):
 223         args['mode'] = 'wb'
 224     else:
 225         args.update({
 226             'mode': 'w',
 227             'encoding': 'utf-8',
 228         })
 229
 230     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 231
 232     try:
 233         with tf:
 234             json.dump(obj, tf)
 235         if sys.platform == 'win32':
 236             # Need to remove existing file on Windows, else os.rename raises
 237             # WindowsError or FileExistsError.
 238             try:
 239                 os.unlink(fn)
 240             except OSError:
 241                 pass
 242         os.rename(tf.name, fn)
 243     except Exception:
 244         try:
 245             os.remove(tf.name)
 246         except OSError:
 247             pass
 248         raise
 249
 250
 251 if sys.version_info >= (2, 7):
 252     def find_xpath_attr(node, xpath, key, val=None):
 253         """ Find the xpath xpath[@key=val] """
 254         assert re.match(r'^[a-zA-Z_-]+$', key)
 255         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 256         return node.find(expr)
 257 else:
 258     def find_xpath_attr(node, xpath, key, val=None):
 259         for f in node.findall(compat_xpath(xpath)):
 260             if key not in f.attrib:
 261                 continue
 262             if val is None or f.attrib.get(key) == val:
 263                 return f
 264         return None
 265
 266 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 267 # the namespace parameter
 268
 269
 270 def xpath_with_ns(path, ns_map):
 271     components = [c.split(':') for c in path.split('/')]
 272     replaced = []
 273     for c in components:
 274         if len(c) == 1:
 275             replaced.append(c[0])
 276         else:
 277             ns, tag = c
 278             replaced.append('{%s}%s' % (ns_map[ns], tag))
 279     return '/'.join(replaced)
 280
 281
 282 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 283     def _find_xpath(xpath):
 284         return node.find(compat_xpath(xpath))
 285
 286     if isinstance(xpath, (str, compat_str)):
 287         n = _find_xpath(xpath)
 288     else:
 289         for xp in xpath:
 290             n = _find_xpath(xp)
 291             if n is not None:
 292                 break
 293
 294     if n is None:
 295         if default is not NO_DEFAULT:
 296             return default
 297         elif fatal:
 298             name = xpath if name is None else name
 299             raise ExtractorError('Could not find XML element %s' % name)
 300         else:
 301             return None
 302     return n
 303
 304
 305 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 306     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 307     if n is None or n == default:
 308         return n
 309     if n.text is None:
 310         if default is not NO_DEFAULT:
 311             return default
 312         elif fatal:
 313             name = xpath if name is None else name
 314             raise ExtractorError('Could not find XML element\'s text %s' % name)
 315         else:
 316             return None
 317     return n.text
 318
 319
 320 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 321     n = find_xpath_attr(node, xpath, key)
 322     if n is None:
 323         if default is not NO_DEFAULT:
 324             return default
 325         elif fatal:
 326             name = '%s[@%s]' % (xpath, key) if name is None else name
 327             raise ExtractorError('Could not find XML attribute %s' % name)
 328         else:
 329             return None
 330     return n.attrib[key]
 331
 332
 333 def get_element_by_id(id, html):
 334     """Return the content of the tag with the specified ID in the passed HTML document"""
 335     return get_element_by_attribute('id', id, html)
 336
 337
 338 def get_element_by_class(class_name, html):
 339     return get_element_by_attribute(
 340         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 341         html, escape_value=False)
 342
 343
 344 def get_element_by_attribute(attribute, value, html, escape_value=True):
 345     """Return the content of the tag with the specified attribute in the passed HTML document"""
 346
 347     value = re.escape(value) if escape_value else value
 348
 349     m = re.search(r'''(?xs)
 350         <([a-zA-Z0-9:._-]+)
 351          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 352          \s+%s=['"]?%s['"]?
 353          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 354         \s*>
 355         (?P<content>.*?)
 356         </\1>
 357     ''' % (re.escape(attribute), value), html)
 358
 359     if not m:
 360         return None
 361     res = m.group('content')
 362
 363     if res.startswith('"') or res.startswith("'"):
 364         res = res[1:-1]
 365
 366     return unescapeHTML(res)
 367
 368
 369 class HTMLAttributeParser(compat_HTMLParser):
 370     """Trivial HTML parser to gather the attributes for a single element"""
 371     def __init__(self):
 372         self.attrs = {}
 373         compat_HTMLParser.__init__(self)
 374
 375     def handle_starttag(self, tag, attrs):
 376         self.attrs = dict(attrs)
 377
 378
 379 def extract_attributes(html_element):
 380     """Given a string for an HTML element such as
 381     <el
 382          a="foo" B="bar" c="&98;az" d=boz
 383          empty= noval entity="&amp;"
 384          sq='"' dq="'"
 385     >
 386     Decode and return a dictionary of attributes.
 387     {
 388         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 389         'empty': '', 'noval': None, 'entity': '&',
 390         'sq': '"', 'dq': '\''
 391     }.
 392     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 393     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 394     """
 395     parser = HTMLAttributeParser()
 396     parser.feed(html_element)
 397     parser.close()
 398     return parser.attrs
 399
 400
 401 def clean_html(html):
 402     """Clean an HTML snippet into a readable string"""
 403
 404     if html is None:  # Convenience for sanitizing descriptions etc.
 405         return html
 406
 407     # Newline vs <br />
 408     html = html.replace('\n', ' ')
 409     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 410     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 411     # Strip html tags
 412     html = re.sub('<.*?>', '', html)
 413     # Replace html entities
 414     html = unescapeHTML(html)
 415     return html.strip()
 416
 417
 418 def sanitize_open(filename, open_mode):
 419     """Try to open the given filename, and slightly tweak it if this fails.
 420
 421     Attempts to open the given filename. If this fails, it tries to change
 422     the filename slightly, step by step, until it's either able to open it
 423     or it fails and raises a final exception, like the standard open()
 424     function.
 425
 426     It returns the tuple (stream, definitive_file_name).
 427     """
 428     try:
 429         if filename == '-':
 430             if sys.platform == 'win32':
 431                 import msvcrt
 432                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 433             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 434         stream = open(encodeFilename(filename), open_mode)
 435         return (stream, filename)
 436     except (IOError, OSError) as err:
 437         if err.errno in (errno.EACCES,):
 438             raise
 439
 440         # In case of error, try to remove win32 forbidden chars
 441         alt_filename = sanitize_path(filename)
 442         if alt_filename == filename:
 443             raise
 444         else:
 445             # An exception here should be caught in the caller
 446             stream = open(encodeFilename(alt_filename), open_mode)
 447             return (stream, alt_filename)
 448
 449
 450 def timeconvert(timestr):
 451     """Convert RFC 2822 defined time string into system timestamp"""
 452     timestamp = None
 453     timetuple = email.utils.parsedate_tz(timestr)
 454     if timetuple is not None:
 455         timestamp = email.utils.mktime_tz(timetuple)
 456     return timestamp
 457
 458
 459 def sanitize_filename(s, restricted=False, is_id=False):
 460     """Sanitizes a string so it could be used as part of a filename.
 461     If restricted is set, use a stricter subset of allowed characters.
 462     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 463     """
 464     def replace_insane(char):
 465         if restricted and char in ACCENT_CHARS:
 466             return ACCENT_CHARS[char]
 467         if char == '?' or ord(char) < 32 or ord(char) == 127:
 468             return ''
 469         elif char == '"':
 470             return '' if restricted else '\''
 471         elif char == ':':
 472             return '_-' if restricted else ' -'
 473         elif char in '\\/|*<>':
 474             return '_'
 475         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 476             return '_'
 477         if restricted and ord(char) > 127:
 478             return '_'
 479         return char
 480
 481     # Handle timestamps
 482     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 483     result = ''.join(map(replace_insane, s))
 484     if not is_id:
 485         while '__' in result:
 486             result = result.replace('__', '_')
 487         result = result.strip('_')
 488         # Common case of "Foreign band name - English song title"
 489         if restricted and result.startswith('-_'):
 490             result = result[2:]
 491         if result.startswith('-'):
 492             result = '_' + result[len('-'):]
 493         result = result.lstrip('.')
 494         if not result:
 495             result = '_'
 496     return result
 497
 498
 499 def sanitize_path(s):
 500     """Sanitizes and normalizes path on Windows"""
 501     if sys.platform != 'win32':
 502         return s
 503     drive_or_unc, _ = os.path.splitdrive(s)
 504     if sys.version_info < (2, 7) and not drive_or_unc:
 505         drive_or_unc, _ = os.path.splitunc(s)
 506     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 507     if drive_or_unc:
 508         norm_path.pop(0)
 509     sanitized_path = [
 510         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 511         for path_part in norm_path]
 512     if drive_or_unc:
 513         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 514     return os.path.join(*sanitized_path)
 515
 516
 517 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 518 # unwanted failures due to missing protocol
 519 def sanitize_url(url):
 520     return 'http:%s' % url if url.startswith('//') else url
 521
 522
 523 def sanitized_Request(url, *args, **kwargs):
 524     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 525
 526
 527 def orderedSet(iterable):
 528     """ Remove all duplicates from the input iterable """
 529     res = []
 530     for el in iterable:
 531         if el not in res:
 532             res.append(el)
 533     return res
 534
 535
 536 def _htmlentity_transform(entity_with_semicolon):
 537     """Transforms an HTML entity to a character."""
 538     entity = entity_with_semicolon[:-1]
 539
 540     # Known non-numeric HTML entity
 541     if entity in compat_html_entities.name2codepoint:
 542         return compat_chr(compat_html_entities.name2codepoint[entity])
 543
 544     # TODO: HTML5 allows entities without a semicolon. For example,
 545     # '&Eacuteric' should be decoded as 'Éric'.
 546     if entity_with_semicolon in compat_html_entities_html5:
 547         return compat_html_entities_html5[entity_with_semicolon]
 548
 549     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 550     if mobj is not None:
 551         numstr = mobj.group(1)
 552         if numstr.startswith('x'):
 553             base = 16
 554             numstr = '0%s' % numstr
 555         else:
 556             base = 10
 557         # See https://github.com/rg3/youtube-dl/issues/7518
 558         try:
 559             return compat_chr(int(numstr, base))
 560         except ValueError:
 561             pass
 562
 563     # Unknown entity in name, return its literal representation
 564     return '&%s;' % entity
 565
 566
 567 def unescapeHTML(s):
 568     if s is None:
 569         return None
 570     assert type(s) == compat_str
 571
 572     return re.sub(
 573         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 574
 575
 576 def get_subprocess_encoding():
 577     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 578         # For subprocess calls, encode with locale encoding
 579         # Refer to http://stackoverflow.com/a/9951851/35070
 580         encoding = preferredencoding()
 581     else:
 582         encoding = sys.getfilesystemencoding()
 583     if encoding is None:
 584         encoding = 'utf-8'
 585     return encoding
 586
 587
 588 def encodeFilename(s, for_subprocess=False):
 589     """
 590     @param s The name of the file
 591     """
 592
 593     assert type(s) == compat_str
 594
 595     # Python 3 has a Unicode API
 596     if sys.version_info >= (3, 0):
 597         return s
 598
 599     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 600     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 601     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 602     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 603         return s
 604
 605     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 606     if sys.platform.startswith('java'):
 607         return s
 608
 609     return s.encode(get_subprocess_encoding(), 'ignore')
 610
 611
 612 def decodeFilename(b, for_subprocess=False):
 613
 614     if sys.version_info >= (3, 0):
 615         return b
 616
 617     if not isinstance(b, bytes):
 618         return b
 619
 620     return b.decode(get_subprocess_encoding(), 'ignore')
 621
 622
 623 def encodeArgument(s):
 624     if not isinstance(s, compat_str):
 625         # Legacy code that uses byte strings
 626         # Uncomment the following line after fixing all post processors
 627         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 628         s = s.decode('ascii')
 629     return encodeFilename(s, True)
 630
 631
 632 def decodeArgument(b):
 633     return decodeFilename(b, True)
 634
 635
 636 def decodeOption(optval):
 637     if optval is None:
 638         return optval
 639     if isinstance(optval, bytes):
 640         optval = optval.decode(preferredencoding())
 641
 642     assert isinstance(optval, compat_str)
 643     return optval
 644
 645
 646 def formatSeconds(secs):
 647     if secs > 3600:
 648         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 649     elif secs > 60:
 650         return '%d:%02d' % (secs // 60, secs % 60)
 651     else:
 652         return '%d' % secs
 653
 654
 655 def make_HTTPS_handler(params, **kwargs):
 656     opts_no_check_certificate = params.get('nocheckcertificate', False)
 657     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 658         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 659         if opts_no_check_certificate:
 660             context.check_hostname = False
 661             context.verify_mode = ssl.CERT_NONE
 662         try:
 663             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 664         except TypeError:
 665             # Python 2.7.8
 666             # (create_default_context present but HTTPSHandler has no context=)
 667             pass
 668
 669     if sys.version_info < (3, 2):
 670         return YoutubeDLHTTPSHandler(params, **kwargs)
 671     else:  # Python < 3.4
 672         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 673         context.verify_mode = (ssl.CERT_NONE
 674                                if opts_no_check_certificate
 675                                else ssl.CERT_REQUIRED)
 676         context.set_default_verify_paths()
 677         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 678
 679
 680 def bug_reports_message():
 681     if ytdl_is_updateable():
 682         update_cmd = 'type  youtube-dl -U  to update'
 683     else:
 684         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 685     msg = '; please report this issue on https://yt-dl.org/bug .'
 686     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 687     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 688     return msg
 689
 690
 691 class ExtractorError(Exception):
 692     """Error during info extraction."""
 693
 694     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 695         """ tb, if given, is the original traceback (so that it can be printed out).
 696         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 697         """
 698
 699         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 700             expected = True
 701         if video_id is not None:
 702             msg = video_id + ': ' + msg
 703         if cause:
 704             msg += ' (caused by %r)' % cause
 705         if not expected:
 706             msg += bug_reports_message()
 707         super(ExtractorError, self).__init__(msg)
 708
 709         self.traceback = tb
 710         self.exc_info = sys.exc_info()  # preserve original exception
 711         self.cause = cause
 712         self.video_id = video_id
 713
 714     def format_traceback(self):
 715         if self.traceback is None:
 716             return None
 717         return ''.join(traceback.format_tb(self.traceback))
 718
 719
 720 class UnsupportedError(ExtractorError):
 721     def __init__(self, url):
 722         super(UnsupportedError, self).__init__(
 723             'Unsupported URL: %s' % url, expected=True)
 724         self.url = url
 725
 726
 727 class RegexNotFoundError(ExtractorError):
 728     """Error when a regex didn't match"""
 729     pass
 730
 731
 732 class DownloadError(Exception):
 733     """Download Error exception.
 734
 735     This exception may be thrown by FileDownloader objects if they are not
 736     configured to continue on errors. They will contain the appropriate
 737     error message.
 738     """
 739
 740     def __init__(self, msg, exc_info=None):
 741         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 742         super(DownloadError, self).__init__(msg)
 743         self.exc_info = exc_info
 744
 745
 746 class SameFileError(Exception):
 747     """Same File exception.
 748
 749     This exception will be thrown by FileDownloader objects if they detect
 750     multiple files would have to be downloaded to the same file on disk.
 751     """
 752     pass
 753
 754
 755 class PostProcessingError(Exception):
 756     """Post Processing exception.
 757
 758     This exception may be raised by PostProcessor's .run() method to
 759     indicate an error in the postprocessing task.
 760     """
 761
 762     def __init__(self, msg):
 763         self.msg = msg
 764
 765
 766 class MaxDownloadsReached(Exception):
 767     """ --max-downloads limit has been reached. """
 768     pass
 769
 770
 771 class UnavailableVideoError(Exception):
 772     """Unavailable Format exception.
 773
 774     This exception will be thrown when a video is requested
 775     in a format that is not available for that video.
 776     """
 777     pass
 778
 779
 780 class ContentTooShortError(Exception):
 781     """Content Too Short exception.
 782
 783     This exception may be raised by FileDownloader objects when a file they
 784     download is too small for what the server announced first, indicating
 785     the connection was probably interrupted.
 786     """
 787
 788     def __init__(self, downloaded, expected):
 789         # Both in bytes
 790         self.downloaded = downloaded
 791         self.expected = expected
 792
 793
 794 class XAttrMetadataError(Exception):
 795     def __init__(self, code=None, msg='Unknown error'):
 796         super(XAttrMetadataError, self).__init__(msg)
 797         self.code = code
 798         self.msg = msg
 799
 800         # Parsing code and msg
 801         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 802                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 803             self.reason = 'NO_SPACE'
 804         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 805             self.reason = 'VALUE_TOO_LONG'
 806         else:
 807             self.reason = 'NOT_SUPPORTED'
 808
 809
 810 class XAttrUnavailableError(Exception):
 811     pass
 812
 813
 814 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 815     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 816     # expected HTTP responses to meet HTTP/1.0 or later (see also
 817     # https://github.com/rg3/youtube-dl/issues/6727)
 818     if sys.version_info < (3, 0):
 819         kwargs[b'strict'] = True
 820     hc = http_class(*args, **kwargs)
 821     source_address = ydl_handler._params.get('source_address')
 822     if source_address is not None:
 823         sa = (source_address, 0)
 824         if hasattr(hc, 'source_address'):  # Python 2.7+
 825             hc.source_address = sa
 826         else:  # Python 2.6
 827             def _hc_connect(self, *args, **kwargs):
 828                 sock = compat_socket_create_connection(
 829                     (self.host, self.port), self.timeout, sa)
 830                 if is_https:
 831                     self.sock = ssl.wrap_socket(
 832                         sock, self.key_file, self.cert_file,
 833                         ssl_version=ssl.PROTOCOL_TLSv1)
 834                 else:
 835                     self.sock = sock
 836             hc.connect = functools.partial(_hc_connect, hc)
 837
 838     return hc
 839
 840
 841 def handle_youtubedl_headers(headers):
 842     filtered_headers = headers
 843
 844     if 'Youtubedl-no-compression' in filtered_headers:
 845         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 846         del filtered_headers['Youtubedl-no-compression']
 847
 848     return filtered_headers
 849
 850
 851 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 852     """Handler for HTTP requests and responses.
 853
 854     This class, when installed with an OpenerDirector, automatically adds
 855     the standard headers to every HTTP request and handles gzipped and
 856     deflated responses from web servers. If compression is to be avoided in
 857     a particular request, the original request in the program code only has
 858     to include the HTTP header "Youtubedl-no-compression", which will be
 859     removed before making the real request.
 860
 861     Part of this code was copied from:
 862
 863     http://techknack.net/python-urllib2-handlers/
 864
 865     Andrew Rowls, the author of that code, agreed to release it to the
 866     public domain.
 867     """
 868
 869     def __init__(self, params, *args, **kwargs):
 870         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 871         self._params = params
 872
 873     def http_open(self, req):
 874         conn_class = compat_http_client.HTTPConnection
 875
 876         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 877         if socks_proxy:
 878             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 879             del req.headers['Ytdl-socks-proxy']
 880
 881         return self.do_open(functools.partial(
 882             _create_http_connection, self, conn_class, False),
 883             req)
 884
 885     @staticmethod
 886     def deflate(data):
 887         try:
 888             return zlib.decompress(data, -zlib.MAX_WBITS)
 889         except zlib.error:
 890             return zlib.decompress(data)
 891
 892     @staticmethod
 893     def addinfourl_wrapper(stream, headers, url, code):
 894         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 895             return compat_urllib_request.addinfourl(stream, headers, url, code)
 896         ret = compat_urllib_request.addinfourl(stream, headers, url)
 897         ret.code = code
 898         return ret
 899
 900     def http_request(self, req):
 901         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 902         # always respected by websites, some tend to give out URLs with non percent-encoded
 903         # non-ASCII characters (see telemb.py, ard.py [#3412])
 904         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 905         # To work around aforementioned issue we will replace request's original URL with
 906         # percent-encoded one
 907         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 908         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 909         url = req.get_full_url()
 910         url_escaped = escape_url(url)
 911
 912         # Substitute URL if any change after escaping
 913         if url != url_escaped:
 914             req = update_Request(req, url=url_escaped)
 915
 916         for h, v in std_headers.items():
 917             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 918             # The dict keys are capitalized because of this bug by urllib
 919             if h.capitalize() not in req.headers:
 920                 req.add_header(h, v)
 921
 922         req.headers = handle_youtubedl_headers(req.headers)
 923
 924         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 925             # Python 2.6 is brain-dead when it comes to fragments
 926             req._Request__original = req._Request__original.partition('#')[0]
 927             req._Request__r_type = req._Request__r_type.partition('#')[0]
 928
 929         return req
 930
 931     def http_response(self, req, resp):
 932         old_resp = resp
 933         # gzip
 934         if resp.headers.get('Content-encoding', '') == 'gzip':
 935             content = resp.read()
 936             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 937             try:
 938                 uncompressed = io.BytesIO(gz.read())
 939             except IOError as original_ioerror:
 940                 # There may be junk add the end of the file
 941                 # See http://stackoverflow.com/q/4928560/35070 for details
 942                 for i in range(1, 1024):
 943                     try:
 944                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 945                         uncompressed = io.BytesIO(gz.read())
 946                     except IOError:
 947                         continue
 948                     break
 949                 else:
 950                     raise original_ioerror
 951             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 952             resp.msg = old_resp.msg
 953             del resp.headers['Content-encoding']
 954         # deflate
 955         if resp.headers.get('Content-encoding', '') == 'deflate':
 956             gz = io.BytesIO(self.deflate(resp.read()))
 957             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 958             resp.msg = old_resp.msg
 959             del resp.headers['Content-encoding']
 960         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 961         # https://github.com/rg3/youtube-dl/issues/6457).
 962         if 300 <= resp.code < 400:
 963             location = resp.headers.get('Location')
 964             if location:
 965                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 966                 if sys.version_info >= (3, 0):
 967                     location = location.encode('iso-8859-1').decode('utf-8')
 968                 else:
 969                     location = location.decode('utf-8')
 970                 location_escaped = escape_url(location)
 971                 if location != location_escaped:
 972                     del resp.headers['Location']
 973                     if sys.version_info < (3, 0):
 974                         location_escaped = location_escaped.encode('utf-8')
 975                     resp.headers['Location'] = location_escaped
 976         return resp
 977
 978     https_request = http_request
 979     https_response = http_response
 980
 981
 982 def make_socks_conn_class(base_class, socks_proxy):
 983     assert issubclass(base_class, (
 984         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 985
 986     url_components = compat_urlparse.urlparse(socks_proxy)
 987     if url_components.scheme.lower() == 'socks5':
 988         socks_type = ProxyType.SOCKS5
 989     elif url_components.scheme.lower() in ('socks', 'socks4'):
 990         socks_type = ProxyType.SOCKS4
 991     elif url_components.scheme.lower() == 'socks4a':
 992         socks_type = ProxyType.SOCKS4A
 993
 994     def unquote_if_non_empty(s):
 995         if not s:
 996             return s
 997         return compat_urllib_parse_unquote_plus(s)
 998
 999     proxy_args = (
1000         socks_type,
1001         url_components.hostname, url_components.port or 1080,
1002         True,  # Remote DNS
1003         unquote_if_non_empty(url_components.username),
1004         unquote_if_non_empty(url_components.password),
1005     )
1006
1007     class SocksConnection(base_class):
1008         def connect(self):
1009             self.sock = sockssocket()
1010             self.sock.setproxy(*proxy_args)
1011             if type(self.timeout) in (int, float):
1012                 self.sock.settimeout(self.timeout)
1013             self.sock.connect((self.host, self.port))
1014
1015             if isinstance(self, compat_http_client.HTTPSConnection):
1016                 if hasattr(self, '_context'):  # Python > 2.6
1017                     self.sock = self._context.wrap_socket(
1018                         self.sock, server_hostname=self.host)
1019                 else:
1020                     self.sock = ssl.wrap_socket(self.sock)
1021
1022     return SocksConnection
1023
1024
1025 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1026     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1027         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1028         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1029         self._params = params
1030
1031     def https_open(self, req):
1032         kwargs = {}
1033         conn_class = self._https_conn_class
1034
1035         if hasattr(self, '_context'):  # python > 2.6
1036             kwargs['context'] = self._context
1037         if hasattr(self, '_check_hostname'):  # python 3.x
1038             kwargs['check_hostname'] = self._check_hostname
1039
1040         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1041         if socks_proxy:
1042             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1043             del req.headers['Ytdl-socks-proxy']
1044
1045         return self.do_open(functools.partial(
1046             _create_http_connection, self, conn_class, True),
1047             req, **kwargs)
1048
1049
1050 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1051     def __init__(self, cookiejar=None):
1052         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1053
1054     def http_response(self, request, response):
1055         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1056         # characters in Set-Cookie HTTP header of last response (see
1057         # https://github.com/rg3/youtube-dl/issues/6769).
1058         # In order to at least prevent crashing we will percent encode Set-Cookie
1059         # header before HTTPCookieProcessor starts processing it.
1060         # if sys.version_info < (3, 0) and response.headers:
1061         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1062         #         set_cookie = response.headers.get(set_cookie_header)
1063         #         if set_cookie:
1064         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1065         #             if set_cookie != set_cookie_escaped:
1066         #                 del response.headers[set_cookie_header]
1067         #                 response.headers[set_cookie_header] = set_cookie_escaped
1068         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1069
1070     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1071     https_response = http_response
1072
1073
1074 def extract_timezone(date_str):
1075     m = re.search(
1076         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1077         date_str)
1078     if not m:
1079         timezone = datetime.timedelta()
1080     else:
1081         date_str = date_str[:-len(m.group('tz'))]
1082         if not m.group('sign'):
1083             timezone = datetime.timedelta()
1084         else:
1085             sign = 1 if m.group('sign') == '+' else -1
1086             timezone = datetime.timedelta(
1087                 hours=sign * int(m.group('hours')),
1088                 minutes=sign * int(m.group('minutes')))
1089     return timezone, date_str
1090
1091
1092 def parse_iso8601(date_str, delimiter='T', timezone=None):
1093     """ Return a UNIX timestamp from the given date """
1094
1095     if date_str is None:
1096         return None
1097
1098     date_str = re.sub(r'\.[0-9]+', '', date_str)
1099
1100     if timezone is None:
1101         timezone, date_str = extract_timezone(date_str)
1102
1103     try:
1104         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1105         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1106         return calendar.timegm(dt.timetuple())
1107     except ValueError:
1108         pass
1109
1110
1111 def date_formats(day_first=True):
1112     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1113
1114
1115 def unified_strdate(date_str, day_first=True):
1116     """Return a string with the date in the format YYYYMMDD"""
1117
1118     if date_str is None:
1119         return None
1120     upload_date = None
1121     # Replace commas
1122     date_str = date_str.replace(',', ' ')
1123     # Remove AM/PM + timezone
1124     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1125     _, date_str = extract_timezone(date_str)
1126
1127     for expression in date_formats(day_first):
1128         try:
1129             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1130         except ValueError:
1131             pass
1132     if upload_date is None:
1133         timetuple = email.utils.parsedate_tz(date_str)
1134         if timetuple:
1135             try:
1136                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1137             except ValueError:
1138                 pass
1139     if upload_date is not None:
1140         return compat_str(upload_date)
1141
1142
1143 def unified_timestamp(date_str, day_first=True):
1144     if date_str is None:
1145         return None
1146
1147     date_str = date_str.replace(',', ' ')
1148
1149     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1150     timezone, date_str = extract_timezone(date_str)
1151
1152     # Remove AM/PM + timezone
1153     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1154
1155     for expression in date_formats(day_first):
1156         try:
1157             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1158             return calendar.timegm(dt.timetuple())
1159         except ValueError:
1160             pass
1161     timetuple = email.utils.parsedate_tz(date_str)
1162     if timetuple:
1163         return calendar.timegm(timetuple) + pm_delta * 3600
1164
1165
1166 def determine_ext(url, default_ext='unknown_video'):
1167     if url is None:
1168         return default_ext
1169     guess = url.partition('?')[0].rpartition('.')[2]
1170     if re.match(r'^[A-Za-z0-9]+$', guess):
1171         return guess
1172     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1173     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1174         return guess.rstrip('/')
1175     else:
1176         return default_ext
1177
1178
1179 def subtitles_filename(filename, sub_lang, sub_format):
1180     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1181
1182
1183 def date_from_str(date_str):
1184     """
1185     Return a datetime object from a string in the format YYYYMMDD or
1186     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1187     today = datetime.date.today()
1188     if date_str in ('now', 'today'):
1189         return today
1190     if date_str == 'yesterday':
1191         return today - datetime.timedelta(days=1)
1192     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1193     if match is not None:
1194         sign = match.group('sign')
1195         time = int(match.group('time'))
1196         if sign == '-':
1197             time = -time
1198         unit = match.group('unit')
1199         # A bad approximation?
1200         if unit == 'month':
1201             unit = 'day'
1202             time *= 30
1203         elif unit == 'year':
1204             unit = 'day'
1205             time *= 365
1206         unit += 's'
1207         delta = datetime.timedelta(**{unit: time})
1208         return today + delta
1209     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1210
1211
1212 def hyphenate_date(date_str):
1213     """
1214     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1215     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1216     if match is not None:
1217         return '-'.join(match.groups())
1218     else:
1219         return date_str
1220
1221
1222 class DateRange(object):
1223     """Represents a time interval between two dates"""
1224
1225     def __init__(self, start=None, end=None):
1226         """start and end must be strings in the format accepted by date"""
1227         if start is not None:
1228             self.start = date_from_str(start)
1229         else:
1230             self.start = datetime.datetime.min.date()
1231         if end is not None:
1232             self.end = date_from_str(end)
1233         else:
1234             self.end = datetime.datetime.max.date()
1235         if self.start > self.end:
1236             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1237
1238     @classmethod
1239     def day(cls, day):
1240         """Returns a range that only contains the given day"""
1241         return cls(day, day)
1242
1243     def __contains__(self, date):
1244         """Check if the date is in the range"""
1245         if not isinstance(date, datetime.date):
1246             date = date_from_str(date)
1247         return self.start <= date <= self.end
1248
1249     def __str__(self):
1250         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1251
1252
1253 def platform_name():
1254     """ Returns the platform name as a compat_str """
1255     res = platform.platform()
1256     if isinstance(res, bytes):
1257         res = res.decode(preferredencoding())
1258
1259     assert isinstance(res, compat_str)
1260     return res
1261
1262
1263 def _windows_write_string(s, out):
1264     """ Returns True if the string was written using special methods,
1265     False if it has yet to be written out."""
1266     # Adapted from http://stackoverflow.com/a/3259271/35070
1267
1268     import ctypes
1269     import ctypes.wintypes
1270
1271     WIN_OUTPUT_IDS = {
1272         1: -11,
1273         2: -12,
1274     }
1275
1276     try:
1277         fileno = out.fileno()
1278     except AttributeError:
1279         # If the output stream doesn't have a fileno, it's virtual
1280         return False
1281     except io.UnsupportedOperation:
1282         # Some strange Windows pseudo files?
1283         return False
1284     if fileno not in WIN_OUTPUT_IDS:
1285         return False
1286
1287     GetStdHandle = ctypes.WINFUNCTYPE(
1288         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1289         (b'GetStdHandle', ctypes.windll.kernel32))
1290     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1291
1292     WriteConsoleW = ctypes.WINFUNCTYPE(
1293         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1294         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1295         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1296     written = ctypes.wintypes.DWORD(0)
1297
1298     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1299     FILE_TYPE_CHAR = 0x0002
1300     FILE_TYPE_REMOTE = 0x8000
1301     GetConsoleMode = ctypes.WINFUNCTYPE(
1302         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1303         ctypes.POINTER(ctypes.wintypes.DWORD))(
1304         (b'GetConsoleMode', ctypes.windll.kernel32))
1305     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1306
1307     def not_a_console(handle):
1308         if handle == INVALID_HANDLE_VALUE or handle is None:
1309             return True
1310         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1311                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1312
1313     if not_a_console(h):
1314         return False
1315
1316     def next_nonbmp_pos(s):
1317         try:
1318             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1319         except StopIteration:
1320             return len(s)
1321
1322     while s:
1323         count = min(next_nonbmp_pos(s), 1024)
1324
1325         ret = WriteConsoleW(
1326             h, s, count if count else 2, ctypes.byref(written), None)
1327         if ret == 0:
1328             raise OSError('Failed to write string')
1329         if not count:  # We just wrote a non-BMP character
1330             assert written.value == 2
1331             s = s[1:]
1332         else:
1333             assert written.value > 0
1334             s = s[written.value:]
1335     return True
1336
1337
1338 def write_string(s, out=None, encoding=None):
1339     if out is None:
1340         out = sys.stderr
1341     assert type(s) == compat_str
1342
1343     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1344         if _windows_write_string(s, out):
1345             return
1346
1347     if ('b' in getattr(out, 'mode', '') or
1348             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1349         byt = s.encode(encoding or preferredencoding(), 'ignore')
1350         out.write(byt)
1351     elif hasattr(out, 'buffer'):
1352         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1353         byt = s.encode(enc, 'ignore')
1354         out.buffer.write(byt)
1355     else:
1356         out.write(s)
1357     out.flush()
1358
1359
1360 def bytes_to_intlist(bs):
1361     if not bs:
1362         return []
1363     if isinstance(bs[0], int):  # Python 3
1364         return list(bs)
1365     else:
1366         return [ord(c) for c in bs]
1367
1368
1369 def intlist_to_bytes(xs):
1370     if not xs:
1371         return b''
1372     return compat_struct_pack('%dB' % len(xs), *xs)
1373
1374
1375 # Cross-platform file locking
1376 if sys.platform == 'win32':
1377     import ctypes.wintypes
1378     import msvcrt
1379
1380     class OVERLAPPED(ctypes.Structure):
1381         _fields_ = [
1382             ('Internal', ctypes.wintypes.LPVOID),
1383             ('InternalHigh', ctypes.wintypes.LPVOID),
1384             ('Offset', ctypes.wintypes.DWORD),
1385             ('OffsetHigh', ctypes.wintypes.DWORD),
1386             ('hEvent', ctypes.wintypes.HANDLE),
1387         ]
1388
1389     kernel32 = ctypes.windll.kernel32
1390     LockFileEx = kernel32.LockFileEx
1391     LockFileEx.argtypes = [
1392         ctypes.wintypes.HANDLE,     # hFile
1393         ctypes.wintypes.DWORD,      # dwFlags
1394         ctypes.wintypes.DWORD,      # dwReserved
1395         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1396         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1397         ctypes.POINTER(OVERLAPPED)  # Overlapped
1398     ]
1399     LockFileEx.restype = ctypes.wintypes.BOOL
1400     UnlockFileEx = kernel32.UnlockFileEx
1401     UnlockFileEx.argtypes = [
1402         ctypes.wintypes.HANDLE,     # hFile
1403         ctypes.wintypes.DWORD,      # dwReserved
1404         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1405         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1406         ctypes.POINTER(OVERLAPPED)  # Overlapped
1407     ]
1408     UnlockFileEx.restype = ctypes.wintypes.BOOL
1409     whole_low = 0xffffffff
1410     whole_high = 0x7fffffff
1411
1412     def _lock_file(f, exclusive):
1413         overlapped = OVERLAPPED()
1414         overlapped.Offset = 0
1415         overlapped.OffsetHigh = 0
1416         overlapped.hEvent = 0
1417         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1418         handle = msvcrt.get_osfhandle(f.fileno())
1419         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1420                           whole_low, whole_high, f._lock_file_overlapped_p):
1421             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1422
1423     def _unlock_file(f):
1424         assert f._lock_file_overlapped_p
1425         handle = msvcrt.get_osfhandle(f.fileno())
1426         if not UnlockFileEx(handle, 0,
1427                             whole_low, whole_high, f._lock_file_overlapped_p):
1428             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1429
1430 else:
1431     # Some platforms, such as Jython, is missing fcntl
1432     try:
1433         import fcntl
1434
1435         def _lock_file(f, exclusive):
1436             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1437
1438         def _unlock_file(f):
1439             fcntl.flock(f, fcntl.LOCK_UN)
1440     except ImportError:
1441         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1442
1443         def _lock_file(f, exclusive):
1444             raise IOError(UNSUPPORTED_MSG)
1445
1446         def _unlock_file(f):
1447             raise IOError(UNSUPPORTED_MSG)
1448
1449
1450 class locked_file(object):
1451     def __init__(self, filename, mode, encoding=None):
1452         assert mode in ['r', 'a', 'w']
1453         self.f = io.open(filename, mode, encoding=encoding)
1454         self.mode = mode
1455
1456     def __enter__(self):
1457         exclusive = self.mode != 'r'
1458         try:
1459             _lock_file(self.f, exclusive)
1460         except IOError:
1461             self.f.close()
1462             raise
1463         return self
1464
1465     def __exit__(self, etype, value, traceback):
1466         try:
1467             _unlock_file(self.f)
1468         finally:
1469             self.f.close()
1470
1471     def __iter__(self):
1472         return iter(self.f)
1473
1474     def write(self, *args):
1475         return self.f.write(*args)
1476
1477     def read(self, *args):
1478         return self.f.read(*args)
1479
1480
1481 def get_filesystem_encoding():
1482     encoding = sys.getfilesystemencoding()
1483     return encoding if encoding is not None else 'utf-8'
1484
1485
1486 def shell_quote(args):
1487     quoted_args = []
1488     encoding = get_filesystem_encoding()
1489     for a in args:
1490         if isinstance(a, bytes):
1491             # We may get a filename encoded with 'encodeFilename'
1492             a = a.decode(encoding)
1493         quoted_args.append(pipes.quote(a))
1494     return ' '.join(quoted_args)
1495
1496
1497 def smuggle_url(url, data):
1498     """ Pass additional data in a URL for internal use. """
1499
1500     url, idata = unsmuggle_url(url, {})
1501     data.update(idata)
1502     sdata = compat_urllib_parse_urlencode(
1503         {'__youtubedl_smuggle': json.dumps(data)})
1504     return url + '#' + sdata
1505
1506
1507 def unsmuggle_url(smug_url, default=None):
1508     if '#__youtubedl_smuggle' not in smug_url:
1509         return smug_url, default
1510     url, _, sdata = smug_url.rpartition('#')
1511     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1512     data = json.loads(jsond)
1513     return url, data
1514
1515
1516 def format_bytes(bytes):
1517     if bytes is None:
1518         return 'N/A'
1519     if type(bytes) is str:
1520         bytes = float(bytes)
1521     if bytes == 0.0:
1522         exponent = 0
1523     else:
1524         exponent = int(math.log(bytes, 1024.0))
1525     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1526     converted = float(bytes) / float(1024 ** exponent)
1527     return '%.2f%s' % (converted, suffix)
1528
1529
1530 def lookup_unit_table(unit_table, s):
1531     units_re = '|'.join(re.escape(u) for u in unit_table)
1532     m = re.match(
1533         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1534     if not m:
1535         return None
1536     num_str = m.group('num').replace(',', '.')
1537     mult = unit_table[m.group('unit')]
1538     return int(float(num_str) * mult)
1539
1540
1541 def parse_filesize(s):
1542     if s is None:
1543         return None
1544
1545     # The lower-case forms are of course incorrect and unofficial,
1546     # but we support those too
1547     _UNIT_TABLE = {
1548         'B': 1,
1549         'b': 1,
1550         'bytes': 1,
1551         'KiB': 1024,
1552         'KB': 1000,
1553         'kB': 1024,
1554         'Kb': 1000,
1555         'kb': 1000,
1556         'kilobytes': 1000,
1557         'kibibytes': 1024,
1558         'MiB': 1024 ** 2,
1559         'MB': 1000 ** 2,
1560         'mB': 1024 ** 2,
1561         'Mb': 1000 ** 2,
1562         'mb': 1000 ** 2,
1563         'megabytes': 1000 ** 2,
1564         'mebibytes': 1024 ** 2,
1565         'GiB': 1024 ** 3,
1566         'GB': 1000 ** 3,
1567         'gB': 1024 ** 3,
1568         'Gb': 1000 ** 3,
1569         'gb': 1000 ** 3,
1570         'gigabytes': 1000 ** 3,
1571         'gibibytes': 1024 ** 3,
1572         'TiB': 1024 ** 4,
1573         'TB': 1000 ** 4,
1574         'tB': 1024 ** 4,
1575         'Tb': 1000 ** 4,
1576         'tb': 1000 ** 4,
1577         'terabytes': 1000 ** 4,
1578         'tebibytes': 1024 ** 4,
1579         'PiB': 1024 ** 5,
1580         'PB': 1000 ** 5,
1581         'pB': 1024 ** 5,
1582         'Pb': 1000 ** 5,
1583         'pb': 1000 ** 5,
1584         'petabytes': 1000 ** 5,
1585         'pebibytes': 1024 ** 5,
1586         'EiB': 1024 ** 6,
1587         'EB': 1000 ** 6,
1588         'eB': 1024 ** 6,
1589         'Eb': 1000 ** 6,
1590         'eb': 1000 ** 6,
1591         'exabytes': 1000 ** 6,
1592         'exbibytes': 1024 ** 6,
1593         'ZiB': 1024 ** 7,
1594         'ZB': 1000 ** 7,
1595         'zB': 1024 ** 7,
1596         'Zb': 1000 ** 7,
1597         'zb': 1000 ** 7,
1598         'zettabytes': 1000 ** 7,
1599         'zebibytes': 1024 ** 7,
1600         'YiB': 1024 ** 8,
1601         'YB': 1000 ** 8,
1602         'yB': 1024 ** 8,
1603         'Yb': 1000 ** 8,
1604         'yb': 1000 ** 8,
1605         'yottabytes': 1000 ** 8,
1606         'yobibytes': 1024 ** 8,
1607     }
1608
1609     return lookup_unit_table(_UNIT_TABLE, s)
1610
1611
1612 def parse_count(s):
1613     if s is None:
1614         return None
1615
1616     s = s.strip()
1617
1618     if re.match(r'^[\d,.]+$', s):
1619         return str_to_int(s)
1620
1621     _UNIT_TABLE = {
1622         'k': 1000,
1623         'K': 1000,
1624         'm': 1000 ** 2,
1625         'M': 1000 ** 2,
1626         'kk': 1000 ** 2,
1627         'KK': 1000 ** 2,
1628     }
1629
1630     return lookup_unit_table(_UNIT_TABLE, s)
1631
1632
1633 def month_by_name(name, lang='en'):
1634     """ Return the number of a month by (locale-independently) English name """
1635
1636     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1637
1638     try:
1639         return month_names.index(name) + 1
1640     except ValueError:
1641         return None
1642
1643
1644 def month_by_abbreviation(abbrev):
1645     """ Return the number of a month by (locale-independently) English
1646         abbreviations """
1647
1648     try:
1649         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1650     except ValueError:
1651         return None
1652
1653
1654 def fix_xml_ampersands(xml_str):
1655     """Replace all the '&' by '&amp;' in XML"""
1656     return re.sub(
1657         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1658         '&amp;',
1659         xml_str)
1660
1661
1662 def setproctitle(title):
1663     assert isinstance(title, compat_str)
1664
1665     # ctypes in Jython is not complete
1666     # http://bugs.jython.org/issue2148
1667     if sys.platform.startswith('java'):
1668         return
1669
1670     try:
1671         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1672     except OSError:
1673         return
1674     title_bytes = title.encode('utf-8')
1675     buf = ctypes.create_string_buffer(len(title_bytes))
1676     buf.value = title_bytes
1677     try:
1678         libc.prctl(15, buf, 0, 0, 0)
1679     except AttributeError:
1680         return  # Strange libc, just skip this
1681
1682
1683 def remove_start(s, start):
1684     return s[len(start):] if s is not None and s.startswith(start) else s
1685
1686
1687 def remove_end(s, end):
1688     return s[:-len(end)] if s is not None and s.endswith(end) else s
1689
1690
1691 def remove_quotes(s):
1692     if s is None or len(s) < 2:
1693         return s
1694     for quote in ('"', "'", ):
1695         if s[0] == quote and s[-1] == quote:
1696             return s[1:-1]
1697     return s
1698
1699
1700 def url_basename(url):
1701     path = compat_urlparse.urlparse(url).path
1702     return path.strip('/').split('/')[-1]
1703
1704
1705 def base_url(url):
1706     return re.match(r'https?://[^?#&]+/', url).group()
1707
1708
1709 def urljoin(base, path):
1710     if not isinstance(path, compat_str) or not path:
1711         return None
1712     if re.match(r'^(?:https?:)?//', path):
1713         return path
1714     if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1715         return None
1716     return compat_urlparse.urljoin(base, path)
1717
1718
1719 class HEADRequest(compat_urllib_request.Request):
1720     def get_method(self):
1721         return 'HEAD'
1722
1723
1724 class PUTRequest(compat_urllib_request.Request):
1725     def get_method(self):
1726         return 'PUT'
1727
1728
1729 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1730     if get_attr:
1731         if v is not None:
1732             v = getattr(v, get_attr, None)
1733     if v == '':
1734         v = None
1735     if v is None:
1736         return default
1737     try:
1738         return int(v) * invscale // scale
1739     except ValueError:
1740         return default
1741
1742
1743 def str_or_none(v, default=None):
1744     return default if v is None else compat_str(v)
1745
1746
1747 def str_to_int(int_str):
1748     """ A more relaxed version of int_or_none """
1749     if int_str is None:
1750         return None
1751     int_str = re.sub(r'[,\.\+]', '', int_str)
1752     return int(int_str)
1753
1754
1755 def float_or_none(v, scale=1, invscale=1, default=None):
1756     if v is None:
1757         return default
1758     try:
1759         return float(v) * invscale / scale
1760     except ValueError:
1761         return default
1762
1763
1764 def strip_or_none(v):
1765     return None if v is None else v.strip()
1766
1767
1768 def parse_duration(s):
1769     if not isinstance(s, compat_basestring):
1770         return None
1771
1772     s = s.strip()
1773
1774     days, hours, mins, secs, ms = [None] * 5
1775     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1776     if m:
1777         days, hours, mins, secs, ms = m.groups()
1778     else:
1779         m = re.match(
1780             r'''(?ix)(?:P?T)?
1781                 (?:
1782                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1783                 )?
1784                 (?:
1785                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1786                 )?
1787                 (?:
1788                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1789                 )?
1790                 (?:
1791                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1792                 )?$''', s)
1793         if m:
1794             days, hours, mins, secs, ms = m.groups()
1795         else:
1796             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1797             if m:
1798                 hours, mins = m.groups()
1799             else:
1800                 return None
1801
1802     duration = 0
1803     if secs:
1804         duration += float(secs)
1805     if mins:
1806         duration += float(mins) * 60
1807     if hours:
1808         duration += float(hours) * 60 * 60
1809     if days:
1810         duration += float(days) * 24 * 60 * 60
1811     if ms:
1812         duration += float(ms)
1813     return duration
1814
1815
1816 def prepend_extension(filename, ext, expected_real_ext=None):
1817     name, real_ext = os.path.splitext(filename)
1818     return (
1819         '{0}.{1}{2}'.format(name, ext, real_ext)
1820         if not expected_real_ext or real_ext[1:] == expected_real_ext
1821         else '{0}.{1}'.format(filename, ext))
1822
1823
1824 def replace_extension(filename, ext, expected_real_ext=None):
1825     name, real_ext = os.path.splitext(filename)
1826     return '{0}.{1}'.format(
1827         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1828         ext)
1829
1830
1831 def check_executable(exe, args=[]):
1832     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1833     args can be a list of arguments for a short output (like -version) """
1834     try:
1835         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1836     except OSError:
1837         return False
1838     return exe
1839
1840
1841 def get_exe_version(exe, args=['--version'],
1842                     version_re=None, unrecognized='present'):
1843     """ Returns the version of the specified executable,
1844     or False if the executable is not present """
1845     try:
1846         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1847         # SIGTTOU if youtube-dl is run in the background.
1848         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1849         out, _ = subprocess.Popen(
1850             [encodeArgument(exe)] + args,
1851             stdin=subprocess.PIPE,
1852             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1853     except OSError:
1854         return False
1855     if isinstance(out, bytes):  # Python 2.x
1856         out = out.decode('ascii', 'ignore')
1857     return detect_exe_version(out, version_re, unrecognized)
1858
1859
1860 def detect_exe_version(output, version_re=None, unrecognized='present'):
1861     assert isinstance(output, compat_str)
1862     if version_re is None:
1863         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1864     m = re.search(version_re, output)
1865     if m:
1866         return m.group(1)
1867     else:
1868         return unrecognized
1869
1870
1871 class PagedList(object):
1872     def __len__(self):
1873         # This is only useful for tests
1874         return len(self.getslice())
1875
1876
1877 class OnDemandPagedList(PagedList):
1878     def __init__(self, pagefunc, pagesize, use_cache=False):
1879         self._pagefunc = pagefunc
1880         self._pagesize = pagesize
1881         self._use_cache = use_cache
1882         if use_cache:
1883             self._cache = {}
1884
1885     def getslice(self, start=0, end=None):
1886         res = []
1887         for pagenum in itertools.count(start // self._pagesize):
1888             firstid = pagenum * self._pagesize
1889             nextfirstid = pagenum * self._pagesize + self._pagesize
1890             if start >= nextfirstid:
1891                 continue
1892
1893             page_results = None
1894             if self._use_cache:
1895                 page_results = self._cache.get(pagenum)
1896             if page_results is None:
1897                 page_results = list(self._pagefunc(pagenum))
1898             if self._use_cache:
1899                 self._cache[pagenum] = page_results
1900
1901             startv = (
1902                 start % self._pagesize
1903                 if firstid <= start < nextfirstid
1904                 else 0)
1905
1906             endv = (
1907                 ((end - 1) % self._pagesize) + 1
1908                 if (end is not None and firstid <= end <= nextfirstid)
1909                 else None)
1910
1911             if startv != 0 or endv is not None:
1912                 page_results = page_results[startv:endv]
1913             res.extend(page_results)
1914
1915             # A little optimization - if current page is not "full", ie. does
1916             # not contain page_size videos then we can assume that this page
1917             # is the last one - there are no more ids on further pages -
1918             # i.e. no need to query again.
1919             if len(page_results) + startv < self._pagesize:
1920                 break
1921
1922             # If we got the whole page, but the next page is not interesting,
1923             # break out early as well
1924             if end == nextfirstid:
1925                 break
1926         return res
1927
1928
1929 class InAdvancePagedList(PagedList):
1930     def __init__(self, pagefunc, pagecount, pagesize):
1931         self._pagefunc = pagefunc
1932         self._pagecount = pagecount
1933         self._pagesize = pagesize
1934
1935     def getslice(self, start=0, end=None):
1936         res = []
1937         start_page = start // self._pagesize
1938         end_page = (
1939             self._pagecount if end is None else (end // self._pagesize + 1))
1940         skip_elems = start - start_page * self._pagesize
1941         only_more = None if end is None else end - start
1942         for pagenum in range(start_page, end_page):
1943             page = list(self._pagefunc(pagenum))
1944             if skip_elems:
1945                 page = page[skip_elems:]
1946                 skip_elems = None
1947             if only_more is not None:
1948                 if len(page) < only_more:
1949                     only_more -= len(page)
1950                 else:
1951                     page = page[:only_more]
1952                     res.extend(page)
1953                     break
1954             res.extend(page)
1955         return res
1956
1957
1958 def uppercase_escape(s):
1959     unicode_escape = codecs.getdecoder('unicode_escape')
1960     return re.sub(
1961         r'\\U[0-9a-fA-F]{8}',
1962         lambda m: unicode_escape(m.group(0))[0],
1963         s)
1964
1965
1966 def lowercase_escape(s):
1967     unicode_escape = codecs.getdecoder('unicode_escape')
1968     return re.sub(
1969         r'\\u[0-9a-fA-F]{4}',
1970         lambda m: unicode_escape(m.group(0))[0],
1971         s)
1972
1973
1974 def escape_rfc3986(s):
1975     """Escape non-ASCII characters as suggested by RFC 3986"""
1976     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1977         s = s.encode('utf-8')
1978     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1979
1980
1981 def escape_url(url):
1982     """Escape URL as suggested by RFC 3986"""
1983     url_parsed = compat_urllib_parse_urlparse(url)
1984     return url_parsed._replace(
1985         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1986         path=escape_rfc3986(url_parsed.path),
1987         params=escape_rfc3986(url_parsed.params),
1988         query=escape_rfc3986(url_parsed.query),
1989         fragment=escape_rfc3986(url_parsed.fragment)
1990     ).geturl()
1991
1992
1993 def read_batch_urls(batch_fd):
1994     def fixup(url):
1995         if not isinstance(url, compat_str):
1996             url = url.decode('utf-8', 'replace')
1997         BOM_UTF8 = '\xef\xbb\xbf'
1998         if url.startswith(BOM_UTF8):
1999             url = url[len(BOM_UTF8):]
2000         url = url.strip()
2001         if url.startswith(('#', ';', ']')):
2002             return False
2003         return url
2004
2005     with contextlib.closing(batch_fd) as fd:
2006         return [url for url in map(fixup, fd) if url]
2007
2008
2009 def urlencode_postdata(*args, **kargs):
2010     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2011
2012
2013 def update_url_query(url, query):
2014     if not query:
2015         return url
2016     parsed_url = compat_urlparse.urlparse(url)
2017     qs = compat_parse_qs(parsed_url.query)
2018     qs.update(query)
2019     return compat_urlparse.urlunparse(parsed_url._replace(
2020         query=compat_urllib_parse_urlencode(qs, True)))
2021
2022
2023 def update_Request(req, url=None, data=None, headers={}, query={}):
2024     req_headers = req.headers.copy()
2025     req_headers.update(headers)
2026     req_data = data or req.data
2027     req_url = update_url_query(url or req.get_full_url(), query)
2028     req_get_method = req.get_method()
2029     if req_get_method == 'HEAD':
2030         req_type = HEADRequest
2031     elif req_get_method == 'PUT':
2032         req_type = PUTRequest
2033     else:
2034         req_type = compat_urllib_request.Request
2035     new_req = req_type(
2036         req_url, data=req_data, headers=req_headers,
2037         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2038     if hasattr(req, 'timeout'):
2039         new_req.timeout = req.timeout
2040     return new_req
2041
2042
2043 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2044     if isinstance(key_or_keys, (list, tuple)):
2045         for key in key_or_keys:
2046             if key not in d or d[key] is None or skip_false_values and not d[key]:
2047                 continue
2048             return d[key]
2049         return default
2050     return d.get(key_or_keys, default)
2051
2052
2053 def try_get(src, getter, expected_type=None):
2054     try:
2055         v = getter(src)
2056     except (AttributeError, KeyError, TypeError, IndexError):
2057         pass
2058     else:
2059         if expected_type is None or isinstance(v, expected_type):
2060             return v
2061
2062
2063 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2064     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2065
2066
2067 US_RATINGS = {
2068     'G': 0,
2069     'PG': 10,
2070     'PG-13': 13,
2071     'R': 16,
2072     'NC': 18,
2073 }
2074
2075
2076 TV_PARENTAL_GUIDELINES = {
2077     'TV-Y': 0,
2078     'TV-Y7': 7,
2079     'TV-G': 0,
2080     'TV-PG': 0,
2081     'TV-14': 14,
2082     'TV-MA': 17,
2083 }
2084
2085
2086 def parse_age_limit(s):
2087     if type(s) == int:
2088         return s if 0 <= s <= 21 else None
2089     if not isinstance(s, compat_basestring):
2090         return None
2091     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2092     if m:
2093         return int(m.group('age'))
2094     if s in US_RATINGS:
2095         return US_RATINGS[s]
2096     return TV_PARENTAL_GUIDELINES.get(s)
2097
2098
2099 def strip_jsonp(code):
2100     return re.sub(
2101         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2102
2103
2104 def js_to_json(code):
2105     def fix_kv(m):
2106         v = m.group(0)
2107         if v in ('true', 'false', 'null'):
2108             return v
2109         elif v.startswith('/*') or v == ',':
2110             return ""
2111
2112         if v[0] in ("'", '"'):
2113             v = re.sub(r'(?s)\\.|"', lambda m: {
2114                 '"': '\\"',
2115                 "\\'": "'",
2116                 '\\\n': '',
2117                 '\\x': '\\u00',
2118             }.get(m.group(0), m.group(0)), v[1:-1])
2119
2120         INTEGER_TABLE = (
2121             (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2122             (r'^(0+[0-7]+)\s*:?$', 8),
2123         )
2124
2125         for regex, base in INTEGER_TABLE:
2126             im = re.match(regex, v)
2127             if im:
2128                 i = int(im.group(1), base)
2129                 return '"%d":' % i if v.endswith(':') else '%d' % i
2130
2131         return '"%s"' % v
2132
2133     return re.sub(r'''(?sx)
2134         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2135         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2136         /\*.*?\*/|,(?=\s*[\]}])|
2137         [a-zA-Z_][.a-zA-Z_0-9]*|
2138         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2139         [0-9]+(?=\s*:)
2140         ''', fix_kv, code)
2141
2142
2143 def qualities(quality_ids):
2144     """ Get a numeric quality value out of a list of possible values """
2145     def q(qid):
2146         try:
2147             return quality_ids.index(qid)
2148         except ValueError:
2149             return -1
2150     return q
2151
2152
2153 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2154
2155
2156 def limit_length(s, length):
2157     """ Add ellipses to overly long strings """
2158     if s is None:
2159         return None
2160     ELLIPSES = '...'
2161     if len(s) > length:
2162         return s[:length - len(ELLIPSES)] + ELLIPSES
2163     return s
2164
2165
2166 def version_tuple(v):
2167     return tuple(int(e) for e in re.split(r'[-.]', v))
2168
2169
2170 def is_outdated_version(version, limit, assume_new=True):
2171     if not version:
2172         return not assume_new
2173     try:
2174         return version_tuple(version) < version_tuple(limit)
2175     except ValueError:
2176         return not assume_new
2177
2178
2179 def ytdl_is_updateable():
2180     """ Returns if youtube-dl can be updated with -U """
2181     from zipimport import zipimporter
2182
2183     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2184
2185
2186 def args_to_str(args):
2187     # Get a short string representation for a subprocess command
2188     return ' '.join(compat_shlex_quote(a) for a in args)
2189
2190
2191 def error_to_compat_str(err):
2192     err_str = str(err)
2193     # On python 2 error byte string must be decoded with proper
2194     # encoding rather than ascii
2195     if sys.version_info[0] < 3:
2196         err_str = err_str.decode(preferredencoding())
2197     return err_str
2198
2199
2200 def mimetype2ext(mt):
2201     if mt is None:
2202         return None
2203
2204     ext = {
2205         'audio/mp4': 'm4a',
2206         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2207         # it's the most popular one
2208         'audio/mpeg': 'mp3',
2209     }.get(mt)
2210     if ext is not None:
2211         return ext
2212
2213     _, _, res = mt.rpartition('/')
2214     res = res.split(';')[0].strip().lower()
2215
2216     return {
2217         '3gpp': '3gp',
2218         'smptett+xml': 'tt',
2219         'srt': 'srt',
2220         'ttaf+xml': 'dfxp',
2221         'ttml+xml': 'ttml',
2222         'vtt': 'vtt',
2223         'x-flv': 'flv',
2224         'x-mp4-fragmented': 'mp4',
2225         'x-ms-wmv': 'wmv',
2226         'mpegurl': 'm3u8',
2227         'x-mpegurl': 'm3u8',
2228         'vnd.apple.mpegurl': 'm3u8',
2229         'dash+xml': 'mpd',
2230         'f4m': 'f4m',
2231         'f4m+xml': 'f4m',
2232         'hds+xml': 'f4m',
2233         'vnd.ms-sstr+xml': 'ism',
2234         'quicktime': 'mov',
2235     }.get(res, res)
2236
2237
2238 def parse_codecs(codecs_str):
2239     # http://tools.ietf.org/html/rfc6381
2240     if not codecs_str:
2241         return {}
2242     splited_codecs = list(filter(None, map(
2243         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2244     vcodec, acodec = None, None
2245     for full_codec in splited_codecs:
2246         codec = full_codec.split('.')[0]
2247         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2248             if not vcodec:
2249                 vcodec = full_codec
2250         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2251             if not acodec:
2252                 acodec = full_codec
2253         else:
2254             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2255     if not vcodec and not acodec:
2256         if len(splited_codecs) == 2:
2257             return {
2258                 'vcodec': vcodec,
2259                 'acodec': acodec,
2260             }
2261         elif len(splited_codecs) == 1:
2262             return {
2263                 'vcodec': 'none',
2264                 'acodec': vcodec,
2265             }
2266     else:
2267         return {
2268             'vcodec': vcodec or 'none',
2269             'acodec': acodec or 'none',
2270         }
2271     return {}
2272
2273
2274 def urlhandle_detect_ext(url_handle):
2275     getheader = url_handle.headers.get
2276
2277     cd = getheader('Content-Disposition')
2278     if cd:
2279         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2280         if m:
2281             e = determine_ext(m.group('filename'), default_ext=None)
2282             if e:
2283                 return e
2284
2285     return mimetype2ext(getheader('Content-Type'))
2286
2287
2288 def encode_data_uri(data, mime_type):
2289     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2290
2291
2292 def age_restricted(content_limit, age_limit):
2293     """ Returns True iff the content should be blocked """
2294
2295     if age_limit is None:  # No limit set
2296         return False
2297     if content_limit is None:
2298         return False  # Content available for everyone
2299     return age_limit < content_limit
2300
2301
2302 def is_html(first_bytes):
2303     """ Detect whether a file contains HTML by examining its first bytes. """
2304
2305     BOMS = [
2306         (b'\xef\xbb\xbf', 'utf-8'),
2307         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2308         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2309         (b'\xff\xfe', 'utf-16-le'),
2310         (b'\xfe\xff', 'utf-16-be'),
2311     ]
2312     for bom, enc in BOMS:
2313         if first_bytes.startswith(bom):
2314             s = first_bytes[len(bom):].decode(enc, 'replace')
2315             break
2316     else:
2317         s = first_bytes.decode('utf-8', 'replace')
2318
2319     return re.match(r'^\s*<', s)
2320
2321
2322 def determine_protocol(info_dict):
2323     protocol = info_dict.get('protocol')
2324     if protocol is not None:
2325         return protocol
2326
2327     url = info_dict['url']
2328     if url.startswith('rtmp'):
2329         return 'rtmp'
2330     elif url.startswith('mms'):
2331         return 'mms'
2332     elif url.startswith('rtsp'):
2333         return 'rtsp'
2334
2335     ext = determine_ext(url)
2336     if ext == 'm3u8':
2337         return 'm3u8'
2338     elif ext == 'f4m':
2339         return 'f4m'
2340
2341     return compat_urllib_parse_urlparse(url).scheme
2342
2343
2344 def render_table(header_row, data):
2345     """ Render a list of rows, each as a list of values """
2346     table = [header_row] + data
2347     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2348     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2349     return '\n'.join(format_str % tuple(row) for row in table)
2350
2351
2352 def _match_one(filter_part, dct):
2353     COMPARISON_OPERATORS = {
2354         '<': operator.lt,
2355         '<=': operator.le,
2356         '>': operator.gt,
2357         '>=': operator.ge,
2358         '=': operator.eq,
2359         '!=': operator.ne,
2360     }
2361     operator_rex = re.compile(r'''(?x)\s*
2362         (?P<key>[a-z_]+)
2363         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2364         (?:
2365             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2366             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2367         )
2368         \s*$
2369         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2370     m = operator_rex.search(filter_part)
2371     if m:
2372         op = COMPARISON_OPERATORS[m.group('op')]
2373         actual_value = dct.get(m.group('key'))
2374         if (m.group('strval') is not None or
2375             # If the original field is a string and matching comparisonvalue is
2376             # a number we should respect the origin of the original field
2377             # and process comparison value as a string (see
2378             # https://github.com/rg3/youtube-dl/issues/11082).
2379             actual_value is not None and m.group('intval') is not None and
2380                 isinstance(actual_value, compat_str)):
2381             if m.group('op') not in ('=', '!='):
2382                 raise ValueError(
2383                     'Operator %s does not support string values!' % m.group('op'))
2384             comparison_value = m.group('strval') or m.group('intval')
2385         else:
2386             try:
2387                 comparison_value = int(m.group('intval'))
2388             except ValueError:
2389                 comparison_value = parse_filesize(m.group('intval'))
2390                 if comparison_value is None:
2391                     comparison_value = parse_filesize(m.group('intval') + 'B')
2392                 if comparison_value is None:
2393                     raise ValueError(
2394                         'Invalid integer value %r in filter part %r' % (
2395                             m.group('intval'), filter_part))
2396         if actual_value is None:
2397             return m.group('none_inclusive')
2398         return op(actual_value, comparison_value)
2399
2400     UNARY_OPERATORS = {
2401         '': lambda v: v is not None,
2402         '!': lambda v: v is None,
2403     }
2404     operator_rex = re.compile(r'''(?x)\s*
2405         (?P<op>%s)\s*(?P<key>[a-z_]+)
2406         \s*$
2407         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2408     m = operator_rex.search(filter_part)
2409     if m:
2410         op = UNARY_OPERATORS[m.group('op')]
2411         actual_value = dct.get(m.group('key'))
2412         return op(actual_value)
2413
2414     raise ValueError('Invalid filter part %r' % filter_part)
2415
2416
2417 def match_str(filter_str, dct):
2418     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2419
2420     return all(
2421         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2422
2423
2424 def match_filter_func(filter_str):
2425     def _match_func(info_dict):
2426         if match_str(filter_str, info_dict):
2427             return None
2428         else:
2429             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2430             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2431     return _match_func
2432
2433
2434 def parse_dfxp_time_expr(time_expr):
2435     if not time_expr:
2436         return
2437
2438     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2439     if mobj:
2440         return float(mobj.group('time_offset'))
2441
2442     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2443     if mobj:
2444         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2445
2446
2447 def srt_subtitles_timecode(seconds):
2448     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2449
2450
2451 def dfxp2srt(dfxp_data):
2452     _x = functools.partial(xpath_with_ns, ns_map={
2453         'ttml': 'http://www.w3.org/ns/ttml',
2454         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2455         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2456     })
2457
2458     class TTMLPElementParser(object):
2459         out = ''
2460
2461         def start(self, tag, attrib):
2462             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2463                 self.out += '\n'
2464
2465         def end(self, tag):
2466             pass
2467
2468         def data(self, data):
2469             self.out += data
2470
2471         def close(self):
2472             return self.out.strip()
2473
2474     def parse_node(node):
2475         target = TTMLPElementParser()
2476         parser = xml.etree.ElementTree.XMLParser(target=target)
2477         parser.feed(xml.etree.ElementTree.tostring(node))
2478         return parser.close()
2479
2480     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2481     out = []
2482     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2483
2484     if not paras:
2485         raise ValueError('Invalid dfxp/TTML subtitle')
2486
2487     for para, index in zip(paras, itertools.count(1)):
2488         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2489         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2490         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2491         if begin_time is None:
2492             continue
2493         if not end_time:
2494             if not dur:
2495                 continue
2496             end_time = begin_time + dur
2497         out.append('%d\n%s --> %s\n%s\n\n' % (
2498             index,
2499             srt_subtitles_timecode(begin_time),
2500             srt_subtitles_timecode(end_time),
2501             parse_node(para)))
2502
2503     return ''.join(out)
2504
2505
2506 def cli_option(params, command_option, param):
2507     param = params.get(param)
2508     if param:
2509         param = compat_str(param)
2510     return [command_option, param] if param is not None else []
2511
2512
2513 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2514     param = params.get(param)
2515     assert isinstance(param, bool)
2516     if separator:
2517         return [command_option + separator + (true_value if param else false_value)]
2518     return [command_option, true_value if param else false_value]
2519
2520
2521 def cli_valueless_option(params, command_option, param, expected_value=True):
2522     param = params.get(param)
2523     return [command_option] if param == expected_value else []
2524
2525
2526 def cli_configuration_args(params, param, default=[]):
2527     ex_args = params.get(param)
2528     if ex_args is None:
2529         return default
2530     assert isinstance(ex_args, list)
2531     return ex_args
2532
2533
2534 class ISO639Utils(object):
2535     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2536     _lang_map = {
2537         'aa': 'aar',
2538         'ab': 'abk',
2539         'ae': 'ave',
2540         'af': 'afr',
2541         'ak': 'aka',
2542         'am': 'amh',
2543         'an': 'arg',
2544         'ar': 'ara',
2545         'as': 'asm',
2546         'av': 'ava',
2547         'ay': 'aym',
2548         'az': 'aze',
2549         'ba': 'bak',
2550         'be': 'bel',
2551         'bg': 'bul',
2552         'bh': 'bih',
2553         'bi': 'bis',
2554         'bm': 'bam',
2555         'bn': 'ben',
2556         'bo': 'bod',
2557         'br': 'bre',
2558         'bs': 'bos',
2559         'ca': 'cat',
2560         'ce': 'che',
2561         'ch': 'cha',
2562         'co': 'cos',
2563         'cr': 'cre',
2564         'cs': 'ces',
2565         'cu': 'chu',
2566         'cv': 'chv',
2567         'cy': 'cym',
2568         'da': 'dan',
2569         'de': 'deu',
2570         'dv': 'div',
2571         'dz': 'dzo',
2572         'ee': 'ewe',
2573         'el': 'ell',
2574         'en': 'eng',
2575         'eo': 'epo',
2576         'es': 'spa',
2577         'et': 'est',
2578         'eu': 'eus',
2579         'fa': 'fas',
2580         'ff': 'ful',
2581         'fi': 'fin',
2582         'fj': 'fij',
2583         'fo': 'fao',
2584         'fr': 'fra',
2585         'fy': 'fry',
2586         'ga': 'gle',
2587         'gd': 'gla',
2588         'gl': 'glg',
2589         'gn': 'grn',
2590         'gu': 'guj',
2591         'gv': 'glv',
2592         'ha': 'hau',
2593         'he': 'heb',
2594         'hi': 'hin',
2595         'ho': 'hmo',
2596         'hr': 'hrv',
2597         'ht': 'hat',
2598         'hu': 'hun',
2599         'hy': 'hye',
2600         'hz': 'her',
2601         'ia': 'ina',
2602         'id': 'ind',
2603         'ie': 'ile',
2604         'ig': 'ibo',
2605         'ii': 'iii',
2606         'ik': 'ipk',
2607         'io': 'ido',
2608         'is': 'isl',
2609         'it': 'ita',
2610         'iu': 'iku',
2611         'ja': 'jpn',
2612         'jv': 'jav',
2613         'ka': 'kat',
2614         'kg': 'kon',
2615         'ki': 'kik',
2616         'kj': 'kua',
2617         'kk': 'kaz',
2618         'kl': 'kal',
2619         'km': 'khm',
2620         'kn': 'kan',
2621         'ko': 'kor',
2622         'kr': 'kau',
2623         'ks': 'kas',
2624         'ku': 'kur',
2625         'kv': 'kom',
2626         'kw': 'cor',
2627         'ky': 'kir',
2628         'la': 'lat',
2629         'lb': 'ltz',
2630         'lg': 'lug',
2631         'li': 'lim',
2632         'ln': 'lin',
2633         'lo': 'lao',
2634         'lt': 'lit',
2635         'lu': 'lub',
2636         'lv': 'lav',
2637         'mg': 'mlg',
2638         'mh': 'mah',
2639         'mi': 'mri',
2640         'mk': 'mkd',
2641         'ml': 'mal',
2642         'mn': 'mon',
2643         'mr': 'mar',
2644         'ms': 'msa',
2645         'mt': 'mlt',
2646         'my': 'mya',
2647         'na': 'nau',
2648         'nb': 'nob',
2649         'nd': 'nde',
2650         'ne': 'nep',
2651         'ng': 'ndo',
2652         'nl': 'nld',
2653         'nn': 'nno',
2654         'no': 'nor',
2655         'nr': 'nbl',
2656         'nv': 'nav',
2657         'ny': 'nya',
2658         'oc': 'oci',
2659         'oj': 'oji',
2660         'om': 'orm',
2661         'or': 'ori',
2662         'os': 'oss',
2663         'pa': 'pan',
2664         'pi': 'pli',
2665         'pl': 'pol',
2666         'ps': 'pus',
2667         'pt': 'por',
2668         'qu': 'que',
2669         'rm': 'roh',
2670         'rn': 'run',
2671         'ro': 'ron',
2672         'ru': 'rus',
2673         'rw': 'kin',
2674         'sa': 'san',
2675         'sc': 'srd',
2676         'sd': 'snd',
2677         'se': 'sme',
2678         'sg': 'sag',
2679         'si': 'sin',
2680         'sk': 'slk',
2681         'sl': 'slv',
2682         'sm': 'smo',
2683         'sn': 'sna',
2684         'so': 'som',
2685         'sq': 'sqi',
2686         'sr': 'srp',
2687         'ss': 'ssw',
2688         'st': 'sot',
2689         'su': 'sun',
2690         'sv': 'swe',
2691         'sw': 'swa',
2692         'ta': 'tam',
2693         'te': 'tel',
2694         'tg': 'tgk',
2695         'th': 'tha',
2696         'ti': 'tir',
2697         'tk': 'tuk',
2698         'tl': 'tgl',
2699         'tn': 'tsn',
2700         'to': 'ton',
2701         'tr': 'tur',
2702         'ts': 'tso',
2703         'tt': 'tat',
2704         'tw': 'twi',
2705         'ty': 'tah',
2706         'ug': 'uig',
2707         'uk': 'ukr',
2708         'ur': 'urd',
2709         'uz': 'uzb',
2710         've': 'ven',
2711         'vi': 'vie',
2712         'vo': 'vol',
2713         'wa': 'wln',
2714         'wo': 'wol',
2715         'xh': 'xho',
2716         'yi': 'yid',
2717         'yo': 'yor',
2718         'za': 'zha',
2719         'zh': 'zho',
2720         'zu': 'zul',
2721     }
2722
2723     @classmethod
2724     def short2long(cls, code):
2725         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2726         return cls._lang_map.get(code[:2])
2727
2728     @classmethod
2729     def long2short(cls, code):
2730         """Convert language code from ISO 639-2/T to ISO 639-1"""
2731         for short_name, long_name in cls._lang_map.items():
2732             if long_name == code:
2733                 return short_name
2734
2735
2736 class ISO3166Utils(object):
2737     # From http://data.okfn.org/data/core/country-list
2738     _country_map = {
2739         'AF': 'Afghanistan',
2740         'AX': 'Åland Islands',
2741         'AL': 'Albania',
2742         'DZ': 'Algeria',
2743         'AS': 'American Samoa',
2744         'AD': 'Andorra',
2745         'AO': 'Angola',
2746         'AI': 'Anguilla',
2747         'AQ': 'Antarctica',
2748         'AG': 'Antigua and Barbuda',
2749         'AR': 'Argentina',
2750         'AM': 'Armenia',
2751         'AW': 'Aruba',
2752         'AU': 'Australia',
2753         'AT': 'Austria',
2754         'AZ': 'Azerbaijan',
2755         'BS': 'Bahamas',
2756         'BH': 'Bahrain',
2757         'BD': 'Bangladesh',
2758         'BB': 'Barbados',
2759         'BY': 'Belarus',
2760         'BE': 'Belgium',
2761         'BZ': 'Belize',
2762         'BJ': 'Benin',
2763         'BM': 'Bermuda',
2764         'BT': 'Bhutan',
2765         'BO': 'Bolivia, Plurinational State of',
2766         'BQ': 'Bonaire, Sint Eustatius and Saba',
2767         'BA': 'Bosnia and Herzegovina',
2768         'BW': 'Botswana',
2769         'BV': 'Bouvet Island',
2770         'BR': 'Brazil',
2771         'IO': 'British Indian Ocean Territory',
2772         'BN': 'Brunei Darussalam',
2773         'BG': 'Bulgaria',
2774         'BF': 'Burkina Faso',
2775         'BI': 'Burundi',
2776         'KH': 'Cambodia',
2777         'CM': 'Cameroon',
2778         'CA': 'Canada',
2779         'CV': 'Cape Verde',
2780         'KY': 'Cayman Islands',
2781         'CF': 'Central African Republic',
2782         'TD': 'Chad',
2783         'CL': 'Chile',
2784         'CN': 'China',
2785         'CX': 'Christmas Island',
2786         'CC': 'Cocos (Keeling) Islands',
2787         'CO': 'Colombia',
2788         'KM': 'Comoros',
2789         'CG': 'Congo',
2790         'CD': 'Congo, the Democratic Republic of the',
2791         'CK': 'Cook Islands',
2792         'CR': 'Costa Rica',
2793         'CI': 'Côte d\'Ivoire',
2794         'HR': 'Croatia',
2795         'CU': 'Cuba',
2796         'CW': 'Curaçao',
2797         'CY': 'Cyprus',
2798         'CZ': 'Czech Republic',
2799         'DK': 'Denmark',
2800         'DJ': 'Djibouti',
2801         'DM': 'Dominica',
2802         'DO': 'Dominican Republic',
2803         'EC': 'Ecuador',
2804         'EG': 'Egypt',
2805         'SV': 'El Salvador',
2806         'GQ': 'Equatorial Guinea',
2807         'ER': 'Eritrea',
2808         'EE': 'Estonia',
2809         'ET': 'Ethiopia',
2810         'FK': 'Falkland Islands (Malvinas)',
2811         'FO': 'Faroe Islands',
2812         'FJ': 'Fiji',
2813         'FI': 'Finland',
2814         'FR': 'France',
2815         'GF': 'French Guiana',
2816         'PF': 'French Polynesia',
2817         'TF': 'French Southern Territories',
2818         'GA': 'Gabon',
2819         'GM': 'Gambia',
2820         'GE': 'Georgia',
2821         'DE': 'Germany',
2822         'GH': 'Ghana',
2823         'GI': 'Gibraltar',
2824         'GR': 'Greece',
2825         'GL': 'Greenland',
2826         'GD': 'Grenada',
2827         'GP': 'Guadeloupe',
2828         'GU': 'Guam',
2829         'GT': 'Guatemala',
2830         'GG': 'Guernsey',
2831         'GN': 'Guinea',
2832         'GW': 'Guinea-Bissau',
2833         'GY': 'Guyana',
2834         'HT': 'Haiti',
2835         'HM': 'Heard Island and McDonald Islands',
2836         'VA': 'Holy See (Vatican City State)',
2837         'HN': 'Honduras',
2838         'HK': 'Hong Kong',
2839         'HU': 'Hungary',
2840         'IS': 'Iceland',
2841         'IN': 'India',
2842         'ID': 'Indonesia',
2843         'IR': 'Iran, Islamic Republic of',
2844         'IQ': 'Iraq',
2845         'IE': 'Ireland',
2846         'IM': 'Isle of Man',
2847         'IL': 'Israel',
2848         'IT': 'Italy',
2849         'JM': 'Jamaica',
2850         'JP': 'Japan',
2851         'JE': 'Jersey',
2852         'JO': 'Jordan',
2853         'KZ': 'Kazakhstan',
2854         'KE': 'Kenya',
2855         'KI': 'Kiribati',
2856         'KP': 'Korea, Democratic People\'s Republic of',
2857         'KR': 'Korea, Republic of',
2858         'KW': 'Kuwait',
2859         'KG': 'Kyrgyzstan',
2860         'LA': 'Lao People\'s Democratic Republic',
2861         'LV': 'Latvia',
2862         'LB': 'Lebanon',
2863         'LS': 'Lesotho',
2864         'LR': 'Liberia',
2865         'LY': 'Libya',
2866         'LI': 'Liechtenstein',
2867         'LT': 'Lithuania',
2868         'LU': 'Luxembourg',
2869         'MO': 'Macao',
2870         'MK': 'Macedonia, the Former Yugoslav Republic of',
2871         'MG': 'Madagascar',
2872         'MW': 'Malawi',
2873         'MY': 'Malaysia',
2874         'MV': 'Maldives',
2875         'ML': 'Mali',
2876         'MT': 'Malta',
2877         'MH': 'Marshall Islands',
2878         'MQ': 'Martinique',
2879         'MR': 'Mauritania',
2880         'MU': 'Mauritius',
2881         'YT': 'Mayotte',
2882         'MX': 'Mexico',
2883         'FM': 'Micronesia, Federated States of',
2884         'MD': 'Moldova, Republic of',
2885         'MC': 'Monaco',
2886         'MN': 'Mongolia',
2887         'ME': 'Montenegro',
2888         'MS': 'Montserrat',
2889         'MA': 'Morocco',
2890         'MZ': 'Mozambique',
2891         'MM': 'Myanmar',
2892         'NA': 'Namibia',
2893         'NR': 'Nauru',
2894         'NP': 'Nepal',
2895         'NL': 'Netherlands',
2896         'NC': 'New Caledonia',
2897         'NZ': 'New Zealand',
2898         'NI': 'Nicaragua',
2899         'NE': 'Niger',
2900         'NG': 'Nigeria',
2901         'NU': 'Niue',
2902         'NF': 'Norfolk Island',
2903         'MP': 'Northern Mariana Islands',
2904         'NO': 'Norway',
2905         'OM': 'Oman',
2906         'PK': 'Pakistan',
2907         'PW': 'Palau',
2908         'PS': 'Palestine, State of',
2909         'PA': 'Panama',
2910         'PG': 'Papua New Guinea',
2911         'PY': 'Paraguay',
2912         'PE': 'Peru',
2913         'PH': 'Philippines',
2914         'PN': 'Pitcairn',
2915         'PL': 'Poland',
2916         'PT': 'Portugal',
2917         'PR': 'Puerto Rico',
2918         'QA': 'Qatar',
2919         'RE': 'Réunion',
2920         'RO': 'Romania',
2921         'RU': 'Russian Federation',
2922         'RW': 'Rwanda',
2923         'BL': 'Saint Barthélemy',
2924         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2925         'KN': 'Saint Kitts and Nevis',
2926         'LC': 'Saint Lucia',
2927         'MF': 'Saint Martin (French part)',
2928         'PM': 'Saint Pierre and Miquelon',
2929         'VC': 'Saint Vincent and the Grenadines',
2930         'WS': 'Samoa',
2931         'SM': 'San Marino',
2932         'ST': 'Sao Tome and Principe',
2933         'SA': 'Saudi Arabia',
2934         'SN': 'Senegal',
2935         'RS': 'Serbia',
2936         'SC': 'Seychelles',
2937         'SL': 'Sierra Leone',
2938         'SG': 'Singapore',
2939         'SX': 'Sint Maarten (Dutch part)',
2940         'SK': 'Slovakia',
2941         'SI': 'Slovenia',
2942         'SB': 'Solomon Islands',
2943         'SO': 'Somalia',
2944         'ZA': 'South Africa',
2945         'GS': 'South Georgia and the South Sandwich Islands',
2946         'SS': 'South Sudan',
2947         'ES': 'Spain',
2948         'LK': 'Sri Lanka',
2949         'SD': 'Sudan',
2950         'SR': 'Suriname',
2951         'SJ': 'Svalbard and Jan Mayen',
2952         'SZ': 'Swaziland',
2953         'SE': 'Sweden',
2954         'CH': 'Switzerland',
2955         'SY': 'Syrian Arab Republic',
2956         'TW': 'Taiwan, Province of China',
2957         'TJ': 'Tajikistan',
2958         'TZ': 'Tanzania, United Republic of',
2959         'TH': 'Thailand',
2960         'TL': 'Timor-Leste',
2961         'TG': 'Togo',
2962         'TK': 'Tokelau',
2963         'TO': 'Tonga',
2964         'TT': 'Trinidad and Tobago',
2965         'TN': 'Tunisia',
2966         'TR': 'Turkey',
2967         'TM': 'Turkmenistan',
2968         'TC': 'Turks and Caicos Islands',
2969         'TV': 'Tuvalu',
2970         'UG': 'Uganda',
2971         'UA': 'Ukraine',
2972         'AE': 'United Arab Emirates',
2973         'GB': 'United Kingdom',
2974         'US': 'United States',
2975         'UM': 'United States Minor Outlying Islands',
2976         'UY': 'Uruguay',
2977         'UZ': 'Uzbekistan',
2978         'VU': 'Vanuatu',
2979         'VE': 'Venezuela, Bolivarian Republic of',
2980         'VN': 'Viet Nam',
2981         'VG': 'Virgin Islands, British',
2982         'VI': 'Virgin Islands, U.S.',
2983         'WF': 'Wallis and Futuna',
2984         'EH': 'Western Sahara',
2985         'YE': 'Yemen',
2986         'ZM': 'Zambia',
2987         'ZW': 'Zimbabwe',
2988     }
2989
2990     @classmethod
2991     def short2full(cls, code):
2992         """Convert an ISO 3166-2 country code to the corresponding full name"""
2993         return cls._country_map.get(code.upper())
2994
2995
2996 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2997     def __init__(self, proxies=None):
2998         # Set default handlers
2999         for type in ('http', 'https'):
3000             setattr(self, '%s_open' % type,
3001                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3002                         meth(r, proxy, type))
3003         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3004
3005     def proxy_open(self, req, proxy, type):
3006         req_proxy = req.headers.get('Ytdl-request-proxy')
3007         if req_proxy is not None:
3008             proxy = req_proxy
3009             del req.headers['Ytdl-request-proxy']
3010
3011         if proxy == '__noproxy__':
3012             return None  # No Proxy
3013         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3014             req.add_header('Ytdl-socks-proxy', proxy)
3015             # youtube-dl's http/https handlers do wrapping the socket with socks
3016             return None
3017         return compat_urllib_request.ProxyHandler.proxy_open(
3018             self, req, proxy, type)
3019
3020
3021 def ohdave_rsa_encrypt(data, exponent, modulus):
3022     '''
3023     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3024
3025     Input:
3026         data: data to encrypt, bytes-like object
3027         exponent, modulus: parameter e and N of RSA algorithm, both integer
3028     Output: hex string of encrypted data
3029
3030     Limitation: supports one block encryption only
3031     '''
3032
3033     payload = int(binascii.hexlify(data[::-1]), 16)
3034     encrypted = pow(payload, exponent, modulus)
3035     return '%x' % encrypted
3036
3037
3038 def encode_base_n(num, n, table=None):
3039     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3040     if not table:
3041         table = FULL_TABLE[:n]
3042
3043     if n > len(table):
3044         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3045
3046     if num == 0:
3047         return table[0]
3048
3049     ret = ''
3050     while num:
3051         ret = table[num % n] + ret
3052         num = num // n
3053     return ret
3054
3055
3056 def decode_packed_codes(code):
3057     mobj = re.search(PACKED_CODES_RE, code)
3058     obfucasted_code, base, count, symbols = mobj.groups()
3059     base = int(base)
3060     count = int(count)
3061     symbols = symbols.split('|')
3062     symbol_table = {}
3063
3064     while count:
3065         count -= 1
3066         base_n_count = encode_base_n(count, base)
3067         symbol_table[base_n_count] = symbols[count] or base_n_count
3068
3069     return re.sub(
3070         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3071         obfucasted_code)
3072
3073
3074 def parse_m3u8_attributes(attrib):
3075     info = {}
3076     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3077         if val.startswith('"'):
3078             val = val[1:-1]
3079         info[key] = val
3080     return info
3081
3082
3083 def urshift(val, n):
3084     return val >> n if val >= 0 else (val + 0x100000000) >> n
3085
3086
3087 # Based on png2str() written by @gdkchan and improved by @yokrysty
3088 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3089 def decode_png(png_data):
3090     # Reference: https://www.w3.org/TR/PNG/
3091     header = png_data[8:]
3092
3093     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3094         raise IOError('Not a valid PNG file.')
3095
3096     int_map = {1: '>B', 2: '>H', 4: '>I'}
3097     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3098
3099     chunks = []
3100
3101     while header:
3102         length = unpack_integer(header[:4])
3103         header = header[4:]
3104
3105         chunk_type = header[:4]
3106         header = header[4:]
3107
3108         chunk_data = header[:length]
3109         header = header[length:]
3110
3111         header = header[4:]  # Skip CRC
3112
3113         chunks.append({
3114             'type': chunk_type,
3115             'length': length,
3116             'data': chunk_data
3117         })
3118
3119     ihdr = chunks[0]['data']
3120
3121     width = unpack_integer(ihdr[:4])
3122     height = unpack_integer(ihdr[4:8])
3123
3124     idat = b''
3125
3126     for chunk in chunks:
3127         if chunk['type'] == b'IDAT':
3128             idat += chunk['data']
3129
3130     if not idat:
3131         raise IOError('Unable to read PNG data.')
3132
3133     decompressed_data = bytearray(zlib.decompress(idat))
3134
3135     stride = width * 3
3136     pixels = []
3137
3138     def _get_pixel(idx):
3139         x = idx % stride
3140         y = idx // stride
3141         return pixels[y][x]
3142
3143     for y in range(height):
3144         basePos = y * (1 + stride)
3145         filter_type = decompressed_data[basePos]
3146
3147         current_row = []
3148
3149         pixels.append(current_row)
3150
3151         for x in range(stride):
3152             color = decompressed_data[1 + basePos + x]
3153             basex = y * stride + x
3154             left = 0
3155             up = 0
3156
3157             if x > 2:
3158                 left = _get_pixel(basex - 3)
3159             if y > 0:
3160                 up = _get_pixel(basex - stride)
3161
3162             if filter_type == 1:  # Sub
3163                 color = (color + left) & 0xff
3164             elif filter_type == 2:  # Up
3165                 color = (color + up) & 0xff
3166             elif filter_type == 3:  # Average
3167                 color = (color + ((left + up) >> 1)) & 0xff
3168             elif filter_type == 4:  # Paeth
3169                 a = left
3170                 b = up
3171                 c = 0
3172
3173                 if x > 2 and y > 0:
3174                     c = _get_pixel(basex - stride - 3)
3175
3176                 p = a + b - c
3177
3178                 pa = abs(p - a)
3179                 pb = abs(p - b)
3180                 pc = abs(p - c)
3181
3182                 if pa <= pb and pa <= pc:
3183                     color = (color + a) & 0xff
3184                 elif pb <= pc:
3185                     color = (color + b) & 0xff
3186                 else:
3187                     color = (color + c) & 0xff
3188
3189             current_row.append(color)
3190
3191     return width, height, pixels
3192
3193
3194 def write_xattr(path, key, value):
3195     # This mess below finds the best xattr tool for the job
3196     try:
3197         # try the pyxattr module...
3198         import xattr
3199
3200         if hasattr(xattr, 'set'):  # pyxattr
3201             # Unicode arguments are not supported in python-pyxattr until
3202             # version 0.5.0
3203             # See https://github.com/rg3/youtube-dl/issues/5498
3204             pyxattr_required_version = '0.5.0'
3205             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3206                 # TODO: fallback to CLI tools
3207                 raise XAttrUnavailableError(
3208                     'python-pyxattr is detected but is too old. '
3209                     'youtube-dl requires %s or above while your version is %s. '
3210                     'Falling back to other xattr implementations' % (
3211                         pyxattr_required_version, xattr.__version__))
3212
3213             setxattr = xattr.set
3214         else:  # xattr
3215             setxattr = xattr.setxattr
3216
3217         try:
3218             setxattr(path, key, value)
3219         except EnvironmentError as e:
3220             raise XAttrMetadataError(e.errno, e.strerror)
3221
3222     except ImportError:
3223         if compat_os_name == 'nt':
3224             # Write xattrs to NTFS Alternate Data Streams:
3225             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3226             assert ':' not in key
3227             assert os.path.exists(path)
3228
3229             ads_fn = path + ':' + key
3230             try:
3231                 with open(ads_fn, 'wb') as f:
3232                     f.write(value)
3233             except EnvironmentError as e:
3234                 raise XAttrMetadataError(e.errno, e.strerror)
3235         else:
3236             user_has_setfattr = check_executable('setfattr', ['--version'])
3237             user_has_xattr = check_executable('xattr', ['-h'])
3238
3239             if user_has_setfattr or user_has_xattr:
3240
3241                 value = value.decode('utf-8')
3242                 if user_has_setfattr:
3243                     executable = 'setfattr'
3244                     opts = ['-n', key, '-v', value]
3245                 elif user_has_xattr:
3246                     executable = 'xattr'
3247                     opts = ['-w', key, value]
3248
3249                 cmd = ([encodeFilename(executable, True)] +
3250                        [encodeArgument(o) for o in opts] +
3251                        [encodeFilename(path, True)])
3252
3253                 try:
3254                     p = subprocess.Popen(
3255                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3256                 except EnvironmentError as e:
3257                     raise XAttrMetadataError(e.errno, e.strerror)
3258                 stdout, stderr = p.communicate()
3259                 stderr = stderr.decode('utf-8', 'replace')
3260                 if p.returncode != 0:
3261                     raise XAttrMetadataError(p.returncode, stderr)
3262
3263             else:
3264                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3265                 if sys.platform.startswith('linux'):
3266                     raise XAttrUnavailableError(
3267                         "Couldn't find a tool to set the xattrs. "
3268                         "Install either the python 'pyxattr' or 'xattr' "
3269                         "modules, or the GNU 'attr' package "
3270                         "(which contains the 'setfattr' tool).")
3271                 else:
3272                     raise XAttrUnavailableError(
3273                         "Couldn't find a tool to set the xattrs. "
3274                         "Install either the python 'xattr' module, "
3275                         "or the 'xattr' binary.")