git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_struct_unpack,
  51     compat_urllib_error,
  52     compat_urllib_parse,
  53     compat_urllib_parse_urlencode,
  54     compat_urllib_parse_urlparse,
  55     compat_urllib_parse_unquote_plus,
  56     compat_urllib_request,
  57     compat_urlparse,
  58     compat_xpath,
  59 )
  60
  61 from .socks import (
  62     ProxyType,
  63     sockssocket,
  64 )
  65
  66
  67 def register_socks_protocols():
  68     # "Register" SOCKS protocols
  69     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  70     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  71     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  72         if scheme not in compat_urlparse.uses_netloc:
  73             compat_urlparse.uses_netloc.append(scheme)
  74
  75
  76 # This is not clearly defined otherwise
  77 compiled_regex_type = type(re.compile(''))
  78
  79 std_headers = {
  80     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  81     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83     'Accept-Encoding': 'gzip, deflate',
  84     'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87
  88 NO_DEFAULT = object()
  89
  90 ENGLISH_MONTH_NAMES = [
  91     'January', 'February', 'March', 'April', 'May', 'June',
  92     'July', 'August', 'September', 'October', 'November', 'December']
  93
  94 KNOWN_EXTENSIONS = (
  95     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  96     'flv', 'f4v', 'f4a', 'f4b',
  97     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  98     'mkv', 'mka', 'mk3d',
  99     'avi', 'divx',
 100     'mov',
 101     'asf', 'wmv', 'wma',
 102     '3gp', '3g2',
 103     'mp3',
 104     'flac',
 105     'ape',
 106     'wav',
 107     'f4f', 'f4m', 'm3u8', 'smil')
 108
 109 # needed for sanitizing filenames in restricted mode
 110 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 111                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 112                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 113
 114 DATE_FORMATS = (
 115     '%d %B %Y',
 116     '%d %b %Y',
 117     '%B %d %Y',
 118     '%b %d %Y',
 119     '%b %dst %Y %I:%M',
 120     '%b %dnd %Y %I:%M',
 121     '%b %dth %Y %I:%M',
 122     '%Y %m %d',
 123     '%Y-%m-%d',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S',
 127     '%Y-%m-%d %H:%M:%S.%f',
 128     '%d.%m.%Y %H:%M',
 129     '%d.%m.%Y %H.%M',
 130     '%Y-%m-%dT%H:%M:%SZ',
 131     '%Y-%m-%dT%H:%M:%S.%fZ',
 132     '%Y-%m-%dT%H:%M:%S.%f0Z',
 133     '%Y-%m-%dT%H:%M:%S',
 134     '%Y-%m-%dT%H:%M:%S.%f',
 135     '%Y-%m-%dT%H:%M',
 136 )
 137
 138 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 139 DATE_FORMATS_DAY_FIRST.extend([
 140     '%d-%m-%Y',
 141     '%d.%m.%Y',
 142     '%d.%m.%y',
 143     '%d/%m/%Y',
 144     '%d/%m/%y',
 145     '%d/%m/%Y %H:%M:%S',
 146 ])
 147
 148 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 149 DATE_FORMATS_MONTH_FIRST.extend([
 150     '%m-%d-%Y',
 151     '%m.%d.%Y',
 152     '%m/%d/%Y',
 153     '%m/%d/%y',
 154     '%m/%d/%Y %H:%M:%S',
 155 ])
 156
 157
 158 def preferredencoding():
 159     """Get preferred encoding.
 160
 161     Returns the best encoding scheme for the system, based on
 162     locale.getpreferredencoding() and some further tweaks.
 163     """
 164     try:
 165         pref = locale.getpreferredencoding()
 166         'TEST'.encode(pref)
 167     except Exception:
 168         pref = 'UTF-8'
 169
 170     return pref
 171
 172
 173 def write_json_file(obj, fn):
 174     """ Encode obj as JSON and write it to fn, atomically if possible """
 175
 176     fn = encodeFilename(fn)
 177     if sys.version_info < (3, 0) and sys.platform != 'win32':
 178         encoding = get_filesystem_encoding()
 179         # os.path.basename returns a bytes object, but NamedTemporaryFile
 180         # will fail if the filename contains non ascii characters unless we
 181         # use a unicode object
 182         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 183         # the same for os.path.dirname
 184         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 185     else:
 186         path_basename = os.path.basename
 187         path_dirname = os.path.dirname
 188
 189     args = {
 190         'suffix': '.tmp',
 191         'prefix': path_basename(fn) + '.',
 192         'dir': path_dirname(fn),
 193         'delete': False,
 194     }
 195
 196     # In Python 2.x, json.dump expects a bytestream.
 197     # In Python 3.x, it writes to a character stream
 198     if sys.version_info < (3, 0):
 199         args['mode'] = 'wb'
 200     else:
 201         args.update({
 202             'mode': 'w',
 203             'encoding': 'utf-8',
 204         })
 205
 206     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 207
 208     try:
 209         with tf:
 210             json.dump(obj, tf)
 211         if sys.platform == 'win32':
 212             # Need to remove existing file on Windows, else os.rename raises
 213             # WindowsError or FileExistsError.
 214             try:
 215                 os.unlink(fn)
 216             except OSError:
 217                 pass
 218         os.rename(tf.name, fn)
 219     except Exception:
 220         try:
 221             os.remove(tf.name)
 222         except OSError:
 223             pass
 224         raise
 225
 226
 227 if sys.version_info >= (2, 7):
 228     def find_xpath_attr(node, xpath, key, val=None):
 229         """ Find the xpath xpath[@key=val] """
 230         assert re.match(r'^[a-zA-Z_-]+$', key)
 231         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 232         return node.find(expr)
 233 else:
 234     def find_xpath_attr(node, xpath, key, val=None):
 235         for f in node.findall(compat_xpath(xpath)):
 236             if key not in f.attrib:
 237                 continue
 238             if val is None or f.attrib.get(key) == val:
 239                 return f
 240         return None
 241
 242 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 243 # the namespace parameter
 244
 245
 246 def xpath_with_ns(path, ns_map):
 247     components = [c.split(':') for c in path.split('/')]
 248     replaced = []
 249     for c in components:
 250         if len(c) == 1:
 251             replaced.append(c[0])
 252         else:
 253             ns, tag = c
 254             replaced.append('{%s}%s' % (ns_map[ns], tag))
 255     return '/'.join(replaced)
 256
 257
 258 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 259     def _find_xpath(xpath):
 260         return node.find(compat_xpath(xpath))
 261
 262     if isinstance(xpath, (str, compat_str)):
 263         n = _find_xpath(xpath)
 264     else:
 265         for xp in xpath:
 266             n = _find_xpath(xp)
 267             if n is not None:
 268                 break
 269
 270     if n is None:
 271         if default is not NO_DEFAULT:
 272             return default
 273         elif fatal:
 274             name = xpath if name is None else name
 275             raise ExtractorError('Could not find XML element %s' % name)
 276         else:
 277             return None
 278     return n
 279
 280
 281 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 282     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 283     if n is None or n == default:
 284         return n
 285     if n.text is None:
 286         if default is not NO_DEFAULT:
 287             return default
 288         elif fatal:
 289             name = xpath if name is None else name
 290             raise ExtractorError('Could not find XML element\'s text %s' % name)
 291         else:
 292             return None
 293     return n.text
 294
 295
 296 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 297     n = find_xpath_attr(node, xpath, key)
 298     if n is None:
 299         if default is not NO_DEFAULT:
 300             return default
 301         elif fatal:
 302             name = '%s[@%s]' % (xpath, key) if name is None else name
 303             raise ExtractorError('Could not find XML attribute %s' % name)
 304         else:
 305             return None
 306     return n.attrib[key]
 307
 308
 309 def get_element_by_id(id, html):
 310     """Return the content of the tag with the specified ID in the passed HTML document"""
 311     return get_element_by_attribute('id', id, html)
 312
 313
 314 def get_element_by_class(class_name, html):
 315     return get_element_by_attribute(
 316         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 317         html, escape_value=False)
 318
 319
 320 def get_element_by_attribute(attribute, value, html, escape_value=True):
 321     """Return the content of the tag with the specified attribute in the passed HTML document"""
 322
 323     value = re.escape(value) if escape_value else value
 324
 325     m = re.search(r'''(?xs)
 326         <([a-zA-Z0-9:._-]+)
 327          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 328          \s+%s=['"]?%s['"]?
 329          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 330         \s*>
 331         (?P<content>.*?)
 332         </\1>
 333     ''' % (re.escape(attribute), value), html)
 334
 335     if not m:
 336         return None
 337     res = m.group('content')
 338
 339     if res.startswith('"') or res.startswith("'"):
 340         res = res[1:-1]
 341
 342     return unescapeHTML(res)
 343
 344
 345 class HTMLAttributeParser(compat_HTMLParser):
 346     """Trivial HTML parser to gather the attributes for a single element"""
 347     def __init__(self):
 348         self.attrs = {}
 349         compat_HTMLParser.__init__(self)
 350
 351     def handle_starttag(self, tag, attrs):
 352         self.attrs = dict(attrs)
 353
 354
 355 def extract_attributes(html_element):
 356     """Given a string for an HTML element such as
 357     <el
 358          a="foo" B="bar" c="&98;az" d=boz
 359          empty= noval entity="&amp;"
 360          sq='"' dq="'"
 361     >
 362     Decode and return a dictionary of attributes.
 363     {
 364         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 365         'empty': '', 'noval': None, 'entity': '&',
 366         'sq': '"', 'dq': '\''
 367     }.
 368     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 369     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 370     """
 371     parser = HTMLAttributeParser()
 372     parser.feed(html_element)
 373     parser.close()
 374     return parser.attrs
 375
 376
 377 def clean_html(html):
 378     """Clean an HTML snippet into a readable string"""
 379
 380     if html is None:  # Convenience for sanitizing descriptions etc.
 381         return html
 382
 383     # Newline vs <br />
 384     html = html.replace('\n', ' ')
 385     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 386     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 387     # Strip html tags
 388     html = re.sub('<.*?>', '', html)
 389     # Replace html entities
 390     html = unescapeHTML(html)
 391     return html.strip()
 392
 393
 394 def sanitize_open(filename, open_mode):
 395     """Try to open the given filename, and slightly tweak it if this fails.
 396
 397     Attempts to open the given filename. If this fails, it tries to change
 398     the filename slightly, step by step, until it's either able to open it
 399     or it fails and raises a final exception, like the standard open()
 400     function.
 401
 402     It returns the tuple (stream, definitive_file_name).
 403     """
 404     try:
 405         if filename == '-':
 406             if sys.platform == 'win32':
 407                 import msvcrt
 408                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 409             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 410         stream = open(encodeFilename(filename), open_mode)
 411         return (stream, filename)
 412     except (IOError, OSError) as err:
 413         if err.errno in (errno.EACCES,):
 414             raise
 415
 416         # In case of error, try to remove win32 forbidden chars
 417         alt_filename = sanitize_path(filename)
 418         if alt_filename == filename:
 419             raise
 420         else:
 421             # An exception here should be caught in the caller
 422             stream = open(encodeFilename(alt_filename), open_mode)
 423             return (stream, alt_filename)
 424
 425
 426 def timeconvert(timestr):
 427     """Convert RFC 2822 defined time string into system timestamp"""
 428     timestamp = None
 429     timetuple = email.utils.parsedate_tz(timestr)
 430     if timetuple is not None:
 431         timestamp = email.utils.mktime_tz(timetuple)
 432     return timestamp
 433
 434
 435 def sanitize_filename(s, restricted=False, is_id=False):
 436     """Sanitizes a string so it could be used as part of a filename.
 437     If restricted is set, use a stricter subset of allowed characters.
 438     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 439     """
 440     def replace_insane(char):
 441         if restricted and char in ACCENT_CHARS:
 442             return ACCENT_CHARS[char]
 443         if char == '?' or ord(char) < 32 or ord(char) == 127:
 444             return ''
 445         elif char == '"':
 446             return '' if restricted else '\''
 447         elif char == ':':
 448             return '_-' if restricted else ' -'
 449         elif char in '\\/|*<>':
 450             return '_'
 451         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 452             return '_'
 453         if restricted and ord(char) > 127:
 454             return '_'
 455         return char
 456
 457     # Handle timestamps
 458     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 459     result = ''.join(map(replace_insane, s))
 460     if not is_id:
 461         while '__' in result:
 462             result = result.replace('__', '_')
 463         result = result.strip('_')
 464         # Common case of "Foreign band name - English song title"
 465         if restricted and result.startswith('-_'):
 466             result = result[2:]
 467         if result.startswith('-'):
 468             result = '_' + result[len('-'):]
 469         result = result.lstrip('.')
 470         if not result:
 471             result = '_'
 472     return result
 473
 474
 475 def sanitize_path(s):
 476     """Sanitizes and normalizes path on Windows"""
 477     if sys.platform != 'win32':
 478         return s
 479     drive_or_unc, _ = os.path.splitdrive(s)
 480     if sys.version_info < (2, 7) and not drive_or_unc:
 481         drive_or_unc, _ = os.path.splitunc(s)
 482     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 483     if drive_or_unc:
 484         norm_path.pop(0)
 485     sanitized_path = [
 486         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 487         for path_part in norm_path]
 488     if drive_or_unc:
 489         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 490     return os.path.join(*sanitized_path)
 491
 492
 493 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 494 # unwanted failures due to missing protocol
 495 def sanitize_url(url):
 496     return 'http:%s' % url if url.startswith('//') else url
 497
 498
 499 def sanitized_Request(url, *args, **kwargs):
 500     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 501
 502
 503 def orderedSet(iterable):
 504     """ Remove all duplicates from the input iterable """
 505     res = []
 506     for el in iterable:
 507         if el not in res:
 508             res.append(el)
 509     return res
 510
 511
 512 def _htmlentity_transform(entity_with_semicolon):
 513     """Transforms an HTML entity to a character."""
 514     entity = entity_with_semicolon[:-1]
 515
 516     # Known non-numeric HTML entity
 517     if entity in compat_html_entities.name2codepoint:
 518         return compat_chr(compat_html_entities.name2codepoint[entity])
 519
 520     # TODO: HTML5 allows entities without a semicolon. For example,
 521     # '&Eacuteric' should be decoded as 'Éric'.
 522     if entity_with_semicolon in compat_html_entities_html5:
 523         return compat_html_entities_html5[entity_with_semicolon]
 524
 525     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 526     if mobj is not None:
 527         numstr = mobj.group(1)
 528         if numstr.startswith('x'):
 529             base = 16
 530             numstr = '0%s' % numstr
 531         else:
 532             base = 10
 533         # See https://github.com/rg3/youtube-dl/issues/7518
 534         try:
 535             return compat_chr(int(numstr, base))
 536         except ValueError:
 537             pass
 538
 539     # Unknown entity in name, return its literal representation
 540     return '&%s;' % entity
 541
 542
 543 def unescapeHTML(s):
 544     if s is None:
 545         return None
 546     assert type(s) == compat_str
 547
 548     return re.sub(
 549         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 550
 551
 552 def get_subprocess_encoding():
 553     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 554         # For subprocess calls, encode with locale encoding
 555         # Refer to http://stackoverflow.com/a/9951851/35070
 556         encoding = preferredencoding()
 557     else:
 558         encoding = sys.getfilesystemencoding()
 559     if encoding is None:
 560         encoding = 'utf-8'
 561     return encoding
 562
 563
 564 def encodeFilename(s, for_subprocess=False):
 565     """
 566     @param s The name of the file
 567     """
 568
 569     assert type(s) == compat_str
 570
 571     # Python 3 has a Unicode API
 572     if sys.version_info >= (3, 0):
 573         return s
 574
 575     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 576     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 577     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 578     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 579         return s
 580
 581     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 582     if sys.platform.startswith('java'):
 583         return s
 584
 585     return s.encode(get_subprocess_encoding(), 'ignore')
 586
 587
 588 def decodeFilename(b, for_subprocess=False):
 589
 590     if sys.version_info >= (3, 0):
 591         return b
 592
 593     if not isinstance(b, bytes):
 594         return b
 595
 596     return b.decode(get_subprocess_encoding(), 'ignore')
 597
 598
 599 def encodeArgument(s):
 600     if not isinstance(s, compat_str):
 601         # Legacy code that uses byte strings
 602         # Uncomment the following line after fixing all post processors
 603         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 604         s = s.decode('ascii')
 605     return encodeFilename(s, True)
 606
 607
 608 def decodeArgument(b):
 609     return decodeFilename(b, True)
 610
 611
 612 def decodeOption(optval):
 613     if optval is None:
 614         return optval
 615     if isinstance(optval, bytes):
 616         optval = optval.decode(preferredencoding())
 617
 618     assert isinstance(optval, compat_str)
 619     return optval
 620
 621
 622 def formatSeconds(secs):
 623     if secs > 3600:
 624         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 625     elif secs > 60:
 626         return '%d:%02d' % (secs // 60, secs % 60)
 627     else:
 628         return '%d' % secs
 629
 630
 631 def make_HTTPS_handler(params, **kwargs):
 632     opts_no_check_certificate = params.get('nocheckcertificate', False)
 633     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 634         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 635         if opts_no_check_certificate:
 636             context.check_hostname = False
 637             context.verify_mode = ssl.CERT_NONE
 638         try:
 639             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 640         except TypeError:
 641             # Python 2.7.8
 642             # (create_default_context present but HTTPSHandler has no context=)
 643             pass
 644
 645     if sys.version_info < (3, 2):
 646         return YoutubeDLHTTPSHandler(params, **kwargs)
 647     else:  # Python < 3.4
 648         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 649         context.verify_mode = (ssl.CERT_NONE
 650                                if opts_no_check_certificate
 651                                else ssl.CERT_REQUIRED)
 652         context.set_default_verify_paths()
 653         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 654
 655
 656 def bug_reports_message():
 657     if ytdl_is_updateable():
 658         update_cmd = 'type  youtube-dl -U  to update'
 659     else:
 660         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 661     msg = '; please report this issue on https://yt-dl.org/bug .'
 662     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 663     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 664     return msg
 665
 666
 667 class ExtractorError(Exception):
 668     """Error during info extraction."""
 669
 670     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 671         """ tb, if given, is the original traceback (so that it can be printed out).
 672         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 673         """
 674
 675         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 676             expected = True
 677         if video_id is not None:
 678             msg = video_id + ': ' + msg
 679         if cause:
 680             msg += ' (caused by %r)' % cause
 681         if not expected:
 682             msg += bug_reports_message()
 683         super(ExtractorError, self).__init__(msg)
 684
 685         self.traceback = tb
 686         self.exc_info = sys.exc_info()  # preserve original exception
 687         self.cause = cause
 688         self.video_id = video_id
 689
 690     def format_traceback(self):
 691         if self.traceback is None:
 692             return None
 693         return ''.join(traceback.format_tb(self.traceback))
 694
 695
 696 class UnsupportedError(ExtractorError):
 697     def __init__(self, url):
 698         super(UnsupportedError, self).__init__(
 699             'Unsupported URL: %s' % url, expected=True)
 700         self.url = url
 701
 702
 703 class RegexNotFoundError(ExtractorError):
 704     """Error when a regex didn't match"""
 705     pass
 706
 707
 708 class DownloadError(Exception):
 709     """Download Error exception.
 710
 711     This exception may be thrown by FileDownloader objects if they are not
 712     configured to continue on errors. They will contain the appropriate
 713     error message.
 714     """
 715
 716     def __init__(self, msg, exc_info=None):
 717         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 718         super(DownloadError, self).__init__(msg)
 719         self.exc_info = exc_info
 720
 721
 722 class SameFileError(Exception):
 723     """Same File exception.
 724
 725     This exception will be thrown by FileDownloader objects if they detect
 726     multiple files would have to be downloaded to the same file on disk.
 727     """
 728     pass
 729
 730
 731 class PostProcessingError(Exception):
 732     """Post Processing exception.
 733
 734     This exception may be raised by PostProcessor's .run() method to
 735     indicate an error in the postprocessing task.
 736     """
 737
 738     def __init__(self, msg):
 739         self.msg = msg
 740
 741
 742 class MaxDownloadsReached(Exception):
 743     """ --max-downloads limit has been reached. """
 744     pass
 745
 746
 747 class UnavailableVideoError(Exception):
 748     """Unavailable Format exception.
 749
 750     This exception will be thrown when a video is requested
 751     in a format that is not available for that video.
 752     """
 753     pass
 754
 755
 756 class ContentTooShortError(Exception):
 757     """Content Too Short exception.
 758
 759     This exception may be raised by FileDownloader objects when a file they
 760     download is too small for what the server announced first, indicating
 761     the connection was probably interrupted.
 762     """
 763
 764     def __init__(self, downloaded, expected):
 765         # Both in bytes
 766         self.downloaded = downloaded
 767         self.expected = expected
 768
 769
 770 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 771     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 772     # expected HTTP responses to meet HTTP/1.0 or later (see also
 773     # https://github.com/rg3/youtube-dl/issues/6727)
 774     if sys.version_info < (3, 0):
 775         kwargs[b'strict'] = True
 776     hc = http_class(*args, **kwargs)
 777     source_address = ydl_handler._params.get('source_address')
 778     if source_address is not None:
 779         sa = (source_address, 0)
 780         if hasattr(hc, 'source_address'):  # Python 2.7+
 781             hc.source_address = sa
 782         else:  # Python 2.6
 783             def _hc_connect(self, *args, **kwargs):
 784                 sock = compat_socket_create_connection(
 785                     (self.host, self.port), self.timeout, sa)
 786                 if is_https:
 787                     self.sock = ssl.wrap_socket(
 788                         sock, self.key_file, self.cert_file,
 789                         ssl_version=ssl.PROTOCOL_TLSv1)
 790                 else:
 791                     self.sock = sock
 792             hc.connect = functools.partial(_hc_connect, hc)
 793
 794     return hc
 795
 796
 797 def handle_youtubedl_headers(headers):
 798     filtered_headers = headers
 799
 800     if 'Youtubedl-no-compression' in filtered_headers:
 801         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 802         del filtered_headers['Youtubedl-no-compression']
 803
 804     return filtered_headers
 805
 806
 807 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 808     """Handler for HTTP requests and responses.
 809
 810     This class, when installed with an OpenerDirector, automatically adds
 811     the standard headers to every HTTP request and handles gzipped and
 812     deflated responses from web servers. If compression is to be avoided in
 813     a particular request, the original request in the program code only has
 814     to include the HTTP header "Youtubedl-no-compression", which will be
 815     removed before making the real request.
 816
 817     Part of this code was copied from:
 818
 819     http://techknack.net/python-urllib2-handlers/
 820
 821     Andrew Rowls, the author of that code, agreed to release it to the
 822     public domain.
 823     """
 824
 825     def __init__(self, params, *args, **kwargs):
 826         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 827         self._params = params
 828
 829     def http_open(self, req):
 830         conn_class = compat_http_client.HTTPConnection
 831
 832         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 833         if socks_proxy:
 834             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 835             del req.headers['Ytdl-socks-proxy']
 836
 837         return self.do_open(functools.partial(
 838             _create_http_connection, self, conn_class, False),
 839             req)
 840
 841     @staticmethod
 842     def deflate(data):
 843         try:
 844             return zlib.decompress(data, -zlib.MAX_WBITS)
 845         except zlib.error:
 846             return zlib.decompress(data)
 847
 848     @staticmethod
 849     def addinfourl_wrapper(stream, headers, url, code):
 850         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 851             return compat_urllib_request.addinfourl(stream, headers, url, code)
 852         ret = compat_urllib_request.addinfourl(stream, headers, url)
 853         ret.code = code
 854         return ret
 855
 856     def http_request(self, req):
 857         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 858         # always respected by websites, some tend to give out URLs with non percent-encoded
 859         # non-ASCII characters (see telemb.py, ard.py [#3412])
 860         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 861         # To work around aforementioned issue we will replace request's original URL with
 862         # percent-encoded one
 863         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 864         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 865         url = req.get_full_url()
 866         url_escaped = escape_url(url)
 867
 868         # Substitute URL if any change after escaping
 869         if url != url_escaped:
 870             req = update_Request(req, url=url_escaped)
 871
 872         for h, v in std_headers.items():
 873             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 874             # The dict keys are capitalized because of this bug by urllib
 875             if h.capitalize() not in req.headers:
 876                 req.add_header(h, v)
 877
 878         req.headers = handle_youtubedl_headers(req.headers)
 879
 880         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 881             # Python 2.6 is brain-dead when it comes to fragments
 882             req._Request__original = req._Request__original.partition('#')[0]
 883             req._Request__r_type = req._Request__r_type.partition('#')[0]
 884
 885         return req
 886
 887     def http_response(self, req, resp):
 888         old_resp = resp
 889         # gzip
 890         if resp.headers.get('Content-encoding', '') == 'gzip':
 891             content = resp.read()
 892             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 893             try:
 894                 uncompressed = io.BytesIO(gz.read())
 895             except IOError as original_ioerror:
 896                 # There may be junk add the end of the file
 897                 # See http://stackoverflow.com/q/4928560/35070 for details
 898                 for i in range(1, 1024):
 899                     try:
 900                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 901                         uncompressed = io.BytesIO(gz.read())
 902                     except IOError:
 903                         continue
 904                     break
 905                 else:
 906                     raise original_ioerror
 907             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 908             resp.msg = old_resp.msg
 909             del resp.headers['Content-encoding']
 910         # deflate
 911         if resp.headers.get('Content-encoding', '') == 'deflate':
 912             gz = io.BytesIO(self.deflate(resp.read()))
 913             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 914             resp.msg = old_resp.msg
 915             del resp.headers['Content-encoding']
 916         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 917         # https://github.com/rg3/youtube-dl/issues/6457).
 918         if 300 <= resp.code < 400:
 919             location = resp.headers.get('Location')
 920             if location:
 921                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 922                 if sys.version_info >= (3, 0):
 923                     location = location.encode('iso-8859-1').decode('utf-8')
 924                 else:
 925                     location = location.decode('utf-8')
 926                 location_escaped = escape_url(location)
 927                 if location != location_escaped:
 928                     del resp.headers['Location']
 929                     if sys.version_info < (3, 0):
 930                         location_escaped = location_escaped.encode('utf-8')
 931                     resp.headers['Location'] = location_escaped
 932         return resp
 933
 934     https_request = http_request
 935     https_response = http_response
 936
 937
 938 def make_socks_conn_class(base_class, socks_proxy):
 939     assert issubclass(base_class, (
 940         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 941
 942     url_components = compat_urlparse.urlparse(socks_proxy)
 943     if url_components.scheme.lower() == 'socks5':
 944         socks_type = ProxyType.SOCKS5
 945     elif url_components.scheme.lower() in ('socks', 'socks4'):
 946         socks_type = ProxyType.SOCKS4
 947     elif url_components.scheme.lower() == 'socks4a':
 948         socks_type = ProxyType.SOCKS4A
 949
 950     def unquote_if_non_empty(s):
 951         if not s:
 952             return s
 953         return compat_urllib_parse_unquote_plus(s)
 954
 955     proxy_args = (
 956         socks_type,
 957         url_components.hostname, url_components.port or 1080,
 958         True,  # Remote DNS
 959         unquote_if_non_empty(url_components.username),
 960         unquote_if_non_empty(url_components.password),
 961     )
 962
 963     class SocksConnection(base_class):
 964         def connect(self):
 965             self.sock = sockssocket()
 966             self.sock.setproxy(*proxy_args)
 967             if type(self.timeout) in (int, float):
 968                 self.sock.settimeout(self.timeout)
 969             self.sock.connect((self.host, self.port))
 970
 971             if isinstance(self, compat_http_client.HTTPSConnection):
 972                 if hasattr(self, '_context'):  # Python > 2.6
 973                     self.sock = self._context.wrap_socket(
 974                         self.sock, server_hostname=self.host)
 975                 else:
 976                     self.sock = ssl.wrap_socket(self.sock)
 977
 978     return SocksConnection
 979
 980
 981 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 982     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 983         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 984         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 985         self._params = params
 986
 987     def https_open(self, req):
 988         kwargs = {}
 989         conn_class = self._https_conn_class
 990
 991         if hasattr(self, '_context'):  # python > 2.6
 992             kwargs['context'] = self._context
 993         if hasattr(self, '_check_hostname'):  # python 3.x
 994             kwargs['check_hostname'] = self._check_hostname
 995
 996         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 997         if socks_proxy:
 998             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 999             del req.headers['Ytdl-socks-proxy']
1000
1001         return self.do_open(functools.partial(
1002             _create_http_connection, self, conn_class, True),
1003             req, **kwargs)
1004
1005
1006 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1007     def __init__(self, cookiejar=None):
1008         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1009
1010     def http_response(self, request, response):
1011         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1012         # characters in Set-Cookie HTTP header of last response (see
1013         # https://github.com/rg3/youtube-dl/issues/6769).
1014         # In order to at least prevent crashing we will percent encode Set-Cookie
1015         # header before HTTPCookieProcessor starts processing it.
1016         # if sys.version_info < (3, 0) and response.headers:
1017         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1018         #         set_cookie = response.headers.get(set_cookie_header)
1019         #         if set_cookie:
1020         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1021         #             if set_cookie != set_cookie_escaped:
1022         #                 del response.headers[set_cookie_header]
1023         #                 response.headers[set_cookie_header] = set_cookie_escaped
1024         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1025
1026     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1027     https_response = http_response
1028
1029
1030 def extract_timezone(date_str):
1031     m = re.search(
1032         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1033         date_str)
1034     if not m:
1035         timezone = datetime.timedelta()
1036     else:
1037         date_str = date_str[:-len(m.group('tz'))]
1038         if not m.group('sign'):
1039             timezone = datetime.timedelta()
1040         else:
1041             sign = 1 if m.group('sign') == '+' else -1
1042             timezone = datetime.timedelta(
1043                 hours=sign * int(m.group('hours')),
1044                 minutes=sign * int(m.group('minutes')))
1045     return timezone, date_str
1046
1047
1048 def parse_iso8601(date_str, delimiter='T', timezone=None):
1049     """ Return a UNIX timestamp from the given date """
1050
1051     if date_str is None:
1052         return None
1053
1054     date_str = re.sub(r'\.[0-9]+', '', date_str)
1055
1056     if timezone is None:
1057         timezone, date_str = extract_timezone(date_str)
1058
1059     try:
1060         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1061         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1062         return calendar.timegm(dt.timetuple())
1063     except ValueError:
1064         pass
1065
1066
1067 def date_formats(day_first=True):
1068     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1069
1070
1071 def unified_strdate(date_str, day_first=True):
1072     """Return a string with the date in the format YYYYMMDD"""
1073
1074     if date_str is None:
1075         return None
1076     upload_date = None
1077     # Replace commas
1078     date_str = date_str.replace(',', ' ')
1079     # Remove AM/PM + timezone
1080     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1081     _, date_str = extract_timezone(date_str)
1082
1083     for expression in date_formats(day_first):
1084         try:
1085             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1086         except ValueError:
1087             pass
1088     if upload_date is None:
1089         timetuple = email.utils.parsedate_tz(date_str)
1090         if timetuple:
1091             try:
1092                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1093             except ValueError:
1094                 pass
1095     if upload_date is not None:
1096         return compat_str(upload_date)
1097
1098
1099 def unified_timestamp(date_str, day_first=True):
1100     if date_str is None:
1101         return None
1102
1103     date_str = date_str.replace(',', ' ')
1104
1105     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1106     timezone, date_str = extract_timezone(date_str)
1107
1108     # Remove AM/PM + timezone
1109     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1110
1111     for expression in date_formats(day_first):
1112         try:
1113             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1114             return calendar.timegm(dt.timetuple())
1115         except ValueError:
1116             pass
1117     timetuple = email.utils.parsedate_tz(date_str)
1118     if timetuple:
1119         return calendar.timegm(timetuple) + pm_delta * 3600
1120
1121
1122 def determine_ext(url, default_ext='unknown_video'):
1123     if url is None:
1124         return default_ext
1125     guess = url.partition('?')[0].rpartition('.')[2]
1126     if re.match(r'^[A-Za-z0-9]+$', guess):
1127         return guess
1128     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1129     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1130         return guess.rstrip('/')
1131     else:
1132         return default_ext
1133
1134
1135 def subtitles_filename(filename, sub_lang, sub_format):
1136     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1137
1138
1139 def date_from_str(date_str):
1140     """
1141     Return a datetime object from a string in the format YYYYMMDD or
1142     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1143     today = datetime.date.today()
1144     if date_str in ('now', 'today'):
1145         return today
1146     if date_str == 'yesterday':
1147         return today - datetime.timedelta(days=1)
1148     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1149     if match is not None:
1150         sign = match.group('sign')
1151         time = int(match.group('time'))
1152         if sign == '-':
1153             time = -time
1154         unit = match.group('unit')
1155         # A bad approximation?
1156         if unit == 'month':
1157             unit = 'day'
1158             time *= 30
1159         elif unit == 'year':
1160             unit = 'day'
1161             time *= 365
1162         unit += 's'
1163         delta = datetime.timedelta(**{unit: time})
1164         return today + delta
1165     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1166
1167
1168 def hyphenate_date(date_str):
1169     """
1170     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1171     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1172     if match is not None:
1173         return '-'.join(match.groups())
1174     else:
1175         return date_str
1176
1177
1178 class DateRange(object):
1179     """Represents a time interval between two dates"""
1180
1181     def __init__(self, start=None, end=None):
1182         """start and end must be strings in the format accepted by date"""
1183         if start is not None:
1184             self.start = date_from_str(start)
1185         else:
1186             self.start = datetime.datetime.min.date()
1187         if end is not None:
1188             self.end = date_from_str(end)
1189         else:
1190             self.end = datetime.datetime.max.date()
1191         if self.start > self.end:
1192             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1193
1194     @classmethod
1195     def day(cls, day):
1196         """Returns a range that only contains the given day"""
1197         return cls(day, day)
1198
1199     def __contains__(self, date):
1200         """Check if the date is in the range"""
1201         if not isinstance(date, datetime.date):
1202             date = date_from_str(date)
1203         return self.start <= date <= self.end
1204
1205     def __str__(self):
1206         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1207
1208
1209 def platform_name():
1210     """ Returns the platform name as a compat_str """
1211     res = platform.platform()
1212     if isinstance(res, bytes):
1213         res = res.decode(preferredencoding())
1214
1215     assert isinstance(res, compat_str)
1216     return res
1217
1218
1219 def _windows_write_string(s, out):
1220     """ Returns True if the string was written using special methods,
1221     False if it has yet to be written out."""
1222     # Adapted from http://stackoverflow.com/a/3259271/35070
1223
1224     import ctypes
1225     import ctypes.wintypes
1226
1227     WIN_OUTPUT_IDS = {
1228         1: -11,
1229         2: -12,
1230     }
1231
1232     try:
1233         fileno = out.fileno()
1234     except AttributeError:
1235         # If the output stream doesn't have a fileno, it's virtual
1236         return False
1237     except io.UnsupportedOperation:
1238         # Some strange Windows pseudo files?
1239         return False
1240     if fileno not in WIN_OUTPUT_IDS:
1241         return False
1242
1243     GetStdHandle = ctypes.WINFUNCTYPE(
1244         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1245         (b'GetStdHandle', ctypes.windll.kernel32))
1246     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1247
1248     WriteConsoleW = ctypes.WINFUNCTYPE(
1249         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1250         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1251         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1252     written = ctypes.wintypes.DWORD(0)
1253
1254     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1255     FILE_TYPE_CHAR = 0x0002
1256     FILE_TYPE_REMOTE = 0x8000
1257     GetConsoleMode = ctypes.WINFUNCTYPE(
1258         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1259         ctypes.POINTER(ctypes.wintypes.DWORD))(
1260         (b'GetConsoleMode', ctypes.windll.kernel32))
1261     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1262
1263     def not_a_console(handle):
1264         if handle == INVALID_HANDLE_VALUE or handle is None:
1265             return True
1266         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1267                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1268
1269     if not_a_console(h):
1270         return False
1271
1272     def next_nonbmp_pos(s):
1273         try:
1274             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1275         except StopIteration:
1276             return len(s)
1277
1278     while s:
1279         count = min(next_nonbmp_pos(s), 1024)
1280
1281         ret = WriteConsoleW(
1282             h, s, count if count else 2, ctypes.byref(written), None)
1283         if ret == 0:
1284             raise OSError('Failed to write string')
1285         if not count:  # We just wrote a non-BMP character
1286             assert written.value == 2
1287             s = s[1:]
1288         else:
1289             assert written.value > 0
1290             s = s[written.value:]
1291     return True
1292
1293
1294 def write_string(s, out=None, encoding=None):
1295     if out is None:
1296         out = sys.stderr
1297     assert type(s) == compat_str
1298
1299     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1300         if _windows_write_string(s, out):
1301             return
1302
1303     if ('b' in getattr(out, 'mode', '') or
1304             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1305         byt = s.encode(encoding or preferredencoding(), 'ignore')
1306         out.write(byt)
1307     elif hasattr(out, 'buffer'):
1308         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1309         byt = s.encode(enc, 'ignore')
1310         out.buffer.write(byt)
1311     else:
1312         out.write(s)
1313     out.flush()
1314
1315
1316 def bytes_to_intlist(bs):
1317     if not bs:
1318         return []
1319     if isinstance(bs[0], int):  # Python 3
1320         return list(bs)
1321     else:
1322         return [ord(c) for c in bs]
1323
1324
1325 def intlist_to_bytes(xs):
1326     if not xs:
1327         return b''
1328     return compat_struct_pack('%dB' % len(xs), *xs)
1329
1330
1331 # Cross-platform file locking
1332 if sys.platform == 'win32':
1333     import ctypes.wintypes
1334     import msvcrt
1335
1336     class OVERLAPPED(ctypes.Structure):
1337         _fields_ = [
1338             ('Internal', ctypes.wintypes.LPVOID),
1339             ('InternalHigh', ctypes.wintypes.LPVOID),
1340             ('Offset', ctypes.wintypes.DWORD),
1341             ('OffsetHigh', ctypes.wintypes.DWORD),
1342             ('hEvent', ctypes.wintypes.HANDLE),
1343         ]
1344
1345     kernel32 = ctypes.windll.kernel32
1346     LockFileEx = kernel32.LockFileEx
1347     LockFileEx.argtypes = [
1348         ctypes.wintypes.HANDLE,     # hFile
1349         ctypes.wintypes.DWORD,      # dwFlags
1350         ctypes.wintypes.DWORD,      # dwReserved
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1353         ctypes.POINTER(OVERLAPPED)  # Overlapped
1354     ]
1355     LockFileEx.restype = ctypes.wintypes.BOOL
1356     UnlockFileEx = kernel32.UnlockFileEx
1357     UnlockFileEx.argtypes = [
1358         ctypes.wintypes.HANDLE,     # hFile
1359         ctypes.wintypes.DWORD,      # dwReserved
1360         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1361         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1362         ctypes.POINTER(OVERLAPPED)  # Overlapped
1363     ]
1364     UnlockFileEx.restype = ctypes.wintypes.BOOL
1365     whole_low = 0xffffffff
1366     whole_high = 0x7fffffff
1367
1368     def _lock_file(f, exclusive):
1369         overlapped = OVERLAPPED()
1370         overlapped.Offset = 0
1371         overlapped.OffsetHigh = 0
1372         overlapped.hEvent = 0
1373         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1374         handle = msvcrt.get_osfhandle(f.fileno())
1375         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1376                           whole_low, whole_high, f._lock_file_overlapped_p):
1377             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1378
1379     def _unlock_file(f):
1380         assert f._lock_file_overlapped_p
1381         handle = msvcrt.get_osfhandle(f.fileno())
1382         if not UnlockFileEx(handle, 0,
1383                             whole_low, whole_high, f._lock_file_overlapped_p):
1384             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1385
1386 else:
1387     # Some platforms, such as Jython, is missing fcntl
1388     try:
1389         import fcntl
1390
1391         def _lock_file(f, exclusive):
1392             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1393
1394         def _unlock_file(f):
1395             fcntl.flock(f, fcntl.LOCK_UN)
1396     except ImportError:
1397         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1398
1399         def _lock_file(f, exclusive):
1400             raise IOError(UNSUPPORTED_MSG)
1401
1402         def _unlock_file(f):
1403             raise IOError(UNSUPPORTED_MSG)
1404
1405
1406 class locked_file(object):
1407     def __init__(self, filename, mode, encoding=None):
1408         assert mode in ['r', 'a', 'w']
1409         self.f = io.open(filename, mode, encoding=encoding)
1410         self.mode = mode
1411
1412     def __enter__(self):
1413         exclusive = self.mode != 'r'
1414         try:
1415             _lock_file(self.f, exclusive)
1416         except IOError:
1417             self.f.close()
1418             raise
1419         return self
1420
1421     def __exit__(self, etype, value, traceback):
1422         try:
1423             _unlock_file(self.f)
1424         finally:
1425             self.f.close()
1426
1427     def __iter__(self):
1428         return iter(self.f)
1429
1430     def write(self, *args):
1431         return self.f.write(*args)
1432
1433     def read(self, *args):
1434         return self.f.read(*args)
1435
1436
1437 def get_filesystem_encoding():
1438     encoding = sys.getfilesystemencoding()
1439     return encoding if encoding is not None else 'utf-8'
1440
1441
1442 def shell_quote(args):
1443     quoted_args = []
1444     encoding = get_filesystem_encoding()
1445     for a in args:
1446         if isinstance(a, bytes):
1447             # We may get a filename encoded with 'encodeFilename'
1448             a = a.decode(encoding)
1449         quoted_args.append(pipes.quote(a))
1450     return ' '.join(quoted_args)
1451
1452
1453 def smuggle_url(url, data):
1454     """ Pass additional data in a URL for internal use. """
1455
1456     url, idata = unsmuggle_url(url, {})
1457     data.update(idata)
1458     sdata = compat_urllib_parse_urlencode(
1459         {'__youtubedl_smuggle': json.dumps(data)})
1460     return url + '#' + sdata
1461
1462
1463 def unsmuggle_url(smug_url, default=None):
1464     if '#__youtubedl_smuggle' not in smug_url:
1465         return smug_url, default
1466     url, _, sdata = smug_url.rpartition('#')
1467     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1468     data = json.loads(jsond)
1469     return url, data
1470
1471
1472 def format_bytes(bytes):
1473     if bytes is None:
1474         return 'N/A'
1475     if type(bytes) is str:
1476         bytes = float(bytes)
1477     if bytes == 0.0:
1478         exponent = 0
1479     else:
1480         exponent = int(math.log(bytes, 1024.0))
1481     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1482     converted = float(bytes) / float(1024 ** exponent)
1483     return '%.2f%s' % (converted, suffix)
1484
1485
1486 def lookup_unit_table(unit_table, s):
1487     units_re = '|'.join(re.escape(u) for u in unit_table)
1488     m = re.match(
1489         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1490     if not m:
1491         return None
1492     num_str = m.group('num').replace(',', '.')
1493     mult = unit_table[m.group('unit')]
1494     return int(float(num_str) * mult)
1495
1496
1497 def parse_filesize(s):
1498     if s is None:
1499         return None
1500
1501     # The lower-case forms are of course incorrect and unofficial,
1502     # but we support those too
1503     _UNIT_TABLE = {
1504         'B': 1,
1505         'b': 1,
1506         'KiB': 1024,
1507         'KB': 1000,
1508         'kB': 1024,
1509         'Kb': 1000,
1510         'MiB': 1024 ** 2,
1511         'MB': 1000 ** 2,
1512         'mB': 1024 ** 2,
1513         'Mb': 1000 ** 2,
1514         'GiB': 1024 ** 3,
1515         'GB': 1000 ** 3,
1516         'gB': 1024 ** 3,
1517         'Gb': 1000 ** 3,
1518         'TiB': 1024 ** 4,
1519         'TB': 1000 ** 4,
1520         'tB': 1024 ** 4,
1521         'Tb': 1000 ** 4,
1522         'PiB': 1024 ** 5,
1523         'PB': 1000 ** 5,
1524         'pB': 1024 ** 5,
1525         'Pb': 1000 ** 5,
1526         'EiB': 1024 ** 6,
1527         'EB': 1000 ** 6,
1528         'eB': 1024 ** 6,
1529         'Eb': 1000 ** 6,
1530         'ZiB': 1024 ** 7,
1531         'ZB': 1000 ** 7,
1532         'zB': 1024 ** 7,
1533         'Zb': 1000 ** 7,
1534         'YiB': 1024 ** 8,
1535         'YB': 1000 ** 8,
1536         'yB': 1024 ** 8,
1537         'Yb': 1000 ** 8,
1538     }
1539
1540     return lookup_unit_table(_UNIT_TABLE, s)
1541
1542
1543 def parse_count(s):
1544     if s is None:
1545         return None
1546
1547     s = s.strip()
1548
1549     if re.match(r'^[\d,.]+$', s):
1550         return str_to_int(s)
1551
1552     _UNIT_TABLE = {
1553         'k': 1000,
1554         'K': 1000,
1555         'm': 1000 ** 2,
1556         'M': 1000 ** 2,
1557         'kk': 1000 ** 2,
1558         'KK': 1000 ** 2,
1559     }
1560
1561     return lookup_unit_table(_UNIT_TABLE, s)
1562
1563
1564 def month_by_name(name):
1565     """ Return the number of a month by (locale-independently) English name """
1566
1567     try:
1568         return ENGLISH_MONTH_NAMES.index(name) + 1
1569     except ValueError:
1570         return None
1571
1572
1573 def month_by_abbreviation(abbrev):
1574     """ Return the number of a month by (locale-independently) English
1575         abbreviations """
1576
1577     try:
1578         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1579     except ValueError:
1580         return None
1581
1582
1583 def fix_xml_ampersands(xml_str):
1584     """Replace all the '&' by '&amp;' in XML"""
1585     return re.sub(
1586         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1587         '&amp;',
1588         xml_str)
1589
1590
1591 def setproctitle(title):
1592     assert isinstance(title, compat_str)
1593
1594     # ctypes in Jython is not complete
1595     # http://bugs.jython.org/issue2148
1596     if sys.platform.startswith('java'):
1597         return
1598
1599     try:
1600         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1601     except OSError:
1602         return
1603     title_bytes = title.encode('utf-8')
1604     buf = ctypes.create_string_buffer(len(title_bytes))
1605     buf.value = title_bytes
1606     try:
1607         libc.prctl(15, buf, 0, 0, 0)
1608     except AttributeError:
1609         return  # Strange libc, just skip this
1610
1611
1612 def remove_start(s, start):
1613     return s[len(start):] if s is not None and s.startswith(start) else s
1614
1615
1616 def remove_end(s, end):
1617     return s[:-len(end)] if s is not None and s.endswith(end) else s
1618
1619
1620 def remove_quotes(s):
1621     if s is None or len(s) < 2:
1622         return s
1623     for quote in ('"', "'", ):
1624         if s[0] == quote and s[-1] == quote:
1625             return s[1:-1]
1626     return s
1627
1628
1629 def url_basename(url):
1630     path = compat_urlparse.urlparse(url).path
1631     return path.strip('/').split('/')[-1]
1632
1633
1634 class HEADRequest(compat_urllib_request.Request):
1635     def get_method(self):
1636         return 'HEAD'
1637
1638
1639 class PUTRequest(compat_urllib_request.Request):
1640     def get_method(self):
1641         return 'PUT'
1642
1643
1644 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1645     if get_attr:
1646         if v is not None:
1647             v = getattr(v, get_attr, None)
1648     if v == '':
1649         v = None
1650     if v is None:
1651         return default
1652     try:
1653         return int(v) * invscale // scale
1654     except ValueError:
1655         return default
1656
1657
1658 def str_or_none(v, default=None):
1659     return default if v is None else compat_str(v)
1660
1661
1662 def str_to_int(int_str):
1663     """ A more relaxed version of int_or_none """
1664     if int_str is None:
1665         return None
1666     int_str = re.sub(r'[,\.\+]', '', int_str)
1667     return int(int_str)
1668
1669
1670 def float_or_none(v, scale=1, invscale=1, default=None):
1671     if v is None:
1672         return default
1673     try:
1674         return float(v) * invscale / scale
1675     except ValueError:
1676         return default
1677
1678
1679 def strip_or_none(v):
1680     return None if v is None else v.strip()
1681
1682
1683 def parse_duration(s):
1684     if not isinstance(s, compat_basestring):
1685         return None
1686
1687     s = s.strip()
1688
1689     days, hours, mins, secs, ms = [None] * 5
1690     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1691     if m:
1692         days, hours, mins, secs, ms = m.groups()
1693     else:
1694         m = re.match(
1695             r'''(?ix)(?:P?T)?
1696                 (?:
1697                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1698                 )?
1699                 (?:
1700                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1701                 )?
1702                 (?:
1703                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1704                 )?
1705                 (?:
1706                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1707                 )?$''', s)
1708         if m:
1709             days, hours, mins, secs, ms = m.groups()
1710         else:
1711             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1712             if m:
1713                 hours, mins = m.groups()
1714             else:
1715                 return None
1716
1717     duration = 0
1718     if secs:
1719         duration += float(secs)
1720     if mins:
1721         duration += float(mins) * 60
1722     if hours:
1723         duration += float(hours) * 60 * 60
1724     if days:
1725         duration += float(days) * 24 * 60 * 60
1726     if ms:
1727         duration += float(ms)
1728     return duration
1729
1730
1731 def prepend_extension(filename, ext, expected_real_ext=None):
1732     name, real_ext = os.path.splitext(filename)
1733     return (
1734         '{0}.{1}{2}'.format(name, ext, real_ext)
1735         if not expected_real_ext or real_ext[1:] == expected_real_ext
1736         else '{0}.{1}'.format(filename, ext))
1737
1738
1739 def replace_extension(filename, ext, expected_real_ext=None):
1740     name, real_ext = os.path.splitext(filename)
1741     return '{0}.{1}'.format(
1742         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1743         ext)
1744
1745
1746 def check_executable(exe, args=[]):
1747     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1748     args can be a list of arguments for a short output (like -version) """
1749     try:
1750         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1751     except OSError:
1752         return False
1753     return exe
1754
1755
1756 def get_exe_version(exe, args=['--version'],
1757                     version_re=None, unrecognized='present'):
1758     """ Returns the version of the specified executable,
1759     or False if the executable is not present """
1760     try:
1761         out, _ = subprocess.Popen(
1762             [encodeArgument(exe)] + args,
1763             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1764     except OSError:
1765         return False
1766     if isinstance(out, bytes):  # Python 2.x
1767         out = out.decode('ascii', 'ignore')
1768     return detect_exe_version(out, version_re, unrecognized)
1769
1770
1771 def detect_exe_version(output, version_re=None, unrecognized='present'):
1772     assert isinstance(output, compat_str)
1773     if version_re is None:
1774         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1775     m = re.search(version_re, output)
1776     if m:
1777         return m.group(1)
1778     else:
1779         return unrecognized
1780
1781
1782 class PagedList(object):
1783     def __len__(self):
1784         # This is only useful for tests
1785         return len(self.getslice())
1786
1787
1788 class OnDemandPagedList(PagedList):
1789     def __init__(self, pagefunc, pagesize, use_cache=False):
1790         self._pagefunc = pagefunc
1791         self._pagesize = pagesize
1792         self._use_cache = use_cache
1793         if use_cache:
1794             self._cache = {}
1795
1796     def getslice(self, start=0, end=None):
1797         res = []
1798         for pagenum in itertools.count(start // self._pagesize):
1799             firstid = pagenum * self._pagesize
1800             nextfirstid = pagenum * self._pagesize + self._pagesize
1801             if start >= nextfirstid:
1802                 continue
1803
1804             page_results = None
1805             if self._use_cache:
1806                 page_results = self._cache.get(pagenum)
1807             if page_results is None:
1808                 page_results = list(self._pagefunc(pagenum))
1809             if self._use_cache:
1810                 self._cache[pagenum] = page_results
1811
1812             startv = (
1813                 start % self._pagesize
1814                 if firstid <= start < nextfirstid
1815                 else 0)
1816
1817             endv = (
1818                 ((end - 1) % self._pagesize) + 1
1819                 if (end is not None and firstid <= end <= nextfirstid)
1820                 else None)
1821
1822             if startv != 0 or endv is not None:
1823                 page_results = page_results[startv:endv]
1824             res.extend(page_results)
1825
1826             # A little optimization - if current page is not "full", ie. does
1827             # not contain page_size videos then we can assume that this page
1828             # is the last one - there are no more ids on further pages -
1829             # i.e. no need to query again.
1830             if len(page_results) + startv < self._pagesize:
1831                 break
1832
1833             # If we got the whole page, but the next page is not interesting,
1834             # break out early as well
1835             if end == nextfirstid:
1836                 break
1837         return res
1838
1839
1840 class InAdvancePagedList(PagedList):
1841     def __init__(self, pagefunc, pagecount, pagesize):
1842         self._pagefunc = pagefunc
1843         self._pagecount = pagecount
1844         self._pagesize = pagesize
1845
1846     def getslice(self, start=0, end=None):
1847         res = []
1848         start_page = start // self._pagesize
1849         end_page = (
1850             self._pagecount if end is None else (end // self._pagesize + 1))
1851         skip_elems = start - start_page * self._pagesize
1852         only_more = None if end is None else end - start
1853         for pagenum in range(start_page, end_page):
1854             page = list(self._pagefunc(pagenum))
1855             if skip_elems:
1856                 page = page[skip_elems:]
1857                 skip_elems = None
1858             if only_more is not None:
1859                 if len(page) < only_more:
1860                     only_more -= len(page)
1861                 else:
1862                     page = page[:only_more]
1863                     res.extend(page)
1864                     break
1865             res.extend(page)
1866         return res
1867
1868
1869 def uppercase_escape(s):
1870     unicode_escape = codecs.getdecoder('unicode_escape')
1871     return re.sub(
1872         r'\\U[0-9a-fA-F]{8}',
1873         lambda m: unicode_escape(m.group(0))[0],
1874         s)
1875
1876
1877 def lowercase_escape(s):
1878     unicode_escape = codecs.getdecoder('unicode_escape')
1879     return re.sub(
1880         r'\\u[0-9a-fA-F]{4}',
1881         lambda m: unicode_escape(m.group(0))[0],
1882         s)
1883
1884
1885 def escape_rfc3986(s):
1886     """Escape non-ASCII characters as suggested by RFC 3986"""
1887     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1888         s = s.encode('utf-8')
1889     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1890
1891
1892 def escape_url(url):
1893     """Escape URL as suggested by RFC 3986"""
1894     url_parsed = compat_urllib_parse_urlparse(url)
1895     return url_parsed._replace(
1896         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1897         path=escape_rfc3986(url_parsed.path),
1898         params=escape_rfc3986(url_parsed.params),
1899         query=escape_rfc3986(url_parsed.query),
1900         fragment=escape_rfc3986(url_parsed.fragment)
1901     ).geturl()
1902
1903
1904 def read_batch_urls(batch_fd):
1905     def fixup(url):
1906         if not isinstance(url, compat_str):
1907             url = url.decode('utf-8', 'replace')
1908         BOM_UTF8 = '\xef\xbb\xbf'
1909         if url.startswith(BOM_UTF8):
1910             url = url[len(BOM_UTF8):]
1911         url = url.strip()
1912         if url.startswith(('#', ';', ']')):
1913             return False
1914         return url
1915
1916     with contextlib.closing(batch_fd) as fd:
1917         return [url for url in map(fixup, fd) if url]
1918
1919
1920 def urlencode_postdata(*args, **kargs):
1921     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1922
1923
1924 def update_url_query(url, query):
1925     if not query:
1926         return url
1927     parsed_url = compat_urlparse.urlparse(url)
1928     qs = compat_parse_qs(parsed_url.query)
1929     qs.update(query)
1930     return compat_urlparse.urlunparse(parsed_url._replace(
1931         query=compat_urllib_parse_urlencode(qs, True)))
1932
1933
1934 def update_Request(req, url=None, data=None, headers={}, query={}):
1935     req_headers = req.headers.copy()
1936     req_headers.update(headers)
1937     req_data = data or req.data
1938     req_url = update_url_query(url or req.get_full_url(), query)
1939     req_get_method = req.get_method()
1940     if req_get_method == 'HEAD':
1941         req_type = HEADRequest
1942     elif req_get_method == 'PUT':
1943         req_type = PUTRequest
1944     else:
1945         req_type = compat_urllib_request.Request
1946     new_req = req_type(
1947         req_url, data=req_data, headers=req_headers,
1948         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1949     if hasattr(req, 'timeout'):
1950         new_req.timeout = req.timeout
1951     return new_req
1952
1953
1954 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1955     if isinstance(key_or_keys, (list, tuple)):
1956         for key in key_or_keys:
1957             if key not in d or d[key] is None or skip_false_values and not d[key]:
1958                 continue
1959             return d[key]
1960         return default
1961     return d.get(key_or_keys, default)
1962
1963
1964 def try_get(src, getter, expected_type=None):
1965     try:
1966         v = getter(src)
1967     except (AttributeError, KeyError, TypeError, IndexError):
1968         pass
1969     else:
1970         if expected_type is None or isinstance(v, expected_type):
1971             return v
1972
1973
1974 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1975     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1976
1977
1978 US_RATINGS = {
1979     'G': 0,
1980     'PG': 10,
1981     'PG-13': 13,
1982     'R': 16,
1983     'NC': 18,
1984 }
1985
1986
1987 TV_PARENTAL_GUIDELINES = {
1988     'TV-Y': 0,
1989     'TV-Y7': 7,
1990     'TV-G': 0,
1991     'TV-PG': 0,
1992     'TV-14': 14,
1993     'TV-MA': 17,
1994 }
1995
1996
1997 def parse_age_limit(s):
1998     if type(s) == int:
1999         return s if 0 <= s <= 21 else None
2000     if not isinstance(s, compat_basestring):
2001         return None
2002     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2003     if m:
2004         return int(m.group('age'))
2005     if s in US_RATINGS:
2006         return US_RATINGS[s]
2007     return TV_PARENTAL_GUIDELINES.get(s)
2008
2009
2010 def strip_jsonp(code):
2011     return re.sub(
2012         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2013
2014
2015 def js_to_json(code):
2016     def fix_kv(m):
2017         v = m.group(0)
2018         if v in ('true', 'false', 'null'):
2019             return v
2020         elif v.startswith('/*') or v == ',':
2021             return ""
2022
2023         if v[0] in ("'", '"'):
2024             v = re.sub(r'(?s)\\.|"', lambda m: {
2025                 '"': '\\"',
2026                 "\\'": "'",
2027                 '\\\n': '',
2028                 '\\x': '\\u00',
2029             }.get(m.group(0), m.group(0)), v[1:-1])
2030
2031         INTEGER_TABLE = (
2032             (r'^0[xX][0-9a-fA-F]+', 16),
2033             (r'^0+[0-7]+', 8),
2034         )
2035
2036         for regex, base in INTEGER_TABLE:
2037             im = re.match(regex, v)
2038             if im:
2039                 i = int(im.group(0), base)
2040                 return '"%d":' % i if v.endswith(':') else '%d' % i
2041
2042         return '"%s"' % v
2043
2044     return re.sub(r'''(?sx)
2045         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2046         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2047         /\*.*?\*/|,(?=\s*[\]}])|
2048         [a-zA-Z_][.a-zA-Z_0-9]*|
2049         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2050         [0-9]+(?=\s*:)
2051         ''', fix_kv, code)
2052
2053
2054 def qualities(quality_ids):
2055     """ Get a numeric quality value out of a list of possible values """
2056     def q(qid):
2057         try:
2058             return quality_ids.index(qid)
2059         except ValueError:
2060             return -1
2061     return q
2062
2063
2064 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2065
2066
2067 def limit_length(s, length):
2068     """ Add ellipses to overly long strings """
2069     if s is None:
2070         return None
2071     ELLIPSES = '...'
2072     if len(s) > length:
2073         return s[:length - len(ELLIPSES)] + ELLIPSES
2074     return s
2075
2076
2077 def version_tuple(v):
2078     return tuple(int(e) for e in re.split(r'[-.]', v))
2079
2080
2081 def is_outdated_version(version, limit, assume_new=True):
2082     if not version:
2083         return not assume_new
2084     try:
2085         return version_tuple(version) < version_tuple(limit)
2086     except ValueError:
2087         return not assume_new
2088
2089
2090 def ytdl_is_updateable():
2091     """ Returns if youtube-dl can be updated with -U """
2092     from zipimport import zipimporter
2093
2094     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2095
2096
2097 def args_to_str(args):
2098     # Get a short string representation for a subprocess command
2099     return ' '.join(compat_shlex_quote(a) for a in args)
2100
2101
2102 def error_to_compat_str(err):
2103     err_str = str(err)
2104     # On python 2 error byte string must be decoded with proper
2105     # encoding rather than ascii
2106     if sys.version_info[0] < 3:
2107         err_str = err_str.decode(preferredencoding())
2108     return err_str
2109
2110
2111 def mimetype2ext(mt):
2112     if mt is None:
2113         return None
2114
2115     ext = {
2116         'audio/mp4': 'm4a',
2117         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2118         # it's the most popular one
2119         'audio/mpeg': 'mp3',
2120     }.get(mt)
2121     if ext is not None:
2122         return ext
2123
2124     _, _, res = mt.rpartition('/')
2125     res = res.lower()
2126
2127     return {
2128         '3gpp': '3gp',
2129         'smptett+xml': 'tt',
2130         'srt': 'srt',
2131         'ttaf+xml': 'dfxp',
2132         'ttml+xml': 'ttml',
2133         'vtt': 'vtt',
2134         'x-flv': 'flv',
2135         'x-mp4-fragmented': 'mp4',
2136         'x-ms-wmv': 'wmv',
2137         'mpegurl': 'm3u8',
2138         'x-mpegurl': 'm3u8',
2139         'vnd.apple.mpegurl': 'm3u8',
2140         'dash+xml': 'mpd',
2141         'f4m': 'f4m',
2142         'f4m+xml': 'f4m',
2143         'hds+xml': 'f4m',
2144         'vnd.ms-sstr+xml': 'ism',
2145     }.get(res, res)
2146
2147
2148 def parse_codecs(codecs_str):
2149     # http://tools.ietf.org/html/rfc6381
2150     if not codecs_str:
2151         return {}
2152     splited_codecs = list(filter(None, map(
2153         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2154     vcodec, acodec = None, None
2155     for full_codec in splited_codecs:
2156         codec = full_codec.split('.')[0]
2157         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2158             if not vcodec:
2159                 vcodec = full_codec
2160         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2161             if not acodec:
2162                 acodec = full_codec
2163         else:
2164             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2165     if not vcodec and not acodec:
2166         if len(splited_codecs) == 2:
2167             return {
2168                 'vcodec': vcodec,
2169                 'acodec': acodec,
2170             }
2171         elif len(splited_codecs) == 1:
2172             return {
2173                 'vcodec': 'none',
2174                 'acodec': vcodec,
2175             }
2176     else:
2177         return {
2178             'vcodec': vcodec or 'none',
2179             'acodec': acodec or 'none',
2180         }
2181     return {}
2182
2183
2184 def urlhandle_detect_ext(url_handle):
2185     getheader = url_handle.headers.get
2186
2187     cd = getheader('Content-Disposition')
2188     if cd:
2189         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2190         if m:
2191             e = determine_ext(m.group('filename'), default_ext=None)
2192             if e:
2193                 return e
2194
2195     return mimetype2ext(getheader('Content-Type'))
2196
2197
2198 def encode_data_uri(data, mime_type):
2199     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2200
2201
2202 def age_restricted(content_limit, age_limit):
2203     """ Returns True iff the content should be blocked """
2204
2205     if age_limit is None:  # No limit set
2206         return False
2207     if content_limit is None:
2208         return False  # Content available for everyone
2209     return age_limit < content_limit
2210
2211
2212 def is_html(first_bytes):
2213     """ Detect whether a file contains HTML by examining its first bytes. """
2214
2215     BOMS = [
2216         (b'\xef\xbb\xbf', 'utf-8'),
2217         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2218         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2219         (b'\xff\xfe', 'utf-16-le'),
2220         (b'\xfe\xff', 'utf-16-be'),
2221     ]
2222     for bom, enc in BOMS:
2223         if first_bytes.startswith(bom):
2224             s = first_bytes[len(bom):].decode(enc, 'replace')
2225             break
2226     else:
2227         s = first_bytes.decode('utf-8', 'replace')
2228
2229     return re.match(r'^\s*<', s)
2230
2231
2232 def determine_protocol(info_dict):
2233     protocol = info_dict.get('protocol')
2234     if protocol is not None:
2235         return protocol
2236
2237     url = info_dict['url']
2238     if url.startswith('rtmp'):
2239         return 'rtmp'
2240     elif url.startswith('mms'):
2241         return 'mms'
2242     elif url.startswith('rtsp'):
2243         return 'rtsp'
2244
2245     ext = determine_ext(url)
2246     if ext == 'm3u8':
2247         return 'm3u8'
2248     elif ext == 'f4m':
2249         return 'f4m'
2250
2251     return compat_urllib_parse_urlparse(url).scheme
2252
2253
2254 def render_table(header_row, data):
2255     """ Render a list of rows, each as a list of values """
2256     table = [header_row] + data
2257     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2258     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2259     return '\n'.join(format_str % tuple(row) for row in table)
2260
2261
2262 def _match_one(filter_part, dct):
2263     COMPARISON_OPERATORS = {
2264         '<': operator.lt,
2265         '<=': operator.le,
2266         '>': operator.gt,
2267         '>=': operator.ge,
2268         '=': operator.eq,
2269         '!=': operator.ne,
2270     }
2271     operator_rex = re.compile(r'''(?x)\s*
2272         (?P<key>[a-z_]+)
2273         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2274         (?:
2275             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2276             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2277         )
2278         \s*$
2279         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2280     m = operator_rex.search(filter_part)
2281     if m:
2282         op = COMPARISON_OPERATORS[m.group('op')]
2283         if m.group('strval') is not None:
2284             if m.group('op') not in ('=', '!='):
2285                 raise ValueError(
2286                     'Operator %s does not support string values!' % m.group('op'))
2287             comparison_value = m.group('strval')
2288         else:
2289             try:
2290                 comparison_value = int(m.group('intval'))
2291             except ValueError:
2292                 comparison_value = parse_filesize(m.group('intval'))
2293                 if comparison_value is None:
2294                     comparison_value = parse_filesize(m.group('intval') + 'B')
2295                 if comparison_value is None:
2296                     raise ValueError(
2297                         'Invalid integer value %r in filter part %r' % (
2298                             m.group('intval'), filter_part))
2299         actual_value = dct.get(m.group('key'))
2300         if actual_value is None:
2301             return m.group('none_inclusive')
2302         return op(actual_value, comparison_value)
2303
2304     UNARY_OPERATORS = {
2305         '': lambda v: v is not None,
2306         '!': lambda v: v is None,
2307     }
2308     operator_rex = re.compile(r'''(?x)\s*
2309         (?P<op>%s)\s*(?P<key>[a-z_]+)
2310         \s*$
2311         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2312     m = operator_rex.search(filter_part)
2313     if m:
2314         op = UNARY_OPERATORS[m.group('op')]
2315         actual_value = dct.get(m.group('key'))
2316         return op(actual_value)
2317
2318     raise ValueError('Invalid filter part %r' % filter_part)
2319
2320
2321 def match_str(filter_str, dct):
2322     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2323
2324     return all(
2325         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2326
2327
2328 def match_filter_func(filter_str):
2329     def _match_func(info_dict):
2330         if match_str(filter_str, info_dict):
2331             return None
2332         else:
2333             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2334             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2335     return _match_func
2336
2337
2338 def parse_dfxp_time_expr(time_expr):
2339     if not time_expr:
2340         return
2341
2342     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2343     if mobj:
2344         return float(mobj.group('time_offset'))
2345
2346     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2347     if mobj:
2348         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2349
2350
2351 def srt_subtitles_timecode(seconds):
2352     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2353
2354
2355 def dfxp2srt(dfxp_data):
2356     _x = functools.partial(xpath_with_ns, ns_map={
2357         'ttml': 'http://www.w3.org/ns/ttml',
2358         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2359         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2360     })
2361
2362     class TTMLPElementParser(object):
2363         out = ''
2364
2365         def start(self, tag, attrib):
2366             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2367                 self.out += '\n'
2368
2369         def end(self, tag):
2370             pass
2371
2372         def data(self, data):
2373             self.out += data
2374
2375         def close(self):
2376             return self.out.strip()
2377
2378     def parse_node(node):
2379         target = TTMLPElementParser()
2380         parser = xml.etree.ElementTree.XMLParser(target=target)
2381         parser.feed(xml.etree.ElementTree.tostring(node))
2382         return parser.close()
2383
2384     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2385     out = []
2386     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2387
2388     if not paras:
2389         raise ValueError('Invalid dfxp/TTML subtitle')
2390
2391     for para, index in zip(paras, itertools.count(1)):
2392         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2393         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2394         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2395         if begin_time is None:
2396             continue
2397         if not end_time:
2398             if not dur:
2399                 continue
2400             end_time = begin_time + dur
2401         out.append('%d\n%s --> %s\n%s\n\n' % (
2402             index,
2403             srt_subtitles_timecode(begin_time),
2404             srt_subtitles_timecode(end_time),
2405             parse_node(para)))
2406
2407     return ''.join(out)
2408
2409
2410 def cli_option(params, command_option, param):
2411     param = params.get(param)
2412     return [command_option, param] if param is not None else []
2413
2414
2415 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2416     param = params.get(param)
2417     assert isinstance(param, bool)
2418     if separator:
2419         return [command_option + separator + (true_value if param else false_value)]
2420     return [command_option, true_value if param else false_value]
2421
2422
2423 def cli_valueless_option(params, command_option, param, expected_value=True):
2424     param = params.get(param)
2425     return [command_option] if param == expected_value else []
2426
2427
2428 def cli_configuration_args(params, param, default=[]):
2429     ex_args = params.get(param)
2430     if ex_args is None:
2431         return default
2432     assert isinstance(ex_args, list)
2433     return ex_args
2434
2435
2436 class ISO639Utils(object):
2437     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2438     _lang_map = {
2439         'aa': 'aar',
2440         'ab': 'abk',
2441         'ae': 'ave',
2442         'af': 'afr',
2443         'ak': 'aka',
2444         'am': 'amh',
2445         'an': 'arg',
2446         'ar': 'ara',
2447         'as': 'asm',
2448         'av': 'ava',
2449         'ay': 'aym',
2450         'az': 'aze',
2451         'ba': 'bak',
2452         'be': 'bel',
2453         'bg': 'bul',
2454         'bh': 'bih',
2455         'bi': 'bis',
2456         'bm': 'bam',
2457         'bn': 'ben',
2458         'bo': 'bod',
2459         'br': 'bre',
2460         'bs': 'bos',
2461         'ca': 'cat',
2462         'ce': 'che',
2463         'ch': 'cha',
2464         'co': 'cos',
2465         'cr': 'cre',
2466         'cs': 'ces',
2467         'cu': 'chu',
2468         'cv': 'chv',
2469         'cy': 'cym',
2470         'da': 'dan',
2471         'de': 'deu',
2472         'dv': 'div',
2473         'dz': 'dzo',
2474         'ee': 'ewe',
2475         'el': 'ell',
2476         'en': 'eng',
2477         'eo': 'epo',
2478         'es': 'spa',
2479         'et': 'est',
2480         'eu': 'eus',
2481         'fa': 'fas',
2482         'ff': 'ful',
2483         'fi': 'fin',
2484         'fj': 'fij',
2485         'fo': 'fao',
2486         'fr': 'fra',
2487         'fy': 'fry',
2488         'ga': 'gle',
2489         'gd': 'gla',
2490         'gl': 'glg',
2491         'gn': 'grn',
2492         'gu': 'guj',
2493         'gv': 'glv',
2494         'ha': 'hau',
2495         'he': 'heb',
2496         'hi': 'hin',
2497         'ho': 'hmo',
2498         'hr': 'hrv',
2499         'ht': 'hat',
2500         'hu': 'hun',
2501         'hy': 'hye',
2502         'hz': 'her',
2503         'ia': 'ina',
2504         'id': 'ind',
2505         'ie': 'ile',
2506         'ig': 'ibo',
2507         'ii': 'iii',
2508         'ik': 'ipk',
2509         'io': 'ido',
2510         'is': 'isl',
2511         'it': 'ita',
2512         'iu': 'iku',
2513         'ja': 'jpn',
2514         'jv': 'jav',
2515         'ka': 'kat',
2516         'kg': 'kon',
2517         'ki': 'kik',
2518         'kj': 'kua',
2519         'kk': 'kaz',
2520         'kl': 'kal',
2521         'km': 'khm',
2522         'kn': 'kan',
2523         'ko': 'kor',
2524         'kr': 'kau',
2525         'ks': 'kas',
2526         'ku': 'kur',
2527         'kv': 'kom',
2528         'kw': 'cor',
2529         'ky': 'kir',
2530         'la': 'lat',
2531         'lb': 'ltz',
2532         'lg': 'lug',
2533         'li': 'lim',
2534         'ln': 'lin',
2535         'lo': 'lao',
2536         'lt': 'lit',
2537         'lu': 'lub',
2538         'lv': 'lav',
2539         'mg': 'mlg',
2540         'mh': 'mah',
2541         'mi': 'mri',
2542         'mk': 'mkd',
2543         'ml': 'mal',
2544         'mn': 'mon',
2545         'mr': 'mar',
2546         'ms': 'msa',
2547         'mt': 'mlt',
2548         'my': 'mya',
2549         'na': 'nau',
2550         'nb': 'nob',
2551         'nd': 'nde',
2552         'ne': 'nep',
2553         'ng': 'ndo',
2554         'nl': 'nld',
2555         'nn': 'nno',
2556         'no': 'nor',
2557         'nr': 'nbl',
2558         'nv': 'nav',
2559         'ny': 'nya',
2560         'oc': 'oci',
2561         'oj': 'oji',
2562         'om': 'orm',
2563         'or': 'ori',
2564         'os': 'oss',
2565         'pa': 'pan',
2566         'pi': 'pli',
2567         'pl': 'pol',
2568         'ps': 'pus',
2569         'pt': 'por',
2570         'qu': 'que',
2571         'rm': 'roh',
2572         'rn': 'run',
2573         'ro': 'ron',
2574         'ru': 'rus',
2575         'rw': 'kin',
2576         'sa': 'san',
2577         'sc': 'srd',
2578         'sd': 'snd',
2579         'se': 'sme',
2580         'sg': 'sag',
2581         'si': 'sin',
2582         'sk': 'slk',
2583         'sl': 'slv',
2584         'sm': 'smo',
2585         'sn': 'sna',
2586         'so': 'som',
2587         'sq': 'sqi',
2588         'sr': 'srp',
2589         'ss': 'ssw',
2590         'st': 'sot',
2591         'su': 'sun',
2592         'sv': 'swe',
2593         'sw': 'swa',
2594         'ta': 'tam',
2595         'te': 'tel',
2596         'tg': 'tgk',
2597         'th': 'tha',
2598         'ti': 'tir',
2599         'tk': 'tuk',
2600         'tl': 'tgl',
2601         'tn': 'tsn',
2602         'to': 'ton',
2603         'tr': 'tur',
2604         'ts': 'tso',
2605         'tt': 'tat',
2606         'tw': 'twi',
2607         'ty': 'tah',
2608         'ug': 'uig',
2609         'uk': 'ukr',
2610         'ur': 'urd',
2611         'uz': 'uzb',
2612         've': 'ven',
2613         'vi': 'vie',
2614         'vo': 'vol',
2615         'wa': 'wln',
2616         'wo': 'wol',
2617         'xh': 'xho',
2618         'yi': 'yid',
2619         'yo': 'yor',
2620         'za': 'zha',
2621         'zh': 'zho',
2622         'zu': 'zul',
2623     }
2624
2625     @classmethod
2626     def short2long(cls, code):
2627         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2628         return cls._lang_map.get(code[:2])
2629
2630     @classmethod
2631     def long2short(cls, code):
2632         """Convert language code from ISO 639-2/T to ISO 639-1"""
2633         for short_name, long_name in cls._lang_map.items():
2634             if long_name == code:
2635                 return short_name
2636
2637
2638 class ISO3166Utils(object):
2639     # From http://data.okfn.org/data/core/country-list
2640     _country_map = {
2641         'AF': 'Afghanistan',
2642         'AX': 'Åland Islands',
2643         'AL': 'Albania',
2644         'DZ': 'Algeria',
2645         'AS': 'American Samoa',
2646         'AD': 'Andorra',
2647         'AO': 'Angola',
2648         'AI': 'Anguilla',
2649         'AQ': 'Antarctica',
2650         'AG': 'Antigua and Barbuda',
2651         'AR': 'Argentina',
2652         'AM': 'Armenia',
2653         'AW': 'Aruba',
2654         'AU': 'Australia',
2655         'AT': 'Austria',
2656         'AZ': 'Azerbaijan',
2657         'BS': 'Bahamas',
2658         'BH': 'Bahrain',
2659         'BD': 'Bangladesh',
2660         'BB': 'Barbados',
2661         'BY': 'Belarus',
2662         'BE': 'Belgium',
2663         'BZ': 'Belize',
2664         'BJ': 'Benin',
2665         'BM': 'Bermuda',
2666         'BT': 'Bhutan',
2667         'BO': 'Bolivia, Plurinational State of',
2668         'BQ': 'Bonaire, Sint Eustatius and Saba',
2669         'BA': 'Bosnia and Herzegovina',
2670         'BW': 'Botswana',
2671         'BV': 'Bouvet Island',
2672         'BR': 'Brazil',
2673         'IO': 'British Indian Ocean Territory',
2674         'BN': 'Brunei Darussalam',
2675         'BG': 'Bulgaria',
2676         'BF': 'Burkina Faso',
2677         'BI': 'Burundi',
2678         'KH': 'Cambodia',
2679         'CM': 'Cameroon',
2680         'CA': 'Canada',
2681         'CV': 'Cape Verde',
2682         'KY': 'Cayman Islands',
2683         'CF': 'Central African Republic',
2684         'TD': 'Chad',
2685         'CL': 'Chile',
2686         'CN': 'China',
2687         'CX': 'Christmas Island',
2688         'CC': 'Cocos (Keeling) Islands',
2689         'CO': 'Colombia',
2690         'KM': 'Comoros',
2691         'CG': 'Congo',
2692         'CD': 'Congo, the Democratic Republic of the',
2693         'CK': 'Cook Islands',
2694         'CR': 'Costa Rica',
2695         'CI': 'Côte d\'Ivoire',
2696         'HR': 'Croatia',
2697         'CU': 'Cuba',
2698         'CW': 'Curaçao',
2699         'CY': 'Cyprus',
2700         'CZ': 'Czech Republic',
2701         'DK': 'Denmark',
2702         'DJ': 'Djibouti',
2703         'DM': 'Dominica',
2704         'DO': 'Dominican Republic',
2705         'EC': 'Ecuador',
2706         'EG': 'Egypt',
2707         'SV': 'El Salvador',
2708         'GQ': 'Equatorial Guinea',
2709         'ER': 'Eritrea',
2710         'EE': 'Estonia',
2711         'ET': 'Ethiopia',
2712         'FK': 'Falkland Islands (Malvinas)',
2713         'FO': 'Faroe Islands',
2714         'FJ': 'Fiji',
2715         'FI': 'Finland',
2716         'FR': 'France',
2717         'GF': 'French Guiana',
2718         'PF': 'French Polynesia',
2719         'TF': 'French Southern Territories',
2720         'GA': 'Gabon',
2721         'GM': 'Gambia',
2722         'GE': 'Georgia',
2723         'DE': 'Germany',
2724         'GH': 'Ghana',
2725         'GI': 'Gibraltar',
2726         'GR': 'Greece',
2727         'GL': 'Greenland',
2728         'GD': 'Grenada',
2729         'GP': 'Guadeloupe',
2730         'GU': 'Guam',
2731         'GT': 'Guatemala',
2732         'GG': 'Guernsey',
2733         'GN': 'Guinea',
2734         'GW': 'Guinea-Bissau',
2735         'GY': 'Guyana',
2736         'HT': 'Haiti',
2737         'HM': 'Heard Island and McDonald Islands',
2738         'VA': 'Holy See (Vatican City State)',
2739         'HN': 'Honduras',
2740         'HK': 'Hong Kong',
2741         'HU': 'Hungary',
2742         'IS': 'Iceland',
2743         'IN': 'India',
2744         'ID': 'Indonesia',
2745         'IR': 'Iran, Islamic Republic of',
2746         'IQ': 'Iraq',
2747         'IE': 'Ireland',
2748         'IM': 'Isle of Man',
2749         'IL': 'Israel',
2750         'IT': 'Italy',
2751         'JM': 'Jamaica',
2752         'JP': 'Japan',
2753         'JE': 'Jersey',
2754         'JO': 'Jordan',
2755         'KZ': 'Kazakhstan',
2756         'KE': 'Kenya',
2757         'KI': 'Kiribati',
2758         'KP': 'Korea, Democratic People\'s Republic of',
2759         'KR': 'Korea, Republic of',
2760         'KW': 'Kuwait',
2761         'KG': 'Kyrgyzstan',
2762         'LA': 'Lao People\'s Democratic Republic',
2763         'LV': 'Latvia',
2764         'LB': 'Lebanon',
2765         'LS': 'Lesotho',
2766         'LR': 'Liberia',
2767         'LY': 'Libya',
2768         'LI': 'Liechtenstein',
2769         'LT': 'Lithuania',
2770         'LU': 'Luxembourg',
2771         'MO': 'Macao',
2772         'MK': 'Macedonia, the Former Yugoslav Republic of',
2773         'MG': 'Madagascar',
2774         'MW': 'Malawi',
2775         'MY': 'Malaysia',
2776         'MV': 'Maldives',
2777         'ML': 'Mali',
2778         'MT': 'Malta',
2779         'MH': 'Marshall Islands',
2780         'MQ': 'Martinique',
2781         'MR': 'Mauritania',
2782         'MU': 'Mauritius',
2783         'YT': 'Mayotte',
2784         'MX': 'Mexico',
2785         'FM': 'Micronesia, Federated States of',
2786         'MD': 'Moldova, Republic of',
2787         'MC': 'Monaco',
2788         'MN': 'Mongolia',
2789         'ME': 'Montenegro',
2790         'MS': 'Montserrat',
2791         'MA': 'Morocco',
2792         'MZ': 'Mozambique',
2793         'MM': 'Myanmar',
2794         'NA': 'Namibia',
2795         'NR': 'Nauru',
2796         'NP': 'Nepal',
2797         'NL': 'Netherlands',
2798         'NC': 'New Caledonia',
2799         'NZ': 'New Zealand',
2800         'NI': 'Nicaragua',
2801         'NE': 'Niger',
2802         'NG': 'Nigeria',
2803         'NU': 'Niue',
2804         'NF': 'Norfolk Island',
2805         'MP': 'Northern Mariana Islands',
2806         'NO': 'Norway',
2807         'OM': 'Oman',
2808         'PK': 'Pakistan',
2809         'PW': 'Palau',
2810         'PS': 'Palestine, State of',
2811         'PA': 'Panama',
2812         'PG': 'Papua New Guinea',
2813         'PY': 'Paraguay',
2814         'PE': 'Peru',
2815         'PH': 'Philippines',
2816         'PN': 'Pitcairn',
2817         'PL': 'Poland',
2818         'PT': 'Portugal',
2819         'PR': 'Puerto Rico',
2820         'QA': 'Qatar',
2821         'RE': 'Réunion',
2822         'RO': 'Romania',
2823         'RU': 'Russian Federation',
2824         'RW': 'Rwanda',
2825         'BL': 'Saint Barthélemy',
2826         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2827         'KN': 'Saint Kitts and Nevis',
2828         'LC': 'Saint Lucia',
2829         'MF': 'Saint Martin (French part)',
2830         'PM': 'Saint Pierre and Miquelon',
2831         'VC': 'Saint Vincent and the Grenadines',
2832         'WS': 'Samoa',
2833         'SM': 'San Marino',
2834         'ST': 'Sao Tome and Principe',
2835         'SA': 'Saudi Arabia',
2836         'SN': 'Senegal',
2837         'RS': 'Serbia',
2838         'SC': 'Seychelles',
2839         'SL': 'Sierra Leone',
2840         'SG': 'Singapore',
2841         'SX': 'Sint Maarten (Dutch part)',
2842         'SK': 'Slovakia',
2843         'SI': 'Slovenia',
2844         'SB': 'Solomon Islands',
2845         'SO': 'Somalia',
2846         'ZA': 'South Africa',
2847         'GS': 'South Georgia and the South Sandwich Islands',
2848         'SS': 'South Sudan',
2849         'ES': 'Spain',
2850         'LK': 'Sri Lanka',
2851         'SD': 'Sudan',
2852         'SR': 'Suriname',
2853         'SJ': 'Svalbard and Jan Mayen',
2854         'SZ': 'Swaziland',
2855         'SE': 'Sweden',
2856         'CH': 'Switzerland',
2857         'SY': 'Syrian Arab Republic',
2858         'TW': 'Taiwan, Province of China',
2859         'TJ': 'Tajikistan',
2860         'TZ': 'Tanzania, United Republic of',
2861         'TH': 'Thailand',
2862         'TL': 'Timor-Leste',
2863         'TG': 'Togo',
2864         'TK': 'Tokelau',
2865         'TO': 'Tonga',
2866         'TT': 'Trinidad and Tobago',
2867         'TN': 'Tunisia',
2868         'TR': 'Turkey',
2869         'TM': 'Turkmenistan',
2870         'TC': 'Turks and Caicos Islands',
2871         'TV': 'Tuvalu',
2872         'UG': 'Uganda',
2873         'UA': 'Ukraine',
2874         'AE': 'United Arab Emirates',
2875         'GB': 'United Kingdom',
2876         'US': 'United States',
2877         'UM': 'United States Minor Outlying Islands',
2878         'UY': 'Uruguay',
2879         'UZ': 'Uzbekistan',
2880         'VU': 'Vanuatu',
2881         'VE': 'Venezuela, Bolivarian Republic of',
2882         'VN': 'Viet Nam',
2883         'VG': 'Virgin Islands, British',
2884         'VI': 'Virgin Islands, U.S.',
2885         'WF': 'Wallis and Futuna',
2886         'EH': 'Western Sahara',
2887         'YE': 'Yemen',
2888         'ZM': 'Zambia',
2889         'ZW': 'Zimbabwe',
2890     }
2891
2892     @classmethod
2893     def short2full(cls, code):
2894         """Convert an ISO 3166-2 country code to the corresponding full name"""
2895         return cls._country_map.get(code.upper())
2896
2897
2898 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2899     def __init__(self, proxies=None):
2900         # Set default handlers
2901         for type in ('http', 'https'):
2902             setattr(self, '%s_open' % type,
2903                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2904                         meth(r, proxy, type))
2905         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2906
2907     def proxy_open(self, req, proxy, type):
2908         req_proxy = req.headers.get('Ytdl-request-proxy')
2909         if req_proxy is not None:
2910             proxy = req_proxy
2911             del req.headers['Ytdl-request-proxy']
2912
2913         if proxy == '__noproxy__':
2914             return None  # No Proxy
2915         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2916             req.add_header('Ytdl-socks-proxy', proxy)
2917             # youtube-dl's http/https handlers do wrapping the socket with socks
2918             return None
2919         return compat_urllib_request.ProxyHandler.proxy_open(
2920             self, req, proxy, type)
2921
2922
2923 def ohdave_rsa_encrypt(data, exponent, modulus):
2924     '''
2925     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2926
2927     Input:
2928         data: data to encrypt, bytes-like object
2929         exponent, modulus: parameter e and N of RSA algorithm, both integer
2930     Output: hex string of encrypted data
2931
2932     Limitation: supports one block encryption only
2933     '''
2934
2935     payload = int(binascii.hexlify(data[::-1]), 16)
2936     encrypted = pow(payload, exponent, modulus)
2937     return '%x' % encrypted
2938
2939
2940 def encode_base_n(num, n, table=None):
2941     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2942     if not table:
2943         table = FULL_TABLE[:n]
2944
2945     if n > len(table):
2946         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2947
2948     if num == 0:
2949         return table[0]
2950
2951     ret = ''
2952     while num:
2953         ret = table[num % n] + ret
2954         num = num // n
2955     return ret
2956
2957
2958 def decode_packed_codes(code):
2959     mobj = re.search(
2960         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2961         code)
2962     obfucasted_code, base, count, symbols = mobj.groups()
2963     base = int(base)
2964     count = int(count)
2965     symbols = symbols.split('|')
2966     symbol_table = {}
2967
2968     while count:
2969         count -= 1
2970         base_n_count = encode_base_n(count, base)
2971         symbol_table[base_n_count] = symbols[count] or base_n_count
2972
2973     return re.sub(
2974         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2975         obfucasted_code)
2976
2977
2978 def parse_m3u8_attributes(attrib):
2979     info = {}
2980     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2981         if val.startswith('"'):
2982             val = val[1:-1]
2983         info[key] = val
2984     return info
2985
2986
2987 def urshift(val, n):
2988     return val >> n if val >= 0 else (val + 0x100000000) >> n
2989
2990
2991 # Based on png2str() written by @gdkchan and improved by @yokrysty
2992 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
2993 def decode_png(png_data):
2994     # Reference: https://www.w3.org/TR/PNG/
2995     header = png_data[8:]
2996
2997     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
2998         raise IOError('Not a valid PNG file.')
2999
3000     int_map = {1: '>B', 2: '>H', 4: '>I'}
3001     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3002
3003     chunks = []
3004
3005     while header:
3006         length = unpack_integer(header[:4])
3007         header = header[4:]
3008
3009         chunk_type = header[:4]
3010         header = header[4:]
3011
3012         chunk_data = header[:length]
3013         header = header[length:]
3014
3015         header = header[4:]  # Skip CRC
3016
3017         chunks.append({
3018             'type': chunk_type,
3019             'length': length,
3020             'data': chunk_data
3021         })
3022
3023     ihdr = chunks[0]['data']
3024
3025     width = unpack_integer(ihdr[:4])
3026     height = unpack_integer(ihdr[4:8])
3027
3028     idat = b''
3029
3030     for chunk in chunks:
3031         if chunk['type'] == b'IDAT':
3032             idat += chunk['data']
3033
3034     if not idat:
3035         raise IOError('Unable to read PNG data.')
3036
3037     decompressed_data = bytearray(zlib.decompress(idat))
3038
3039     stride = width * 3
3040     pixels = []
3041
3042     def _get_pixel(idx):
3043         x = idx % stride
3044         y = idx // stride
3045         return pixels[y][x]
3046
3047     for y in range(height):
3048         basePos = y * (1 + stride)
3049         filter_type = decompressed_data[basePos]
3050
3051         current_row = []
3052
3053         pixels.append(current_row)
3054
3055         for x in range(stride):
3056             color = decompressed_data[1 + basePos + x]
3057             basex = y * stride + x
3058             left = 0
3059             up = 0
3060
3061             if x > 2:
3062                 left = _get_pixel(basex - 3)
3063             if y > 0:
3064                 up = _get_pixel(basex - stride)
3065
3066             if filter_type == 1:  # Sub
3067                 color = (color + left) & 0xff
3068             elif filter_type == 2:  # Up
3069                 color = (color + up) & 0xff
3070             elif filter_type == 3:  # Average
3071                 color = (color + ((left + up) >> 1)) & 0xff
3072             elif filter_type == 4:  # Paeth
3073                 a = left
3074                 b = up
3075                 c = 0
3076
3077                 if x > 2 and y > 0:
3078                     c = _get_pixel(basex - stride - 3)
3079
3080                 p = a + b - c
3081
3082                 pa = abs(p - a)
3083                 pb = abs(p - b)
3084                 pc = abs(p - c)
3085
3086                 if pa <= pb and pa <= pc:
3087                     color = (color + a) & 0xff
3088                 elif pb <= pc:
3089                     color = (color + b) & 0xff
3090                 else:
3091                     color = (color + c) & 0xff
3092
3093             current_row.append(color)
3094
3095     return width, height, pixels