_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113 DATE_FORMATS = (
 114     '%d %B %Y',
 115     '%d %b %Y',
 116     '%B %d %Y',
 117     '%b %d %Y',
 118     '%b %dst %Y %I:%M',
 119     '%b %dnd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y/%m/%d',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y-%m-%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S.%f',
 127     '%d.%m.%Y %H:%M',
 128     '%d.%m.%Y %H.%M',
 129     '%Y-%m-%dT%H:%M:%SZ',
 130     '%Y-%m-%dT%H:%M:%S.%fZ',
 131     '%Y-%m-%dT%H:%M:%S.%f0Z',
 132     '%Y-%m-%dT%H:%M:%S',
 133     '%Y-%m-%dT%H:%M:%S.%f',
 134     '%Y-%m-%dT%H:%M',
 135 )
 136
 137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 138 DATE_FORMATS_DAY_FIRST.extend([
 139     '%d-%m-%Y',
 140     '%d.%m.%Y',
 141     '%d.%m.%y',
 142     '%d/%m/%Y',
 143     '%d/%m/%y',
 144     '%d/%m/%Y %H:%M:%S',
 145 ])
 146
 147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_MONTH_FIRST.extend([
 149     '%m-%d-%Y',
 150     '%m.%d.%Y',
 151     '%m/%d/%Y',
 152     '%m/%d/%y',
 153     '%m/%d/%Y %H:%M:%S',
 154 ])
 155
 156
 157 def preferredencoding():
 158     """Get preferred encoding.
 159
 160     Returns the best encoding scheme for the system, based on
 161     locale.getpreferredencoding() and some further tweaks.
 162     """
 163     try:
 164         pref = locale.getpreferredencoding()
 165         'TEST'.encode(pref)
 166     except Exception:
 167         pref = 'UTF-8'
 168
 169     return pref
 170
 171
 172 def write_json_file(obj, fn):
 173     """ Encode obj as JSON and write it to fn, atomically if possible """
 174
 175     fn = encodeFilename(fn)
 176     if sys.version_info < (3, 0) and sys.platform != 'win32':
 177         encoding = get_filesystem_encoding()
 178         # os.path.basename returns a bytes object, but NamedTemporaryFile
 179         # will fail if the filename contains non ascii characters unless we
 180         # use a unicode object
 181         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 182         # the same for os.path.dirname
 183         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 184     else:
 185         path_basename = os.path.basename
 186         path_dirname = os.path.dirname
 187
 188     args = {
 189         'suffix': '.tmp',
 190         'prefix': path_basename(fn) + '.',
 191         'dir': path_dirname(fn),
 192         'delete': False,
 193     }
 194
 195     # In Python 2.x, json.dump expects a bytestream.
 196     # In Python 3.x, it writes to a character stream
 197     if sys.version_info < (3, 0):
 198         args['mode'] = 'wb'
 199     else:
 200         args.update({
 201             'mode': 'w',
 202             'encoding': 'utf-8',
 203         })
 204
 205     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 206
 207     try:
 208         with tf:
 209             json.dump(obj, tf)
 210         if sys.platform == 'win32':
 211             # Need to remove existing file on Windows, else os.rename raises
 212             # WindowsError or FileExistsError.
 213             try:
 214                 os.unlink(fn)
 215             except OSError:
 216                 pass
 217         os.rename(tf.name, fn)
 218     except Exception:
 219         try:
 220             os.remove(tf.name)
 221         except OSError:
 222             pass
 223         raise
 224
 225
 226 if sys.version_info >= (2, 7):
 227     def find_xpath_attr(node, xpath, key, val=None):
 228         """ Find the xpath xpath[@key=val] """
 229         assert re.match(r'^[a-zA-Z_-]+$', key)
 230         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 231         return node.find(expr)
 232 else:
 233     def find_xpath_attr(node, xpath, key, val=None):
 234         for f in node.findall(compat_xpath(xpath)):
 235             if key not in f.attrib:
 236                 continue
 237             if val is None or f.attrib.get(key) == val:
 238                 return f
 239         return None
 240
 241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 242 # the namespace parameter
 243
 244
 245 def xpath_with_ns(path, ns_map):
 246     components = [c.split(':') for c in path.split('/')]
 247     replaced = []
 248     for c in components:
 249         if len(c) == 1:
 250             replaced.append(c[0])
 251         else:
 252             ns, tag = c
 253             replaced.append('{%s}%s' % (ns_map[ns], tag))
 254     return '/'.join(replaced)
 255
 256
 257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 258     def _find_xpath(xpath):
 259         return node.find(compat_xpath(xpath))
 260
 261     if isinstance(xpath, (str, compat_str)):
 262         n = _find_xpath(xpath)
 263     else:
 264         for xp in xpath:
 265             n = _find_xpath(xp)
 266             if n is not None:
 267                 break
 268
 269     if n is None:
 270         if default is not NO_DEFAULT:
 271             return default
 272         elif fatal:
 273             name = xpath if name is None else name
 274             raise ExtractorError('Could not find XML element %s' % name)
 275         else:
 276             return None
 277     return n
 278
 279
 280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 281     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 282     if n is None or n == default:
 283         return n
 284     if n.text is None:
 285         if default is not NO_DEFAULT:
 286             return default
 287         elif fatal:
 288             name = xpath if name is None else name
 289             raise ExtractorError('Could not find XML element\'s text %s' % name)
 290         else:
 291             return None
 292     return n.text
 293
 294
 295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 296     n = find_xpath_attr(node, xpath, key)
 297     if n is None:
 298         if default is not NO_DEFAULT:
 299             return default
 300         elif fatal:
 301             name = '%s[@%s]' % (xpath, key) if name is None else name
 302             raise ExtractorError('Could not find XML attribute %s' % name)
 303         else:
 304             return None
 305     return n.attrib[key]
 306
 307
 308 def get_element_by_id(id, html):
 309     """Return the content of the tag with the specified ID in the passed HTML document"""
 310     return get_element_by_attribute('id', id, html)
 311
 312
 313 def get_element_by_attribute(attribute, value, html):
 314     """Return the content of the tag with the specified attribute in the passed HTML document"""
 315
 316     m = re.search(r'''(?xs)
 317         <([a-zA-Z0-9:._-]+)
 318          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 319          \s+%s=['"]?%s['"]?
 320          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 321         \s*>
 322         (?P<content>.*?)
 323         </\1>
 324     ''' % (re.escape(attribute), re.escape(value)), html)
 325
 326     if not m:
 327         return None
 328     res = m.group('content')
 329
 330     if res.startswith('"') or res.startswith("'"):
 331         res = res[1:-1]
 332
 333     return unescapeHTML(res)
 334
 335
 336 class HTMLAttributeParser(compat_HTMLParser):
 337     """Trivial HTML parser to gather the attributes for a single element"""
 338     def __init__(self):
 339         self.attrs = {}
 340         compat_HTMLParser.__init__(self)
 341
 342     def handle_starttag(self, tag, attrs):
 343         self.attrs = dict(attrs)
 344
 345
 346 def extract_attributes(html_element):
 347     """Given a string for an HTML element such as
 348     <el
 349          a="foo" B="bar" c="&98;az" d=boz
 350          empty= noval entity="&amp;"
 351          sq='"' dq="'"
 352     >
 353     Decode and return a dictionary of attributes.
 354     {
 355         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 356         'empty': '', 'noval': None, 'entity': '&',
 357         'sq': '"', 'dq': '\''
 358     }.
 359     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 360     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 361     """
 362     parser = HTMLAttributeParser()
 363     parser.feed(html_element)
 364     parser.close()
 365     return parser.attrs
 366
 367
 368 def clean_html(html):
 369     """Clean an HTML snippet into a readable string"""
 370
 371     if html is None:  # Convenience for sanitizing descriptions etc.
 372         return html
 373
 374     # Newline vs <br />
 375     html = html.replace('\n', ' ')
 376     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 377     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 378     # Strip html tags
 379     html = re.sub('<.*?>', '', html)
 380     # Replace html entities
 381     html = unescapeHTML(html)
 382     return html.strip()
 383
 384
 385 def sanitize_open(filename, open_mode):
 386     """Try to open the given filename, and slightly tweak it if this fails.
 387
 388     Attempts to open the given filename. If this fails, it tries to change
 389     the filename slightly, step by step, until it's either able to open it
 390     or it fails and raises a final exception, like the standard open()
 391     function.
 392
 393     It returns the tuple (stream, definitive_file_name).
 394     """
 395     try:
 396         if filename == '-':
 397             if sys.platform == 'win32':
 398                 import msvcrt
 399                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 400             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 401         stream = open(encodeFilename(filename), open_mode)
 402         return (stream, filename)
 403     except (IOError, OSError) as err:
 404         if err.errno in (errno.EACCES,):
 405             raise
 406
 407         # In case of error, try to remove win32 forbidden chars
 408         alt_filename = sanitize_path(filename)
 409         if alt_filename == filename:
 410             raise
 411         else:
 412             # An exception here should be caught in the caller
 413             stream = open(encodeFilename(alt_filename), open_mode)
 414             return (stream, alt_filename)
 415
 416
 417 def timeconvert(timestr):
 418     """Convert RFC 2822 defined time string into system timestamp"""
 419     timestamp = None
 420     timetuple = email.utils.parsedate_tz(timestr)
 421     if timetuple is not None:
 422         timestamp = email.utils.mktime_tz(timetuple)
 423     return timestamp
 424
 425
 426 def sanitize_filename(s, restricted=False, is_id=False):
 427     """Sanitizes a string so it could be used as part of a filename.
 428     If restricted is set, use a stricter subset of allowed characters.
 429     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 430     """
 431     def replace_insane(char):
 432         if restricted and char in ACCENT_CHARS:
 433             return ACCENT_CHARS[char]
 434         if char == '?' or ord(char) < 32 or ord(char) == 127:
 435             return ''
 436         elif char == '"':
 437             return '' if restricted else '\''
 438         elif char == ':':
 439             return '_-' if restricted else ' -'
 440         elif char in '\\/|*<>':
 441             return '_'
 442         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 443             return '_'
 444         if restricted and ord(char) > 127:
 445             return '_'
 446         return char
 447
 448     # Handle timestamps
 449     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 450     result = ''.join(map(replace_insane, s))
 451     if not is_id:
 452         while '__' in result:
 453             result = result.replace('__', '_')
 454         result = result.strip('_')
 455         # Common case of "Foreign band name - English song title"
 456         if restricted and result.startswith('-_'):
 457             result = result[2:]
 458         if result.startswith('-'):
 459             result = '_' + result[len('-'):]
 460         result = result.lstrip('.')
 461         if not result:
 462             result = '_'
 463     return result
 464
 465
 466 def sanitize_path(s):
 467     """Sanitizes and normalizes path on Windows"""
 468     if sys.platform != 'win32':
 469         return s
 470     drive_or_unc, _ = os.path.splitdrive(s)
 471     if sys.version_info < (2, 7) and not drive_or_unc:
 472         drive_or_unc, _ = os.path.splitunc(s)
 473     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 474     if drive_or_unc:
 475         norm_path.pop(0)
 476     sanitized_path = [
 477         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 478         for path_part in norm_path]
 479     if drive_or_unc:
 480         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 481     return os.path.join(*sanitized_path)
 482
 483
 484 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 485 # unwanted failures due to missing protocol
 486 def sanitize_url(url):
 487     return 'http:%s' % url if url.startswith('//') else url
 488
 489
 490 def sanitized_Request(url, *args, **kwargs):
 491     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 492
 493
 494 def orderedSet(iterable):
 495     """ Remove all duplicates from the input iterable """
 496     res = []
 497     for el in iterable:
 498         if el not in res:
 499             res.append(el)
 500     return res
 501
 502
 503 def _htmlentity_transform(entity_with_semicolon):
 504     """Transforms an HTML entity to a character."""
 505     entity = entity_with_semicolon[:-1]
 506
 507     # Known non-numeric HTML entity
 508     if entity in compat_html_entities.name2codepoint:
 509         return compat_chr(compat_html_entities.name2codepoint[entity])
 510
 511     # TODO: HTML5 allows entities without a semicolon. For example,
 512     # '&Eacuteric' should be decoded as 'Éric'.
 513     if entity_with_semicolon in compat_html_entities_html5:
 514         return compat_html_entities_html5[entity_with_semicolon]
 515
 516     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 517     if mobj is not None:
 518         numstr = mobj.group(1)
 519         if numstr.startswith('x'):
 520             base = 16
 521             numstr = '0%s' % numstr
 522         else:
 523             base = 10
 524         # See https://github.com/rg3/youtube-dl/issues/7518
 525         try:
 526             return compat_chr(int(numstr, base))
 527         except ValueError:
 528             pass
 529
 530     # Unknown entity in name, return its literal representation
 531     return '&%s;' % entity
 532
 533
 534 def unescapeHTML(s):
 535     if s is None:
 536         return None
 537     assert type(s) == compat_str
 538
 539     return re.sub(
 540         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 541
 542
 543 def get_subprocess_encoding():
 544     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 545         # For subprocess calls, encode with locale encoding
 546         # Refer to http://stackoverflow.com/a/9951851/35070
 547         encoding = preferredencoding()
 548     else:
 549         encoding = sys.getfilesystemencoding()
 550     if encoding is None:
 551         encoding = 'utf-8'
 552     return encoding
 553
 554
 555 def encodeFilename(s, for_subprocess=False):
 556     """
 557     @param s The name of the file
 558     """
 559
 560     assert type(s) == compat_str
 561
 562     # Python 3 has a Unicode API
 563     if sys.version_info >= (3, 0):
 564         return s
 565
 566     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 567     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 568     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 569     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 570         return s
 571
 572     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 573     if sys.platform.startswith('java'):
 574         return s
 575
 576     return s.encode(get_subprocess_encoding(), 'ignore')
 577
 578
 579 def decodeFilename(b, for_subprocess=False):
 580
 581     if sys.version_info >= (3, 0):
 582         return b
 583
 584     if not isinstance(b, bytes):
 585         return b
 586
 587     return b.decode(get_subprocess_encoding(), 'ignore')
 588
 589
 590 def encodeArgument(s):
 591     if not isinstance(s, compat_str):
 592         # Legacy code that uses byte strings
 593         # Uncomment the following line after fixing all post processors
 594         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 595         s = s.decode('ascii')
 596     return encodeFilename(s, True)
 597
 598
 599 def decodeArgument(b):
 600     return decodeFilename(b, True)
 601
 602
 603 def decodeOption(optval):
 604     if optval is None:
 605         return optval
 606     if isinstance(optval, bytes):
 607         optval = optval.decode(preferredencoding())
 608
 609     assert isinstance(optval, compat_str)
 610     return optval
 611
 612
 613 def formatSeconds(secs):
 614     if secs > 3600:
 615         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 616     elif secs > 60:
 617         return '%d:%02d' % (secs // 60, secs % 60)
 618     else:
 619         return '%d' % secs
 620
 621
 622 def make_HTTPS_handler(params, **kwargs):
 623     opts_no_check_certificate = params.get('nocheckcertificate', False)
 624     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 625         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 626         if opts_no_check_certificate:
 627             context.check_hostname = False
 628             context.verify_mode = ssl.CERT_NONE
 629         try:
 630             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 631         except TypeError:
 632             # Python 2.7.8
 633             # (create_default_context present but HTTPSHandler has no context=)
 634             pass
 635
 636     if sys.version_info < (3, 2):
 637         return YoutubeDLHTTPSHandler(params, **kwargs)
 638     else:  # Python < 3.4
 639         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 640         context.verify_mode = (ssl.CERT_NONE
 641                                if opts_no_check_certificate
 642                                else ssl.CERT_REQUIRED)
 643         context.set_default_verify_paths()
 644         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 645
 646
 647 def bug_reports_message():
 648     if ytdl_is_updateable():
 649         update_cmd = 'type  youtube-dl -U  to update'
 650     else:
 651         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 652     msg = '; please report this issue on https://yt-dl.org/bug .'
 653     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 654     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 655     return msg
 656
 657
 658 class ExtractorError(Exception):
 659     """Error during info extraction."""
 660
 661     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 662         """ tb, if given, is the original traceback (so that it can be printed out).
 663         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 664         """
 665
 666         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 667             expected = True
 668         if video_id is not None:
 669             msg = video_id + ': ' + msg
 670         if cause:
 671             msg += ' (caused by %r)' % cause
 672         if not expected:
 673             msg += bug_reports_message()
 674         super(ExtractorError, self).__init__(msg)
 675
 676         self.traceback = tb
 677         self.exc_info = sys.exc_info()  # preserve original exception
 678         self.cause = cause
 679         self.video_id = video_id
 680
 681     def format_traceback(self):
 682         if self.traceback is None:
 683             return None
 684         return ''.join(traceback.format_tb(self.traceback))
 685
 686
 687 class UnsupportedError(ExtractorError):
 688     def __init__(self, url):
 689         super(UnsupportedError, self).__init__(
 690             'Unsupported URL: %s' % url, expected=True)
 691         self.url = url
 692
 693
 694 class RegexNotFoundError(ExtractorError):
 695     """Error when a regex didn't match"""
 696     pass
 697
 698
 699 class DownloadError(Exception):
 700     """Download Error exception.
 701
 702     This exception may be thrown by FileDownloader objects if they are not
 703     configured to continue on errors. They will contain the appropriate
 704     error message.
 705     """
 706
 707     def __init__(self, msg, exc_info=None):
 708         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 709         super(DownloadError, self).__init__(msg)
 710         self.exc_info = exc_info
 711
 712
 713 class SameFileError(Exception):
 714     """Same File exception.
 715
 716     This exception will be thrown by FileDownloader objects if they detect
 717     multiple files would have to be downloaded to the same file on disk.
 718     """
 719     pass
 720
 721
 722 class PostProcessingError(Exception):
 723     """Post Processing exception.
 724
 725     This exception may be raised by PostProcessor's .run() method to
 726     indicate an error in the postprocessing task.
 727     """
 728
 729     def __init__(self, msg):
 730         self.msg = msg
 731
 732
 733 class MaxDownloadsReached(Exception):
 734     """ --max-downloads limit has been reached. """
 735     pass
 736
 737
 738 class UnavailableVideoError(Exception):
 739     """Unavailable Format exception.
 740
 741     This exception will be thrown when a video is requested
 742     in a format that is not available for that video.
 743     """
 744     pass
 745
 746
 747 class ContentTooShortError(Exception):
 748     """Content Too Short exception.
 749
 750     This exception may be raised by FileDownloader objects when a file they
 751     download is too small for what the server announced first, indicating
 752     the connection was probably interrupted.
 753     """
 754
 755     def __init__(self, downloaded, expected):
 756         # Both in bytes
 757         self.downloaded = downloaded
 758         self.expected = expected
 759
 760
 761 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 762     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 763     # expected HTTP responses to meet HTTP/1.0 or later (see also
 764     # https://github.com/rg3/youtube-dl/issues/6727)
 765     if sys.version_info < (3, 0):
 766         kwargs[b'strict'] = True
 767     hc = http_class(*args, **kwargs)
 768     source_address = ydl_handler._params.get('source_address')
 769     if source_address is not None:
 770         sa = (source_address, 0)
 771         if hasattr(hc, 'source_address'):  # Python 2.7+
 772             hc.source_address = sa
 773         else:  # Python 2.6
 774             def _hc_connect(self, *args, **kwargs):
 775                 sock = compat_socket_create_connection(
 776                     (self.host, self.port), self.timeout, sa)
 777                 if is_https:
 778                     self.sock = ssl.wrap_socket(
 779                         sock, self.key_file, self.cert_file,
 780                         ssl_version=ssl.PROTOCOL_TLSv1)
 781                 else:
 782                     self.sock = sock
 783             hc.connect = functools.partial(_hc_connect, hc)
 784
 785     return hc
 786
 787
 788 def handle_youtubedl_headers(headers):
 789     filtered_headers = headers
 790
 791     if 'Youtubedl-no-compression' in filtered_headers:
 792         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 793         del filtered_headers['Youtubedl-no-compression']
 794
 795     return filtered_headers
 796
 797
 798 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 799     """Handler for HTTP requests and responses.
 800
 801     This class, when installed with an OpenerDirector, automatically adds
 802     the standard headers to every HTTP request and handles gzipped and
 803     deflated responses from web servers. If compression is to be avoided in
 804     a particular request, the original request in the program code only has
 805     to include the HTTP header "Youtubedl-no-compression", which will be
 806     removed before making the real request.
 807
 808     Part of this code was copied from:
 809
 810     http://techknack.net/python-urllib2-handlers/
 811
 812     Andrew Rowls, the author of that code, agreed to release it to the
 813     public domain.
 814     """
 815
 816     def __init__(self, params, *args, **kwargs):
 817         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 818         self._params = params
 819
 820     def http_open(self, req):
 821         conn_class = compat_http_client.HTTPConnection
 822
 823         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 824         if socks_proxy:
 825             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 826             del req.headers['Ytdl-socks-proxy']
 827
 828         return self.do_open(functools.partial(
 829             _create_http_connection, self, conn_class, False),
 830             req)
 831
 832     @staticmethod
 833     def deflate(data):
 834         try:
 835             return zlib.decompress(data, -zlib.MAX_WBITS)
 836         except zlib.error:
 837             return zlib.decompress(data)
 838
 839     @staticmethod
 840     def addinfourl_wrapper(stream, headers, url, code):
 841         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 842             return compat_urllib_request.addinfourl(stream, headers, url, code)
 843         ret = compat_urllib_request.addinfourl(stream, headers, url)
 844         ret.code = code
 845         return ret
 846
 847     def http_request(self, req):
 848         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 849         # always respected by websites, some tend to give out URLs with non percent-encoded
 850         # non-ASCII characters (see telemb.py, ard.py [#3412])
 851         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 852         # To work around aforementioned issue we will replace request's original URL with
 853         # percent-encoded one
 854         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 855         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 856         url = req.get_full_url()
 857         url_escaped = escape_url(url)
 858
 859         # Substitute URL if any change after escaping
 860         if url != url_escaped:
 861             req = update_Request(req, url=url_escaped)
 862
 863         for h, v in std_headers.items():
 864             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 865             # The dict keys are capitalized because of this bug by urllib
 866             if h.capitalize() not in req.headers:
 867                 req.add_header(h, v)
 868
 869         req.headers = handle_youtubedl_headers(req.headers)
 870
 871         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 872             # Python 2.6 is brain-dead when it comes to fragments
 873             req._Request__original = req._Request__original.partition('#')[0]
 874             req._Request__r_type = req._Request__r_type.partition('#')[0]
 875
 876         return req
 877
 878     def http_response(self, req, resp):
 879         old_resp = resp
 880         # gzip
 881         if resp.headers.get('Content-encoding', '') == 'gzip':
 882             content = resp.read()
 883             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 884             try:
 885                 uncompressed = io.BytesIO(gz.read())
 886             except IOError as original_ioerror:
 887                 # There may be junk add the end of the file
 888                 # See http://stackoverflow.com/q/4928560/35070 for details
 889                 for i in range(1, 1024):
 890                     try:
 891                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 892                         uncompressed = io.BytesIO(gz.read())
 893                     except IOError:
 894                         continue
 895                     break
 896                 else:
 897                     raise original_ioerror
 898             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 899             resp.msg = old_resp.msg
 900             del resp.headers['Content-encoding']
 901         # deflate
 902         if resp.headers.get('Content-encoding', '') == 'deflate':
 903             gz = io.BytesIO(self.deflate(resp.read()))
 904             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 905             resp.msg = old_resp.msg
 906             del resp.headers['Content-encoding']
 907         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 908         # https://github.com/rg3/youtube-dl/issues/6457).
 909         if 300 <= resp.code < 400:
 910             location = resp.headers.get('Location')
 911             if location:
 912                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 913                 if sys.version_info >= (3, 0):
 914                     location = location.encode('iso-8859-1').decode('utf-8')
 915                 else:
 916                     location = location.decode('utf-8')
 917                 location_escaped = escape_url(location)
 918                 if location != location_escaped:
 919                     del resp.headers['Location']
 920                     if sys.version_info < (3, 0):
 921                         location_escaped = location_escaped.encode('utf-8')
 922                     resp.headers['Location'] = location_escaped
 923         return resp
 924
 925     https_request = http_request
 926     https_response = http_response
 927
 928
 929 def make_socks_conn_class(base_class, socks_proxy):
 930     assert issubclass(base_class, (
 931         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 932
 933     url_components = compat_urlparse.urlparse(socks_proxy)
 934     if url_components.scheme.lower() == 'socks5':
 935         socks_type = ProxyType.SOCKS5
 936     elif url_components.scheme.lower() in ('socks', 'socks4'):
 937         socks_type = ProxyType.SOCKS4
 938     elif url_components.scheme.lower() == 'socks4a':
 939         socks_type = ProxyType.SOCKS4A
 940
 941     def unquote_if_non_empty(s):
 942         if not s:
 943             return s
 944         return compat_urllib_parse_unquote_plus(s)
 945
 946     proxy_args = (
 947         socks_type,
 948         url_components.hostname, url_components.port or 1080,
 949         True,  # Remote DNS
 950         unquote_if_non_empty(url_components.username),
 951         unquote_if_non_empty(url_components.password),
 952     )
 953
 954     class SocksConnection(base_class):
 955         def connect(self):
 956             self.sock = sockssocket()
 957             self.sock.setproxy(*proxy_args)
 958             if type(self.timeout) in (int, float):
 959                 self.sock.settimeout(self.timeout)
 960             self.sock.connect((self.host, self.port))
 961
 962             if isinstance(self, compat_http_client.HTTPSConnection):
 963                 if hasattr(self, '_context'):  # Python > 2.6
 964                     self.sock = self._context.wrap_socket(
 965                         self.sock, server_hostname=self.host)
 966                 else:
 967                     self.sock = ssl.wrap_socket(self.sock)
 968
 969     return SocksConnection
 970
 971
 972 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 973     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 974         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 975         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 976         self._params = params
 977
 978     def https_open(self, req):
 979         kwargs = {}
 980         conn_class = self._https_conn_class
 981
 982         if hasattr(self, '_context'):  # python > 2.6
 983             kwargs['context'] = self._context
 984         if hasattr(self, '_check_hostname'):  # python 3.x
 985             kwargs['check_hostname'] = self._check_hostname
 986
 987         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 988         if socks_proxy:
 989             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 990             del req.headers['Ytdl-socks-proxy']
 991
 992         return self.do_open(functools.partial(
 993             _create_http_connection, self, conn_class, True),
 994             req, **kwargs)
 995
 996
 997 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 998     def __init__(self, cookiejar=None):
 999         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001     def http_response(self, request, response):
1002         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003         # characters in Set-Cookie HTTP header of last response (see
1004         # https://github.com/rg3/youtube-dl/issues/6769).
1005         # In order to at least prevent crashing we will percent encode Set-Cookie
1006         # header before HTTPCookieProcessor starts processing it.
1007         # if sys.version_info < (3, 0) and response.headers:
1008         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009         #         set_cookie = response.headers.get(set_cookie_header)
1010         #         if set_cookie:
1011         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012         #             if set_cookie != set_cookie_escaped:
1013         #                 del response.headers[set_cookie_header]
1014         #                 response.headers[set_cookie_header] = set_cookie_escaped
1015         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018     https_response = http_response
1019
1020
1021 def extract_timezone(date_str):
1022     m = re.search(
1023         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024         date_str)
1025     if not m:
1026         timezone = datetime.timedelta()
1027     else:
1028         date_str = date_str[:-len(m.group('tz'))]
1029         if not m.group('sign'):
1030             timezone = datetime.timedelta()
1031         else:
1032             sign = 1 if m.group('sign') == '+' else -1
1033             timezone = datetime.timedelta(
1034                 hours=sign * int(m.group('hours')),
1035                 minutes=sign * int(m.group('minutes')))
1036     return timezone, date_str
1037
1038
1039 def parse_iso8601(date_str, delimiter='T', timezone=None):
1040     """ Return a UNIX timestamp from the given date """
1041
1042     if date_str is None:
1043         return None
1044
1045     date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
1047     if timezone is None:
1048         timezone, date_str = extract_timezone(date_str)
1049
1050     try:
1051         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053         return calendar.timegm(dt.timetuple())
1054     except ValueError:
1055         pass
1056
1057
1058 def date_formats(day_first=True):
1059     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
1062 def unified_strdate(date_str, day_first=True):
1063     """Return a string with the date in the format YYYYMMDD"""
1064
1065     if date_str is None:
1066         return None
1067     upload_date = None
1068     # Replace commas
1069     date_str = date_str.replace(',', ' ')
1070     # Remove AM/PM + timezone
1071     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1072     _, date_str = extract_timezone(date_str)
1073
1074     for expression in date_formats(day_first):
1075         try:
1076             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1077         except ValueError:
1078             pass
1079     if upload_date is None:
1080         timetuple = email.utils.parsedate_tz(date_str)
1081         if timetuple:
1082             try:
1083                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084             except ValueError:
1085                 pass
1086     if upload_date is not None:
1087         return compat_str(upload_date)
1088
1089
1090 def unified_timestamp(date_str, day_first=True):
1091     if date_str is None:
1092         return None
1093
1094     date_str = date_str.replace(',', ' ')
1095
1096     pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097     timezone, date_str = extract_timezone(date_str)
1098
1099     # Remove AM/PM + timezone
1100     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102     for expression in date_formats(day_first):
1103         try:
1104             dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105             return calendar.timegm(dt.timetuple())
1106         except ValueError:
1107             pass
1108     timetuple = email.utils.parsedate_tz(date_str)
1109     if timetuple:
1110         return calendar.timegm(timetuple.timetuple())
1111
1112
1113 def determine_ext(url, default_ext='unknown_video'):
1114     if url is None:
1115         return default_ext
1116     guess = url.partition('?')[0].rpartition('.')[2]
1117     if re.match(r'^[A-Za-z0-9]+$', guess):
1118         return guess
1119     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1121         return guess.rstrip('/')
1122     else:
1123         return default_ext
1124
1125
1126 def subtitles_filename(filename, sub_lang, sub_format):
1127     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1128
1129
1130 def date_from_str(date_str):
1131     """
1132     Return a datetime object from a string in the format YYYYMMDD or
1133     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134     today = datetime.date.today()
1135     if date_str in ('now', 'today'):
1136         return today
1137     if date_str == 'yesterday':
1138         return today - datetime.timedelta(days=1)
1139     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140     if match is not None:
1141         sign = match.group('sign')
1142         time = int(match.group('time'))
1143         if sign == '-':
1144             time = -time
1145         unit = match.group('unit')
1146         # A bad approximation?
1147         if unit == 'month':
1148             unit = 'day'
1149             time *= 30
1150         elif unit == 'year':
1151             unit = 'day'
1152             time *= 365
1153         unit += 's'
1154         delta = datetime.timedelta(**{unit: time})
1155         return today + delta
1156     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1157
1158
1159 def hyphenate_date(date_str):
1160     """
1161     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163     if match is not None:
1164         return '-'.join(match.groups())
1165     else:
1166         return date_str
1167
1168
1169 class DateRange(object):
1170     """Represents a time interval between two dates"""
1171
1172     def __init__(self, start=None, end=None):
1173         """start and end must be strings in the format accepted by date"""
1174         if start is not None:
1175             self.start = date_from_str(start)
1176         else:
1177             self.start = datetime.datetime.min.date()
1178         if end is not None:
1179             self.end = date_from_str(end)
1180         else:
1181             self.end = datetime.datetime.max.date()
1182         if self.start > self.end:
1183             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1184
1185     @classmethod
1186     def day(cls, day):
1187         """Returns a range that only contains the given day"""
1188         return cls(day, day)
1189
1190     def __contains__(self, date):
1191         """Check if the date is in the range"""
1192         if not isinstance(date, datetime.date):
1193             date = date_from_str(date)
1194         return self.start <= date <= self.end
1195
1196     def __str__(self):
1197         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1198
1199
1200 def platform_name():
1201     """ Returns the platform name as a compat_str """
1202     res = platform.platform()
1203     if isinstance(res, bytes):
1204         res = res.decode(preferredencoding())
1205
1206     assert isinstance(res, compat_str)
1207     return res
1208
1209
1210 def _windows_write_string(s, out):
1211     """ Returns True if the string was written using special methods,
1212     False if it has yet to be written out."""
1213     # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215     import ctypes
1216     import ctypes.wintypes
1217
1218     WIN_OUTPUT_IDS = {
1219         1: -11,
1220         2: -12,
1221     }
1222
1223     try:
1224         fileno = out.fileno()
1225     except AttributeError:
1226         # If the output stream doesn't have a fileno, it's virtual
1227         return False
1228     except io.UnsupportedOperation:
1229         # Some strange Windows pseudo files?
1230         return False
1231     if fileno not in WIN_OUTPUT_IDS:
1232         return False
1233
1234     GetStdHandle = ctypes.WINFUNCTYPE(
1235         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1236         (b'GetStdHandle', ctypes.windll.kernel32))
1237     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
1239     WriteConsoleW = ctypes.WINFUNCTYPE(
1240         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1242         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1243     written = ctypes.wintypes.DWORD(0)
1244
1245     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1246     FILE_TYPE_CHAR = 0x0002
1247     FILE_TYPE_REMOTE = 0x8000
1248     GetConsoleMode = ctypes.WINFUNCTYPE(
1249         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250         ctypes.POINTER(ctypes.wintypes.DWORD))(
1251         (b'GetConsoleMode', ctypes.windll.kernel32))
1252     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254     def not_a_console(handle):
1255         if handle == INVALID_HANDLE_VALUE or handle is None:
1256             return True
1257         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1259
1260     if not_a_console(h):
1261         return False
1262
1263     def next_nonbmp_pos(s):
1264         try:
1265             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266         except StopIteration:
1267             return len(s)
1268
1269     while s:
1270         count = min(next_nonbmp_pos(s), 1024)
1271
1272         ret = WriteConsoleW(
1273             h, s, count if count else 2, ctypes.byref(written), None)
1274         if ret == 0:
1275             raise OSError('Failed to write string')
1276         if not count:  # We just wrote a non-BMP character
1277             assert written.value == 2
1278             s = s[1:]
1279         else:
1280             assert written.value > 0
1281             s = s[written.value:]
1282     return True
1283
1284
1285 def write_string(s, out=None, encoding=None):
1286     if out is None:
1287         out = sys.stderr
1288     assert type(s) == compat_str
1289
1290     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291         if _windows_write_string(s, out):
1292             return
1293
1294     if ('b' in getattr(out, 'mode', '') or
1295             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1296         byt = s.encode(encoding or preferredencoding(), 'ignore')
1297         out.write(byt)
1298     elif hasattr(out, 'buffer'):
1299         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300         byt = s.encode(enc, 'ignore')
1301         out.buffer.write(byt)
1302     else:
1303         out.write(s)
1304     out.flush()
1305
1306
1307 def bytes_to_intlist(bs):
1308     if not bs:
1309         return []
1310     if isinstance(bs[0], int):  # Python 3
1311         return list(bs)
1312     else:
1313         return [ord(c) for c in bs]
1314
1315
1316 def intlist_to_bytes(xs):
1317     if not xs:
1318         return b''
1319     return compat_struct_pack('%dB' % len(xs), *xs)
1320
1321
1322 # Cross-platform file locking
1323 if sys.platform == 'win32':
1324     import ctypes.wintypes
1325     import msvcrt
1326
1327     class OVERLAPPED(ctypes.Structure):
1328         _fields_ = [
1329             ('Internal', ctypes.wintypes.LPVOID),
1330             ('InternalHigh', ctypes.wintypes.LPVOID),
1331             ('Offset', ctypes.wintypes.DWORD),
1332             ('OffsetHigh', ctypes.wintypes.DWORD),
1333             ('hEvent', ctypes.wintypes.HANDLE),
1334         ]
1335
1336     kernel32 = ctypes.windll.kernel32
1337     LockFileEx = kernel32.LockFileEx
1338     LockFileEx.argtypes = [
1339         ctypes.wintypes.HANDLE,     # hFile
1340         ctypes.wintypes.DWORD,      # dwFlags
1341         ctypes.wintypes.DWORD,      # dwReserved
1342         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1343         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1344         ctypes.POINTER(OVERLAPPED)  # Overlapped
1345     ]
1346     LockFileEx.restype = ctypes.wintypes.BOOL
1347     UnlockFileEx = kernel32.UnlockFileEx
1348     UnlockFileEx.argtypes = [
1349         ctypes.wintypes.HANDLE,     # hFile
1350         ctypes.wintypes.DWORD,      # dwReserved
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1353         ctypes.POINTER(OVERLAPPED)  # Overlapped
1354     ]
1355     UnlockFileEx.restype = ctypes.wintypes.BOOL
1356     whole_low = 0xffffffff
1357     whole_high = 0x7fffffff
1358
1359     def _lock_file(f, exclusive):
1360         overlapped = OVERLAPPED()
1361         overlapped.Offset = 0
1362         overlapped.OffsetHigh = 0
1363         overlapped.hEvent = 0
1364         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365         handle = msvcrt.get_osfhandle(f.fileno())
1366         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367                           whole_low, whole_high, f._lock_file_overlapped_p):
1368             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370     def _unlock_file(f):
1371         assert f._lock_file_overlapped_p
1372         handle = msvcrt.get_osfhandle(f.fileno())
1373         if not UnlockFileEx(handle, 0,
1374                             whole_low, whole_high, f._lock_file_overlapped_p):
1375             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377 else:
1378     # Some platforms, such as Jython, is missing fcntl
1379     try:
1380         import fcntl
1381
1382         def _lock_file(f, exclusive):
1383             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1384
1385         def _unlock_file(f):
1386             fcntl.flock(f, fcntl.LOCK_UN)
1387     except ImportError:
1388         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390         def _lock_file(f, exclusive):
1391             raise IOError(UNSUPPORTED_MSG)
1392
1393         def _unlock_file(f):
1394             raise IOError(UNSUPPORTED_MSG)
1395
1396
1397 class locked_file(object):
1398     def __init__(self, filename, mode, encoding=None):
1399         assert mode in ['r', 'a', 'w']
1400         self.f = io.open(filename, mode, encoding=encoding)
1401         self.mode = mode
1402
1403     def __enter__(self):
1404         exclusive = self.mode != 'r'
1405         try:
1406             _lock_file(self.f, exclusive)
1407         except IOError:
1408             self.f.close()
1409             raise
1410         return self
1411
1412     def __exit__(self, etype, value, traceback):
1413         try:
1414             _unlock_file(self.f)
1415         finally:
1416             self.f.close()
1417
1418     def __iter__(self):
1419         return iter(self.f)
1420
1421     def write(self, *args):
1422         return self.f.write(*args)
1423
1424     def read(self, *args):
1425         return self.f.read(*args)
1426
1427
1428 def get_filesystem_encoding():
1429     encoding = sys.getfilesystemencoding()
1430     return encoding if encoding is not None else 'utf-8'
1431
1432
1433 def shell_quote(args):
1434     quoted_args = []
1435     encoding = get_filesystem_encoding()
1436     for a in args:
1437         if isinstance(a, bytes):
1438             # We may get a filename encoded with 'encodeFilename'
1439             a = a.decode(encoding)
1440         quoted_args.append(pipes.quote(a))
1441     return ' '.join(quoted_args)
1442
1443
1444 def smuggle_url(url, data):
1445     """ Pass additional data in a URL for internal use. """
1446
1447     sdata = compat_urllib_parse_urlencode(
1448         {'__youtubedl_smuggle': json.dumps(data)})
1449     return url + '#' + sdata
1450
1451
1452 def unsmuggle_url(smug_url, default=None):
1453     if '#__youtubedl_smuggle' not in smug_url:
1454         return smug_url, default
1455     url, _, sdata = smug_url.rpartition('#')
1456     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1457     data = json.loads(jsond)
1458     return url, data
1459
1460
1461 def format_bytes(bytes):
1462     if bytes is None:
1463         return 'N/A'
1464     if type(bytes) is str:
1465         bytes = float(bytes)
1466     if bytes == 0.0:
1467         exponent = 0
1468     else:
1469         exponent = int(math.log(bytes, 1024.0))
1470     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1471     converted = float(bytes) / float(1024 ** exponent)
1472     return '%.2f%s' % (converted, suffix)
1473
1474
1475 def lookup_unit_table(unit_table, s):
1476     units_re = '|'.join(re.escape(u) for u in unit_table)
1477     m = re.match(
1478         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1479     if not m:
1480         return None
1481     num_str = m.group('num').replace(',', '.')
1482     mult = unit_table[m.group('unit')]
1483     return int(float(num_str) * mult)
1484
1485
1486 def parse_filesize(s):
1487     if s is None:
1488         return None
1489
1490     # The lower-case forms are of course incorrect and unofficial,
1491     # but we support those too
1492     _UNIT_TABLE = {
1493         'B': 1,
1494         'b': 1,
1495         'KiB': 1024,
1496         'KB': 1000,
1497         'kB': 1024,
1498         'Kb': 1000,
1499         'MiB': 1024 ** 2,
1500         'MB': 1000 ** 2,
1501         'mB': 1024 ** 2,
1502         'Mb': 1000 ** 2,
1503         'GiB': 1024 ** 3,
1504         'GB': 1000 ** 3,
1505         'gB': 1024 ** 3,
1506         'Gb': 1000 ** 3,
1507         'TiB': 1024 ** 4,
1508         'TB': 1000 ** 4,
1509         'tB': 1024 ** 4,
1510         'Tb': 1000 ** 4,
1511         'PiB': 1024 ** 5,
1512         'PB': 1000 ** 5,
1513         'pB': 1024 ** 5,
1514         'Pb': 1000 ** 5,
1515         'EiB': 1024 ** 6,
1516         'EB': 1000 ** 6,
1517         'eB': 1024 ** 6,
1518         'Eb': 1000 ** 6,
1519         'ZiB': 1024 ** 7,
1520         'ZB': 1000 ** 7,
1521         'zB': 1024 ** 7,
1522         'Zb': 1000 ** 7,
1523         'YiB': 1024 ** 8,
1524         'YB': 1000 ** 8,
1525         'yB': 1024 ** 8,
1526         'Yb': 1000 ** 8,
1527     }
1528
1529     return lookup_unit_table(_UNIT_TABLE, s)
1530
1531
1532 def parse_count(s):
1533     if s is None:
1534         return None
1535
1536     s = s.strip()
1537
1538     if re.match(r'^[\d,.]+$', s):
1539         return str_to_int(s)
1540
1541     _UNIT_TABLE = {
1542         'k': 1000,
1543         'K': 1000,
1544         'm': 1000 ** 2,
1545         'M': 1000 ** 2,
1546         'kk': 1000 ** 2,
1547         'KK': 1000 ** 2,
1548     }
1549
1550     return lookup_unit_table(_UNIT_TABLE, s)
1551
1552
1553 def month_by_name(name):
1554     """ Return the number of a month by (locale-independently) English name """
1555
1556     try:
1557         return ENGLISH_MONTH_NAMES.index(name) + 1
1558     except ValueError:
1559         return None
1560
1561
1562 def month_by_abbreviation(abbrev):
1563     """ Return the number of a month by (locale-independently) English
1564         abbreviations """
1565
1566     try:
1567         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1568     except ValueError:
1569         return None
1570
1571
1572 def fix_xml_ampersands(xml_str):
1573     """Replace all the '&' by '&amp;' in XML"""
1574     return re.sub(
1575         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1576         '&amp;',
1577         xml_str)
1578
1579
1580 def setproctitle(title):
1581     assert isinstance(title, compat_str)
1582
1583     # ctypes in Jython is not complete
1584     # http://bugs.jython.org/issue2148
1585     if sys.platform.startswith('java'):
1586         return
1587
1588     try:
1589         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1590     except OSError:
1591         return
1592     title_bytes = title.encode('utf-8')
1593     buf = ctypes.create_string_buffer(len(title_bytes))
1594     buf.value = title_bytes
1595     try:
1596         libc.prctl(15, buf, 0, 0, 0)
1597     except AttributeError:
1598         return  # Strange libc, just skip this
1599
1600
1601 def remove_start(s, start):
1602     return s[len(start):] if s is not None and s.startswith(start) else s
1603
1604
1605 def remove_end(s, end):
1606     return s[:-len(end)] if s is not None and s.endswith(end) else s
1607
1608
1609 def remove_quotes(s):
1610     if s is None or len(s) < 2:
1611         return s
1612     for quote in ('"', "'", ):
1613         if s[0] == quote and s[-1] == quote:
1614             return s[1:-1]
1615     return s
1616
1617
1618 def url_basename(url):
1619     path = compat_urlparse.urlparse(url).path
1620     return path.strip('/').split('/')[-1]
1621
1622
1623 class HEADRequest(compat_urllib_request.Request):
1624     def get_method(self):
1625         return 'HEAD'
1626
1627
1628 class PUTRequest(compat_urllib_request.Request):
1629     def get_method(self):
1630         return 'PUT'
1631
1632
1633 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1634     if get_attr:
1635         if v is not None:
1636             v = getattr(v, get_attr, None)
1637     if v == '':
1638         v = None
1639     if v is None:
1640         return default
1641     try:
1642         return int(v) * invscale // scale
1643     except ValueError:
1644         return default
1645
1646
1647 def str_or_none(v, default=None):
1648     return default if v is None else compat_str(v)
1649
1650
1651 def str_to_int(int_str):
1652     """ A more relaxed version of int_or_none """
1653     if int_str is None:
1654         return None
1655     int_str = re.sub(r'[,\.\+]', '', int_str)
1656     return int(int_str)
1657
1658
1659 def float_or_none(v, scale=1, invscale=1, default=None):
1660     if v is None:
1661         return default
1662     try:
1663         return float(v) * invscale / scale
1664     except ValueError:
1665         return default
1666
1667
1668 def strip_or_none(v):
1669     return None if v is None else v.strip()
1670
1671
1672 def parse_duration(s):
1673     if not isinstance(s, compat_basestring):
1674         return None
1675
1676     s = s.strip()
1677
1678     days, hours, mins, secs, ms = [None] * 5
1679     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1680     if m:
1681         days, hours, mins, secs, ms = m.groups()
1682     else:
1683         m = re.match(
1684             r'''(?ix)(?:P?T)?
1685                 (?:
1686                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1687                 )?
1688                 (?:
1689                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1690                 )?
1691                 (?:
1692                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1693                 )?
1694                 (?:
1695                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1696                 )?$''', s)
1697         if m:
1698             days, hours, mins, secs, ms = m.groups()
1699         else:
1700             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1701             if m:
1702                 hours, mins = m.groups()
1703             else:
1704                 return None
1705
1706     duration = 0
1707     if secs:
1708         duration += float(secs)
1709     if mins:
1710         duration += float(mins) * 60
1711     if hours:
1712         duration += float(hours) * 60 * 60
1713     if days:
1714         duration += float(days) * 24 * 60 * 60
1715     if ms:
1716         duration += float(ms)
1717     return duration
1718
1719
1720 def prepend_extension(filename, ext, expected_real_ext=None):
1721     name, real_ext = os.path.splitext(filename)
1722     return (
1723         '{0}.{1}{2}'.format(name, ext, real_ext)
1724         if not expected_real_ext or real_ext[1:] == expected_real_ext
1725         else '{0}.{1}'.format(filename, ext))
1726
1727
1728 def replace_extension(filename, ext, expected_real_ext=None):
1729     name, real_ext = os.path.splitext(filename)
1730     return '{0}.{1}'.format(
1731         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1732         ext)
1733
1734
1735 def check_executable(exe, args=[]):
1736     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1737     args can be a list of arguments for a short output (like -version) """
1738     try:
1739         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1740     except OSError:
1741         return False
1742     return exe
1743
1744
1745 def get_exe_version(exe, args=['--version'],
1746                     version_re=None, unrecognized='present'):
1747     """ Returns the version of the specified executable,
1748     or False if the executable is not present """
1749     try:
1750         out, _ = subprocess.Popen(
1751             [encodeArgument(exe)] + args,
1752             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1753     except OSError:
1754         return False
1755     if isinstance(out, bytes):  # Python 2.x
1756         out = out.decode('ascii', 'ignore')
1757     return detect_exe_version(out, version_re, unrecognized)
1758
1759
1760 def detect_exe_version(output, version_re=None, unrecognized='present'):
1761     assert isinstance(output, compat_str)
1762     if version_re is None:
1763         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1764     m = re.search(version_re, output)
1765     if m:
1766         return m.group(1)
1767     else:
1768         return unrecognized
1769
1770
1771 class PagedList(object):
1772     def __len__(self):
1773         # This is only useful for tests
1774         return len(self.getslice())
1775
1776
1777 class OnDemandPagedList(PagedList):
1778     def __init__(self, pagefunc, pagesize, use_cache=False):
1779         self._pagefunc = pagefunc
1780         self._pagesize = pagesize
1781         self._use_cache = use_cache
1782         if use_cache:
1783             self._cache = {}
1784
1785     def getslice(self, start=0, end=None):
1786         res = []
1787         for pagenum in itertools.count(start // self._pagesize):
1788             firstid = pagenum * self._pagesize
1789             nextfirstid = pagenum * self._pagesize + self._pagesize
1790             if start >= nextfirstid:
1791                 continue
1792
1793             page_results = None
1794             if self._use_cache:
1795                 page_results = self._cache.get(pagenum)
1796             if page_results is None:
1797                 page_results = list(self._pagefunc(pagenum))
1798             if self._use_cache:
1799                 self._cache[pagenum] = page_results
1800
1801             startv = (
1802                 start % self._pagesize
1803                 if firstid <= start < nextfirstid
1804                 else 0)
1805
1806             endv = (
1807                 ((end - 1) % self._pagesize) + 1
1808                 if (end is not None and firstid <= end <= nextfirstid)
1809                 else None)
1810
1811             if startv != 0 or endv is not None:
1812                 page_results = page_results[startv:endv]
1813             res.extend(page_results)
1814
1815             # A little optimization - if current page is not "full", ie. does
1816             # not contain page_size videos then we can assume that this page
1817             # is the last one - there are no more ids on further pages -
1818             # i.e. no need to query again.
1819             if len(page_results) + startv < self._pagesize:
1820                 break
1821
1822             # If we got the whole page, but the next page is not interesting,
1823             # break out early as well
1824             if end == nextfirstid:
1825                 break
1826         return res
1827
1828
1829 class InAdvancePagedList(PagedList):
1830     def __init__(self, pagefunc, pagecount, pagesize):
1831         self._pagefunc = pagefunc
1832         self._pagecount = pagecount
1833         self._pagesize = pagesize
1834
1835     def getslice(self, start=0, end=None):
1836         res = []
1837         start_page = start // self._pagesize
1838         end_page = (
1839             self._pagecount if end is None else (end // self._pagesize + 1))
1840         skip_elems = start - start_page * self._pagesize
1841         only_more = None if end is None else end - start
1842         for pagenum in range(start_page, end_page):
1843             page = list(self._pagefunc(pagenum))
1844             if skip_elems:
1845                 page = page[skip_elems:]
1846                 skip_elems = None
1847             if only_more is not None:
1848                 if len(page) < only_more:
1849                     only_more -= len(page)
1850                 else:
1851                     page = page[:only_more]
1852                     res.extend(page)
1853                     break
1854             res.extend(page)
1855         return res
1856
1857
1858 def uppercase_escape(s):
1859     unicode_escape = codecs.getdecoder('unicode_escape')
1860     return re.sub(
1861         r'\\U[0-9a-fA-F]{8}',
1862         lambda m: unicode_escape(m.group(0))[0],
1863         s)
1864
1865
1866 def lowercase_escape(s):
1867     unicode_escape = codecs.getdecoder('unicode_escape')
1868     return re.sub(
1869         r'\\u[0-9a-fA-F]{4}',
1870         lambda m: unicode_escape(m.group(0))[0],
1871         s)
1872
1873
1874 def escape_rfc3986(s):
1875     """Escape non-ASCII characters as suggested by RFC 3986"""
1876     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1877         s = s.encode('utf-8')
1878     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1879
1880
1881 def escape_url(url):
1882     """Escape URL as suggested by RFC 3986"""
1883     url_parsed = compat_urllib_parse_urlparse(url)
1884     return url_parsed._replace(
1885         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1886         path=escape_rfc3986(url_parsed.path),
1887         params=escape_rfc3986(url_parsed.params),
1888         query=escape_rfc3986(url_parsed.query),
1889         fragment=escape_rfc3986(url_parsed.fragment)
1890     ).geturl()
1891
1892
1893 def read_batch_urls(batch_fd):
1894     def fixup(url):
1895         if not isinstance(url, compat_str):
1896             url = url.decode('utf-8', 'replace')
1897         BOM_UTF8 = '\xef\xbb\xbf'
1898         if url.startswith(BOM_UTF8):
1899             url = url[len(BOM_UTF8):]
1900         url = url.strip()
1901         if url.startswith(('#', ';', ']')):
1902             return False
1903         return url
1904
1905     with contextlib.closing(batch_fd) as fd:
1906         return [url for url in map(fixup, fd) if url]
1907
1908
1909 def urlencode_postdata(*args, **kargs):
1910     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1911
1912
1913 def update_url_query(url, query):
1914     if not query:
1915         return url
1916     parsed_url = compat_urlparse.urlparse(url)
1917     qs = compat_parse_qs(parsed_url.query)
1918     qs.update(query)
1919     return compat_urlparse.urlunparse(parsed_url._replace(
1920         query=compat_urllib_parse_urlencode(qs, True)))
1921
1922
1923 def update_Request(req, url=None, data=None, headers={}, query={}):
1924     req_headers = req.headers.copy()
1925     req_headers.update(headers)
1926     req_data = data or req.data
1927     req_url = update_url_query(url or req.get_full_url(), query)
1928     req_get_method = req.get_method()
1929     if req_get_method == 'HEAD':
1930         req_type = HEADRequest
1931     elif req_get_method == 'PUT':
1932         req_type = PUTRequest
1933     else:
1934         req_type = compat_urllib_request.Request
1935     new_req = req_type(
1936         req_url, data=req_data, headers=req_headers,
1937         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1938     if hasattr(req, 'timeout'):
1939         new_req.timeout = req.timeout
1940     return new_req
1941
1942
1943 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1944     if isinstance(key_or_keys, (list, tuple)):
1945         for key in key_or_keys:
1946             if key not in d or d[key] is None or skip_false_values and not d[key]:
1947                 continue
1948             return d[key]
1949         return default
1950     return d.get(key_or_keys, default)
1951
1952
1953 def try_get(src, getter, expected_type=None):
1954     try:
1955         v = getter(src)
1956     except (AttributeError, KeyError, TypeError, IndexError):
1957         pass
1958     else:
1959         if expected_type is None or isinstance(v, expected_type):
1960             return v
1961
1962
1963 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1964     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1965
1966
1967 US_RATINGS = {
1968     'G': 0,
1969     'PG': 10,
1970     'PG-13': 13,
1971     'R': 16,
1972     'NC': 18,
1973 }
1974
1975
1976 def parse_age_limit(s):
1977     if s is None:
1978         return None
1979     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1980     return int(m.group('age')) if m else US_RATINGS.get(s)
1981
1982
1983 def strip_jsonp(code):
1984     return re.sub(
1985         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1986
1987
1988 def js_to_json(code):
1989     def fix_kv(m):
1990         v = m.group(0)
1991         if v in ('true', 'false', 'null'):
1992             return v
1993         elif v.startswith('/*') or v == ',':
1994             return ""
1995
1996         if v[0] in ("'", '"'):
1997             v = re.sub(r'(?s)\\.|"', lambda m: {
1998                 '"': '\\"',
1999                 "\\'": "'",
2000                 '\\\n': '',
2001                 '\\x': '\\u00',
2002             }.get(m.group(0), m.group(0)), v[1:-1])
2003
2004         INTEGER_TABLE = (
2005             (r'^0[xX][0-9a-fA-F]+', 16),
2006             (r'^0+[0-7]+', 8),
2007         )
2008
2009         for regex, base in INTEGER_TABLE:
2010             im = re.match(regex, v)
2011             if im:
2012                 i = int(im.group(0), base)
2013                 return '"%d":' % i if v.endswith(':') else '%d' % i
2014
2015         return '"%s"' % v
2016
2017     return re.sub(r'''(?sx)
2018         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2019         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2020         /\*.*?\*/|,(?=\s*[\]}])|
2021         [a-zA-Z_][.a-zA-Z_0-9]*|
2022         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2023         [0-9]+(?=\s*:)
2024         ''', fix_kv, code)
2025
2026
2027 def qualities(quality_ids):
2028     """ Get a numeric quality value out of a list of possible values """
2029     def q(qid):
2030         try:
2031             return quality_ids.index(qid)
2032         except ValueError:
2033             return -1
2034     return q
2035
2036
2037 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2038
2039
2040 def limit_length(s, length):
2041     """ Add ellipses to overly long strings """
2042     if s is None:
2043         return None
2044     ELLIPSES = '...'
2045     if len(s) > length:
2046         return s[:length - len(ELLIPSES)] + ELLIPSES
2047     return s
2048
2049
2050 def version_tuple(v):
2051     return tuple(int(e) for e in re.split(r'[-.]', v))
2052
2053
2054 def is_outdated_version(version, limit, assume_new=True):
2055     if not version:
2056         return not assume_new
2057     try:
2058         return version_tuple(version) < version_tuple(limit)
2059     except ValueError:
2060         return not assume_new
2061
2062
2063 def ytdl_is_updateable():
2064     """ Returns if youtube-dl can be updated with -U """
2065     from zipimport import zipimporter
2066
2067     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2068
2069
2070 def args_to_str(args):
2071     # Get a short string representation for a subprocess command
2072     return ' '.join(compat_shlex_quote(a) for a in args)
2073
2074
2075 def error_to_compat_str(err):
2076     err_str = str(err)
2077     # On python 2 error byte string must be decoded with proper
2078     # encoding rather than ascii
2079     if sys.version_info[0] < 3:
2080         err_str = err_str.decode(preferredencoding())
2081     return err_str
2082
2083
2084 def mimetype2ext(mt):
2085     if mt is None:
2086         return None
2087
2088     ext = {
2089         'audio/mp4': 'm4a',
2090         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2091         # it's the most popular one
2092         'audio/mpeg': 'mp3',
2093     }.get(mt)
2094     if ext is not None:
2095         return ext
2096
2097     _, _, res = mt.rpartition('/')
2098
2099     return {
2100         '3gpp': '3gp',
2101         'smptett+xml': 'tt',
2102         'srt': 'srt',
2103         'ttaf+xml': 'dfxp',
2104         'ttml+xml': 'ttml',
2105         'vtt': 'vtt',
2106         'x-flv': 'flv',
2107         'x-mp4-fragmented': 'mp4',
2108         'x-ms-wmv': 'wmv',
2109     }.get(res, res)
2110
2111
2112 def urlhandle_detect_ext(url_handle):
2113     getheader = url_handle.headers.get
2114
2115     cd = getheader('Content-Disposition')
2116     if cd:
2117         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2118         if m:
2119             e = determine_ext(m.group('filename'), default_ext=None)
2120             if e:
2121                 return e
2122
2123     return mimetype2ext(getheader('Content-Type'))
2124
2125
2126 def encode_data_uri(data, mime_type):
2127     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2128
2129
2130 def age_restricted(content_limit, age_limit):
2131     """ Returns True iff the content should be blocked """
2132
2133     if age_limit is None:  # No limit set
2134         return False
2135     if content_limit is None:
2136         return False  # Content available for everyone
2137     return age_limit < content_limit
2138
2139
2140 def is_html(first_bytes):
2141     """ Detect whether a file contains HTML by examining its first bytes. """
2142
2143     BOMS = [
2144         (b'\xef\xbb\xbf', 'utf-8'),
2145         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2146         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2147         (b'\xff\xfe', 'utf-16-le'),
2148         (b'\xfe\xff', 'utf-16-be'),
2149     ]
2150     for bom, enc in BOMS:
2151         if first_bytes.startswith(bom):
2152             s = first_bytes[len(bom):].decode(enc, 'replace')
2153             break
2154     else:
2155         s = first_bytes.decode('utf-8', 'replace')
2156
2157     return re.match(r'^\s*<', s)
2158
2159
2160 def determine_protocol(info_dict):
2161     protocol = info_dict.get('protocol')
2162     if protocol is not None:
2163         return protocol
2164
2165     url = info_dict['url']
2166     if url.startswith('rtmp'):
2167         return 'rtmp'
2168     elif url.startswith('mms'):
2169         return 'mms'
2170     elif url.startswith('rtsp'):
2171         return 'rtsp'
2172
2173     ext = determine_ext(url)
2174     if ext == 'm3u8':
2175         return 'm3u8'
2176     elif ext == 'f4m':
2177         return 'f4m'
2178
2179     return compat_urllib_parse_urlparse(url).scheme
2180
2181
2182 def render_table(header_row, data):
2183     """ Render a list of rows, each as a list of values """
2184     table = [header_row] + data
2185     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2186     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2187     return '\n'.join(format_str % tuple(row) for row in table)
2188
2189
2190 def _match_one(filter_part, dct):
2191     COMPARISON_OPERATORS = {
2192         '<': operator.lt,
2193         '<=': operator.le,
2194         '>': operator.gt,
2195         '>=': operator.ge,
2196         '=': operator.eq,
2197         '!=': operator.ne,
2198     }
2199     operator_rex = re.compile(r'''(?x)\s*
2200         (?P<key>[a-z_]+)
2201         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2202         (?:
2203             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2204             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2205         )
2206         \s*$
2207         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2208     m = operator_rex.search(filter_part)
2209     if m:
2210         op = COMPARISON_OPERATORS[m.group('op')]
2211         if m.group('strval') is not None:
2212             if m.group('op') not in ('=', '!='):
2213                 raise ValueError(
2214                     'Operator %s does not support string values!' % m.group('op'))
2215             comparison_value = m.group('strval')
2216         else:
2217             try:
2218                 comparison_value = int(m.group('intval'))
2219             except ValueError:
2220                 comparison_value = parse_filesize(m.group('intval'))
2221                 if comparison_value is None:
2222                     comparison_value = parse_filesize(m.group('intval') + 'B')
2223                 if comparison_value is None:
2224                     raise ValueError(
2225                         'Invalid integer value %r in filter part %r' % (
2226                             m.group('intval'), filter_part))
2227         actual_value = dct.get(m.group('key'))
2228         if actual_value is None:
2229             return m.group('none_inclusive')
2230         return op(actual_value, comparison_value)
2231
2232     UNARY_OPERATORS = {
2233         '': lambda v: v is not None,
2234         '!': lambda v: v is None,
2235     }
2236     operator_rex = re.compile(r'''(?x)\s*
2237         (?P<op>%s)\s*(?P<key>[a-z_]+)
2238         \s*$
2239         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2240     m = operator_rex.search(filter_part)
2241     if m:
2242         op = UNARY_OPERATORS[m.group('op')]
2243         actual_value = dct.get(m.group('key'))
2244         return op(actual_value)
2245
2246     raise ValueError('Invalid filter part %r' % filter_part)
2247
2248
2249 def match_str(filter_str, dct):
2250     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2251
2252     return all(
2253         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2254
2255
2256 def match_filter_func(filter_str):
2257     def _match_func(info_dict):
2258         if match_str(filter_str, info_dict):
2259             return None
2260         else:
2261             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2262             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2263     return _match_func
2264
2265
2266 def parse_dfxp_time_expr(time_expr):
2267     if not time_expr:
2268         return
2269
2270     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2271     if mobj:
2272         return float(mobj.group('time_offset'))
2273
2274     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2275     if mobj:
2276         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2277
2278
2279 def srt_subtitles_timecode(seconds):
2280     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2281
2282
2283 def dfxp2srt(dfxp_data):
2284     _x = functools.partial(xpath_with_ns, ns_map={
2285         'ttml': 'http://www.w3.org/ns/ttml',
2286         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2287         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2288     })
2289
2290     class TTMLPElementParser(object):
2291         out = ''
2292
2293         def start(self, tag, attrib):
2294             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2295                 self.out += '\n'
2296
2297         def end(self, tag):
2298             pass
2299
2300         def data(self, data):
2301             self.out += data
2302
2303         def close(self):
2304             return self.out.strip()
2305
2306     def parse_node(node):
2307         target = TTMLPElementParser()
2308         parser = xml.etree.ElementTree.XMLParser(target=target)
2309         parser.feed(xml.etree.ElementTree.tostring(node))
2310         return parser.close()
2311
2312     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2313     out = []
2314     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2315
2316     if not paras:
2317         raise ValueError('Invalid dfxp/TTML subtitle')
2318
2319     for para, index in zip(paras, itertools.count(1)):
2320         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2321         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2322         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2323         if begin_time is None:
2324             continue
2325         if not end_time:
2326             if not dur:
2327                 continue
2328             end_time = begin_time + dur
2329         out.append('%d\n%s --> %s\n%s\n\n' % (
2330             index,
2331             srt_subtitles_timecode(begin_time),
2332             srt_subtitles_timecode(end_time),
2333             parse_node(para)))
2334
2335     return ''.join(out)
2336
2337
2338 def cli_option(params, command_option, param):
2339     param = params.get(param)
2340     return [command_option, param] if param is not None else []
2341
2342
2343 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2344     param = params.get(param)
2345     assert isinstance(param, bool)
2346     if separator:
2347         return [command_option + separator + (true_value if param else false_value)]
2348     return [command_option, true_value if param else false_value]
2349
2350
2351 def cli_valueless_option(params, command_option, param, expected_value=True):
2352     param = params.get(param)
2353     return [command_option] if param == expected_value else []
2354
2355
2356 def cli_configuration_args(params, param, default=[]):
2357     ex_args = params.get(param)
2358     if ex_args is None:
2359         return default
2360     assert isinstance(ex_args, list)
2361     return ex_args
2362
2363
2364 class ISO639Utils(object):
2365     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2366     _lang_map = {
2367         'aa': 'aar',
2368         'ab': 'abk',
2369         'ae': 'ave',
2370         'af': 'afr',
2371         'ak': 'aka',
2372         'am': 'amh',
2373         'an': 'arg',
2374         'ar': 'ara',
2375         'as': 'asm',
2376         'av': 'ava',
2377         'ay': 'aym',
2378         'az': 'aze',
2379         'ba': 'bak',
2380         'be': 'bel',
2381         'bg': 'bul',
2382         'bh': 'bih',
2383         'bi': 'bis',
2384         'bm': 'bam',
2385         'bn': 'ben',
2386         'bo': 'bod',
2387         'br': 'bre',
2388         'bs': 'bos',
2389         'ca': 'cat',
2390         'ce': 'che',
2391         'ch': 'cha',
2392         'co': 'cos',
2393         'cr': 'cre',
2394         'cs': 'ces',
2395         'cu': 'chu',
2396         'cv': 'chv',
2397         'cy': 'cym',
2398         'da': 'dan',
2399         'de': 'deu',
2400         'dv': 'div',
2401         'dz': 'dzo',
2402         'ee': 'ewe',
2403         'el': 'ell',
2404         'en': 'eng',
2405         'eo': 'epo',
2406         'es': 'spa',
2407         'et': 'est',
2408         'eu': 'eus',
2409         'fa': 'fas',
2410         'ff': 'ful',
2411         'fi': 'fin',
2412         'fj': 'fij',
2413         'fo': 'fao',
2414         'fr': 'fra',
2415         'fy': 'fry',
2416         'ga': 'gle',
2417         'gd': 'gla',
2418         'gl': 'glg',
2419         'gn': 'grn',
2420         'gu': 'guj',
2421         'gv': 'glv',
2422         'ha': 'hau',
2423         'he': 'heb',
2424         'hi': 'hin',
2425         'ho': 'hmo',
2426         'hr': 'hrv',
2427         'ht': 'hat',
2428         'hu': 'hun',
2429         'hy': 'hye',
2430         'hz': 'her',
2431         'ia': 'ina',
2432         'id': 'ind',
2433         'ie': 'ile',
2434         'ig': 'ibo',
2435         'ii': 'iii',
2436         'ik': 'ipk',
2437         'io': 'ido',
2438         'is': 'isl',
2439         'it': 'ita',
2440         'iu': 'iku',
2441         'ja': 'jpn',
2442         'jv': 'jav',
2443         'ka': 'kat',
2444         'kg': 'kon',
2445         'ki': 'kik',
2446         'kj': 'kua',
2447         'kk': 'kaz',
2448         'kl': 'kal',
2449         'km': 'khm',
2450         'kn': 'kan',
2451         'ko': 'kor',
2452         'kr': 'kau',
2453         'ks': 'kas',
2454         'ku': 'kur',
2455         'kv': 'kom',
2456         'kw': 'cor',
2457         'ky': 'kir',
2458         'la': 'lat',
2459         'lb': 'ltz',
2460         'lg': 'lug',
2461         'li': 'lim',
2462         'ln': 'lin',
2463         'lo': 'lao',
2464         'lt': 'lit',
2465         'lu': 'lub',
2466         'lv': 'lav',
2467         'mg': 'mlg',
2468         'mh': 'mah',
2469         'mi': 'mri',
2470         'mk': 'mkd',
2471         'ml': 'mal',
2472         'mn': 'mon',
2473         'mr': 'mar',
2474         'ms': 'msa',
2475         'mt': 'mlt',
2476         'my': 'mya',
2477         'na': 'nau',
2478         'nb': 'nob',
2479         'nd': 'nde',
2480         'ne': 'nep',
2481         'ng': 'ndo',
2482         'nl': 'nld',
2483         'nn': 'nno',
2484         'no': 'nor',
2485         'nr': 'nbl',
2486         'nv': 'nav',
2487         'ny': 'nya',
2488         'oc': 'oci',
2489         'oj': 'oji',
2490         'om': 'orm',
2491         'or': 'ori',
2492         'os': 'oss',
2493         'pa': 'pan',
2494         'pi': 'pli',
2495         'pl': 'pol',
2496         'ps': 'pus',
2497         'pt': 'por',
2498         'qu': 'que',
2499         'rm': 'roh',
2500         'rn': 'run',
2501         'ro': 'ron',
2502         'ru': 'rus',
2503         'rw': 'kin',
2504         'sa': 'san',
2505         'sc': 'srd',
2506         'sd': 'snd',
2507         'se': 'sme',
2508         'sg': 'sag',
2509         'si': 'sin',
2510         'sk': 'slk',
2511         'sl': 'slv',
2512         'sm': 'smo',
2513         'sn': 'sna',
2514         'so': 'som',
2515         'sq': 'sqi',
2516         'sr': 'srp',
2517         'ss': 'ssw',
2518         'st': 'sot',
2519         'su': 'sun',
2520         'sv': 'swe',
2521         'sw': 'swa',
2522         'ta': 'tam',
2523         'te': 'tel',
2524         'tg': 'tgk',
2525         'th': 'tha',
2526         'ti': 'tir',
2527         'tk': 'tuk',
2528         'tl': 'tgl',
2529         'tn': 'tsn',
2530         'to': 'ton',
2531         'tr': 'tur',
2532         'ts': 'tso',
2533         'tt': 'tat',
2534         'tw': 'twi',
2535         'ty': 'tah',
2536         'ug': 'uig',
2537         'uk': 'ukr',
2538         'ur': 'urd',
2539         'uz': 'uzb',
2540         've': 'ven',
2541         'vi': 'vie',
2542         'vo': 'vol',
2543         'wa': 'wln',
2544         'wo': 'wol',
2545         'xh': 'xho',
2546         'yi': 'yid',
2547         'yo': 'yor',
2548         'za': 'zha',
2549         'zh': 'zho',
2550         'zu': 'zul',
2551     }
2552
2553     @classmethod
2554     def short2long(cls, code):
2555         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2556         return cls._lang_map.get(code[:2])
2557
2558     @classmethod
2559     def long2short(cls, code):
2560         """Convert language code from ISO 639-2/T to ISO 639-1"""
2561         for short_name, long_name in cls._lang_map.items():
2562             if long_name == code:
2563                 return short_name
2564
2565
2566 class ISO3166Utils(object):
2567     # From http://data.okfn.org/data/core/country-list
2568     _country_map = {
2569         'AF': 'Afghanistan',
2570         'AX': 'Åland Islands',
2571         'AL': 'Albania',
2572         'DZ': 'Algeria',
2573         'AS': 'American Samoa',
2574         'AD': 'Andorra',
2575         'AO': 'Angola',
2576         'AI': 'Anguilla',
2577         'AQ': 'Antarctica',
2578         'AG': 'Antigua and Barbuda',
2579         'AR': 'Argentina',
2580         'AM': 'Armenia',
2581         'AW': 'Aruba',
2582         'AU': 'Australia',
2583         'AT': 'Austria',
2584         'AZ': 'Azerbaijan',
2585         'BS': 'Bahamas',
2586         'BH': 'Bahrain',
2587         'BD': 'Bangladesh',
2588         'BB': 'Barbados',
2589         'BY': 'Belarus',
2590         'BE': 'Belgium',
2591         'BZ': 'Belize',
2592         'BJ': 'Benin',
2593         'BM': 'Bermuda',
2594         'BT': 'Bhutan',
2595         'BO': 'Bolivia, Plurinational State of',
2596         'BQ': 'Bonaire, Sint Eustatius and Saba',
2597         'BA': 'Bosnia and Herzegovina',
2598         'BW': 'Botswana',
2599         'BV': 'Bouvet Island',
2600         'BR': 'Brazil',
2601         'IO': 'British Indian Ocean Territory',
2602         'BN': 'Brunei Darussalam',
2603         'BG': 'Bulgaria',
2604         'BF': 'Burkina Faso',
2605         'BI': 'Burundi',
2606         'KH': 'Cambodia',
2607         'CM': 'Cameroon',
2608         'CA': 'Canada',
2609         'CV': 'Cape Verde',
2610         'KY': 'Cayman Islands',
2611         'CF': 'Central African Republic',
2612         'TD': 'Chad',
2613         'CL': 'Chile',
2614         'CN': 'China',
2615         'CX': 'Christmas Island',
2616         'CC': 'Cocos (Keeling) Islands',
2617         'CO': 'Colombia',
2618         'KM': 'Comoros',
2619         'CG': 'Congo',
2620         'CD': 'Congo, the Democratic Republic of the',
2621         'CK': 'Cook Islands',
2622         'CR': 'Costa Rica',
2623         'CI': 'Côte d\'Ivoire',
2624         'HR': 'Croatia',
2625         'CU': 'Cuba',
2626         'CW': 'Curaçao',
2627         'CY': 'Cyprus',
2628         'CZ': 'Czech Republic',
2629         'DK': 'Denmark',
2630         'DJ': 'Djibouti',
2631         'DM': 'Dominica',
2632         'DO': 'Dominican Republic',
2633         'EC': 'Ecuador',
2634         'EG': 'Egypt',
2635         'SV': 'El Salvador',
2636         'GQ': 'Equatorial Guinea',
2637         'ER': 'Eritrea',
2638         'EE': 'Estonia',
2639         'ET': 'Ethiopia',
2640         'FK': 'Falkland Islands (Malvinas)',
2641         'FO': 'Faroe Islands',
2642         'FJ': 'Fiji',
2643         'FI': 'Finland',
2644         'FR': 'France',
2645         'GF': 'French Guiana',
2646         'PF': 'French Polynesia',
2647         'TF': 'French Southern Territories',
2648         'GA': 'Gabon',
2649         'GM': 'Gambia',
2650         'GE': 'Georgia',
2651         'DE': 'Germany',
2652         'GH': 'Ghana',
2653         'GI': 'Gibraltar',
2654         'GR': 'Greece',
2655         'GL': 'Greenland',
2656         'GD': 'Grenada',
2657         'GP': 'Guadeloupe',
2658         'GU': 'Guam',
2659         'GT': 'Guatemala',
2660         'GG': 'Guernsey',
2661         'GN': 'Guinea',
2662         'GW': 'Guinea-Bissau',
2663         'GY': 'Guyana',
2664         'HT': 'Haiti',
2665         'HM': 'Heard Island and McDonald Islands',
2666         'VA': 'Holy See (Vatican City State)',
2667         'HN': 'Honduras',
2668         'HK': 'Hong Kong',
2669         'HU': 'Hungary',
2670         'IS': 'Iceland',
2671         'IN': 'India',
2672         'ID': 'Indonesia',
2673         'IR': 'Iran, Islamic Republic of',
2674         'IQ': 'Iraq',
2675         'IE': 'Ireland',
2676         'IM': 'Isle of Man',
2677         'IL': 'Israel',
2678         'IT': 'Italy',
2679         'JM': 'Jamaica',
2680         'JP': 'Japan',
2681         'JE': 'Jersey',
2682         'JO': 'Jordan',
2683         'KZ': 'Kazakhstan',
2684         'KE': 'Kenya',
2685         'KI': 'Kiribati',
2686         'KP': 'Korea, Democratic People\'s Republic of',
2687         'KR': 'Korea, Republic of',
2688         'KW': 'Kuwait',
2689         'KG': 'Kyrgyzstan',
2690         'LA': 'Lao People\'s Democratic Republic',
2691         'LV': 'Latvia',
2692         'LB': 'Lebanon',
2693         'LS': 'Lesotho',
2694         'LR': 'Liberia',
2695         'LY': 'Libya',
2696         'LI': 'Liechtenstein',
2697         'LT': 'Lithuania',
2698         'LU': 'Luxembourg',
2699         'MO': 'Macao',
2700         'MK': 'Macedonia, the Former Yugoslav Republic of',
2701         'MG': 'Madagascar',
2702         'MW': 'Malawi',
2703         'MY': 'Malaysia',
2704         'MV': 'Maldives',
2705         'ML': 'Mali',
2706         'MT': 'Malta',
2707         'MH': 'Marshall Islands',
2708         'MQ': 'Martinique',
2709         'MR': 'Mauritania',
2710         'MU': 'Mauritius',
2711         'YT': 'Mayotte',
2712         'MX': 'Mexico',
2713         'FM': 'Micronesia, Federated States of',
2714         'MD': 'Moldova, Republic of',
2715         'MC': 'Monaco',
2716         'MN': 'Mongolia',
2717         'ME': 'Montenegro',
2718         'MS': 'Montserrat',
2719         'MA': 'Morocco',
2720         'MZ': 'Mozambique',
2721         'MM': 'Myanmar',
2722         'NA': 'Namibia',
2723         'NR': 'Nauru',
2724         'NP': 'Nepal',
2725         'NL': 'Netherlands',
2726         'NC': 'New Caledonia',
2727         'NZ': 'New Zealand',
2728         'NI': 'Nicaragua',
2729         'NE': 'Niger',
2730         'NG': 'Nigeria',
2731         'NU': 'Niue',
2732         'NF': 'Norfolk Island',
2733         'MP': 'Northern Mariana Islands',
2734         'NO': 'Norway',
2735         'OM': 'Oman',
2736         'PK': 'Pakistan',
2737         'PW': 'Palau',
2738         'PS': 'Palestine, State of',
2739         'PA': 'Panama',
2740         'PG': 'Papua New Guinea',
2741         'PY': 'Paraguay',
2742         'PE': 'Peru',
2743         'PH': 'Philippines',
2744         'PN': 'Pitcairn',
2745         'PL': 'Poland',
2746         'PT': 'Portugal',
2747         'PR': 'Puerto Rico',
2748         'QA': 'Qatar',
2749         'RE': 'Réunion',
2750         'RO': 'Romania',
2751         'RU': 'Russian Federation',
2752         'RW': 'Rwanda',
2753         'BL': 'Saint Barthélemy',
2754         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2755         'KN': 'Saint Kitts and Nevis',
2756         'LC': 'Saint Lucia',
2757         'MF': 'Saint Martin (French part)',
2758         'PM': 'Saint Pierre and Miquelon',
2759         'VC': 'Saint Vincent and the Grenadines',
2760         'WS': 'Samoa',
2761         'SM': 'San Marino',
2762         'ST': 'Sao Tome and Principe',
2763         'SA': 'Saudi Arabia',
2764         'SN': 'Senegal',
2765         'RS': 'Serbia',
2766         'SC': 'Seychelles',
2767         'SL': 'Sierra Leone',
2768         'SG': 'Singapore',
2769         'SX': 'Sint Maarten (Dutch part)',
2770         'SK': 'Slovakia',
2771         'SI': 'Slovenia',
2772         'SB': 'Solomon Islands',
2773         'SO': 'Somalia',
2774         'ZA': 'South Africa',
2775         'GS': 'South Georgia and the South Sandwich Islands',
2776         'SS': 'South Sudan',
2777         'ES': 'Spain',
2778         'LK': 'Sri Lanka',
2779         'SD': 'Sudan',
2780         'SR': 'Suriname',
2781         'SJ': 'Svalbard and Jan Mayen',
2782         'SZ': 'Swaziland',
2783         'SE': 'Sweden',
2784         'CH': 'Switzerland',
2785         'SY': 'Syrian Arab Republic',
2786         'TW': 'Taiwan, Province of China',
2787         'TJ': 'Tajikistan',
2788         'TZ': 'Tanzania, United Republic of',
2789         'TH': 'Thailand',
2790         'TL': 'Timor-Leste',
2791         'TG': 'Togo',
2792         'TK': 'Tokelau',
2793         'TO': 'Tonga',
2794         'TT': 'Trinidad and Tobago',
2795         'TN': 'Tunisia',
2796         'TR': 'Turkey',
2797         'TM': 'Turkmenistan',
2798         'TC': 'Turks and Caicos Islands',
2799         'TV': 'Tuvalu',
2800         'UG': 'Uganda',
2801         'UA': 'Ukraine',
2802         'AE': 'United Arab Emirates',
2803         'GB': 'United Kingdom',
2804         'US': 'United States',
2805         'UM': 'United States Minor Outlying Islands',
2806         'UY': 'Uruguay',
2807         'UZ': 'Uzbekistan',
2808         'VU': 'Vanuatu',
2809         'VE': 'Venezuela, Bolivarian Republic of',
2810         'VN': 'Viet Nam',
2811         'VG': 'Virgin Islands, British',
2812         'VI': 'Virgin Islands, U.S.',
2813         'WF': 'Wallis and Futuna',
2814         'EH': 'Western Sahara',
2815         'YE': 'Yemen',
2816         'ZM': 'Zambia',
2817         'ZW': 'Zimbabwe',
2818     }
2819
2820     @classmethod
2821     def short2full(cls, code):
2822         """Convert an ISO 3166-2 country code to the corresponding full name"""
2823         return cls._country_map.get(code.upper())
2824
2825
2826 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2827     def __init__(self, proxies=None):
2828         # Set default handlers
2829         for type in ('http', 'https'):
2830             setattr(self, '%s_open' % type,
2831                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2832                         meth(r, proxy, type))
2833         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2834
2835     def proxy_open(self, req, proxy, type):
2836         req_proxy = req.headers.get('Ytdl-request-proxy')
2837         if req_proxy is not None:
2838             proxy = req_proxy
2839             del req.headers['Ytdl-request-proxy']
2840
2841         if proxy == '__noproxy__':
2842             return None  # No Proxy
2843         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2844             req.add_header('Ytdl-socks-proxy', proxy)
2845             # youtube-dl's http/https handlers do wrapping the socket with socks
2846             return None
2847         return compat_urllib_request.ProxyHandler.proxy_open(
2848             self, req, proxy, type)
2849
2850
2851 def ohdave_rsa_encrypt(data, exponent, modulus):
2852     '''
2853     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2854
2855     Input:
2856         data: data to encrypt, bytes-like object
2857         exponent, modulus: parameter e and N of RSA algorithm, both integer
2858     Output: hex string of encrypted data
2859
2860     Limitation: supports one block encryption only
2861     '''
2862
2863     payload = int(binascii.hexlify(data[::-1]), 16)
2864     encrypted = pow(payload, exponent, modulus)
2865     return '%x' % encrypted
2866
2867
2868 def encode_base_n(num, n, table=None):
2869     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2870     if not table:
2871         table = FULL_TABLE[:n]
2872
2873     if n > len(table):
2874         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2875
2876     if num == 0:
2877         return table[0]
2878
2879     ret = ''
2880     while num:
2881         ret = table[num % n] + ret
2882         num = num // n
2883     return ret
2884
2885
2886 def decode_packed_codes(code):
2887     mobj = re.search(
2888         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2889         code)
2890     obfucasted_code, base, count, symbols = mobj.groups()
2891     base = int(base)
2892     count = int(count)
2893     symbols = symbols.split('|')
2894     symbol_table = {}
2895
2896     while count:
2897         count -= 1
2898         base_n_count = encode_base_n(count, base)
2899         symbol_table[base_n_count] = symbols[count] or base_n_count
2900
2901     return re.sub(
2902         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2903         obfucasted_code)
2904
2905
2906 def parse_m3u8_attributes(attrib):
2907     info = {}
2908     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2909         if val.startswith('"'):
2910             val = val[1:-1]
2911         info[key] = val
2912     return info
2913
2914
2915 def urshift(val, n):
2916     return val >> n if val >= 0 else (val + 0x100000000) >> n