_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113
 114 def preferredencoding():
 115     """Get preferred encoding.
 116
 117     Returns the best encoding scheme for the system, based on
 118     locale.getpreferredencoding() and some further tweaks.
 119     """
 120     try:
 121         pref = locale.getpreferredencoding()
 122         'TEST'.encode(pref)
 123     except Exception:
 124         pref = 'UTF-8'
 125
 126     return pref
 127
 128
 129 def write_json_file(obj, fn):
 130     """ Encode obj as JSON and write it to fn, atomically if possible """
 131
 132     fn = encodeFilename(fn)
 133     if sys.version_info < (3, 0) and sys.platform != 'win32':
 134         encoding = get_filesystem_encoding()
 135         # os.path.basename returns a bytes object, but NamedTemporaryFile
 136         # will fail if the filename contains non ascii characters unless we
 137         # use a unicode object
 138         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 139         # the same for os.path.dirname
 140         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 141     else:
 142         path_basename = os.path.basename
 143         path_dirname = os.path.dirname
 144
 145     args = {
 146         'suffix': '.tmp',
 147         'prefix': path_basename(fn) + '.',
 148         'dir': path_dirname(fn),
 149         'delete': False,
 150     }
 151
 152     # In Python 2.x, json.dump expects a bytestream.
 153     # In Python 3.x, it writes to a character stream
 154     if sys.version_info < (3, 0):
 155         args['mode'] = 'wb'
 156     else:
 157         args.update({
 158             'mode': 'w',
 159             'encoding': 'utf-8',
 160         })
 161
 162     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 163
 164     try:
 165         with tf:
 166             json.dump(obj, tf)
 167         if sys.platform == 'win32':
 168             # Need to remove existing file on Windows, else os.rename raises
 169             # WindowsError or FileExistsError.
 170             try:
 171                 os.unlink(fn)
 172             except OSError:
 173                 pass
 174         os.rename(tf.name, fn)
 175     except Exception:
 176         try:
 177             os.remove(tf.name)
 178         except OSError:
 179             pass
 180         raise
 181
 182
 183 if sys.version_info >= (2, 7):
 184     def find_xpath_attr(node, xpath, key, val=None):
 185         """ Find the xpath xpath[@key=val] """
 186         assert re.match(r'^[a-zA-Z_-]+$', key)
 187         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 188         return node.find(expr)
 189 else:
 190     def find_xpath_attr(node, xpath, key, val=None):
 191         for f in node.findall(compat_xpath(xpath)):
 192             if key not in f.attrib:
 193                 continue
 194             if val is None or f.attrib.get(key) == val:
 195                 return f
 196         return None
 197
 198 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 199 # the namespace parameter
 200
 201
 202 def xpath_with_ns(path, ns_map):
 203     components = [c.split(':') for c in path.split('/')]
 204     replaced = []
 205     for c in components:
 206         if len(c) == 1:
 207             replaced.append(c[0])
 208         else:
 209             ns, tag = c
 210             replaced.append('{%s}%s' % (ns_map[ns], tag))
 211     return '/'.join(replaced)
 212
 213
 214 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 215     def _find_xpath(xpath):
 216         return node.find(compat_xpath(xpath))
 217
 218     if isinstance(xpath, (str, compat_str)):
 219         n = _find_xpath(xpath)
 220     else:
 221         for xp in xpath:
 222             n = _find_xpath(xp)
 223             if n is not None:
 224                 break
 225
 226     if n is None:
 227         if default is not NO_DEFAULT:
 228             return default
 229         elif fatal:
 230             name = xpath if name is None else name
 231             raise ExtractorError('Could not find XML element %s' % name)
 232         else:
 233             return None
 234     return n
 235
 236
 237 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 239     if n is None or n == default:
 240         return n
 241     if n.text is None:
 242         if default is not NO_DEFAULT:
 243             return default
 244         elif fatal:
 245             name = xpath if name is None else name
 246             raise ExtractorError('Could not find XML element\'s text %s' % name)
 247         else:
 248             return None
 249     return n.text
 250
 251
 252 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 253     n = find_xpath_attr(node, xpath, key)
 254     if n is None:
 255         if default is not NO_DEFAULT:
 256             return default
 257         elif fatal:
 258             name = '%s[@%s]' % (xpath, key) if name is None else name
 259             raise ExtractorError('Could not find XML attribute %s' % name)
 260         else:
 261             return None
 262     return n.attrib[key]
 263
 264
 265 def get_element_by_id(id, html):
 266     """Return the content of the tag with the specified ID in the passed HTML document"""
 267     return get_element_by_attribute('id', id, html)
 268
 269
 270 def get_element_by_attribute(attribute, value, html):
 271     """Return the content of the tag with the specified attribute in the passed HTML document"""
 272
 273     m = re.search(r'''(?xs)
 274         <([a-zA-Z0-9:._-]+)
 275          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 276          \s+%s=['"]?%s['"]?
 277          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 278         \s*>
 279         (?P<content>.*?)
 280         </\1>
 281     ''' % (re.escape(attribute), re.escape(value)), html)
 282
 283     if not m:
 284         return None
 285     res = m.group('content')
 286
 287     if res.startswith('"') or res.startswith("'"):
 288         res = res[1:-1]
 289
 290     return unescapeHTML(res)
 291
 292
 293 class HTMLAttributeParser(compat_HTMLParser):
 294     """Trivial HTML parser to gather the attributes for a single element"""
 295     def __init__(self):
 296         self.attrs = {}
 297         compat_HTMLParser.__init__(self)
 298
 299     def handle_starttag(self, tag, attrs):
 300         self.attrs = dict(attrs)
 301
 302
 303 def extract_attributes(html_element):
 304     """Given a string for an HTML element such as
 305     <el
 306          a="foo" B="bar" c="&98;az" d=boz
 307          empty= noval entity="&amp;"
 308          sq='"' dq="'"
 309     >
 310     Decode and return a dictionary of attributes.
 311     {
 312         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 313         'empty': '', 'noval': None, 'entity': '&',
 314         'sq': '"', 'dq': '\''
 315     }.
 316     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 317     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 318     """
 319     parser = HTMLAttributeParser()
 320     parser.feed(html_element)
 321     parser.close()
 322     return parser.attrs
 323
 324
 325 def clean_html(html):
 326     """Clean an HTML snippet into a readable string"""
 327
 328     if html is None:  # Convenience for sanitizing descriptions etc.
 329         return html
 330
 331     # Newline vs <br />
 332     html = html.replace('\n', ' ')
 333     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 334     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 335     # Strip html tags
 336     html = re.sub('<.*?>', '', html)
 337     # Replace html entities
 338     html = unescapeHTML(html)
 339     return html.strip()
 340
 341
 342 def sanitize_open(filename, open_mode):
 343     """Try to open the given filename, and slightly tweak it if this fails.
 344
 345     Attempts to open the given filename. If this fails, it tries to change
 346     the filename slightly, step by step, until it's either able to open it
 347     or it fails and raises a final exception, like the standard open()
 348     function.
 349
 350     It returns the tuple (stream, definitive_file_name).
 351     """
 352     try:
 353         if filename == '-':
 354             if sys.platform == 'win32':
 355                 import msvcrt
 356                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 357             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 358         stream = open(encodeFilename(filename), open_mode)
 359         return (stream, filename)
 360     except (IOError, OSError) as err:
 361         if err.errno in (errno.EACCES,):
 362             raise
 363
 364         # In case of error, try to remove win32 forbidden chars
 365         alt_filename = sanitize_path(filename)
 366         if alt_filename == filename:
 367             raise
 368         else:
 369             # An exception here should be caught in the caller
 370             stream = open(encodeFilename(alt_filename), open_mode)
 371             return (stream, alt_filename)
 372
 373
 374 def timeconvert(timestr):
 375     """Convert RFC 2822 defined time string into system timestamp"""
 376     timestamp = None
 377     timetuple = email.utils.parsedate_tz(timestr)
 378     if timetuple is not None:
 379         timestamp = email.utils.mktime_tz(timetuple)
 380     return timestamp
 381
 382
 383 def sanitize_filename(s, restricted=False, is_id=False):
 384     """Sanitizes a string so it could be used as part of a filename.
 385     If restricted is set, use a stricter subset of allowed characters.
 386     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 387     """
 388     def replace_insane(char):
 389         if restricted and char in ACCENT_CHARS:
 390             return ACCENT_CHARS[char]
 391         if char == '?' or ord(char) < 32 or ord(char) == 127:
 392             return ''
 393         elif char == '"':
 394             return '' if restricted else '\''
 395         elif char == ':':
 396             return '_-' if restricted else ' -'
 397         elif char in '\\/|*<>':
 398             return '_'
 399         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 400             return '_'
 401         if restricted and ord(char) > 127:
 402             return '_'
 403         return char
 404
 405     # Handle timestamps
 406     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 407     result = ''.join(map(replace_insane, s))
 408     if not is_id:
 409         while '__' in result:
 410             result = result.replace('__', '_')
 411         result = result.strip('_')
 412         # Common case of "Foreign band name - English song title"
 413         if restricted and result.startswith('-_'):
 414             result = result[2:]
 415         if result.startswith('-'):
 416             result = '_' + result[len('-'):]
 417         result = result.lstrip('.')
 418         if not result:
 419             result = '_'
 420     return result
 421
 422
 423 def sanitize_path(s):
 424     """Sanitizes and normalizes path on Windows"""
 425     if sys.platform != 'win32':
 426         return s
 427     drive_or_unc, _ = os.path.splitdrive(s)
 428     if sys.version_info < (2, 7) and not drive_or_unc:
 429         drive_or_unc, _ = os.path.splitunc(s)
 430     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 431     if drive_or_unc:
 432         norm_path.pop(0)
 433     sanitized_path = [
 434         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 435         for path_part in norm_path]
 436     if drive_or_unc:
 437         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 438     return os.path.join(*sanitized_path)
 439
 440
 441 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 442 # unwanted failures due to missing protocol
 443 def sanitize_url(url):
 444     return 'http:%s' % url if url.startswith('//') else url
 445
 446
 447 def sanitized_Request(url, *args, **kwargs):
 448     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 449
 450
 451 def orderedSet(iterable):
 452     """ Remove all duplicates from the input iterable """
 453     res = []
 454     for el in iterable:
 455         if el not in res:
 456             res.append(el)
 457     return res
 458
 459
 460 def _htmlentity_transform(entity_with_semicolon):
 461     """Transforms an HTML entity to a character."""
 462     entity = entity_with_semicolon[:-1]
 463
 464     # Known non-numeric HTML entity
 465     if entity in compat_html_entities.name2codepoint:
 466         return compat_chr(compat_html_entities.name2codepoint[entity])
 467
 468     # TODO: HTML5 allows entities without a semicolon. For example,
 469     # '&Eacuteric' should be decoded as 'Éric'.
 470     if entity_with_semicolon in compat_html_entities_html5:
 471         return compat_html_entities_html5[entity_with_semicolon]
 472
 473     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 474     if mobj is not None:
 475         numstr = mobj.group(1)
 476         if numstr.startswith('x'):
 477             base = 16
 478             numstr = '0%s' % numstr
 479         else:
 480             base = 10
 481         # See https://github.com/rg3/youtube-dl/issues/7518
 482         try:
 483             return compat_chr(int(numstr, base))
 484         except ValueError:
 485             pass
 486
 487     # Unknown entity in name, return its literal representation
 488     return '&%s;' % entity
 489
 490
 491 def unescapeHTML(s):
 492     if s is None:
 493         return None
 494     assert type(s) == compat_str
 495
 496     return re.sub(
 497         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 498
 499
 500 def get_subprocess_encoding():
 501     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 502         # For subprocess calls, encode with locale encoding
 503         # Refer to http://stackoverflow.com/a/9951851/35070
 504         encoding = preferredencoding()
 505     else:
 506         encoding = sys.getfilesystemencoding()
 507     if encoding is None:
 508         encoding = 'utf-8'
 509     return encoding
 510
 511
 512 def encodeFilename(s, for_subprocess=False):
 513     """
 514     @param s The name of the file
 515     """
 516
 517     assert type(s) == compat_str
 518
 519     # Python 3 has a Unicode API
 520     if sys.version_info >= (3, 0):
 521         return s
 522
 523     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 524     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 525     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 526     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 527         return s
 528
 529     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 530     if sys.platform.startswith('java'):
 531         return s
 532
 533     return s.encode(get_subprocess_encoding(), 'ignore')
 534
 535
 536 def decodeFilename(b, for_subprocess=False):
 537
 538     if sys.version_info >= (3, 0):
 539         return b
 540
 541     if not isinstance(b, bytes):
 542         return b
 543
 544     return b.decode(get_subprocess_encoding(), 'ignore')
 545
 546
 547 def encodeArgument(s):
 548     if not isinstance(s, compat_str):
 549         # Legacy code that uses byte strings
 550         # Uncomment the following line after fixing all post processors
 551         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 552         s = s.decode('ascii')
 553     return encodeFilename(s, True)
 554
 555
 556 def decodeArgument(b):
 557     return decodeFilename(b, True)
 558
 559
 560 def decodeOption(optval):
 561     if optval is None:
 562         return optval
 563     if isinstance(optval, bytes):
 564         optval = optval.decode(preferredencoding())
 565
 566     assert isinstance(optval, compat_str)
 567     return optval
 568
 569
 570 def formatSeconds(secs):
 571     if secs > 3600:
 572         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 573     elif secs > 60:
 574         return '%d:%02d' % (secs // 60, secs % 60)
 575     else:
 576         return '%d' % secs
 577
 578
 579 def make_HTTPS_handler(params, **kwargs):
 580     opts_no_check_certificate = params.get('nocheckcertificate', False)
 581     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 582         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 583         if opts_no_check_certificate:
 584             context.check_hostname = False
 585             context.verify_mode = ssl.CERT_NONE
 586         try:
 587             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 588         except TypeError:
 589             # Python 2.7.8
 590             # (create_default_context present but HTTPSHandler has no context=)
 591             pass
 592
 593     if sys.version_info < (3, 2):
 594         return YoutubeDLHTTPSHandler(params, **kwargs)
 595     else:  # Python < 3.4
 596         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 597         context.verify_mode = (ssl.CERT_NONE
 598                                if opts_no_check_certificate
 599                                else ssl.CERT_REQUIRED)
 600         context.set_default_verify_paths()
 601         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 602
 603
 604 def bug_reports_message():
 605     if ytdl_is_updateable():
 606         update_cmd = 'type  youtube-dl -U  to update'
 607     else:
 608         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 609     msg = '; please report this issue on https://yt-dl.org/bug .'
 610     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 611     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 612     return msg
 613
 614
 615 class ExtractorError(Exception):
 616     """Error during info extraction."""
 617
 618     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 619         """ tb, if given, is the original traceback (so that it can be printed out).
 620         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 621         """
 622
 623         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 624             expected = True
 625         if video_id is not None:
 626             msg = video_id + ': ' + msg
 627         if cause:
 628             msg += ' (caused by %r)' % cause
 629         if not expected:
 630             msg += bug_reports_message()
 631         super(ExtractorError, self).__init__(msg)
 632
 633         self.traceback = tb
 634         self.exc_info = sys.exc_info()  # preserve original exception
 635         self.cause = cause
 636         self.video_id = video_id
 637
 638     def format_traceback(self):
 639         if self.traceback is None:
 640             return None
 641         return ''.join(traceback.format_tb(self.traceback))
 642
 643
 644 class UnsupportedError(ExtractorError):
 645     def __init__(self, url):
 646         super(UnsupportedError, self).__init__(
 647             'Unsupported URL: %s' % url, expected=True)
 648         self.url = url
 649
 650
 651 class RegexNotFoundError(ExtractorError):
 652     """Error when a regex didn't match"""
 653     pass
 654
 655
 656 class DownloadError(Exception):
 657     """Download Error exception.
 658
 659     This exception may be thrown by FileDownloader objects if they are not
 660     configured to continue on errors. They will contain the appropriate
 661     error message.
 662     """
 663
 664     def __init__(self, msg, exc_info=None):
 665         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 666         super(DownloadError, self).__init__(msg)
 667         self.exc_info = exc_info
 668
 669
 670 class SameFileError(Exception):
 671     """Same File exception.
 672
 673     This exception will be thrown by FileDownloader objects if they detect
 674     multiple files would have to be downloaded to the same file on disk.
 675     """
 676     pass
 677
 678
 679 class PostProcessingError(Exception):
 680     """Post Processing exception.
 681
 682     This exception may be raised by PostProcessor's .run() method to
 683     indicate an error in the postprocessing task.
 684     """
 685
 686     def __init__(self, msg):
 687         self.msg = msg
 688
 689
 690 class MaxDownloadsReached(Exception):
 691     """ --max-downloads limit has been reached. """
 692     pass
 693
 694
 695 class UnavailableVideoError(Exception):
 696     """Unavailable Format exception.
 697
 698     This exception will be thrown when a video is requested
 699     in a format that is not available for that video.
 700     """
 701     pass
 702
 703
 704 class ContentTooShortError(Exception):
 705     """Content Too Short exception.
 706
 707     This exception may be raised by FileDownloader objects when a file they
 708     download is too small for what the server announced first, indicating
 709     the connection was probably interrupted.
 710     """
 711
 712     def __init__(self, downloaded, expected):
 713         # Both in bytes
 714         self.downloaded = downloaded
 715         self.expected = expected
 716
 717
 718 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 719     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 720     # expected HTTP responses to meet HTTP/1.0 or later (see also
 721     # https://github.com/rg3/youtube-dl/issues/6727)
 722     if sys.version_info < (3, 0):
 723         kwargs[b'strict'] = True
 724     hc = http_class(*args, **kwargs)
 725     source_address = ydl_handler._params.get('source_address')
 726     if source_address is not None:
 727         sa = (source_address, 0)
 728         if hasattr(hc, 'source_address'):  # Python 2.7+
 729             hc.source_address = sa
 730         else:  # Python 2.6
 731             def _hc_connect(self, *args, **kwargs):
 732                 sock = compat_socket_create_connection(
 733                     (self.host, self.port), self.timeout, sa)
 734                 if is_https:
 735                     self.sock = ssl.wrap_socket(
 736                         sock, self.key_file, self.cert_file,
 737                         ssl_version=ssl.PROTOCOL_TLSv1)
 738                 else:
 739                     self.sock = sock
 740             hc.connect = functools.partial(_hc_connect, hc)
 741
 742     return hc
 743
 744
 745 def handle_youtubedl_headers(headers):
 746     filtered_headers = headers
 747
 748     if 'Youtubedl-no-compression' in filtered_headers:
 749         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 750         del filtered_headers['Youtubedl-no-compression']
 751
 752     return filtered_headers
 753
 754
 755 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 756     """Handler for HTTP requests and responses.
 757
 758     This class, when installed with an OpenerDirector, automatically adds
 759     the standard headers to every HTTP request and handles gzipped and
 760     deflated responses from web servers. If compression is to be avoided in
 761     a particular request, the original request in the program code only has
 762     to include the HTTP header "Youtubedl-no-compression", which will be
 763     removed before making the real request.
 764
 765     Part of this code was copied from:
 766
 767     http://techknack.net/python-urllib2-handlers/
 768
 769     Andrew Rowls, the author of that code, agreed to release it to the
 770     public domain.
 771     """
 772
 773     def __init__(self, params, *args, **kwargs):
 774         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 775         self._params = params
 776
 777     def http_open(self, req):
 778         conn_class = compat_http_client.HTTPConnection
 779
 780         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 781         if socks_proxy:
 782             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 783             del req.headers['Ytdl-socks-proxy']
 784
 785         return self.do_open(functools.partial(
 786             _create_http_connection, self, conn_class, False),
 787             req)
 788
 789     @staticmethod
 790     def deflate(data):
 791         try:
 792             return zlib.decompress(data, -zlib.MAX_WBITS)
 793         except zlib.error:
 794             return zlib.decompress(data)
 795
 796     @staticmethod
 797     def addinfourl_wrapper(stream, headers, url, code):
 798         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 799             return compat_urllib_request.addinfourl(stream, headers, url, code)
 800         ret = compat_urllib_request.addinfourl(stream, headers, url)
 801         ret.code = code
 802         return ret
 803
 804     def http_request(self, req):
 805         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 806         # always respected by websites, some tend to give out URLs with non percent-encoded
 807         # non-ASCII characters (see telemb.py, ard.py [#3412])
 808         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 809         # To work around aforementioned issue we will replace request's original URL with
 810         # percent-encoded one
 811         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 812         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 813         url = req.get_full_url()
 814         url_escaped = escape_url(url)
 815
 816         # Substitute URL if any change after escaping
 817         if url != url_escaped:
 818             req = update_Request(req, url=url_escaped)
 819
 820         for h, v in std_headers.items():
 821             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 822             # The dict keys are capitalized because of this bug by urllib
 823             if h.capitalize() not in req.headers:
 824                 req.add_header(h, v)
 825
 826         req.headers = handle_youtubedl_headers(req.headers)
 827
 828         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 829             # Python 2.6 is brain-dead when it comes to fragments
 830             req._Request__original = req._Request__original.partition('#')[0]
 831             req._Request__r_type = req._Request__r_type.partition('#')[0]
 832
 833         return req
 834
 835     def http_response(self, req, resp):
 836         old_resp = resp
 837         # gzip
 838         if resp.headers.get('Content-encoding', '') == 'gzip':
 839             content = resp.read()
 840             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 841             try:
 842                 uncompressed = io.BytesIO(gz.read())
 843             except IOError as original_ioerror:
 844                 # There may be junk add the end of the file
 845                 # See http://stackoverflow.com/q/4928560/35070 for details
 846                 for i in range(1, 1024):
 847                     try:
 848                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 849                         uncompressed = io.BytesIO(gz.read())
 850                     except IOError:
 851                         continue
 852                     break
 853                 else:
 854                     raise original_ioerror
 855             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 856             resp.msg = old_resp.msg
 857             del resp.headers['Content-encoding']
 858         # deflate
 859         if resp.headers.get('Content-encoding', '') == 'deflate':
 860             gz = io.BytesIO(self.deflate(resp.read()))
 861             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 862             resp.msg = old_resp.msg
 863             del resp.headers['Content-encoding']
 864         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 865         # https://github.com/rg3/youtube-dl/issues/6457).
 866         if 300 <= resp.code < 400:
 867             location = resp.headers.get('Location')
 868             if location:
 869                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 870                 if sys.version_info >= (3, 0):
 871                     location = location.encode('iso-8859-1').decode('utf-8')
 872                 else:
 873                     location = location.decode('utf-8')
 874                 location_escaped = escape_url(location)
 875                 if location != location_escaped:
 876                     del resp.headers['Location']
 877                     if sys.version_info < (3, 0):
 878                         location_escaped = location_escaped.encode('utf-8')
 879                     resp.headers['Location'] = location_escaped
 880         return resp
 881
 882     https_request = http_request
 883     https_response = http_response
 884
 885
 886 def make_socks_conn_class(base_class, socks_proxy):
 887     assert issubclass(base_class, (
 888         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 889
 890     url_components = compat_urlparse.urlparse(socks_proxy)
 891     if url_components.scheme.lower() == 'socks5':
 892         socks_type = ProxyType.SOCKS5
 893     elif url_components.scheme.lower() in ('socks', 'socks4'):
 894         socks_type = ProxyType.SOCKS4
 895     elif url_components.scheme.lower() == 'socks4a':
 896         socks_type = ProxyType.SOCKS4A
 897
 898     def unquote_if_non_empty(s):
 899         if not s:
 900             return s
 901         return compat_urllib_parse_unquote_plus(s)
 902
 903     proxy_args = (
 904         socks_type,
 905         url_components.hostname, url_components.port or 1080,
 906         True,  # Remote DNS
 907         unquote_if_non_empty(url_components.username),
 908         unquote_if_non_empty(url_components.password),
 909     )
 910
 911     class SocksConnection(base_class):
 912         def connect(self):
 913             self.sock = sockssocket()
 914             self.sock.setproxy(*proxy_args)
 915             if type(self.timeout) in (int, float):
 916                 self.sock.settimeout(self.timeout)
 917             self.sock.connect((self.host, self.port))
 918
 919             if isinstance(self, compat_http_client.HTTPSConnection):
 920                 if hasattr(self, '_context'):  # Python > 2.6
 921                     self.sock = self._context.wrap_socket(
 922                         self.sock, server_hostname=self.host)
 923                 else:
 924                     self.sock = ssl.wrap_socket(self.sock)
 925
 926     return SocksConnection
 927
 928
 929 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 930     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 931         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 932         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 933         self._params = params
 934
 935     def https_open(self, req):
 936         kwargs = {}
 937         conn_class = self._https_conn_class
 938
 939         if hasattr(self, '_context'):  # python > 2.6
 940             kwargs['context'] = self._context
 941         if hasattr(self, '_check_hostname'):  # python 3.x
 942             kwargs['check_hostname'] = self._check_hostname
 943
 944         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 945         if socks_proxy:
 946             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 947             del req.headers['Ytdl-socks-proxy']
 948
 949         return self.do_open(functools.partial(
 950             _create_http_connection, self, conn_class, True),
 951             req, **kwargs)
 952
 953
 954 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 955     def __init__(self, cookiejar=None):
 956         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 957
 958     def http_response(self, request, response):
 959         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 960         # characters in Set-Cookie HTTP header of last response (see
 961         # https://github.com/rg3/youtube-dl/issues/6769).
 962         # In order to at least prevent crashing we will percent encode Set-Cookie
 963         # header before HTTPCookieProcessor starts processing it.
 964         # if sys.version_info < (3, 0) and response.headers:
 965         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 966         #         set_cookie = response.headers.get(set_cookie_header)
 967         #         if set_cookie:
 968         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 969         #             if set_cookie != set_cookie_escaped:
 970         #                 del response.headers[set_cookie_header]
 971         #                 response.headers[set_cookie_header] = set_cookie_escaped
 972         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 973
 974     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 975     https_response = http_response
 976
 977
 978 def parse_iso8601(date_str, delimiter='T', timezone=None):
 979     """ Return a UNIX timestamp from the given date """
 980
 981     if date_str is None:
 982         return None
 983
 984     date_str = re.sub(r'\.[0-9]+', '', date_str)
 985
 986     if timezone is None:
 987         m = re.search(
 988             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 989             date_str)
 990         if not m:
 991             timezone = datetime.timedelta()
 992         else:
 993             date_str = date_str[:-len(m.group(0))]
 994             if not m.group('sign'):
 995                 timezone = datetime.timedelta()
 996             else:
 997                 sign = 1 if m.group('sign') == '+' else -1
 998                 timezone = datetime.timedelta(
 999                     hours=sign * int(m.group('hours')),
1000                     minutes=sign * int(m.group('minutes')))
1001     try:
1002         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1003         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1004         return calendar.timegm(dt.timetuple())
1005     except ValueError:
1006         pass
1007
1008
1009 def unified_strdate(date_str, day_first=True):
1010     """Return a string with the date in the format YYYYMMDD"""
1011
1012     if date_str is None:
1013         return None
1014     upload_date = None
1015     # Replace commas
1016     date_str = date_str.replace(',', ' ')
1017     # %z (UTC offset) is only supported in python>=3.2
1018     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1019         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1020     # Remove AM/PM + timezone
1021     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1022
1023     format_expressions = [
1024         '%d %B %Y',
1025         '%d %b %Y',
1026         '%B %d %Y',
1027         '%b %d %Y',
1028         '%b %dst %Y %I:%M',
1029         '%b %dnd %Y %I:%M',
1030         '%b %dth %Y %I:%M',
1031         '%Y %m %d',
1032         '%Y-%m-%d',
1033         '%Y/%m/%d',
1034         '%Y/%m/%d %H:%M:%S',
1035         '%Y-%m-%d %H:%M:%S',
1036         '%Y-%m-%d %H:%M:%S.%f',
1037         '%d.%m.%Y %H:%M',
1038         '%d.%m.%Y %H.%M',
1039         '%Y-%m-%dT%H:%M:%SZ',
1040         '%Y-%m-%dT%H:%M:%S.%fZ',
1041         '%Y-%m-%dT%H:%M:%S.%f0Z',
1042         '%Y-%m-%dT%H:%M:%S',
1043         '%Y-%m-%dT%H:%M:%S.%f',
1044         '%Y-%m-%dT%H:%M',
1045     ]
1046     if day_first:
1047         format_expressions.extend([
1048             '%d-%m-%Y',
1049             '%d.%m.%Y',
1050             '%d.%m.%y',
1051             '%d/%m/%Y',
1052             '%d/%m/%y',
1053             '%d/%m/%Y %H:%M:%S',
1054         ])
1055     else:
1056         format_expressions.extend([
1057             '%m-%d-%Y',
1058             '%m.%d.%Y',
1059             '%m/%d/%Y',
1060             '%m/%d/%y',
1061             '%m/%d/%Y %H:%M:%S',
1062         ])
1063     for expression in format_expressions:
1064         try:
1065             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1066         except ValueError:
1067             pass
1068     if upload_date is None:
1069         timetuple = email.utils.parsedate_tz(date_str)
1070         if timetuple:
1071             try:
1072                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1073             except ValueError:
1074                 pass
1075     if upload_date is not None:
1076         return compat_str(upload_date)
1077
1078
1079 def determine_ext(url, default_ext='unknown_video'):
1080     if url is None:
1081         return default_ext
1082     guess = url.partition('?')[0].rpartition('.')[2]
1083     if re.match(r'^[A-Za-z0-9]+$', guess):
1084         return guess
1085     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1086     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1087         return guess.rstrip('/')
1088     else:
1089         return default_ext
1090
1091
1092 def subtitles_filename(filename, sub_lang, sub_format):
1093     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1094
1095
1096 def date_from_str(date_str):
1097     """
1098     Return a datetime object from a string in the format YYYYMMDD or
1099     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1100     today = datetime.date.today()
1101     if date_str in ('now', 'today'):
1102         return today
1103     if date_str == 'yesterday':
1104         return today - datetime.timedelta(days=1)
1105     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1106     if match is not None:
1107         sign = match.group('sign')
1108         time = int(match.group('time'))
1109         if sign == '-':
1110             time = -time
1111         unit = match.group('unit')
1112         # A bad approximation?
1113         if unit == 'month':
1114             unit = 'day'
1115             time *= 30
1116         elif unit == 'year':
1117             unit = 'day'
1118             time *= 365
1119         unit += 's'
1120         delta = datetime.timedelta(**{unit: time})
1121         return today + delta
1122     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1123
1124
1125 def hyphenate_date(date_str):
1126     """
1127     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1128     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1129     if match is not None:
1130         return '-'.join(match.groups())
1131     else:
1132         return date_str
1133
1134
1135 class DateRange(object):
1136     """Represents a time interval between two dates"""
1137
1138     def __init__(self, start=None, end=None):
1139         """start and end must be strings in the format accepted by date"""
1140         if start is not None:
1141             self.start = date_from_str(start)
1142         else:
1143             self.start = datetime.datetime.min.date()
1144         if end is not None:
1145             self.end = date_from_str(end)
1146         else:
1147             self.end = datetime.datetime.max.date()
1148         if self.start > self.end:
1149             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1150
1151     @classmethod
1152     def day(cls, day):
1153         """Returns a range that only contains the given day"""
1154         return cls(day, day)
1155
1156     def __contains__(self, date):
1157         """Check if the date is in the range"""
1158         if not isinstance(date, datetime.date):
1159             date = date_from_str(date)
1160         return self.start <= date <= self.end
1161
1162     def __str__(self):
1163         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1164
1165
1166 def platform_name():
1167     """ Returns the platform name as a compat_str """
1168     res = platform.platform()
1169     if isinstance(res, bytes):
1170         res = res.decode(preferredencoding())
1171
1172     assert isinstance(res, compat_str)
1173     return res
1174
1175
1176 def _windows_write_string(s, out):
1177     """ Returns True if the string was written using special methods,
1178     False if it has yet to be written out."""
1179     # Adapted from http://stackoverflow.com/a/3259271/35070
1180
1181     import ctypes
1182     import ctypes.wintypes
1183
1184     WIN_OUTPUT_IDS = {
1185         1: -11,
1186         2: -12,
1187     }
1188
1189     try:
1190         fileno = out.fileno()
1191     except AttributeError:
1192         # If the output stream doesn't have a fileno, it's virtual
1193         return False
1194     except io.UnsupportedOperation:
1195         # Some strange Windows pseudo files?
1196         return False
1197     if fileno not in WIN_OUTPUT_IDS:
1198         return False
1199
1200     GetStdHandle = ctypes.WINFUNCTYPE(
1201         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1202         (b'GetStdHandle', ctypes.windll.kernel32))
1203     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1204
1205     WriteConsoleW = ctypes.WINFUNCTYPE(
1206         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1207         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1208         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1209     written = ctypes.wintypes.DWORD(0)
1210
1211     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1212     FILE_TYPE_CHAR = 0x0002
1213     FILE_TYPE_REMOTE = 0x8000
1214     GetConsoleMode = ctypes.WINFUNCTYPE(
1215         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1216         ctypes.POINTER(ctypes.wintypes.DWORD))(
1217         (b'GetConsoleMode', ctypes.windll.kernel32))
1218     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1219
1220     def not_a_console(handle):
1221         if handle == INVALID_HANDLE_VALUE or handle is None:
1222             return True
1223         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1224                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1225
1226     if not_a_console(h):
1227         return False
1228
1229     def next_nonbmp_pos(s):
1230         try:
1231             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1232         except StopIteration:
1233             return len(s)
1234
1235     while s:
1236         count = min(next_nonbmp_pos(s), 1024)
1237
1238         ret = WriteConsoleW(
1239             h, s, count if count else 2, ctypes.byref(written), None)
1240         if ret == 0:
1241             raise OSError('Failed to write string')
1242         if not count:  # We just wrote a non-BMP character
1243             assert written.value == 2
1244             s = s[1:]
1245         else:
1246             assert written.value > 0
1247             s = s[written.value:]
1248     return True
1249
1250
1251 def write_string(s, out=None, encoding=None):
1252     if out is None:
1253         out = sys.stderr
1254     assert type(s) == compat_str
1255
1256     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1257         if _windows_write_string(s, out):
1258             return
1259
1260     if ('b' in getattr(out, 'mode', '') or
1261             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1262         byt = s.encode(encoding or preferredencoding(), 'ignore')
1263         out.write(byt)
1264     elif hasattr(out, 'buffer'):
1265         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1266         byt = s.encode(enc, 'ignore')
1267         out.buffer.write(byt)
1268     else:
1269         out.write(s)
1270     out.flush()
1271
1272
1273 def bytes_to_intlist(bs):
1274     if not bs:
1275         return []
1276     if isinstance(bs[0], int):  # Python 3
1277         return list(bs)
1278     else:
1279         return [ord(c) for c in bs]
1280
1281
1282 def intlist_to_bytes(xs):
1283     if not xs:
1284         return b''
1285     return compat_struct_pack('%dB' % len(xs), *xs)
1286
1287
1288 # Cross-platform file locking
1289 if sys.platform == 'win32':
1290     import ctypes.wintypes
1291     import msvcrt
1292
1293     class OVERLAPPED(ctypes.Structure):
1294         _fields_ = [
1295             ('Internal', ctypes.wintypes.LPVOID),
1296             ('InternalHigh', ctypes.wintypes.LPVOID),
1297             ('Offset', ctypes.wintypes.DWORD),
1298             ('OffsetHigh', ctypes.wintypes.DWORD),
1299             ('hEvent', ctypes.wintypes.HANDLE),
1300         ]
1301
1302     kernel32 = ctypes.windll.kernel32
1303     LockFileEx = kernel32.LockFileEx
1304     LockFileEx.argtypes = [
1305         ctypes.wintypes.HANDLE,     # hFile
1306         ctypes.wintypes.DWORD,      # dwFlags
1307         ctypes.wintypes.DWORD,      # dwReserved
1308         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1309         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1310         ctypes.POINTER(OVERLAPPED)  # Overlapped
1311     ]
1312     LockFileEx.restype = ctypes.wintypes.BOOL
1313     UnlockFileEx = kernel32.UnlockFileEx
1314     UnlockFileEx.argtypes = [
1315         ctypes.wintypes.HANDLE,     # hFile
1316         ctypes.wintypes.DWORD,      # dwReserved
1317         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1318         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1319         ctypes.POINTER(OVERLAPPED)  # Overlapped
1320     ]
1321     UnlockFileEx.restype = ctypes.wintypes.BOOL
1322     whole_low = 0xffffffff
1323     whole_high = 0x7fffffff
1324
1325     def _lock_file(f, exclusive):
1326         overlapped = OVERLAPPED()
1327         overlapped.Offset = 0
1328         overlapped.OffsetHigh = 0
1329         overlapped.hEvent = 0
1330         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1331         handle = msvcrt.get_osfhandle(f.fileno())
1332         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1333                           whole_low, whole_high, f._lock_file_overlapped_p):
1334             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1335
1336     def _unlock_file(f):
1337         assert f._lock_file_overlapped_p
1338         handle = msvcrt.get_osfhandle(f.fileno())
1339         if not UnlockFileEx(handle, 0,
1340                             whole_low, whole_high, f._lock_file_overlapped_p):
1341             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1342
1343 else:
1344     # Some platforms, such as Jython, is missing fcntl
1345     try:
1346         import fcntl
1347
1348         def _lock_file(f, exclusive):
1349             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1350
1351         def _unlock_file(f):
1352             fcntl.flock(f, fcntl.LOCK_UN)
1353     except ImportError:
1354         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1355
1356         def _lock_file(f, exclusive):
1357             raise IOError(UNSUPPORTED_MSG)
1358
1359         def _unlock_file(f):
1360             raise IOError(UNSUPPORTED_MSG)
1361
1362
1363 class locked_file(object):
1364     def __init__(self, filename, mode, encoding=None):
1365         assert mode in ['r', 'a', 'w']
1366         self.f = io.open(filename, mode, encoding=encoding)
1367         self.mode = mode
1368
1369     def __enter__(self):
1370         exclusive = self.mode != 'r'
1371         try:
1372             _lock_file(self.f, exclusive)
1373         except IOError:
1374             self.f.close()
1375             raise
1376         return self
1377
1378     def __exit__(self, etype, value, traceback):
1379         try:
1380             _unlock_file(self.f)
1381         finally:
1382             self.f.close()
1383
1384     def __iter__(self):
1385         return iter(self.f)
1386
1387     def write(self, *args):
1388         return self.f.write(*args)
1389
1390     def read(self, *args):
1391         return self.f.read(*args)
1392
1393
1394 def get_filesystem_encoding():
1395     encoding = sys.getfilesystemencoding()
1396     return encoding if encoding is not None else 'utf-8'
1397
1398
1399 def shell_quote(args):
1400     quoted_args = []
1401     encoding = get_filesystem_encoding()
1402     for a in args:
1403         if isinstance(a, bytes):
1404             # We may get a filename encoded with 'encodeFilename'
1405             a = a.decode(encoding)
1406         quoted_args.append(pipes.quote(a))
1407     return ' '.join(quoted_args)
1408
1409
1410 def smuggle_url(url, data):
1411     """ Pass additional data in a URL for internal use. """
1412
1413     sdata = compat_urllib_parse_urlencode(
1414         {'__youtubedl_smuggle': json.dumps(data)})
1415     return url + '#' + sdata
1416
1417
1418 def unsmuggle_url(smug_url, default=None):
1419     if '#__youtubedl_smuggle' not in smug_url:
1420         return smug_url, default
1421     url, _, sdata = smug_url.rpartition('#')
1422     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1423     data = json.loads(jsond)
1424     return url, data
1425
1426
1427 def format_bytes(bytes):
1428     if bytes is None:
1429         return 'N/A'
1430     if type(bytes) is str:
1431         bytes = float(bytes)
1432     if bytes == 0.0:
1433         exponent = 0
1434     else:
1435         exponent = int(math.log(bytes, 1024.0))
1436     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1437     converted = float(bytes) / float(1024 ** exponent)
1438     return '%.2f%s' % (converted, suffix)
1439
1440
1441 def lookup_unit_table(unit_table, s):
1442     units_re = '|'.join(re.escape(u) for u in unit_table)
1443     m = re.match(
1444         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1445     if not m:
1446         return None
1447     num_str = m.group('num').replace(',', '.')
1448     mult = unit_table[m.group('unit')]
1449     return int(float(num_str) * mult)
1450
1451
1452 def parse_filesize(s):
1453     if s is None:
1454         return None
1455
1456     # The lower-case forms are of course incorrect and unofficial,
1457     # but we support those too
1458     _UNIT_TABLE = {
1459         'B': 1,
1460         'b': 1,
1461         'KiB': 1024,
1462         'KB': 1000,
1463         'kB': 1024,
1464         'Kb': 1000,
1465         'MiB': 1024 ** 2,
1466         'MB': 1000 ** 2,
1467         'mB': 1024 ** 2,
1468         'Mb': 1000 ** 2,
1469         'GiB': 1024 ** 3,
1470         'GB': 1000 ** 3,
1471         'gB': 1024 ** 3,
1472         'Gb': 1000 ** 3,
1473         'TiB': 1024 ** 4,
1474         'TB': 1000 ** 4,
1475         'tB': 1024 ** 4,
1476         'Tb': 1000 ** 4,
1477         'PiB': 1024 ** 5,
1478         'PB': 1000 ** 5,
1479         'pB': 1024 ** 5,
1480         'Pb': 1000 ** 5,
1481         'EiB': 1024 ** 6,
1482         'EB': 1000 ** 6,
1483         'eB': 1024 ** 6,
1484         'Eb': 1000 ** 6,
1485         'ZiB': 1024 ** 7,
1486         'ZB': 1000 ** 7,
1487         'zB': 1024 ** 7,
1488         'Zb': 1000 ** 7,
1489         'YiB': 1024 ** 8,
1490         'YB': 1000 ** 8,
1491         'yB': 1024 ** 8,
1492         'Yb': 1000 ** 8,
1493     }
1494
1495     return lookup_unit_table(_UNIT_TABLE, s)
1496
1497
1498 def parse_count(s):
1499     if s is None:
1500         return None
1501
1502     s = s.strip()
1503
1504     if re.match(r'^[\d,.]+$', s):
1505         return str_to_int(s)
1506
1507     _UNIT_TABLE = {
1508         'k': 1000,
1509         'K': 1000,
1510         'm': 1000 ** 2,
1511         'M': 1000 ** 2,
1512         'kk': 1000 ** 2,
1513         'KK': 1000 ** 2,
1514     }
1515
1516     return lookup_unit_table(_UNIT_TABLE, s)
1517
1518
1519 def month_by_name(name):
1520     """ Return the number of a month by (locale-independently) English name """
1521
1522     try:
1523         return ENGLISH_MONTH_NAMES.index(name) + 1
1524     except ValueError:
1525         return None
1526
1527
1528 def month_by_abbreviation(abbrev):
1529     """ Return the number of a month by (locale-independently) English
1530         abbreviations """
1531
1532     try:
1533         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1534     except ValueError:
1535         return None
1536
1537
1538 def fix_xml_ampersands(xml_str):
1539     """Replace all the '&' by '&amp;' in XML"""
1540     return re.sub(
1541         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1542         '&amp;',
1543         xml_str)
1544
1545
1546 def setproctitle(title):
1547     assert isinstance(title, compat_str)
1548
1549     # ctypes in Jython is not complete
1550     # http://bugs.jython.org/issue2148
1551     if sys.platform.startswith('java'):
1552         return
1553
1554     try:
1555         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1556     except OSError:
1557         return
1558     title_bytes = title.encode('utf-8')
1559     buf = ctypes.create_string_buffer(len(title_bytes))
1560     buf.value = title_bytes
1561     try:
1562         libc.prctl(15, buf, 0, 0, 0)
1563     except AttributeError:
1564         return  # Strange libc, just skip this
1565
1566
1567 def remove_start(s, start):
1568     return s[len(start):] if s is not None and s.startswith(start) else s
1569
1570
1571 def remove_end(s, end):
1572     return s[:-len(end)] if s is not None and s.endswith(end) else s
1573
1574
1575 def remove_quotes(s):
1576     if s is None or len(s) < 2:
1577         return s
1578     for quote in ('"', "'", ):
1579         if s[0] == quote and s[-1] == quote:
1580             return s[1:-1]
1581     return s
1582
1583
1584 def url_basename(url):
1585     path = compat_urlparse.urlparse(url).path
1586     return path.strip('/').split('/')[-1]
1587
1588
1589 class HEADRequest(compat_urllib_request.Request):
1590     def get_method(self):
1591         return 'HEAD'
1592
1593
1594 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1595     if get_attr:
1596         if v is not None:
1597             v = getattr(v, get_attr, None)
1598     if v == '':
1599         v = None
1600     if v is None:
1601         return default
1602     try:
1603         return int(v) * invscale // scale
1604     except ValueError:
1605         return default
1606
1607
1608 def str_or_none(v, default=None):
1609     return default if v is None else compat_str(v)
1610
1611
1612 def str_to_int(int_str):
1613     """ A more relaxed version of int_or_none """
1614     if int_str is None:
1615         return None
1616     int_str = re.sub(r'[,\.\+]', '', int_str)
1617     return int(int_str)
1618
1619
1620 def float_or_none(v, scale=1, invscale=1, default=None):
1621     if v is None:
1622         return default
1623     try:
1624         return float(v) * invscale / scale
1625     except ValueError:
1626         return default
1627
1628
1629 def parse_duration(s):
1630     if not isinstance(s, compat_basestring):
1631         return None
1632
1633     s = s.strip()
1634
1635     days, hours, mins, secs, ms = [None] * 5
1636     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1637     if m:
1638         days, hours, mins, secs, ms = m.groups()
1639     else:
1640         m = re.match(
1641             r'''(?ix)(?:P?T)?
1642                 (?:
1643                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1644                 )?
1645                 (?:
1646                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1647                 )?
1648                 (?:
1649                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1650                 )?
1651                 (?:
1652                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1653                 )?$''', s)
1654         if m:
1655             days, hours, mins, secs, ms = m.groups()
1656         else:
1657             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1658             if m:
1659                 hours, mins = m.groups()
1660             else:
1661                 return None
1662
1663     duration = 0
1664     if secs:
1665         duration += float(secs)
1666     if mins:
1667         duration += float(mins) * 60
1668     if hours:
1669         duration += float(hours) * 60 * 60
1670     if days:
1671         duration += float(days) * 24 * 60 * 60
1672     if ms:
1673         duration += float(ms)
1674     return duration
1675
1676
1677 def prepend_extension(filename, ext, expected_real_ext=None):
1678     name, real_ext = os.path.splitext(filename)
1679     return (
1680         '{0}.{1}{2}'.format(name, ext, real_ext)
1681         if not expected_real_ext or real_ext[1:] == expected_real_ext
1682         else '{0}.{1}'.format(filename, ext))
1683
1684
1685 def replace_extension(filename, ext, expected_real_ext=None):
1686     name, real_ext = os.path.splitext(filename)
1687     return '{0}.{1}'.format(
1688         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1689         ext)
1690
1691
1692 def check_executable(exe, args=[]):
1693     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1694     args can be a list of arguments for a short output (like -version) """
1695     try:
1696         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1697     except OSError:
1698         return False
1699     return exe
1700
1701
1702 def get_exe_version(exe, args=['--version'],
1703                     version_re=None, unrecognized='present'):
1704     """ Returns the version of the specified executable,
1705     or False if the executable is not present """
1706     try:
1707         out, _ = subprocess.Popen(
1708             [encodeArgument(exe)] + args,
1709             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1710     except OSError:
1711         return False
1712     if isinstance(out, bytes):  # Python 2.x
1713         out = out.decode('ascii', 'ignore')
1714     return detect_exe_version(out, version_re, unrecognized)
1715
1716
1717 def detect_exe_version(output, version_re=None, unrecognized='present'):
1718     assert isinstance(output, compat_str)
1719     if version_re is None:
1720         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1721     m = re.search(version_re, output)
1722     if m:
1723         return m.group(1)
1724     else:
1725         return unrecognized
1726
1727
1728 class PagedList(object):
1729     def __len__(self):
1730         # This is only useful for tests
1731         return len(self.getslice())
1732
1733
1734 class OnDemandPagedList(PagedList):
1735     def __init__(self, pagefunc, pagesize, use_cache=False):
1736         self._pagefunc = pagefunc
1737         self._pagesize = pagesize
1738         self._use_cache = use_cache
1739         if use_cache:
1740             self._cache = {}
1741
1742     def getslice(self, start=0, end=None):
1743         res = []
1744         for pagenum in itertools.count(start // self._pagesize):
1745             firstid = pagenum * self._pagesize
1746             nextfirstid = pagenum * self._pagesize + self._pagesize
1747             if start >= nextfirstid:
1748                 continue
1749
1750             page_results = None
1751             if self._use_cache:
1752                 page_results = self._cache.get(pagenum)
1753             if page_results is None:
1754                 page_results = list(self._pagefunc(pagenum))
1755             if self._use_cache:
1756                 self._cache[pagenum] = page_results
1757
1758             startv = (
1759                 start % self._pagesize
1760                 if firstid <= start < nextfirstid
1761                 else 0)
1762
1763             endv = (
1764                 ((end - 1) % self._pagesize) + 1
1765                 if (end is not None and firstid <= end <= nextfirstid)
1766                 else None)
1767
1768             if startv != 0 or endv is not None:
1769                 page_results = page_results[startv:endv]
1770             res.extend(page_results)
1771
1772             # A little optimization - if current page is not "full", ie. does
1773             # not contain page_size videos then we can assume that this page
1774             # is the last one - there are no more ids on further pages -
1775             # i.e. no need to query again.
1776             if len(page_results) + startv < self._pagesize:
1777                 break
1778
1779             # If we got the whole page, but the next page is not interesting,
1780             # break out early as well
1781             if end == nextfirstid:
1782                 break
1783         return res
1784
1785
1786 class InAdvancePagedList(PagedList):
1787     def __init__(self, pagefunc, pagecount, pagesize):
1788         self._pagefunc = pagefunc
1789         self._pagecount = pagecount
1790         self._pagesize = pagesize
1791
1792     def getslice(self, start=0, end=None):
1793         res = []
1794         start_page = start // self._pagesize
1795         end_page = (
1796             self._pagecount if end is None else (end // self._pagesize + 1))
1797         skip_elems = start - start_page * self._pagesize
1798         only_more = None if end is None else end - start
1799         for pagenum in range(start_page, end_page):
1800             page = list(self._pagefunc(pagenum))
1801             if skip_elems:
1802                 page = page[skip_elems:]
1803                 skip_elems = None
1804             if only_more is not None:
1805                 if len(page) < only_more:
1806                     only_more -= len(page)
1807                 else:
1808                     page = page[:only_more]
1809                     res.extend(page)
1810                     break
1811             res.extend(page)
1812         return res
1813
1814
1815 def uppercase_escape(s):
1816     unicode_escape = codecs.getdecoder('unicode_escape')
1817     return re.sub(
1818         r'\\U[0-9a-fA-F]{8}',
1819         lambda m: unicode_escape(m.group(0))[0],
1820         s)
1821
1822
1823 def lowercase_escape(s):
1824     unicode_escape = codecs.getdecoder('unicode_escape')
1825     return re.sub(
1826         r'\\u[0-9a-fA-F]{4}',
1827         lambda m: unicode_escape(m.group(0))[0],
1828         s)
1829
1830
1831 def escape_rfc3986(s):
1832     """Escape non-ASCII characters as suggested by RFC 3986"""
1833     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1834         s = s.encode('utf-8')
1835     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1836
1837
1838 def escape_url(url):
1839     """Escape URL as suggested by RFC 3986"""
1840     url_parsed = compat_urllib_parse_urlparse(url)
1841     return url_parsed._replace(
1842         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1843         path=escape_rfc3986(url_parsed.path),
1844         params=escape_rfc3986(url_parsed.params),
1845         query=escape_rfc3986(url_parsed.query),
1846         fragment=escape_rfc3986(url_parsed.fragment)
1847     ).geturl()
1848
1849
1850 def read_batch_urls(batch_fd):
1851     def fixup(url):
1852         if not isinstance(url, compat_str):
1853             url = url.decode('utf-8', 'replace')
1854         BOM_UTF8 = '\xef\xbb\xbf'
1855         if url.startswith(BOM_UTF8):
1856             url = url[len(BOM_UTF8):]
1857         url = url.strip()
1858         if url.startswith(('#', ';', ']')):
1859             return False
1860         return url
1861
1862     with contextlib.closing(batch_fd) as fd:
1863         return [url for url in map(fixup, fd) if url]
1864
1865
1866 def urlencode_postdata(*args, **kargs):
1867     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1868
1869
1870 def update_url_query(url, query):
1871     if not query:
1872         return url
1873     parsed_url = compat_urlparse.urlparse(url)
1874     qs = compat_parse_qs(parsed_url.query)
1875     qs.update(query)
1876     return compat_urlparse.urlunparse(parsed_url._replace(
1877         query=compat_urllib_parse_urlencode(qs, True)))
1878
1879
1880 def update_Request(req, url=None, data=None, headers={}, query={}):
1881     req_headers = req.headers.copy()
1882     req_headers.update(headers)
1883     req_data = data or req.data
1884     req_url = update_url_query(url or req.get_full_url(), query)
1885     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1886     new_req = req_type(
1887         req_url, data=req_data, headers=req_headers,
1888         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1889     if hasattr(req, 'timeout'):
1890         new_req.timeout = req.timeout
1891     return new_req
1892
1893
1894 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1895     if isinstance(key_or_keys, (list, tuple)):
1896         for key in key_or_keys:
1897             if key not in d or d[key] is None or skip_false_values and not d[key]:
1898                 continue
1899             return d[key]
1900         return default
1901     return d.get(key_or_keys, default)
1902
1903
1904 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1905     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1906
1907
1908 US_RATINGS = {
1909     'G': 0,
1910     'PG': 10,
1911     'PG-13': 13,
1912     'R': 16,
1913     'NC': 18,
1914 }
1915
1916
1917 def parse_age_limit(s):
1918     if s is None:
1919         return None
1920     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1921     return int(m.group('age')) if m else US_RATINGS.get(s)
1922
1923
1924 def strip_jsonp(code):
1925     return re.sub(
1926         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1927
1928
1929 def js_to_json(code):
1930     def fix_kv(m):
1931         v = m.group(0)
1932         if v in ('true', 'false', 'null'):
1933             return v
1934         elif v.startswith('/*') or v == ',':
1935             return ""
1936
1937         if v[0] in ("'", '"'):
1938             v = re.sub(r'(?s)\\.|"', lambda m: {
1939                 '"': '\\"',
1940                 "\\'": "'",
1941                 '\\\n': '',
1942                 '\\x': '\\u00',
1943             }.get(m.group(0), m.group(0)), v[1:-1])
1944
1945         INTEGER_TABLE = (
1946             (r'^0[xX][0-9a-fA-F]+', 16),
1947             (r'^0+[0-7]+', 8),
1948         )
1949
1950         for regex, base in INTEGER_TABLE:
1951             im = re.match(regex, v)
1952             if im:
1953                 i = int(im.group(0), base)
1954                 return '"%d":' % i if v.endswith(':') else '%d' % i
1955
1956         return '"%s"' % v
1957
1958     return re.sub(r'''(?sx)
1959         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1960         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1961         /\*.*?\*/|,(?=\s*[\]}])|
1962         [a-zA-Z_][.a-zA-Z_0-9]*|
1963         (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1964         [0-9]+(?=\s*:)
1965         ''', fix_kv, code)
1966
1967
1968 def qualities(quality_ids):
1969     """ Get a numeric quality value out of a list of possible values """
1970     def q(qid):
1971         try:
1972             return quality_ids.index(qid)
1973         except ValueError:
1974             return -1
1975     return q
1976
1977
1978 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1979
1980
1981 def limit_length(s, length):
1982     """ Add ellipses to overly long strings """
1983     if s is None:
1984         return None
1985     ELLIPSES = '...'
1986     if len(s) > length:
1987         return s[:length - len(ELLIPSES)] + ELLIPSES
1988     return s
1989
1990
1991 def version_tuple(v):
1992     return tuple(int(e) for e in re.split(r'[-.]', v))
1993
1994
1995 def is_outdated_version(version, limit, assume_new=True):
1996     if not version:
1997         return not assume_new
1998     try:
1999         return version_tuple(version) < version_tuple(limit)
2000     except ValueError:
2001         return not assume_new
2002
2003
2004 def ytdl_is_updateable():
2005     """ Returns if youtube-dl can be updated with -U """
2006     from zipimport import zipimporter
2007
2008     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2009
2010
2011 def args_to_str(args):
2012     # Get a short string representation for a subprocess command
2013     return ' '.join(compat_shlex_quote(a) for a in args)
2014
2015
2016 def error_to_compat_str(err):
2017     err_str = str(err)
2018     # On python 2 error byte string must be decoded with proper
2019     # encoding rather than ascii
2020     if sys.version_info[0] < 3:
2021         err_str = err_str.decode(preferredencoding())
2022     return err_str
2023
2024
2025 def mimetype2ext(mt):
2026     if mt is None:
2027         return None
2028
2029     ext = {
2030         'audio/mp4': 'm4a',
2031         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2032         # it's the most popular one
2033         'audio/mpeg': 'mp3',
2034     }.get(mt)
2035     if ext is not None:
2036         return ext
2037
2038     _, _, res = mt.rpartition('/')
2039
2040     return {
2041         '3gpp': '3gp',
2042         'smptett+xml': 'tt',
2043         'srt': 'srt',
2044         'ttaf+xml': 'dfxp',
2045         'ttml+xml': 'ttml',
2046         'vtt': 'vtt',
2047         'x-flv': 'flv',
2048         'x-mp4-fragmented': 'mp4',
2049         'x-ms-wmv': 'wmv',
2050     }.get(res, res)
2051
2052
2053 def urlhandle_detect_ext(url_handle):
2054     getheader = url_handle.headers.get
2055
2056     cd = getheader('Content-Disposition')
2057     if cd:
2058         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2059         if m:
2060             e = determine_ext(m.group('filename'), default_ext=None)
2061             if e:
2062                 return e
2063
2064     return mimetype2ext(getheader('Content-Type'))
2065
2066
2067 def encode_data_uri(data, mime_type):
2068     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2069
2070
2071 def age_restricted(content_limit, age_limit):
2072     """ Returns True iff the content should be blocked """
2073
2074     if age_limit is None:  # No limit set
2075         return False
2076     if content_limit is None:
2077         return False  # Content available for everyone
2078     return age_limit < content_limit
2079
2080
2081 def is_html(first_bytes):
2082     """ Detect whether a file contains HTML by examining its first bytes. """
2083
2084     BOMS = [
2085         (b'\xef\xbb\xbf', 'utf-8'),
2086         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2087         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2088         (b'\xff\xfe', 'utf-16-le'),
2089         (b'\xfe\xff', 'utf-16-be'),
2090     ]
2091     for bom, enc in BOMS:
2092         if first_bytes.startswith(bom):
2093             s = first_bytes[len(bom):].decode(enc, 'replace')
2094             break
2095     else:
2096         s = first_bytes.decode('utf-8', 'replace')
2097
2098     return re.match(r'^\s*<', s)
2099
2100
2101 def determine_protocol(info_dict):
2102     protocol = info_dict.get('protocol')
2103     if protocol is not None:
2104         return protocol
2105
2106     url = info_dict['url']
2107     if url.startswith('rtmp'):
2108         return 'rtmp'
2109     elif url.startswith('mms'):
2110         return 'mms'
2111     elif url.startswith('rtsp'):
2112         return 'rtsp'
2113
2114     ext = determine_ext(url)
2115     if ext == 'm3u8':
2116         return 'm3u8'
2117     elif ext == 'f4m':
2118         return 'f4m'
2119
2120     return compat_urllib_parse_urlparse(url).scheme
2121
2122
2123 def render_table(header_row, data):
2124     """ Render a list of rows, each as a list of values """
2125     table = [header_row] + data
2126     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2127     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2128     return '\n'.join(format_str % tuple(row) for row in table)
2129
2130
2131 def _match_one(filter_part, dct):
2132     COMPARISON_OPERATORS = {
2133         '<': operator.lt,
2134         '<=': operator.le,
2135         '>': operator.gt,
2136         '>=': operator.ge,
2137         '=': operator.eq,
2138         '!=': operator.ne,
2139     }
2140     operator_rex = re.compile(r'''(?x)\s*
2141         (?P<key>[a-z_]+)
2142         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2143         (?:
2144             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2145             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2146         )
2147         \s*$
2148         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2149     m = operator_rex.search(filter_part)
2150     if m:
2151         op = COMPARISON_OPERATORS[m.group('op')]
2152         if m.group('strval') is not None:
2153             if m.group('op') not in ('=', '!='):
2154                 raise ValueError(
2155                     'Operator %s does not support string values!' % m.group('op'))
2156             comparison_value = m.group('strval')
2157         else:
2158             try:
2159                 comparison_value = int(m.group('intval'))
2160             except ValueError:
2161                 comparison_value = parse_filesize(m.group('intval'))
2162                 if comparison_value is None:
2163                     comparison_value = parse_filesize(m.group('intval') + 'B')
2164                 if comparison_value is None:
2165                     raise ValueError(
2166                         'Invalid integer value %r in filter part %r' % (
2167                             m.group('intval'), filter_part))
2168         actual_value = dct.get(m.group('key'))
2169         if actual_value is None:
2170             return m.group('none_inclusive')
2171         return op(actual_value, comparison_value)
2172
2173     UNARY_OPERATORS = {
2174         '': lambda v: v is not None,
2175         '!': lambda v: v is None,
2176     }
2177     operator_rex = re.compile(r'''(?x)\s*
2178         (?P<op>%s)\s*(?P<key>[a-z_]+)
2179         \s*$
2180         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2181     m = operator_rex.search(filter_part)
2182     if m:
2183         op = UNARY_OPERATORS[m.group('op')]
2184         actual_value = dct.get(m.group('key'))
2185         return op(actual_value)
2186
2187     raise ValueError('Invalid filter part %r' % filter_part)
2188
2189
2190 def match_str(filter_str, dct):
2191     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2192
2193     return all(
2194         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2195
2196
2197 def match_filter_func(filter_str):
2198     def _match_func(info_dict):
2199         if match_str(filter_str, info_dict):
2200             return None
2201         else:
2202             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2203             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2204     return _match_func
2205
2206
2207 def parse_dfxp_time_expr(time_expr):
2208     if not time_expr:
2209         return
2210
2211     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2212     if mobj:
2213         return float(mobj.group('time_offset'))
2214
2215     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2216     if mobj:
2217         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2218
2219
2220 def srt_subtitles_timecode(seconds):
2221     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2222
2223
2224 def dfxp2srt(dfxp_data):
2225     _x = functools.partial(xpath_with_ns, ns_map={
2226         'ttml': 'http://www.w3.org/ns/ttml',
2227         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2228         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2229     })
2230
2231     class TTMLPElementParser(object):
2232         out = ''
2233
2234         def start(self, tag, attrib):
2235             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2236                 self.out += '\n'
2237
2238         def end(self, tag):
2239             pass
2240
2241         def data(self, data):
2242             self.out += data
2243
2244         def close(self):
2245             return self.out.strip()
2246
2247     def parse_node(node):
2248         target = TTMLPElementParser()
2249         parser = xml.etree.ElementTree.XMLParser(target=target)
2250         parser.feed(xml.etree.ElementTree.tostring(node))
2251         return parser.close()
2252
2253     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2254     out = []
2255     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2256
2257     if not paras:
2258         raise ValueError('Invalid dfxp/TTML subtitle')
2259
2260     for para, index in zip(paras, itertools.count(1)):
2261         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2262         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2263         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2264         if begin_time is None:
2265             continue
2266         if not end_time:
2267             if not dur:
2268                 continue
2269             end_time = begin_time + dur
2270         out.append('%d\n%s --> %s\n%s\n\n' % (
2271             index,
2272             srt_subtitles_timecode(begin_time),
2273             srt_subtitles_timecode(end_time),
2274             parse_node(para)))
2275
2276     return ''.join(out)
2277
2278
2279 def cli_option(params, command_option, param):
2280     param = params.get(param)
2281     return [command_option, param] if param is not None else []
2282
2283
2284 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2285     param = params.get(param)
2286     assert isinstance(param, bool)
2287     if separator:
2288         return [command_option + separator + (true_value if param else false_value)]
2289     return [command_option, true_value if param else false_value]
2290
2291
2292 def cli_valueless_option(params, command_option, param, expected_value=True):
2293     param = params.get(param)
2294     return [command_option] if param == expected_value else []
2295
2296
2297 def cli_configuration_args(params, param, default=[]):
2298     ex_args = params.get(param)
2299     if ex_args is None:
2300         return default
2301     assert isinstance(ex_args, list)
2302     return ex_args
2303
2304
2305 class ISO639Utils(object):
2306     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2307     _lang_map = {
2308         'aa': 'aar',
2309         'ab': 'abk',
2310         'ae': 'ave',
2311         'af': 'afr',
2312         'ak': 'aka',
2313         'am': 'amh',
2314         'an': 'arg',
2315         'ar': 'ara',
2316         'as': 'asm',
2317         'av': 'ava',
2318         'ay': 'aym',
2319         'az': 'aze',
2320         'ba': 'bak',
2321         'be': 'bel',
2322         'bg': 'bul',
2323         'bh': 'bih',
2324         'bi': 'bis',
2325         'bm': 'bam',
2326         'bn': 'ben',
2327         'bo': 'bod',
2328         'br': 'bre',
2329         'bs': 'bos',
2330         'ca': 'cat',
2331         'ce': 'che',
2332         'ch': 'cha',
2333         'co': 'cos',
2334         'cr': 'cre',
2335         'cs': 'ces',
2336         'cu': 'chu',
2337         'cv': 'chv',
2338         'cy': 'cym',
2339         'da': 'dan',
2340         'de': 'deu',
2341         'dv': 'div',
2342         'dz': 'dzo',
2343         'ee': 'ewe',
2344         'el': 'ell',
2345         'en': 'eng',
2346         'eo': 'epo',
2347         'es': 'spa',
2348         'et': 'est',
2349         'eu': 'eus',
2350         'fa': 'fas',
2351         'ff': 'ful',
2352         'fi': 'fin',
2353         'fj': 'fij',
2354         'fo': 'fao',
2355         'fr': 'fra',
2356         'fy': 'fry',
2357         'ga': 'gle',
2358         'gd': 'gla',
2359         'gl': 'glg',
2360         'gn': 'grn',
2361         'gu': 'guj',
2362         'gv': 'glv',
2363         'ha': 'hau',
2364         'he': 'heb',
2365         'hi': 'hin',
2366         'ho': 'hmo',
2367         'hr': 'hrv',
2368         'ht': 'hat',
2369         'hu': 'hun',
2370         'hy': 'hye',
2371         'hz': 'her',
2372         'ia': 'ina',
2373         'id': 'ind',
2374         'ie': 'ile',
2375         'ig': 'ibo',
2376         'ii': 'iii',
2377         'ik': 'ipk',
2378         'io': 'ido',
2379         'is': 'isl',
2380         'it': 'ita',
2381         'iu': 'iku',
2382         'ja': 'jpn',
2383         'jv': 'jav',
2384         'ka': 'kat',
2385         'kg': 'kon',
2386         'ki': 'kik',
2387         'kj': 'kua',
2388         'kk': 'kaz',
2389         'kl': 'kal',
2390         'km': 'khm',
2391         'kn': 'kan',
2392         'ko': 'kor',
2393         'kr': 'kau',
2394         'ks': 'kas',
2395         'ku': 'kur',
2396         'kv': 'kom',
2397         'kw': 'cor',
2398         'ky': 'kir',
2399         'la': 'lat',
2400         'lb': 'ltz',
2401         'lg': 'lug',
2402         'li': 'lim',
2403         'ln': 'lin',
2404         'lo': 'lao',
2405         'lt': 'lit',
2406         'lu': 'lub',
2407         'lv': 'lav',
2408         'mg': 'mlg',
2409         'mh': 'mah',
2410         'mi': 'mri',
2411         'mk': 'mkd',
2412         'ml': 'mal',
2413         'mn': 'mon',
2414         'mr': 'mar',
2415         'ms': 'msa',
2416         'mt': 'mlt',
2417         'my': 'mya',
2418         'na': 'nau',
2419         'nb': 'nob',
2420         'nd': 'nde',
2421         'ne': 'nep',
2422         'ng': 'ndo',
2423         'nl': 'nld',
2424         'nn': 'nno',
2425         'no': 'nor',
2426         'nr': 'nbl',
2427         'nv': 'nav',
2428         'ny': 'nya',
2429         'oc': 'oci',
2430         'oj': 'oji',
2431         'om': 'orm',
2432         'or': 'ori',
2433         'os': 'oss',
2434         'pa': 'pan',
2435         'pi': 'pli',
2436         'pl': 'pol',
2437         'ps': 'pus',
2438         'pt': 'por',
2439         'qu': 'que',
2440         'rm': 'roh',
2441         'rn': 'run',
2442         'ro': 'ron',
2443         'ru': 'rus',
2444         'rw': 'kin',
2445         'sa': 'san',
2446         'sc': 'srd',
2447         'sd': 'snd',
2448         'se': 'sme',
2449         'sg': 'sag',
2450         'si': 'sin',
2451         'sk': 'slk',
2452         'sl': 'slv',
2453         'sm': 'smo',
2454         'sn': 'sna',
2455         'so': 'som',
2456         'sq': 'sqi',
2457         'sr': 'srp',
2458         'ss': 'ssw',
2459         'st': 'sot',
2460         'su': 'sun',
2461         'sv': 'swe',
2462         'sw': 'swa',
2463         'ta': 'tam',
2464         'te': 'tel',
2465         'tg': 'tgk',
2466         'th': 'tha',
2467         'ti': 'tir',
2468         'tk': 'tuk',
2469         'tl': 'tgl',
2470         'tn': 'tsn',
2471         'to': 'ton',
2472         'tr': 'tur',
2473         'ts': 'tso',
2474         'tt': 'tat',
2475         'tw': 'twi',
2476         'ty': 'tah',
2477         'ug': 'uig',
2478         'uk': 'ukr',
2479         'ur': 'urd',
2480         'uz': 'uzb',
2481         've': 'ven',
2482         'vi': 'vie',
2483         'vo': 'vol',
2484         'wa': 'wln',
2485         'wo': 'wol',
2486         'xh': 'xho',
2487         'yi': 'yid',
2488         'yo': 'yor',
2489         'za': 'zha',
2490         'zh': 'zho',
2491         'zu': 'zul',
2492     }
2493
2494     @classmethod
2495     def short2long(cls, code):
2496         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2497         return cls._lang_map.get(code[:2])
2498
2499     @classmethod
2500     def long2short(cls, code):
2501         """Convert language code from ISO 639-2/T to ISO 639-1"""
2502         for short_name, long_name in cls._lang_map.items():
2503             if long_name == code:
2504                 return short_name
2505
2506
2507 class ISO3166Utils(object):
2508     # From http://data.okfn.org/data/core/country-list
2509     _country_map = {
2510         'AF': 'Afghanistan',
2511         'AX': 'Åland Islands',
2512         'AL': 'Albania',
2513         'DZ': 'Algeria',
2514         'AS': 'American Samoa',
2515         'AD': 'Andorra',
2516         'AO': 'Angola',
2517         'AI': 'Anguilla',
2518         'AQ': 'Antarctica',
2519         'AG': 'Antigua and Barbuda',
2520         'AR': 'Argentina',
2521         'AM': 'Armenia',
2522         'AW': 'Aruba',
2523         'AU': 'Australia',
2524         'AT': 'Austria',
2525         'AZ': 'Azerbaijan',
2526         'BS': 'Bahamas',
2527         'BH': 'Bahrain',
2528         'BD': 'Bangladesh',
2529         'BB': 'Barbados',
2530         'BY': 'Belarus',
2531         'BE': 'Belgium',
2532         'BZ': 'Belize',
2533         'BJ': 'Benin',
2534         'BM': 'Bermuda',
2535         'BT': 'Bhutan',
2536         'BO': 'Bolivia, Plurinational State of',
2537         'BQ': 'Bonaire, Sint Eustatius and Saba',
2538         'BA': 'Bosnia and Herzegovina',
2539         'BW': 'Botswana',
2540         'BV': 'Bouvet Island',
2541         'BR': 'Brazil',
2542         'IO': 'British Indian Ocean Territory',
2543         'BN': 'Brunei Darussalam',
2544         'BG': 'Bulgaria',
2545         'BF': 'Burkina Faso',
2546         'BI': 'Burundi',
2547         'KH': 'Cambodia',
2548         'CM': 'Cameroon',
2549         'CA': 'Canada',
2550         'CV': 'Cape Verde',
2551         'KY': 'Cayman Islands',
2552         'CF': 'Central African Republic',
2553         'TD': 'Chad',
2554         'CL': 'Chile',
2555         'CN': 'China',
2556         'CX': 'Christmas Island',
2557         'CC': 'Cocos (Keeling) Islands',
2558         'CO': 'Colombia',
2559         'KM': 'Comoros',
2560         'CG': 'Congo',
2561         'CD': 'Congo, the Democratic Republic of the',
2562         'CK': 'Cook Islands',
2563         'CR': 'Costa Rica',
2564         'CI': 'Côte d\'Ivoire',
2565         'HR': 'Croatia',
2566         'CU': 'Cuba',
2567         'CW': 'Curaçao',
2568         'CY': 'Cyprus',
2569         'CZ': 'Czech Republic',
2570         'DK': 'Denmark',
2571         'DJ': 'Djibouti',
2572         'DM': 'Dominica',
2573         'DO': 'Dominican Republic',
2574         'EC': 'Ecuador',
2575         'EG': 'Egypt',
2576         'SV': 'El Salvador',
2577         'GQ': 'Equatorial Guinea',
2578         'ER': 'Eritrea',
2579         'EE': 'Estonia',
2580         'ET': 'Ethiopia',
2581         'FK': 'Falkland Islands (Malvinas)',
2582         'FO': 'Faroe Islands',
2583         'FJ': 'Fiji',
2584         'FI': 'Finland',
2585         'FR': 'France',
2586         'GF': 'French Guiana',
2587         'PF': 'French Polynesia',
2588         'TF': 'French Southern Territories',
2589         'GA': 'Gabon',
2590         'GM': 'Gambia',
2591         'GE': 'Georgia',
2592         'DE': 'Germany',
2593         'GH': 'Ghana',
2594         'GI': 'Gibraltar',
2595         'GR': 'Greece',
2596         'GL': 'Greenland',
2597         'GD': 'Grenada',
2598         'GP': 'Guadeloupe',
2599         'GU': 'Guam',
2600         'GT': 'Guatemala',
2601         'GG': 'Guernsey',
2602         'GN': 'Guinea',
2603         'GW': 'Guinea-Bissau',
2604         'GY': 'Guyana',
2605         'HT': 'Haiti',
2606         'HM': 'Heard Island and McDonald Islands',
2607         'VA': 'Holy See (Vatican City State)',
2608         'HN': 'Honduras',
2609         'HK': 'Hong Kong',
2610         'HU': 'Hungary',
2611         'IS': 'Iceland',
2612         'IN': 'India',
2613         'ID': 'Indonesia',
2614         'IR': 'Iran, Islamic Republic of',
2615         'IQ': 'Iraq',
2616         'IE': 'Ireland',
2617         'IM': 'Isle of Man',
2618         'IL': 'Israel',
2619         'IT': 'Italy',
2620         'JM': 'Jamaica',
2621         'JP': 'Japan',
2622         'JE': 'Jersey',
2623         'JO': 'Jordan',
2624         'KZ': 'Kazakhstan',
2625         'KE': 'Kenya',
2626         'KI': 'Kiribati',
2627         'KP': 'Korea, Democratic People\'s Republic of',
2628         'KR': 'Korea, Republic of',
2629         'KW': 'Kuwait',
2630         'KG': 'Kyrgyzstan',
2631         'LA': 'Lao People\'s Democratic Republic',
2632         'LV': 'Latvia',
2633         'LB': 'Lebanon',
2634         'LS': 'Lesotho',
2635         'LR': 'Liberia',
2636         'LY': 'Libya',
2637         'LI': 'Liechtenstein',
2638         'LT': 'Lithuania',
2639         'LU': 'Luxembourg',
2640         'MO': 'Macao',
2641         'MK': 'Macedonia, the Former Yugoslav Republic of',
2642         'MG': 'Madagascar',
2643         'MW': 'Malawi',
2644         'MY': 'Malaysia',
2645         'MV': 'Maldives',
2646         'ML': 'Mali',
2647         'MT': 'Malta',
2648         'MH': 'Marshall Islands',
2649         'MQ': 'Martinique',
2650         'MR': 'Mauritania',
2651         'MU': 'Mauritius',
2652         'YT': 'Mayotte',
2653         'MX': 'Mexico',
2654         'FM': 'Micronesia, Federated States of',
2655         'MD': 'Moldova, Republic of',
2656         'MC': 'Monaco',
2657         'MN': 'Mongolia',
2658         'ME': 'Montenegro',
2659         'MS': 'Montserrat',
2660         'MA': 'Morocco',
2661         'MZ': 'Mozambique',
2662         'MM': 'Myanmar',
2663         'NA': 'Namibia',
2664         'NR': 'Nauru',
2665         'NP': 'Nepal',
2666         'NL': 'Netherlands',
2667         'NC': 'New Caledonia',
2668         'NZ': 'New Zealand',
2669         'NI': 'Nicaragua',
2670         'NE': 'Niger',
2671         'NG': 'Nigeria',
2672         'NU': 'Niue',
2673         'NF': 'Norfolk Island',
2674         'MP': 'Northern Mariana Islands',
2675         'NO': 'Norway',
2676         'OM': 'Oman',
2677         'PK': 'Pakistan',
2678         'PW': 'Palau',
2679         'PS': 'Palestine, State of',
2680         'PA': 'Panama',
2681         'PG': 'Papua New Guinea',
2682         'PY': 'Paraguay',
2683         'PE': 'Peru',
2684         'PH': 'Philippines',
2685         'PN': 'Pitcairn',
2686         'PL': 'Poland',
2687         'PT': 'Portugal',
2688         'PR': 'Puerto Rico',
2689         'QA': 'Qatar',
2690         'RE': 'Réunion',
2691         'RO': 'Romania',
2692         'RU': 'Russian Federation',
2693         'RW': 'Rwanda',
2694         'BL': 'Saint Barthélemy',
2695         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2696         'KN': 'Saint Kitts and Nevis',
2697         'LC': 'Saint Lucia',
2698         'MF': 'Saint Martin (French part)',
2699         'PM': 'Saint Pierre and Miquelon',
2700         'VC': 'Saint Vincent and the Grenadines',
2701         'WS': 'Samoa',
2702         'SM': 'San Marino',
2703         'ST': 'Sao Tome and Principe',
2704         'SA': 'Saudi Arabia',
2705         'SN': 'Senegal',
2706         'RS': 'Serbia',
2707         'SC': 'Seychelles',
2708         'SL': 'Sierra Leone',
2709         'SG': 'Singapore',
2710         'SX': 'Sint Maarten (Dutch part)',
2711         'SK': 'Slovakia',
2712         'SI': 'Slovenia',
2713         'SB': 'Solomon Islands',
2714         'SO': 'Somalia',
2715         'ZA': 'South Africa',
2716         'GS': 'South Georgia and the South Sandwich Islands',
2717         'SS': 'South Sudan',
2718         'ES': 'Spain',
2719         'LK': 'Sri Lanka',
2720         'SD': 'Sudan',
2721         'SR': 'Suriname',
2722         'SJ': 'Svalbard and Jan Mayen',
2723         'SZ': 'Swaziland',
2724         'SE': 'Sweden',
2725         'CH': 'Switzerland',
2726         'SY': 'Syrian Arab Republic',
2727         'TW': 'Taiwan, Province of China',
2728         'TJ': 'Tajikistan',
2729         'TZ': 'Tanzania, United Republic of',
2730         'TH': 'Thailand',
2731         'TL': 'Timor-Leste',
2732         'TG': 'Togo',
2733         'TK': 'Tokelau',
2734         'TO': 'Tonga',
2735         'TT': 'Trinidad and Tobago',
2736         'TN': 'Tunisia',
2737         'TR': 'Turkey',
2738         'TM': 'Turkmenistan',
2739         'TC': 'Turks and Caicos Islands',
2740         'TV': 'Tuvalu',
2741         'UG': 'Uganda',
2742         'UA': 'Ukraine',
2743         'AE': 'United Arab Emirates',
2744         'GB': 'United Kingdom',
2745         'US': 'United States',
2746         'UM': 'United States Minor Outlying Islands',
2747         'UY': 'Uruguay',
2748         'UZ': 'Uzbekistan',
2749         'VU': 'Vanuatu',
2750         'VE': 'Venezuela, Bolivarian Republic of',
2751         'VN': 'Viet Nam',
2752         'VG': 'Virgin Islands, British',
2753         'VI': 'Virgin Islands, U.S.',
2754         'WF': 'Wallis and Futuna',
2755         'EH': 'Western Sahara',
2756         'YE': 'Yemen',
2757         'ZM': 'Zambia',
2758         'ZW': 'Zimbabwe',
2759     }
2760
2761     @classmethod
2762     def short2full(cls, code):
2763         """Convert an ISO 3166-2 country code to the corresponding full name"""
2764         return cls._country_map.get(code.upper())
2765
2766
2767 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2768     def __init__(self, proxies=None):
2769         # Set default handlers
2770         for type in ('http', 'https'):
2771             setattr(self, '%s_open' % type,
2772                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2773                         meth(r, proxy, type))
2774         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2775
2776     def proxy_open(self, req, proxy, type):
2777         req_proxy = req.headers.get('Ytdl-request-proxy')
2778         if req_proxy is not None:
2779             proxy = req_proxy
2780             del req.headers['Ytdl-request-proxy']
2781
2782         if proxy == '__noproxy__':
2783             return None  # No Proxy
2784         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2785             req.add_header('Ytdl-socks-proxy', proxy)
2786             # youtube-dl's http/https handlers do wrapping the socket with socks
2787             return None
2788         return compat_urllib_request.ProxyHandler.proxy_open(
2789             self, req, proxy, type)
2790
2791
2792 def ohdave_rsa_encrypt(data, exponent, modulus):
2793     '''
2794     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2795
2796     Input:
2797         data: data to encrypt, bytes-like object
2798         exponent, modulus: parameter e and N of RSA algorithm, both integer
2799     Output: hex string of encrypted data
2800
2801     Limitation: supports one block encryption only
2802     '''
2803
2804     payload = int(binascii.hexlify(data[::-1]), 16)
2805     encrypted = pow(payload, exponent, modulus)
2806     return '%x' % encrypted
2807
2808
2809 def encode_base_n(num, n, table=None):
2810     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2811     if not table:
2812         table = FULL_TABLE[:n]
2813
2814     if n > len(table):
2815         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2816
2817     if num == 0:
2818         return table[0]
2819
2820     ret = ''
2821     while num:
2822         ret = table[num % n] + ret
2823         num = num // n
2824     return ret
2825
2826
2827 def decode_packed_codes(code):
2828     mobj = re.search(
2829         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2830         code)
2831     obfucasted_code, base, count, symbols = mobj.groups()
2832     base = int(base)
2833     count = int(count)
2834     symbols = symbols.split('|')
2835     symbol_table = {}
2836
2837     while count:
2838         count -= 1
2839         base_n_count = encode_base_n(count, base)
2840         symbol_table[base_n_count] = symbols[count] or base_n_count
2841
2842     return re.sub(
2843         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2844         obfucasted_code)