git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_http_client,
  43     compat_kwargs,
  44     compat_parse_qs,
  45     compat_shlex_quote,
  46     compat_socket_create_connection,
  47     compat_str,
  48     compat_struct_pack,
  49     compat_urllib_error,
  50     compat_urllib_parse,
  51     compat_urllib_parse_urlencode,
  52     compat_urllib_parse_urlparse,
  53     compat_urllib_request,
  54     compat_urlparse,
  55     compat_xpath,
  56 )
  57
  58 from .socks import (
  59     ProxyType,
  60     sockssocket,
  61 )
  62
  63
  64 def register_socks_protocols():
  65     # "Register" SOCKS protocols
  66     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  67     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  68     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  69         if scheme not in compat_urlparse.uses_netloc:
  70             compat_urlparse.uses_netloc.append(scheme)
  71
  72
  73 # This is not clearly defined otherwise
  74 compiled_regex_type = type(re.compile(''))
  75
  76 std_headers = {
  77     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
  78     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  79     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  80     'Accept-Encoding': 'gzip, deflate',
  81     'Accept-Language': 'en-us,en;q=0.5',
  82 }
  83
  84
  85 NO_DEFAULT = object()
  86
  87 ENGLISH_MONTH_NAMES = [
  88     'January', 'February', 'March', 'April', 'May', 'June',
  89     'July', 'August', 'September', 'October', 'November', 'December']
  90
  91 KNOWN_EXTENSIONS = (
  92     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  93     'flv', 'f4v', 'f4a', 'f4b',
  94     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  95     'mkv', 'mka', 'mk3d',
  96     'avi', 'divx',
  97     'mov',
  98     'asf', 'wmv', 'wma',
  99     '3gp', '3g2',
 100     'mp3',
 101     'flac',
 102     'ape',
 103     'wav',
 104     'f4f', 'f4m', 'm3u8', 'smil')
 105
 106 # needed for sanitizing filenames in restricted mode
 107 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
 108                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
 109                                         'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
 110
 111
 112 def preferredencoding():
 113     """Get preferred encoding.
 114
 115     Returns the best encoding scheme for the system, based on
 116     locale.getpreferredencoding() and some further tweaks.
 117     """
 118     try:
 119         pref = locale.getpreferredencoding()
 120         'TEST'.encode(pref)
 121     except Exception:
 122         pref = 'UTF-8'
 123
 124     return pref
 125
 126
 127 def write_json_file(obj, fn):
 128     """ Encode obj as JSON and write it to fn, atomically if possible """
 129
 130     fn = encodeFilename(fn)
 131     if sys.version_info < (3, 0) and sys.platform != 'win32':
 132         encoding = get_filesystem_encoding()
 133         # os.path.basename returns a bytes object, but NamedTemporaryFile
 134         # will fail if the filename contains non ascii characters unless we
 135         # use a unicode object
 136         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 137         # the same for os.path.dirname
 138         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 139     else:
 140         path_basename = os.path.basename
 141         path_dirname = os.path.dirname
 142
 143     args = {
 144         'suffix': '.tmp',
 145         'prefix': path_basename(fn) + '.',
 146         'dir': path_dirname(fn),
 147         'delete': False,
 148     }
 149
 150     # In Python 2.x, json.dump expects a bytestream.
 151     # In Python 3.x, it writes to a character stream
 152     if sys.version_info < (3, 0):
 153         args['mode'] = 'wb'
 154     else:
 155         args.update({
 156             'mode': 'w',
 157             'encoding': 'utf-8',
 158         })
 159
 160     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 161
 162     try:
 163         with tf:
 164             json.dump(obj, tf)
 165         if sys.platform == 'win32':
 166             # Need to remove existing file on Windows, else os.rename raises
 167             # WindowsError or FileExistsError.
 168             try:
 169                 os.unlink(fn)
 170             except OSError:
 171                 pass
 172         os.rename(tf.name, fn)
 173     except Exception:
 174         try:
 175             os.remove(tf.name)
 176         except OSError:
 177             pass
 178         raise
 179
 180
 181 if sys.version_info >= (2, 7):
 182     def find_xpath_attr(node, xpath, key, val=None):
 183         """ Find the xpath xpath[@key=val] """
 184         assert re.match(r'^[a-zA-Z_-]+$', key)
 185         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 186         return node.find(expr)
 187 else:
 188     def find_xpath_attr(node, xpath, key, val=None):
 189         for f in node.findall(compat_xpath(xpath)):
 190             if key not in f.attrib:
 191                 continue
 192             if val is None or f.attrib.get(key) == val:
 193                 return f
 194         return None
 195
 196 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 197 # the namespace parameter
 198
 199
 200 def xpath_with_ns(path, ns_map):
 201     components = [c.split(':') for c in path.split('/')]
 202     replaced = []
 203     for c in components:
 204         if len(c) == 1:
 205             replaced.append(c[0])
 206         else:
 207             ns, tag = c
 208             replaced.append('{%s}%s' % (ns_map[ns], tag))
 209     return '/'.join(replaced)
 210
 211
 212 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 213     def _find_xpath(xpath):
 214         return node.find(compat_xpath(xpath))
 215
 216     if isinstance(xpath, (str, compat_str)):
 217         n = _find_xpath(xpath)
 218     else:
 219         for xp in xpath:
 220             n = _find_xpath(xp)
 221             if n is not None:
 222                 break
 223
 224     if n is None:
 225         if default is not NO_DEFAULT:
 226             return default
 227         elif fatal:
 228             name = xpath if name is None else name
 229             raise ExtractorError('Could not find XML element %s' % name)
 230         else:
 231             return None
 232     return n
 233
 234
 235 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 236     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 237     if n is None or n == default:
 238         return n
 239     if n.text is None:
 240         if default is not NO_DEFAULT:
 241             return default
 242         elif fatal:
 243             name = xpath if name is None else name
 244             raise ExtractorError('Could not find XML element\'s text %s' % name)
 245         else:
 246             return None
 247     return n.text
 248
 249
 250 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 251     n = find_xpath_attr(node, xpath, key)
 252     if n is None:
 253         if default is not NO_DEFAULT:
 254             return default
 255         elif fatal:
 256             name = '%s[@%s]' % (xpath, key) if name is None else name
 257             raise ExtractorError('Could not find XML attribute %s' % name)
 258         else:
 259             return None
 260     return n.attrib[key]
 261
 262
 263 def get_element_by_id(id, html):
 264     """Return the content of the tag with the specified ID in the passed HTML document"""
 265     return get_element_by_attribute('id', id, html)
 266
 267
 268 def get_element_by_attribute(attribute, value, html):
 269     """Return the content of the tag with the specified attribute in the passed HTML document"""
 270
 271     m = re.search(r'''(?xs)
 272         <([a-zA-Z0-9:._-]+)
 273          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 274          \s+%s=['"]?%s['"]?
 275          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 276         \s*>
 277         (?P<content>.*?)
 278         </\1>
 279     ''' % (re.escape(attribute), re.escape(value)), html)
 280
 281     if not m:
 282         return None
 283     res = m.group('content')
 284
 285     if res.startswith('"') or res.startswith("'"):
 286         res = res[1:-1]
 287
 288     return unescapeHTML(res)
 289
 290
 291 class HTMLAttributeParser(compat_HTMLParser):
 292     """Trivial HTML parser to gather the attributes for a single element"""
 293     def __init__(self):
 294         self.attrs = {}
 295         compat_HTMLParser.__init__(self)
 296
 297     def handle_starttag(self, tag, attrs):
 298         self.attrs = dict(attrs)
 299
 300
 301 def extract_attributes(html_element):
 302     """Given a string for an HTML element such as
 303     <el
 304          a="foo" B="bar" c="&98;az" d=boz
 305          empty= noval entity="&amp;"
 306          sq='"' dq="'"
 307     >
 308     Decode and return a dictionary of attributes.
 309     {
 310         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 311         'empty': '', 'noval': None, 'entity': '&',
 312         'sq': '"', 'dq': '\''
 313     }.
 314     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 315     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 316     """
 317     parser = HTMLAttributeParser()
 318     parser.feed(html_element)
 319     parser.close()
 320     return parser.attrs
 321
 322
 323 def clean_html(html):
 324     """Clean an HTML snippet into a readable string"""
 325
 326     if html is None:  # Convenience for sanitizing descriptions etc.
 327         return html
 328
 329     # Newline vs <br />
 330     html = html.replace('\n', ' ')
 331     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 332     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 333     # Strip html tags
 334     html = re.sub('<.*?>', '', html)
 335     # Replace html entities
 336     html = unescapeHTML(html)
 337     return html.strip()
 338
 339
 340 def sanitize_open(filename, open_mode):
 341     """Try to open the given filename, and slightly tweak it if this fails.
 342
 343     Attempts to open the given filename. If this fails, it tries to change
 344     the filename slightly, step by step, until it's either able to open it
 345     or it fails and raises a final exception, like the standard open()
 346     function.
 347
 348     It returns the tuple (stream, definitive_file_name).
 349     """
 350     try:
 351         if filename == '-':
 352             if sys.platform == 'win32':
 353                 import msvcrt
 354                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 355             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 356         stream = open(encodeFilename(filename), open_mode)
 357         return (stream, filename)
 358     except (IOError, OSError) as err:
 359         if err.errno in (errno.EACCES,):
 360             raise
 361
 362         # In case of error, try to remove win32 forbidden chars
 363         alt_filename = sanitize_path(filename)
 364         if alt_filename == filename:
 365             raise
 366         else:
 367             # An exception here should be caught in the caller
 368             stream = open(encodeFilename(alt_filename), open_mode)
 369             return (stream, alt_filename)
 370
 371
 372 def timeconvert(timestr):
 373     """Convert RFC 2822 defined time string into system timestamp"""
 374     timestamp = None
 375     timetuple = email.utils.parsedate_tz(timestr)
 376     if timetuple is not None:
 377         timestamp = email.utils.mktime_tz(timetuple)
 378     return timestamp
 379
 380
 381 def sanitize_filename(s, restricted=False, is_id=False):
 382     """Sanitizes a string so it could be used as part of a filename.
 383     If restricted is set, use a stricter subset of allowed characters.
 384     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 385     """
 386     def replace_insane(char):
 387         if restricted and char in ACCENT_CHARS:
 388             return ACCENT_CHARS[char]
 389         if char == '?' or ord(char) < 32 or ord(char) == 127:
 390             return ''
 391         elif char == '"':
 392             return '' if restricted else '\''
 393         elif char == ':':
 394             return '_-' if restricted else ' -'
 395         elif char in '\\/|*<>':
 396             return '_'
 397         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 398             return '_'
 399         if restricted and ord(char) > 127:
 400             return '_'
 401         return char
 402
 403     # Handle timestamps
 404     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 405     result = ''.join(map(replace_insane, s))
 406     if not is_id:
 407         while '__' in result:
 408             result = result.replace('__', '_')
 409         result = result.strip('_')
 410         # Common case of "Foreign band name - English song title"
 411         if restricted and result.startswith('-_'):
 412             result = result[2:]
 413         if result.startswith('-'):
 414             result = '_' + result[len('-'):]
 415         result = result.lstrip('.')
 416         if not result:
 417             result = '_'
 418     return result
 419
 420
 421 def sanitize_path(s):
 422     """Sanitizes and normalizes path on Windows"""
 423     if sys.platform != 'win32':
 424         return s
 425     drive_or_unc, _ = os.path.splitdrive(s)
 426     if sys.version_info < (2, 7) and not drive_or_unc:
 427         drive_or_unc, _ = os.path.splitunc(s)
 428     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 429     if drive_or_unc:
 430         norm_path.pop(0)
 431     sanitized_path = [
 432         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 433         for path_part in norm_path]
 434     if drive_or_unc:
 435         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 436     return os.path.join(*sanitized_path)
 437
 438
 439 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 440 # unwanted failures due to missing protocol
 441 def sanitize_url(url):
 442     return 'http:%s' % url if url.startswith('//') else url
 443
 444
 445 def sanitized_Request(url, *args, **kwargs):
 446     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 447
 448
 449 def orderedSet(iterable):
 450     """ Remove all duplicates from the input iterable """
 451     res = []
 452     for el in iterable:
 453         if el not in res:
 454             res.append(el)
 455     return res
 456
 457
 458 def _htmlentity_transform(entity):
 459     """Transforms an HTML entity to a character."""
 460     # Known non-numeric HTML entity
 461     if entity in compat_html_entities.name2codepoint:
 462         return compat_chr(compat_html_entities.name2codepoint[entity])
 463
 464     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 465     if mobj is not None:
 466         numstr = mobj.group(1)
 467         if numstr.startswith('x'):
 468             base = 16
 469             numstr = '0%s' % numstr
 470         else:
 471             base = 10
 472         # See https://github.com/rg3/youtube-dl/issues/7518
 473         try:
 474             return compat_chr(int(numstr, base))
 475         except ValueError:
 476             pass
 477
 478     # Unknown entity in name, return its literal representation
 479     return '&%s;' % entity
 480
 481
 482 def unescapeHTML(s):
 483     if s is None:
 484         return None
 485     assert type(s) == compat_str
 486
 487     return re.sub(
 488         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 489
 490
 491 def get_subprocess_encoding():
 492     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 493         # For subprocess calls, encode with locale encoding
 494         # Refer to http://stackoverflow.com/a/9951851/35070
 495         encoding = preferredencoding()
 496     else:
 497         encoding = sys.getfilesystemencoding()
 498     if encoding is None:
 499         encoding = 'utf-8'
 500     return encoding
 501
 502
 503 def encodeFilename(s, for_subprocess=False):
 504     """
 505     @param s The name of the file
 506     """
 507
 508     assert type(s) == compat_str
 509
 510     # Python 3 has a Unicode API
 511     if sys.version_info >= (3, 0):
 512         return s
 513
 514     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 515     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 516     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 517     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 518         return s
 519
 520     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 521     if sys.platform.startswith('java'):
 522         return s
 523
 524     return s.encode(get_subprocess_encoding(), 'ignore')
 525
 526
 527 def decodeFilename(b, for_subprocess=False):
 528
 529     if sys.version_info >= (3, 0):
 530         return b
 531
 532     if not isinstance(b, bytes):
 533         return b
 534
 535     return b.decode(get_subprocess_encoding(), 'ignore')
 536
 537
 538 def encodeArgument(s):
 539     if not isinstance(s, compat_str):
 540         # Legacy code that uses byte strings
 541         # Uncomment the following line after fixing all post processors
 542         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 543         s = s.decode('ascii')
 544     return encodeFilename(s, True)
 545
 546
 547 def decodeArgument(b):
 548     return decodeFilename(b, True)
 549
 550
 551 def decodeOption(optval):
 552     if optval is None:
 553         return optval
 554     if isinstance(optval, bytes):
 555         optval = optval.decode(preferredencoding())
 556
 557     assert isinstance(optval, compat_str)
 558     return optval
 559
 560
 561 def formatSeconds(secs):
 562     if secs > 3600:
 563         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 564     elif secs > 60:
 565         return '%d:%02d' % (secs // 60, secs % 60)
 566     else:
 567         return '%d' % secs
 568
 569
 570 def make_HTTPS_handler(params, **kwargs):
 571     opts_no_check_certificate = params.get('nocheckcertificate', False)
 572     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 573         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 574         if opts_no_check_certificate:
 575             context.check_hostname = False
 576             context.verify_mode = ssl.CERT_NONE
 577         try:
 578             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 579         except TypeError:
 580             # Python 2.7.8
 581             # (create_default_context present but HTTPSHandler has no context=)
 582             pass
 583
 584     if sys.version_info < (3, 2):
 585         return YoutubeDLHTTPSHandler(params, **kwargs)
 586     else:  # Python < 3.4
 587         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 588         context.verify_mode = (ssl.CERT_NONE
 589                                if opts_no_check_certificate
 590                                else ssl.CERT_REQUIRED)
 591         context.set_default_verify_paths()
 592         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 593
 594
 595 def bug_reports_message():
 596     if ytdl_is_updateable():
 597         update_cmd = 'type  youtube-dl -U  to update'
 598     else:
 599         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 600     msg = '; please report this issue on https://yt-dl.org/bug .'
 601     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 602     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 603     return msg
 604
 605
 606 class ExtractorError(Exception):
 607     """Error during info extraction."""
 608
 609     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 610         """ tb, if given, is the original traceback (so that it can be printed out).
 611         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 612         """
 613
 614         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 615             expected = True
 616         if video_id is not None:
 617             msg = video_id + ': ' + msg
 618         if cause:
 619             msg += ' (caused by %r)' % cause
 620         if not expected:
 621             msg += bug_reports_message()
 622         super(ExtractorError, self).__init__(msg)
 623
 624         self.traceback = tb
 625         self.exc_info = sys.exc_info()  # preserve original exception
 626         self.cause = cause
 627         self.video_id = video_id
 628
 629     def format_traceback(self):
 630         if self.traceback is None:
 631             return None
 632         return ''.join(traceback.format_tb(self.traceback))
 633
 634
 635 class UnsupportedError(ExtractorError):
 636     def __init__(self, url):
 637         super(UnsupportedError, self).__init__(
 638             'Unsupported URL: %s' % url, expected=True)
 639         self.url = url
 640
 641
 642 class RegexNotFoundError(ExtractorError):
 643     """Error when a regex didn't match"""
 644     pass
 645
 646
 647 class DownloadError(Exception):
 648     """Download Error exception.
 649
 650     This exception may be thrown by FileDownloader objects if they are not
 651     configured to continue on errors. They will contain the appropriate
 652     error message.
 653     """
 654
 655     def __init__(self, msg, exc_info=None):
 656         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 657         super(DownloadError, self).__init__(msg)
 658         self.exc_info = exc_info
 659
 660
 661 class SameFileError(Exception):
 662     """Same File exception.
 663
 664     This exception will be thrown by FileDownloader objects if they detect
 665     multiple files would have to be downloaded to the same file on disk.
 666     """
 667     pass
 668
 669
 670 class PostProcessingError(Exception):
 671     """Post Processing exception.
 672
 673     This exception may be raised by PostProcessor's .run() method to
 674     indicate an error in the postprocessing task.
 675     """
 676
 677     def __init__(self, msg):
 678         self.msg = msg
 679
 680
 681 class MaxDownloadsReached(Exception):
 682     """ --max-downloads limit has been reached. """
 683     pass
 684
 685
 686 class UnavailableVideoError(Exception):
 687     """Unavailable Format exception.
 688
 689     This exception will be thrown when a video is requested
 690     in a format that is not available for that video.
 691     """
 692     pass
 693
 694
 695 class ContentTooShortError(Exception):
 696     """Content Too Short exception.
 697
 698     This exception may be raised by FileDownloader objects when a file they
 699     download is too small for what the server announced first, indicating
 700     the connection was probably interrupted.
 701     """
 702
 703     def __init__(self, downloaded, expected):
 704         # Both in bytes
 705         self.downloaded = downloaded
 706         self.expected = expected
 707
 708
 709 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 710     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 711     # expected HTTP responses to meet HTTP/1.0 or later (see also
 712     # https://github.com/rg3/youtube-dl/issues/6727)
 713     if sys.version_info < (3, 0):
 714         kwargs[b'strict'] = True
 715     hc = http_class(*args, **kwargs)
 716     source_address = ydl_handler._params.get('source_address')
 717     if source_address is not None:
 718         sa = (source_address, 0)
 719         if hasattr(hc, 'source_address'):  # Python 2.7+
 720             hc.source_address = sa
 721         else:  # Python 2.6
 722             def _hc_connect(self, *args, **kwargs):
 723                 sock = compat_socket_create_connection(
 724                     (self.host, self.port), self.timeout, sa)
 725                 if is_https:
 726                     self.sock = ssl.wrap_socket(
 727                         sock, self.key_file, self.cert_file,
 728                         ssl_version=ssl.PROTOCOL_TLSv1)
 729                 else:
 730                     self.sock = sock
 731             hc.connect = functools.partial(_hc_connect, hc)
 732
 733     return hc
 734
 735
 736 def handle_youtubedl_headers(headers):
 737     filtered_headers = headers
 738
 739     if 'Youtubedl-no-compression' in filtered_headers:
 740         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 741         del filtered_headers['Youtubedl-no-compression']
 742
 743     return filtered_headers
 744
 745
 746 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 747     """Handler for HTTP requests and responses.
 748
 749     This class, when installed with an OpenerDirector, automatically adds
 750     the standard headers to every HTTP request and handles gzipped and
 751     deflated responses from web servers. If compression is to be avoided in
 752     a particular request, the original request in the program code only has
 753     to include the HTTP header "Youtubedl-no-compression", which will be
 754     removed before making the real request.
 755
 756     Part of this code was copied from:
 757
 758     http://techknack.net/python-urllib2-handlers/
 759
 760     Andrew Rowls, the author of that code, agreed to release it to the
 761     public domain.
 762     """
 763
 764     def __init__(self, params, *args, **kwargs):
 765         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 766         self._params = params
 767
 768     def http_open(self, req):
 769         conn_class = compat_http_client.HTTPConnection
 770
 771         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 772         if socks_proxy:
 773             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 774             del req.headers['Ytdl-socks-proxy']
 775
 776         return self.do_open(functools.partial(
 777             _create_http_connection, self, conn_class, False),
 778             req)
 779
 780     @staticmethod
 781     def deflate(data):
 782         try:
 783             return zlib.decompress(data, -zlib.MAX_WBITS)
 784         except zlib.error:
 785             return zlib.decompress(data)
 786
 787     @staticmethod
 788     def addinfourl_wrapper(stream, headers, url, code):
 789         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 790             return compat_urllib_request.addinfourl(stream, headers, url, code)
 791         ret = compat_urllib_request.addinfourl(stream, headers, url)
 792         ret.code = code
 793         return ret
 794
 795     def http_request(self, req):
 796         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 797         # always respected by websites, some tend to give out URLs with non percent-encoded
 798         # non-ASCII characters (see telemb.py, ard.py [#3412])
 799         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 800         # To work around aforementioned issue we will replace request's original URL with
 801         # percent-encoded one
 802         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 803         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 804         url = req.get_full_url()
 805         url_escaped = escape_url(url)
 806
 807         # Substitute URL if any change after escaping
 808         if url != url_escaped:
 809             req = update_Request(req, url=url_escaped)
 810
 811         for h, v in std_headers.items():
 812             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 813             # The dict keys are capitalized because of this bug by urllib
 814             if h.capitalize() not in req.headers:
 815                 req.add_header(h, v)
 816
 817         req.headers = handle_youtubedl_headers(req.headers)
 818
 819         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 820             # Python 2.6 is brain-dead when it comes to fragments
 821             req._Request__original = req._Request__original.partition('#')[0]
 822             req._Request__r_type = req._Request__r_type.partition('#')[0]
 823
 824         return req
 825
 826     def http_response(self, req, resp):
 827         old_resp = resp
 828         # gzip
 829         if resp.headers.get('Content-encoding', '') == 'gzip':
 830             content = resp.read()
 831             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 832             try:
 833                 uncompressed = io.BytesIO(gz.read())
 834             except IOError as original_ioerror:
 835                 # There may be junk add the end of the file
 836                 # See http://stackoverflow.com/q/4928560/35070 for details
 837                 for i in range(1, 1024):
 838                     try:
 839                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 840                         uncompressed = io.BytesIO(gz.read())
 841                     except IOError:
 842                         continue
 843                     break
 844                 else:
 845                     raise original_ioerror
 846             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 847             resp.msg = old_resp.msg
 848             del resp.headers['Content-encoding']
 849         # deflate
 850         if resp.headers.get('Content-encoding', '') == 'deflate':
 851             gz = io.BytesIO(self.deflate(resp.read()))
 852             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 853             resp.msg = old_resp.msg
 854             del resp.headers['Content-encoding']
 855         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 856         # https://github.com/rg3/youtube-dl/issues/6457).
 857         if 300 <= resp.code < 400:
 858             location = resp.headers.get('Location')
 859             if location:
 860                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 861                 if sys.version_info >= (3, 0):
 862                     location = location.encode('iso-8859-1').decode('utf-8')
 863                 location_escaped = escape_url(location)
 864                 if location != location_escaped:
 865                     del resp.headers['Location']
 866                     resp.headers['Location'] = location_escaped
 867         return resp
 868
 869     https_request = http_request
 870     https_response = http_response
 871
 872
 873 def make_socks_conn_class(base_class, socks_proxy):
 874     assert issubclass(base_class, (
 875         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 876
 877     url_components = compat_urlparse.urlparse(socks_proxy)
 878     if url_components.scheme.lower() == 'socks5':
 879         socks_type = ProxyType.SOCKS5
 880     elif url_components.scheme.lower() in ('socks', 'socks4'):
 881         socks_type = ProxyType.SOCKS4
 882     elif url_components.scheme.lower() == 'socks4a':
 883         socks_type = ProxyType.SOCKS4A
 884
 885     proxy_args = (
 886         socks_type,
 887         url_components.hostname, url_components.port or 1080,
 888         True,  # Remote DNS
 889         url_components.username, url_components.password
 890     )
 891
 892     class SocksConnection(base_class):
 893         def connect(self):
 894             self.sock = sockssocket()
 895             self.sock.setproxy(*proxy_args)
 896             if type(self.timeout) in (int, float):
 897                 self.sock.settimeout(self.timeout)
 898             self.sock.connect((self.host, self.port))
 899
 900             if isinstance(self, compat_http_client.HTTPSConnection):
 901                 if hasattr(self, '_context'):  # Python > 2.6
 902                     self.sock = self._context.wrap_socket(
 903                         self.sock, server_hostname=self.host)
 904                 else:
 905                     self.sock = ssl.wrap_socket(self.sock)
 906
 907     return SocksConnection
 908
 909
 910 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 911     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 912         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 913         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 914         self._params = params
 915
 916     def https_open(self, req):
 917         kwargs = {}
 918         conn_class = self._https_conn_class
 919
 920         if hasattr(self, '_context'):  # python > 2.6
 921             kwargs['context'] = self._context
 922         if hasattr(self, '_check_hostname'):  # python 3.x
 923             kwargs['check_hostname'] = self._check_hostname
 924
 925         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 926         if socks_proxy:
 927             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 928             del req.headers['Ytdl-socks-proxy']
 929
 930         return self.do_open(functools.partial(
 931             _create_http_connection, self, conn_class, True),
 932             req, **kwargs)
 933
 934
 935 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 936     def __init__(self, cookiejar=None):
 937         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 938
 939     def http_response(self, request, response):
 940         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 941         # characters in Set-Cookie HTTP header of last response (see
 942         # https://github.com/rg3/youtube-dl/issues/6769).
 943         # In order to at least prevent crashing we will percent encode Set-Cookie
 944         # header before HTTPCookieProcessor starts processing it.
 945         # if sys.version_info < (3, 0) and response.headers:
 946         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 947         #         set_cookie = response.headers.get(set_cookie_header)
 948         #         if set_cookie:
 949         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 950         #             if set_cookie != set_cookie_escaped:
 951         #                 del response.headers[set_cookie_header]
 952         #                 response.headers[set_cookie_header] = set_cookie_escaped
 953         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 954
 955     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 956     https_response = http_response
 957
 958
 959 def parse_iso8601(date_str, delimiter='T', timezone=None):
 960     """ Return a UNIX timestamp from the given date """
 961
 962     if date_str is None:
 963         return None
 964
 965     date_str = re.sub(r'\.[0-9]+', '', date_str)
 966
 967     if timezone is None:
 968         m = re.search(
 969             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 970             date_str)
 971         if not m:
 972             timezone = datetime.timedelta()
 973         else:
 974             date_str = date_str[:-len(m.group(0))]
 975             if not m.group('sign'):
 976                 timezone = datetime.timedelta()
 977             else:
 978                 sign = 1 if m.group('sign') == '+' else -1
 979                 timezone = datetime.timedelta(
 980                     hours=sign * int(m.group('hours')),
 981                     minutes=sign * int(m.group('minutes')))
 982     try:
 983         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 984         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 985         return calendar.timegm(dt.timetuple())
 986     except ValueError:
 987         pass
 988
 989
 990 def unified_strdate(date_str, day_first=True):
 991     """Return a string with the date in the format YYYYMMDD"""
 992
 993     if date_str is None:
 994         return None
 995     upload_date = None
 996     # Replace commas
 997     date_str = date_str.replace(',', ' ')
 998     # %z (UTC offset) is only supported in python>=3.2
 999     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1000         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1001     # Remove AM/PM + timezone
1002     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1003
1004     format_expressions = [
1005         '%d %B %Y',
1006         '%d %b %Y',
1007         '%B %d %Y',
1008         '%b %d %Y',
1009         '%b %dst %Y %I:%M',
1010         '%b %dnd %Y %I:%M',
1011         '%b %dth %Y %I:%M',
1012         '%Y %m %d',
1013         '%Y-%m-%d',
1014         '%Y/%m/%d',
1015         '%Y/%m/%d %H:%M:%S',
1016         '%Y-%m-%d %H:%M:%S',
1017         '%Y-%m-%d %H:%M:%S.%f',
1018         '%d.%m.%Y %H:%M',
1019         '%d.%m.%Y %H.%M',
1020         '%Y-%m-%dT%H:%M:%SZ',
1021         '%Y-%m-%dT%H:%M:%S.%fZ',
1022         '%Y-%m-%dT%H:%M:%S.%f0Z',
1023         '%Y-%m-%dT%H:%M:%S',
1024         '%Y-%m-%dT%H:%M:%S.%f',
1025         '%Y-%m-%dT%H:%M',
1026     ]
1027     if day_first:
1028         format_expressions.extend([
1029             '%d-%m-%Y',
1030             '%d.%m.%Y',
1031             '%d/%m/%Y',
1032             '%d/%m/%y',
1033             '%d/%m/%Y %H:%M:%S',
1034         ])
1035     else:
1036         format_expressions.extend([
1037             '%m-%d-%Y',
1038             '%m.%d.%Y',
1039             '%m/%d/%Y',
1040             '%m/%d/%y',
1041             '%m/%d/%Y %H:%M:%S',
1042         ])
1043     for expression in format_expressions:
1044         try:
1045             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1046         except ValueError:
1047             pass
1048     if upload_date is None:
1049         timetuple = email.utils.parsedate_tz(date_str)
1050         if timetuple:
1051             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1052     if upload_date is not None:
1053         return compat_str(upload_date)
1054
1055
1056 def determine_ext(url, default_ext='unknown_video'):
1057     if url is None:
1058         return default_ext
1059     guess = url.partition('?')[0].rpartition('.')[2]
1060     if re.match(r'^[A-Za-z0-9]+$', guess):
1061         return guess
1062     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1063     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1064         return guess.rstrip('/')
1065     else:
1066         return default_ext
1067
1068
1069 def subtitles_filename(filename, sub_lang, sub_format):
1070     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1071
1072
1073 def date_from_str(date_str):
1074     """
1075     Return a datetime object from a string in the format YYYYMMDD or
1076     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1077     today = datetime.date.today()
1078     if date_str in ('now', 'today'):
1079         return today
1080     if date_str == 'yesterday':
1081         return today - datetime.timedelta(days=1)
1082     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1083     if match is not None:
1084         sign = match.group('sign')
1085         time = int(match.group('time'))
1086         if sign == '-':
1087             time = -time
1088         unit = match.group('unit')
1089         # A bad approximation?
1090         if unit == 'month':
1091             unit = 'day'
1092             time *= 30
1093         elif unit == 'year':
1094             unit = 'day'
1095             time *= 365
1096         unit += 's'
1097         delta = datetime.timedelta(**{unit: time})
1098         return today + delta
1099     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1100
1101
1102 def hyphenate_date(date_str):
1103     """
1104     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1105     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1106     if match is not None:
1107         return '-'.join(match.groups())
1108     else:
1109         return date_str
1110
1111
1112 class DateRange(object):
1113     """Represents a time interval between two dates"""
1114
1115     def __init__(self, start=None, end=None):
1116         """start and end must be strings in the format accepted by date"""
1117         if start is not None:
1118             self.start = date_from_str(start)
1119         else:
1120             self.start = datetime.datetime.min.date()
1121         if end is not None:
1122             self.end = date_from_str(end)
1123         else:
1124             self.end = datetime.datetime.max.date()
1125         if self.start > self.end:
1126             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1127
1128     @classmethod
1129     def day(cls, day):
1130         """Returns a range that only contains the given day"""
1131         return cls(day, day)
1132
1133     def __contains__(self, date):
1134         """Check if the date is in the range"""
1135         if not isinstance(date, datetime.date):
1136             date = date_from_str(date)
1137         return self.start <= date <= self.end
1138
1139     def __str__(self):
1140         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1141
1142
1143 def platform_name():
1144     """ Returns the platform name as a compat_str """
1145     res = platform.platform()
1146     if isinstance(res, bytes):
1147         res = res.decode(preferredencoding())
1148
1149     assert isinstance(res, compat_str)
1150     return res
1151
1152
1153 def _windows_write_string(s, out):
1154     """ Returns True if the string was written using special methods,
1155     False if it has yet to be written out."""
1156     # Adapted from http://stackoverflow.com/a/3259271/35070
1157
1158     import ctypes
1159     import ctypes.wintypes
1160
1161     WIN_OUTPUT_IDS = {
1162         1: -11,
1163         2: -12,
1164     }
1165
1166     try:
1167         fileno = out.fileno()
1168     except AttributeError:
1169         # If the output stream doesn't have a fileno, it's virtual
1170         return False
1171     except io.UnsupportedOperation:
1172         # Some strange Windows pseudo files?
1173         return False
1174     if fileno not in WIN_OUTPUT_IDS:
1175         return False
1176
1177     GetStdHandle = ctypes.WINFUNCTYPE(
1178         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1179         (b'GetStdHandle', ctypes.windll.kernel32))
1180     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1181
1182     WriteConsoleW = ctypes.WINFUNCTYPE(
1183         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1184         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1185         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1186     written = ctypes.wintypes.DWORD(0)
1187
1188     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1189     FILE_TYPE_CHAR = 0x0002
1190     FILE_TYPE_REMOTE = 0x8000
1191     GetConsoleMode = ctypes.WINFUNCTYPE(
1192         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1193         ctypes.POINTER(ctypes.wintypes.DWORD))(
1194         (b'GetConsoleMode', ctypes.windll.kernel32))
1195     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1196
1197     def not_a_console(handle):
1198         if handle == INVALID_HANDLE_VALUE or handle is None:
1199             return True
1200         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1201                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1202
1203     if not_a_console(h):
1204         return False
1205
1206     def next_nonbmp_pos(s):
1207         try:
1208             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1209         except StopIteration:
1210             return len(s)
1211
1212     while s:
1213         count = min(next_nonbmp_pos(s), 1024)
1214
1215         ret = WriteConsoleW(
1216             h, s, count if count else 2, ctypes.byref(written), None)
1217         if ret == 0:
1218             raise OSError('Failed to write string')
1219         if not count:  # We just wrote a non-BMP character
1220             assert written.value == 2
1221             s = s[1:]
1222         else:
1223             assert written.value > 0
1224             s = s[written.value:]
1225     return True
1226
1227
1228 def write_string(s, out=None, encoding=None):
1229     if out is None:
1230         out = sys.stderr
1231     assert type(s) == compat_str
1232
1233     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1234         if _windows_write_string(s, out):
1235             return
1236
1237     if ('b' in getattr(out, 'mode', '') or
1238             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1239         byt = s.encode(encoding or preferredencoding(), 'ignore')
1240         out.write(byt)
1241     elif hasattr(out, 'buffer'):
1242         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1243         byt = s.encode(enc, 'ignore')
1244         out.buffer.write(byt)
1245     else:
1246         out.write(s)
1247     out.flush()
1248
1249
1250 def bytes_to_intlist(bs):
1251     if not bs:
1252         return []
1253     if isinstance(bs[0], int):  # Python 3
1254         return list(bs)
1255     else:
1256         return [ord(c) for c in bs]
1257
1258
1259 def intlist_to_bytes(xs):
1260     if not xs:
1261         return b''
1262     return compat_struct_pack('%dB' % len(xs), *xs)
1263
1264
1265 # Cross-platform file locking
1266 if sys.platform == 'win32':
1267     import ctypes.wintypes
1268     import msvcrt
1269
1270     class OVERLAPPED(ctypes.Structure):
1271         _fields_ = [
1272             ('Internal', ctypes.wintypes.LPVOID),
1273             ('InternalHigh', ctypes.wintypes.LPVOID),
1274             ('Offset', ctypes.wintypes.DWORD),
1275             ('OffsetHigh', ctypes.wintypes.DWORD),
1276             ('hEvent', ctypes.wintypes.HANDLE),
1277         ]
1278
1279     kernel32 = ctypes.windll.kernel32
1280     LockFileEx = kernel32.LockFileEx
1281     LockFileEx.argtypes = [
1282         ctypes.wintypes.HANDLE,     # hFile
1283         ctypes.wintypes.DWORD,      # dwFlags
1284         ctypes.wintypes.DWORD,      # dwReserved
1285         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1286         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1287         ctypes.POINTER(OVERLAPPED)  # Overlapped
1288     ]
1289     LockFileEx.restype = ctypes.wintypes.BOOL
1290     UnlockFileEx = kernel32.UnlockFileEx
1291     UnlockFileEx.argtypes = [
1292         ctypes.wintypes.HANDLE,     # hFile
1293         ctypes.wintypes.DWORD,      # dwReserved
1294         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1295         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1296         ctypes.POINTER(OVERLAPPED)  # Overlapped
1297     ]
1298     UnlockFileEx.restype = ctypes.wintypes.BOOL
1299     whole_low = 0xffffffff
1300     whole_high = 0x7fffffff
1301
1302     def _lock_file(f, exclusive):
1303         overlapped = OVERLAPPED()
1304         overlapped.Offset = 0
1305         overlapped.OffsetHigh = 0
1306         overlapped.hEvent = 0
1307         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1308         handle = msvcrt.get_osfhandle(f.fileno())
1309         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1310                           whole_low, whole_high, f._lock_file_overlapped_p):
1311             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1312
1313     def _unlock_file(f):
1314         assert f._lock_file_overlapped_p
1315         handle = msvcrt.get_osfhandle(f.fileno())
1316         if not UnlockFileEx(handle, 0,
1317                             whole_low, whole_high, f._lock_file_overlapped_p):
1318             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1319
1320 else:
1321     # Some platforms, such as Jython, is missing fcntl
1322     try:
1323         import fcntl
1324
1325         def _lock_file(f, exclusive):
1326             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1327
1328         def _unlock_file(f):
1329             fcntl.flock(f, fcntl.LOCK_UN)
1330     except ImportError:
1331         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1332
1333         def _lock_file(f, exclusive):
1334             raise IOError(UNSUPPORTED_MSG)
1335
1336         def _unlock_file(f):
1337             raise IOError(UNSUPPORTED_MSG)
1338
1339
1340 class locked_file(object):
1341     def __init__(self, filename, mode, encoding=None):
1342         assert mode in ['r', 'a', 'w']
1343         self.f = io.open(filename, mode, encoding=encoding)
1344         self.mode = mode
1345
1346     def __enter__(self):
1347         exclusive = self.mode != 'r'
1348         try:
1349             _lock_file(self.f, exclusive)
1350         except IOError:
1351             self.f.close()
1352             raise
1353         return self
1354
1355     def __exit__(self, etype, value, traceback):
1356         try:
1357             _unlock_file(self.f)
1358         finally:
1359             self.f.close()
1360
1361     def __iter__(self):
1362         return iter(self.f)
1363
1364     def write(self, *args):
1365         return self.f.write(*args)
1366
1367     def read(self, *args):
1368         return self.f.read(*args)
1369
1370
1371 def get_filesystem_encoding():
1372     encoding = sys.getfilesystemencoding()
1373     return encoding if encoding is not None else 'utf-8'
1374
1375
1376 def shell_quote(args):
1377     quoted_args = []
1378     encoding = get_filesystem_encoding()
1379     for a in args:
1380         if isinstance(a, bytes):
1381             # We may get a filename encoded with 'encodeFilename'
1382             a = a.decode(encoding)
1383         quoted_args.append(pipes.quote(a))
1384     return ' '.join(quoted_args)
1385
1386
1387 def smuggle_url(url, data):
1388     """ Pass additional data in a URL for internal use. """
1389
1390     sdata = compat_urllib_parse_urlencode(
1391         {'__youtubedl_smuggle': json.dumps(data)})
1392     return url + '#' + sdata
1393
1394
1395 def unsmuggle_url(smug_url, default=None):
1396     if '#__youtubedl_smuggle' not in smug_url:
1397         return smug_url, default
1398     url, _, sdata = smug_url.rpartition('#')
1399     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1400     data = json.loads(jsond)
1401     return url, data
1402
1403
1404 def format_bytes(bytes):
1405     if bytes is None:
1406         return 'N/A'
1407     if type(bytes) is str:
1408         bytes = float(bytes)
1409     if bytes == 0.0:
1410         exponent = 0
1411     else:
1412         exponent = int(math.log(bytes, 1024.0))
1413     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1414     converted = float(bytes) / float(1024 ** exponent)
1415     return '%.2f%s' % (converted, suffix)
1416
1417
1418 def lookup_unit_table(unit_table, s):
1419     units_re = '|'.join(re.escape(u) for u in unit_table)
1420     m = re.match(
1421         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1422     if not m:
1423         return None
1424     num_str = m.group('num').replace(',', '.')
1425     mult = unit_table[m.group('unit')]
1426     return int(float(num_str) * mult)
1427
1428
1429 def parse_filesize(s):
1430     if s is None:
1431         return None
1432
1433     # The lower-case forms are of course incorrect and unofficial,
1434     # but we support those too
1435     _UNIT_TABLE = {
1436         'B': 1,
1437         'b': 1,
1438         'KiB': 1024,
1439         'KB': 1000,
1440         'kB': 1024,
1441         'Kb': 1000,
1442         'MiB': 1024 ** 2,
1443         'MB': 1000 ** 2,
1444         'mB': 1024 ** 2,
1445         'Mb': 1000 ** 2,
1446         'GiB': 1024 ** 3,
1447         'GB': 1000 ** 3,
1448         'gB': 1024 ** 3,
1449         'Gb': 1000 ** 3,
1450         'TiB': 1024 ** 4,
1451         'TB': 1000 ** 4,
1452         'tB': 1024 ** 4,
1453         'Tb': 1000 ** 4,
1454         'PiB': 1024 ** 5,
1455         'PB': 1000 ** 5,
1456         'pB': 1024 ** 5,
1457         'Pb': 1000 ** 5,
1458         'EiB': 1024 ** 6,
1459         'EB': 1000 ** 6,
1460         'eB': 1024 ** 6,
1461         'Eb': 1000 ** 6,
1462         'ZiB': 1024 ** 7,
1463         'ZB': 1000 ** 7,
1464         'zB': 1024 ** 7,
1465         'Zb': 1000 ** 7,
1466         'YiB': 1024 ** 8,
1467         'YB': 1000 ** 8,
1468         'yB': 1024 ** 8,
1469         'Yb': 1000 ** 8,
1470     }
1471
1472     return lookup_unit_table(_UNIT_TABLE, s)
1473
1474
1475 def parse_count(s):
1476     if s is None:
1477         return None
1478
1479     s = s.strip()
1480
1481     if re.match(r'^[\d,.]+$', s):
1482         return str_to_int(s)
1483
1484     _UNIT_TABLE = {
1485         'k': 1000,
1486         'K': 1000,
1487         'm': 1000 ** 2,
1488         'M': 1000 ** 2,
1489         'kk': 1000 ** 2,
1490         'KK': 1000 ** 2,
1491     }
1492
1493     return lookup_unit_table(_UNIT_TABLE, s)
1494
1495
1496 def month_by_name(name):
1497     """ Return the number of a month by (locale-independently) English name """
1498
1499     try:
1500         return ENGLISH_MONTH_NAMES.index(name) + 1
1501     except ValueError:
1502         return None
1503
1504
1505 def month_by_abbreviation(abbrev):
1506     """ Return the number of a month by (locale-independently) English
1507         abbreviations """
1508
1509     try:
1510         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1511     except ValueError:
1512         return None
1513
1514
1515 def fix_xml_ampersands(xml_str):
1516     """Replace all the '&' by '&amp;' in XML"""
1517     return re.sub(
1518         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1519         '&amp;',
1520         xml_str)
1521
1522
1523 def setproctitle(title):
1524     assert isinstance(title, compat_str)
1525
1526     # ctypes in Jython is not complete
1527     # http://bugs.jython.org/issue2148
1528     if sys.platform.startswith('java'):
1529         return
1530
1531     try:
1532         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1533     except OSError:
1534         return
1535     title_bytes = title.encode('utf-8')
1536     buf = ctypes.create_string_buffer(len(title_bytes))
1537     buf.value = title_bytes
1538     try:
1539         libc.prctl(15, buf, 0, 0, 0)
1540     except AttributeError:
1541         return  # Strange libc, just skip this
1542
1543
1544 def remove_start(s, start):
1545     if s.startswith(start):
1546         return s[len(start):]
1547     return s
1548
1549
1550 def remove_end(s, end):
1551     if s.endswith(end):
1552         return s[:-len(end)]
1553     return s
1554
1555
1556 def remove_quotes(s):
1557     if s is None or len(s) < 2:
1558         return s
1559     for quote in ('"', "'", ):
1560         if s[0] == quote and s[-1] == quote:
1561             return s[1:-1]
1562     return s
1563
1564
1565 def url_basename(url):
1566     path = compat_urlparse.urlparse(url).path
1567     return path.strip('/').split('/')[-1]
1568
1569
1570 class HEADRequest(compat_urllib_request.Request):
1571     def get_method(self):
1572         return 'HEAD'
1573
1574
1575 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1576     if get_attr:
1577         if v is not None:
1578             v = getattr(v, get_attr, None)
1579     if v == '':
1580         v = None
1581     if v is None:
1582         return default
1583     try:
1584         return int(v) * invscale // scale
1585     except ValueError:
1586         return default
1587
1588
1589 def str_or_none(v, default=None):
1590     return default if v is None else compat_str(v)
1591
1592
1593 def str_to_int(int_str):
1594     """ A more relaxed version of int_or_none """
1595     if int_str is None:
1596         return None
1597     int_str = re.sub(r'[,\.\+]', '', int_str)
1598     return int(int_str)
1599
1600
1601 def float_or_none(v, scale=1, invscale=1, default=None):
1602     if v is None:
1603         return default
1604     try:
1605         return float(v) * invscale / scale
1606     except ValueError:
1607         return default
1608
1609
1610 def parse_duration(s):
1611     if not isinstance(s, compat_basestring):
1612         return None
1613
1614     s = s.strip()
1615
1616     days, hours, mins, secs, ms = [None] * 5
1617     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1618     if m:
1619         days, hours, mins, secs, ms = m.groups()
1620     else:
1621         m = re.match(
1622             r'''(?ix)(?:P?T)?
1623                 (?:
1624                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1625                 )?
1626                 (?:
1627                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1628                 )?
1629                 (?:
1630                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1631                 )?
1632                 (?:
1633                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1634                 )?$''', s)
1635         if m:
1636             days, hours, mins, secs, ms = m.groups()
1637         else:
1638             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1639             if m:
1640                 hours, mins = m.groups()
1641             else:
1642                 return None
1643
1644     duration = 0
1645     if secs:
1646         duration += float(secs)
1647     if mins:
1648         duration += float(mins) * 60
1649     if hours:
1650         duration += float(hours) * 60 * 60
1651     if days:
1652         duration += float(days) * 24 * 60 * 60
1653     if ms:
1654         duration += float(ms)
1655     return duration
1656
1657
1658 def prepend_extension(filename, ext, expected_real_ext=None):
1659     name, real_ext = os.path.splitext(filename)
1660     return (
1661         '{0}.{1}{2}'.format(name, ext, real_ext)
1662         if not expected_real_ext or real_ext[1:] == expected_real_ext
1663         else '{0}.{1}'.format(filename, ext))
1664
1665
1666 def replace_extension(filename, ext, expected_real_ext=None):
1667     name, real_ext = os.path.splitext(filename)
1668     return '{0}.{1}'.format(
1669         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1670         ext)
1671
1672
1673 def check_executable(exe, args=[]):
1674     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1675     args can be a list of arguments for a short output (like -version) """
1676     try:
1677         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1678     except OSError:
1679         return False
1680     return exe
1681
1682
1683 def get_exe_version(exe, args=['--version'],
1684                     version_re=None, unrecognized='present'):
1685     """ Returns the version of the specified executable,
1686     or False if the executable is not present """
1687     try:
1688         out, _ = subprocess.Popen(
1689             [encodeArgument(exe)] + args,
1690             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1691     except OSError:
1692         return False
1693     if isinstance(out, bytes):  # Python 2.x
1694         out = out.decode('ascii', 'ignore')
1695     return detect_exe_version(out, version_re, unrecognized)
1696
1697
1698 def detect_exe_version(output, version_re=None, unrecognized='present'):
1699     assert isinstance(output, compat_str)
1700     if version_re is None:
1701         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1702     m = re.search(version_re, output)
1703     if m:
1704         return m.group(1)
1705     else:
1706         return unrecognized
1707
1708
1709 class PagedList(object):
1710     def __len__(self):
1711         # This is only useful for tests
1712         return len(self.getslice())
1713
1714
1715 class OnDemandPagedList(PagedList):
1716     def __init__(self, pagefunc, pagesize, use_cache=False):
1717         self._pagefunc = pagefunc
1718         self._pagesize = pagesize
1719         self._use_cache = use_cache
1720         if use_cache:
1721             self._cache = {}
1722
1723     def getslice(self, start=0, end=None):
1724         res = []
1725         for pagenum in itertools.count(start // self._pagesize):
1726             firstid = pagenum * self._pagesize
1727             nextfirstid = pagenum * self._pagesize + self._pagesize
1728             if start >= nextfirstid:
1729                 continue
1730
1731             page_results = None
1732             if self._use_cache:
1733                 page_results = self._cache.get(pagenum)
1734             if page_results is None:
1735                 page_results = list(self._pagefunc(pagenum))
1736             if self._use_cache:
1737                 self._cache[pagenum] = page_results
1738
1739             startv = (
1740                 start % self._pagesize
1741                 if firstid <= start < nextfirstid
1742                 else 0)
1743
1744             endv = (
1745                 ((end - 1) % self._pagesize) + 1
1746                 if (end is not None and firstid <= end <= nextfirstid)
1747                 else None)
1748
1749             if startv != 0 or endv is not None:
1750                 page_results = page_results[startv:endv]
1751             res.extend(page_results)
1752
1753             # A little optimization - if current page is not "full", ie. does
1754             # not contain page_size videos then we can assume that this page
1755             # is the last one - there are no more ids on further pages -
1756             # i.e. no need to query again.
1757             if len(page_results) + startv < self._pagesize:
1758                 break
1759
1760             # If we got the whole page, but the next page is not interesting,
1761             # break out early as well
1762             if end == nextfirstid:
1763                 break
1764         return res
1765
1766
1767 class InAdvancePagedList(PagedList):
1768     def __init__(self, pagefunc, pagecount, pagesize):
1769         self._pagefunc = pagefunc
1770         self._pagecount = pagecount
1771         self._pagesize = pagesize
1772
1773     def getslice(self, start=0, end=None):
1774         res = []
1775         start_page = start // self._pagesize
1776         end_page = (
1777             self._pagecount if end is None else (end // self._pagesize + 1))
1778         skip_elems = start - start_page * self._pagesize
1779         only_more = None if end is None else end - start
1780         for pagenum in range(start_page, end_page):
1781             page = list(self._pagefunc(pagenum))
1782             if skip_elems:
1783                 page = page[skip_elems:]
1784                 skip_elems = None
1785             if only_more is not None:
1786                 if len(page) < only_more:
1787                     only_more -= len(page)
1788                 else:
1789                     page = page[:only_more]
1790                     res.extend(page)
1791                     break
1792             res.extend(page)
1793         return res
1794
1795
1796 def uppercase_escape(s):
1797     unicode_escape = codecs.getdecoder('unicode_escape')
1798     return re.sub(
1799         r'\\U[0-9a-fA-F]{8}',
1800         lambda m: unicode_escape(m.group(0))[0],
1801         s)
1802
1803
1804 def lowercase_escape(s):
1805     unicode_escape = codecs.getdecoder('unicode_escape')
1806     return re.sub(
1807         r'\\u[0-9a-fA-F]{4}',
1808         lambda m: unicode_escape(m.group(0))[0],
1809         s)
1810
1811
1812 def escape_rfc3986(s):
1813     """Escape non-ASCII characters as suggested by RFC 3986"""
1814     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1815         s = s.encode('utf-8')
1816     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1817
1818
1819 def escape_url(url):
1820     """Escape URL as suggested by RFC 3986"""
1821     url_parsed = compat_urllib_parse_urlparse(url)
1822     return url_parsed._replace(
1823         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1824         path=escape_rfc3986(url_parsed.path),
1825         params=escape_rfc3986(url_parsed.params),
1826         query=escape_rfc3986(url_parsed.query),
1827         fragment=escape_rfc3986(url_parsed.fragment)
1828     ).geturl()
1829
1830
1831 def read_batch_urls(batch_fd):
1832     def fixup(url):
1833         if not isinstance(url, compat_str):
1834             url = url.decode('utf-8', 'replace')
1835         BOM_UTF8 = '\xef\xbb\xbf'
1836         if url.startswith(BOM_UTF8):
1837             url = url[len(BOM_UTF8):]
1838         url = url.strip()
1839         if url.startswith(('#', ';', ']')):
1840             return False
1841         return url
1842
1843     with contextlib.closing(batch_fd) as fd:
1844         return [url for url in map(fixup, fd) if url]
1845
1846
1847 def urlencode_postdata(*args, **kargs):
1848     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1849
1850
1851 def update_url_query(url, query):
1852     if not query:
1853         return url
1854     parsed_url = compat_urlparse.urlparse(url)
1855     qs = compat_parse_qs(parsed_url.query)
1856     qs.update(query)
1857     return compat_urlparse.urlunparse(parsed_url._replace(
1858         query=compat_urllib_parse_urlencode(qs, True)))
1859
1860
1861 def update_Request(req, url=None, data=None, headers={}, query={}):
1862     req_headers = req.headers.copy()
1863     req_headers.update(headers)
1864     req_data = data or req.data
1865     req_url = update_url_query(url or req.get_full_url(), query)
1866     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1867     new_req = req_type(
1868         req_url, data=req_data, headers=req_headers,
1869         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1870     if hasattr(req, 'timeout'):
1871         new_req.timeout = req.timeout
1872     return new_req
1873
1874
1875 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1876     if isinstance(key_or_keys, (list, tuple)):
1877         for key in key_or_keys:
1878             if key not in d or d[key] is None or skip_false_values and not d[key]:
1879                 continue
1880             return d[key]
1881         return default
1882     return d.get(key_or_keys, default)
1883
1884
1885 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1886     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1887
1888
1889 US_RATINGS = {
1890     'G': 0,
1891     'PG': 10,
1892     'PG-13': 13,
1893     'R': 16,
1894     'NC': 18,
1895 }
1896
1897
1898 def parse_age_limit(s):
1899     if s is None:
1900         return None
1901     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1902     return int(m.group('age')) if m else US_RATINGS.get(s)
1903
1904
1905 def strip_jsonp(code):
1906     return re.sub(
1907         r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1908
1909
1910 def js_to_json(code):
1911     def fix_kv(m):
1912         v = m.group(0)
1913         if v in ('true', 'false', 'null'):
1914             return v
1915         if v.startswith('"'):
1916             v = re.sub(r"\\'", "'", v[1:-1])
1917         elif v.startswith("'"):
1918             v = v[1:-1]
1919             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1920                 '\\\\': '\\\\',
1921                 "\\'": "'",
1922                 '"': '\\"',
1923             }[m.group(0)], v)
1924         return '"%s"' % v
1925
1926     res = re.sub(r'''(?x)
1927         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1928         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1929         [a-zA-Z_][.a-zA-Z_0-9]*
1930         ''', fix_kv, code)
1931     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1932     return res
1933
1934
1935 def qualities(quality_ids):
1936     """ Get a numeric quality value out of a list of possible values """
1937     def q(qid):
1938         try:
1939             return quality_ids.index(qid)
1940         except ValueError:
1941             return -1
1942     return q
1943
1944
1945 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1946
1947
1948 def limit_length(s, length):
1949     """ Add ellipses to overly long strings """
1950     if s is None:
1951         return None
1952     ELLIPSES = '...'
1953     if len(s) > length:
1954         return s[:length - len(ELLIPSES)] + ELLIPSES
1955     return s
1956
1957
1958 def version_tuple(v):
1959     return tuple(int(e) for e in re.split(r'[-.]', v))
1960
1961
1962 def is_outdated_version(version, limit, assume_new=True):
1963     if not version:
1964         return not assume_new
1965     try:
1966         return version_tuple(version) < version_tuple(limit)
1967     except ValueError:
1968         return not assume_new
1969
1970
1971 def ytdl_is_updateable():
1972     """ Returns if youtube-dl can be updated with -U """
1973     from zipimport import zipimporter
1974
1975     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1976
1977
1978 def args_to_str(args):
1979     # Get a short string representation for a subprocess command
1980     return ' '.join(compat_shlex_quote(a) for a in args)
1981
1982
1983 def error_to_compat_str(err):
1984     err_str = str(err)
1985     # On python 2 error byte string must be decoded with proper
1986     # encoding rather than ascii
1987     if sys.version_info[0] < 3:
1988         err_str = err_str.decode(preferredencoding())
1989     return err_str
1990
1991
1992 def mimetype2ext(mt):
1993     if mt is None:
1994         return None
1995
1996     ext = {
1997         'audio/mp4': 'm4a',
1998     }.get(mt)
1999     if ext is not None:
2000         return ext
2001
2002     _, _, res = mt.rpartition('/')
2003
2004     return {
2005         '3gpp': '3gp',
2006         'smptett+xml': 'tt',
2007         'srt': 'srt',
2008         'ttaf+xml': 'dfxp',
2009         'ttml+xml': 'ttml',
2010         'vtt': 'vtt',
2011         'x-flv': 'flv',
2012         'x-mp4-fragmented': 'mp4',
2013         'x-ms-wmv': 'wmv',
2014     }.get(res, res)
2015
2016
2017 def urlhandle_detect_ext(url_handle):
2018     try:
2019         url_handle.headers
2020         getheader = lambda h: url_handle.headers[h]
2021     except AttributeError:  # Python < 3
2022         getheader = url_handle.info().getheader
2023
2024     cd = getheader('Content-Disposition')
2025     if cd:
2026         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2027         if m:
2028             e = determine_ext(m.group('filename'), default_ext=None)
2029             if e:
2030                 return e
2031
2032     return mimetype2ext(getheader('Content-Type'))
2033
2034
2035 def encode_data_uri(data, mime_type):
2036     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2037
2038
2039 def age_restricted(content_limit, age_limit):
2040     """ Returns True iff the content should be blocked """
2041
2042     if age_limit is None:  # No limit set
2043         return False
2044     if content_limit is None:
2045         return False  # Content available for everyone
2046     return age_limit < content_limit
2047
2048
2049 def is_html(first_bytes):
2050     """ Detect whether a file contains HTML by examining its first bytes. """
2051
2052     BOMS = [
2053         (b'\xef\xbb\xbf', 'utf-8'),
2054         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2055         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2056         (b'\xff\xfe', 'utf-16-le'),
2057         (b'\xfe\xff', 'utf-16-be'),
2058     ]
2059     for bom, enc in BOMS:
2060         if first_bytes.startswith(bom):
2061             s = first_bytes[len(bom):].decode(enc, 'replace')
2062             break
2063     else:
2064         s = first_bytes.decode('utf-8', 'replace')
2065
2066     return re.match(r'^\s*<', s)
2067
2068
2069 def determine_protocol(info_dict):
2070     protocol = info_dict.get('protocol')
2071     if protocol is not None:
2072         return protocol
2073
2074     url = info_dict['url']
2075     if url.startswith('rtmp'):
2076         return 'rtmp'
2077     elif url.startswith('mms'):
2078         return 'mms'
2079     elif url.startswith('rtsp'):
2080         return 'rtsp'
2081
2082     ext = determine_ext(url)
2083     if ext == 'm3u8':
2084         return 'm3u8'
2085     elif ext == 'f4m':
2086         return 'f4m'
2087
2088     return compat_urllib_parse_urlparse(url).scheme
2089
2090
2091 def render_table(header_row, data):
2092     """ Render a list of rows, each as a list of values """
2093     table = [header_row] + data
2094     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2095     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2096     return '\n'.join(format_str % tuple(row) for row in table)
2097
2098
2099 def _match_one(filter_part, dct):
2100     COMPARISON_OPERATORS = {
2101         '<': operator.lt,
2102         '<=': operator.le,
2103         '>': operator.gt,
2104         '>=': operator.ge,
2105         '=': operator.eq,
2106         '!=': operator.ne,
2107     }
2108     operator_rex = re.compile(r'''(?x)\s*
2109         (?P<key>[a-z_]+)
2110         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2111         (?:
2112             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2113             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2114         )
2115         \s*$
2116         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2117     m = operator_rex.search(filter_part)
2118     if m:
2119         op = COMPARISON_OPERATORS[m.group('op')]
2120         if m.group('strval') is not None:
2121             if m.group('op') not in ('=', '!='):
2122                 raise ValueError(
2123                     'Operator %s does not support string values!' % m.group('op'))
2124             comparison_value = m.group('strval')
2125         else:
2126             try:
2127                 comparison_value = int(m.group('intval'))
2128             except ValueError:
2129                 comparison_value = parse_filesize(m.group('intval'))
2130                 if comparison_value is None:
2131                     comparison_value = parse_filesize(m.group('intval') + 'B')
2132                 if comparison_value is None:
2133                     raise ValueError(
2134                         'Invalid integer value %r in filter part %r' % (
2135                             m.group('intval'), filter_part))
2136         actual_value = dct.get(m.group('key'))
2137         if actual_value is None:
2138             return m.group('none_inclusive')
2139         return op(actual_value, comparison_value)
2140
2141     UNARY_OPERATORS = {
2142         '': lambda v: v is not None,
2143         '!': lambda v: v is None,
2144     }
2145     operator_rex = re.compile(r'''(?x)\s*
2146         (?P<op>%s)\s*(?P<key>[a-z_]+)
2147         \s*$
2148         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2149     m = operator_rex.search(filter_part)
2150     if m:
2151         op = UNARY_OPERATORS[m.group('op')]
2152         actual_value = dct.get(m.group('key'))
2153         return op(actual_value)
2154
2155     raise ValueError('Invalid filter part %r' % filter_part)
2156
2157
2158 def match_str(filter_str, dct):
2159     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2160
2161     return all(
2162         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2163
2164
2165 def match_filter_func(filter_str):
2166     def _match_func(info_dict):
2167         if match_str(filter_str, info_dict):
2168             return None
2169         else:
2170             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2171             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2172     return _match_func
2173
2174
2175 def parse_dfxp_time_expr(time_expr):
2176     if not time_expr:
2177         return
2178
2179     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2180     if mobj:
2181         return float(mobj.group('time_offset'))
2182
2183     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2184     if mobj:
2185         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2186
2187
2188 def srt_subtitles_timecode(seconds):
2189     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2190
2191
2192 def dfxp2srt(dfxp_data):
2193     _x = functools.partial(xpath_with_ns, ns_map={
2194         'ttml': 'http://www.w3.org/ns/ttml',
2195         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2196         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2197     })
2198
2199     class TTMLPElementParser(object):
2200         out = ''
2201
2202         def start(self, tag, attrib):
2203             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2204                 self.out += '\n'
2205
2206         def end(self, tag):
2207             pass
2208
2209         def data(self, data):
2210             self.out += data
2211
2212         def close(self):
2213             return self.out.strip()
2214
2215     def parse_node(node):
2216         target = TTMLPElementParser()
2217         parser = xml.etree.ElementTree.XMLParser(target=target)
2218         parser.feed(xml.etree.ElementTree.tostring(node))
2219         return parser.close()
2220
2221     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2222     out = []
2223     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2224
2225     if not paras:
2226         raise ValueError('Invalid dfxp/TTML subtitle')
2227
2228     for para, index in zip(paras, itertools.count(1)):
2229         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2230         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2231         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2232         if begin_time is None:
2233             continue
2234         if not end_time:
2235             if not dur:
2236                 continue
2237             end_time = begin_time + dur
2238         out.append('%d\n%s --> %s\n%s\n\n' % (
2239             index,
2240             srt_subtitles_timecode(begin_time),
2241             srt_subtitles_timecode(end_time),
2242             parse_node(para)))
2243
2244     return ''.join(out)
2245
2246
2247 def cli_option(params, command_option, param):
2248     param = params.get(param)
2249     return [command_option, param] if param is not None else []
2250
2251
2252 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2253     param = params.get(param)
2254     assert isinstance(param, bool)
2255     if separator:
2256         return [command_option + separator + (true_value if param else false_value)]
2257     return [command_option, true_value if param else false_value]
2258
2259
2260 def cli_valueless_option(params, command_option, param, expected_value=True):
2261     param = params.get(param)
2262     return [command_option] if param == expected_value else []
2263
2264
2265 def cli_configuration_args(params, param, default=[]):
2266     ex_args = params.get(param)
2267     if ex_args is None:
2268         return default
2269     assert isinstance(ex_args, list)
2270     return ex_args
2271
2272
2273 class ISO639Utils(object):
2274     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2275     _lang_map = {
2276         'aa': 'aar',
2277         'ab': 'abk',
2278         'ae': 'ave',
2279         'af': 'afr',
2280         'ak': 'aka',
2281         'am': 'amh',
2282         'an': 'arg',
2283         'ar': 'ara',
2284         'as': 'asm',
2285         'av': 'ava',
2286         'ay': 'aym',
2287         'az': 'aze',
2288         'ba': 'bak',
2289         'be': 'bel',
2290         'bg': 'bul',
2291         'bh': 'bih',
2292         'bi': 'bis',
2293         'bm': 'bam',
2294         'bn': 'ben',
2295         'bo': 'bod',
2296         'br': 'bre',
2297         'bs': 'bos',
2298         'ca': 'cat',
2299         'ce': 'che',
2300         'ch': 'cha',
2301         'co': 'cos',
2302         'cr': 'cre',
2303         'cs': 'ces',
2304         'cu': 'chu',
2305         'cv': 'chv',
2306         'cy': 'cym',
2307         'da': 'dan',
2308         'de': 'deu',
2309         'dv': 'div',
2310         'dz': 'dzo',
2311         'ee': 'ewe',
2312         'el': 'ell',
2313         'en': 'eng',
2314         'eo': 'epo',
2315         'es': 'spa',
2316         'et': 'est',
2317         'eu': 'eus',
2318         'fa': 'fas',
2319         'ff': 'ful',
2320         'fi': 'fin',
2321         'fj': 'fij',
2322         'fo': 'fao',
2323         'fr': 'fra',
2324         'fy': 'fry',
2325         'ga': 'gle',
2326         'gd': 'gla',
2327         'gl': 'glg',
2328         'gn': 'grn',
2329         'gu': 'guj',
2330         'gv': 'glv',
2331         'ha': 'hau',
2332         'he': 'heb',
2333         'hi': 'hin',
2334         'ho': 'hmo',
2335         'hr': 'hrv',
2336         'ht': 'hat',
2337         'hu': 'hun',
2338         'hy': 'hye',
2339         'hz': 'her',
2340         'ia': 'ina',
2341         'id': 'ind',
2342         'ie': 'ile',
2343         'ig': 'ibo',
2344         'ii': 'iii',
2345         'ik': 'ipk',
2346         'io': 'ido',
2347         'is': 'isl',
2348         'it': 'ita',
2349         'iu': 'iku',
2350         'ja': 'jpn',
2351         'jv': 'jav',
2352         'ka': 'kat',
2353         'kg': 'kon',
2354         'ki': 'kik',
2355         'kj': 'kua',
2356         'kk': 'kaz',
2357         'kl': 'kal',
2358         'km': 'khm',
2359         'kn': 'kan',
2360         'ko': 'kor',
2361         'kr': 'kau',
2362         'ks': 'kas',
2363         'ku': 'kur',
2364         'kv': 'kom',
2365         'kw': 'cor',
2366         'ky': 'kir',
2367         'la': 'lat',
2368         'lb': 'ltz',
2369         'lg': 'lug',
2370         'li': 'lim',
2371         'ln': 'lin',
2372         'lo': 'lao',
2373         'lt': 'lit',
2374         'lu': 'lub',
2375         'lv': 'lav',
2376         'mg': 'mlg',
2377         'mh': 'mah',
2378         'mi': 'mri',
2379         'mk': 'mkd',
2380         'ml': 'mal',
2381         'mn': 'mon',
2382         'mr': 'mar',
2383         'ms': 'msa',
2384         'mt': 'mlt',
2385         'my': 'mya',
2386         'na': 'nau',
2387         'nb': 'nob',
2388         'nd': 'nde',
2389         'ne': 'nep',
2390         'ng': 'ndo',
2391         'nl': 'nld',
2392         'nn': 'nno',
2393         'no': 'nor',
2394         'nr': 'nbl',
2395         'nv': 'nav',
2396         'ny': 'nya',
2397         'oc': 'oci',
2398         'oj': 'oji',
2399         'om': 'orm',
2400         'or': 'ori',
2401         'os': 'oss',
2402         'pa': 'pan',
2403         'pi': 'pli',
2404         'pl': 'pol',
2405         'ps': 'pus',
2406         'pt': 'por',
2407         'qu': 'que',
2408         'rm': 'roh',
2409         'rn': 'run',
2410         'ro': 'ron',
2411         'ru': 'rus',
2412         'rw': 'kin',
2413         'sa': 'san',
2414         'sc': 'srd',
2415         'sd': 'snd',
2416         'se': 'sme',
2417         'sg': 'sag',
2418         'si': 'sin',
2419         'sk': 'slk',
2420         'sl': 'slv',
2421         'sm': 'smo',
2422         'sn': 'sna',
2423         'so': 'som',
2424         'sq': 'sqi',
2425         'sr': 'srp',
2426         'ss': 'ssw',
2427         'st': 'sot',
2428         'su': 'sun',
2429         'sv': 'swe',
2430         'sw': 'swa',
2431         'ta': 'tam',
2432         'te': 'tel',
2433         'tg': 'tgk',
2434         'th': 'tha',
2435         'ti': 'tir',
2436         'tk': 'tuk',
2437         'tl': 'tgl',
2438         'tn': 'tsn',
2439         'to': 'ton',
2440         'tr': 'tur',
2441         'ts': 'tso',
2442         'tt': 'tat',
2443         'tw': 'twi',
2444         'ty': 'tah',
2445         'ug': 'uig',
2446         'uk': 'ukr',
2447         'ur': 'urd',
2448         'uz': 'uzb',
2449         've': 'ven',
2450         'vi': 'vie',
2451         'vo': 'vol',
2452         'wa': 'wln',
2453         'wo': 'wol',
2454         'xh': 'xho',
2455         'yi': 'yid',
2456         'yo': 'yor',
2457         'za': 'zha',
2458         'zh': 'zho',
2459         'zu': 'zul',
2460     }
2461
2462     @classmethod
2463     def short2long(cls, code):
2464         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2465         return cls._lang_map.get(code[:2])
2466
2467     @classmethod
2468     def long2short(cls, code):
2469         """Convert language code from ISO 639-2/T to ISO 639-1"""
2470         for short_name, long_name in cls._lang_map.items():
2471             if long_name == code:
2472                 return short_name
2473
2474
2475 class ISO3166Utils(object):
2476     # From http://data.okfn.org/data/core/country-list
2477     _country_map = {
2478         'AF': 'Afghanistan',
2479         'AX': 'Åland Islands',
2480         'AL': 'Albania',
2481         'DZ': 'Algeria',
2482         'AS': 'American Samoa',
2483         'AD': 'Andorra',
2484         'AO': 'Angola',
2485         'AI': 'Anguilla',
2486         'AQ': 'Antarctica',
2487         'AG': 'Antigua and Barbuda',
2488         'AR': 'Argentina',
2489         'AM': 'Armenia',
2490         'AW': 'Aruba',
2491         'AU': 'Australia',
2492         'AT': 'Austria',
2493         'AZ': 'Azerbaijan',
2494         'BS': 'Bahamas',
2495         'BH': 'Bahrain',
2496         'BD': 'Bangladesh',
2497         'BB': 'Barbados',
2498         'BY': 'Belarus',
2499         'BE': 'Belgium',
2500         'BZ': 'Belize',
2501         'BJ': 'Benin',
2502         'BM': 'Bermuda',
2503         'BT': 'Bhutan',
2504         'BO': 'Bolivia, Plurinational State of',
2505         'BQ': 'Bonaire, Sint Eustatius and Saba',
2506         'BA': 'Bosnia and Herzegovina',
2507         'BW': 'Botswana',
2508         'BV': 'Bouvet Island',
2509         'BR': 'Brazil',
2510         'IO': 'British Indian Ocean Territory',
2511         'BN': 'Brunei Darussalam',
2512         'BG': 'Bulgaria',
2513         'BF': 'Burkina Faso',
2514         'BI': 'Burundi',
2515         'KH': 'Cambodia',
2516         'CM': 'Cameroon',
2517         'CA': 'Canada',
2518         'CV': 'Cape Verde',
2519         'KY': 'Cayman Islands',
2520         'CF': 'Central African Republic',
2521         'TD': 'Chad',
2522         'CL': 'Chile',
2523         'CN': 'China',
2524         'CX': 'Christmas Island',
2525         'CC': 'Cocos (Keeling) Islands',
2526         'CO': 'Colombia',
2527         'KM': 'Comoros',
2528         'CG': 'Congo',
2529         'CD': 'Congo, the Democratic Republic of the',
2530         'CK': 'Cook Islands',
2531         'CR': 'Costa Rica',
2532         'CI': 'Côte d\'Ivoire',
2533         'HR': 'Croatia',
2534         'CU': 'Cuba',
2535         'CW': 'Curaçao',
2536         'CY': 'Cyprus',
2537         'CZ': 'Czech Republic',
2538         'DK': 'Denmark',
2539         'DJ': 'Djibouti',
2540         'DM': 'Dominica',
2541         'DO': 'Dominican Republic',
2542         'EC': 'Ecuador',
2543         'EG': 'Egypt',
2544         'SV': 'El Salvador',
2545         'GQ': 'Equatorial Guinea',
2546         'ER': 'Eritrea',
2547         'EE': 'Estonia',
2548         'ET': 'Ethiopia',
2549         'FK': 'Falkland Islands (Malvinas)',
2550         'FO': 'Faroe Islands',
2551         'FJ': 'Fiji',
2552         'FI': 'Finland',
2553         'FR': 'France',
2554         'GF': 'French Guiana',
2555         'PF': 'French Polynesia',
2556         'TF': 'French Southern Territories',
2557         'GA': 'Gabon',
2558         'GM': 'Gambia',
2559         'GE': 'Georgia',
2560         'DE': 'Germany',
2561         'GH': 'Ghana',
2562         'GI': 'Gibraltar',
2563         'GR': 'Greece',
2564         'GL': 'Greenland',
2565         'GD': 'Grenada',
2566         'GP': 'Guadeloupe',
2567         'GU': 'Guam',
2568         'GT': 'Guatemala',
2569         'GG': 'Guernsey',
2570         'GN': 'Guinea',
2571         'GW': 'Guinea-Bissau',
2572         'GY': 'Guyana',
2573         'HT': 'Haiti',
2574         'HM': 'Heard Island and McDonald Islands',
2575         'VA': 'Holy See (Vatican City State)',
2576         'HN': 'Honduras',
2577         'HK': 'Hong Kong',
2578         'HU': 'Hungary',
2579         'IS': 'Iceland',
2580         'IN': 'India',
2581         'ID': 'Indonesia',
2582         'IR': 'Iran, Islamic Republic of',
2583         'IQ': 'Iraq',
2584         'IE': 'Ireland',
2585         'IM': 'Isle of Man',
2586         'IL': 'Israel',
2587         'IT': 'Italy',
2588         'JM': 'Jamaica',
2589         'JP': 'Japan',
2590         'JE': 'Jersey',
2591         'JO': 'Jordan',
2592         'KZ': 'Kazakhstan',
2593         'KE': 'Kenya',
2594         'KI': 'Kiribati',
2595         'KP': 'Korea, Democratic People\'s Republic of',
2596         'KR': 'Korea, Republic of',
2597         'KW': 'Kuwait',
2598         'KG': 'Kyrgyzstan',
2599         'LA': 'Lao People\'s Democratic Republic',
2600         'LV': 'Latvia',
2601         'LB': 'Lebanon',
2602         'LS': 'Lesotho',
2603         'LR': 'Liberia',
2604         'LY': 'Libya',
2605         'LI': 'Liechtenstein',
2606         'LT': 'Lithuania',
2607         'LU': 'Luxembourg',
2608         'MO': 'Macao',
2609         'MK': 'Macedonia, the Former Yugoslav Republic of',
2610         'MG': 'Madagascar',
2611         'MW': 'Malawi',
2612         'MY': 'Malaysia',
2613         'MV': 'Maldives',
2614         'ML': 'Mali',
2615         'MT': 'Malta',
2616         'MH': 'Marshall Islands',
2617         'MQ': 'Martinique',
2618         'MR': 'Mauritania',
2619         'MU': 'Mauritius',
2620         'YT': 'Mayotte',
2621         'MX': 'Mexico',
2622         'FM': 'Micronesia, Federated States of',
2623         'MD': 'Moldova, Republic of',
2624         'MC': 'Monaco',
2625         'MN': 'Mongolia',
2626         'ME': 'Montenegro',
2627         'MS': 'Montserrat',
2628         'MA': 'Morocco',
2629         'MZ': 'Mozambique',
2630         'MM': 'Myanmar',
2631         'NA': 'Namibia',
2632         'NR': 'Nauru',
2633         'NP': 'Nepal',
2634         'NL': 'Netherlands',
2635         'NC': 'New Caledonia',
2636         'NZ': 'New Zealand',
2637         'NI': 'Nicaragua',
2638         'NE': 'Niger',
2639         'NG': 'Nigeria',
2640         'NU': 'Niue',
2641         'NF': 'Norfolk Island',
2642         'MP': 'Northern Mariana Islands',
2643         'NO': 'Norway',
2644         'OM': 'Oman',
2645         'PK': 'Pakistan',
2646         'PW': 'Palau',
2647         'PS': 'Palestine, State of',
2648         'PA': 'Panama',
2649         'PG': 'Papua New Guinea',
2650         'PY': 'Paraguay',
2651         'PE': 'Peru',
2652         'PH': 'Philippines',
2653         'PN': 'Pitcairn',
2654         'PL': 'Poland',
2655         'PT': 'Portugal',
2656         'PR': 'Puerto Rico',
2657         'QA': 'Qatar',
2658         'RE': 'Réunion',
2659         'RO': 'Romania',
2660         'RU': 'Russian Federation',
2661         'RW': 'Rwanda',
2662         'BL': 'Saint Barthélemy',
2663         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2664         'KN': 'Saint Kitts and Nevis',
2665         'LC': 'Saint Lucia',
2666         'MF': 'Saint Martin (French part)',
2667         'PM': 'Saint Pierre and Miquelon',
2668         'VC': 'Saint Vincent and the Grenadines',
2669         'WS': 'Samoa',
2670         'SM': 'San Marino',
2671         'ST': 'Sao Tome and Principe',
2672         'SA': 'Saudi Arabia',
2673         'SN': 'Senegal',
2674         'RS': 'Serbia',
2675         'SC': 'Seychelles',
2676         'SL': 'Sierra Leone',
2677         'SG': 'Singapore',
2678         'SX': 'Sint Maarten (Dutch part)',
2679         'SK': 'Slovakia',
2680         'SI': 'Slovenia',
2681         'SB': 'Solomon Islands',
2682         'SO': 'Somalia',
2683         'ZA': 'South Africa',
2684         'GS': 'South Georgia and the South Sandwich Islands',
2685         'SS': 'South Sudan',
2686         'ES': 'Spain',
2687         'LK': 'Sri Lanka',
2688         'SD': 'Sudan',
2689         'SR': 'Suriname',
2690         'SJ': 'Svalbard and Jan Mayen',
2691         'SZ': 'Swaziland',
2692         'SE': 'Sweden',
2693         'CH': 'Switzerland',
2694         'SY': 'Syrian Arab Republic',
2695         'TW': 'Taiwan, Province of China',
2696         'TJ': 'Tajikistan',
2697         'TZ': 'Tanzania, United Republic of',
2698         'TH': 'Thailand',
2699         'TL': 'Timor-Leste',
2700         'TG': 'Togo',
2701         'TK': 'Tokelau',
2702         'TO': 'Tonga',
2703         'TT': 'Trinidad and Tobago',
2704         'TN': 'Tunisia',
2705         'TR': 'Turkey',
2706         'TM': 'Turkmenistan',
2707         'TC': 'Turks and Caicos Islands',
2708         'TV': 'Tuvalu',
2709         'UG': 'Uganda',
2710         'UA': 'Ukraine',
2711         'AE': 'United Arab Emirates',
2712         'GB': 'United Kingdom',
2713         'US': 'United States',
2714         'UM': 'United States Minor Outlying Islands',
2715         'UY': 'Uruguay',
2716         'UZ': 'Uzbekistan',
2717         'VU': 'Vanuatu',
2718         'VE': 'Venezuela, Bolivarian Republic of',
2719         'VN': 'Viet Nam',
2720         'VG': 'Virgin Islands, British',
2721         'VI': 'Virgin Islands, U.S.',
2722         'WF': 'Wallis and Futuna',
2723         'EH': 'Western Sahara',
2724         'YE': 'Yemen',
2725         'ZM': 'Zambia',
2726         'ZW': 'Zimbabwe',
2727     }
2728
2729     @classmethod
2730     def short2full(cls, code):
2731         """Convert an ISO 3166-2 country code to the corresponding full name"""
2732         return cls._country_map.get(code.upper())
2733
2734
2735 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2736     def __init__(self, proxies=None):
2737         # Set default handlers
2738         for type in ('http', 'https'):
2739             setattr(self, '%s_open' % type,
2740                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2741                         meth(r, proxy, type))
2742         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2743
2744     def proxy_open(self, req, proxy, type):
2745         req_proxy = req.headers.get('Ytdl-request-proxy')
2746         if req_proxy is not None:
2747             proxy = req_proxy
2748             del req.headers['Ytdl-request-proxy']
2749
2750         if proxy == '__noproxy__':
2751             return None  # No Proxy
2752         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2753             req.add_header('Ytdl-socks-proxy', proxy)
2754             # youtube-dl's http/https handlers do wrapping the socket with socks
2755             return None
2756         return compat_urllib_request.ProxyHandler.proxy_open(
2757             self, req, proxy, type)
2758
2759
2760 def ohdave_rsa_encrypt(data, exponent, modulus):
2761     '''
2762     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2763
2764     Input:
2765         data: data to encrypt, bytes-like object
2766         exponent, modulus: parameter e and N of RSA algorithm, both integer
2767     Output: hex string of encrypted data
2768
2769     Limitation: supports one block encryption only
2770     '''
2771
2772     payload = int(binascii.hexlify(data[::-1]), 16)
2773     encrypted = pow(payload, exponent, modulus)
2774     return '%x' % encrypted
2775
2776
2777 def encode_base_n(num, n, table=None):
2778     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2779     if not table:
2780         table = FULL_TABLE[:n]
2781
2782     if n > len(table):
2783         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2784
2785     if num == 0:
2786         return table[0]
2787
2788     ret = ''
2789     while num:
2790         ret = table[num % n] + ret
2791         num = num // n
2792     return ret
2793
2794
2795 def decode_packed_codes(code):
2796     mobj = re.search(
2797         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2798         code)
2799     obfucasted_code, base, count, symbols = mobj.groups()
2800     base = int(base)
2801     count = int(count)
2802     symbols = symbols.split('|')
2803     symbol_table = {}
2804
2805     while count:
2806         count -= 1
2807         base_n_count = encode_base_n(count, base)
2808         symbol_table[base_n_count] = symbols[count] or base_n_count
2809
2810     return re.sub(
2811         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2812         obfucasted_code)