_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_etree_fromstring,
  40     compat_html_entities,
  41     compat_http_client,
  42     compat_kwargs,
  43     compat_parse_qs,
  44     compat_socket_create_connection,
  45     compat_str,
  46     compat_urllib_error,
  47     compat_urllib_parse,
  48     compat_urllib_parse_urlparse,
  49     compat_urllib_request,
  50     compat_urlparse,
  51     shlex_quote,
  52 )
  53
  54
  55 # This is not clearly defined otherwise
  56 compiled_regex_type = type(re.compile(''))
  57
  58 std_headers = {
  59     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  60     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62     'Accept-Encoding': 'gzip, deflate',
  63     'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66
  67 NO_DEFAULT = object()
  68
  69 ENGLISH_MONTH_NAMES = [
  70     'January', 'February', 'March', 'April', 'May', 'June',
  71     'July', 'August', 'September', 'October', 'November', 'December']
  72
  73
  74 def preferredencoding():
  75     """Get preferred encoding.
  76
  77     Returns the best encoding scheme for the system, based on
  78     locale.getpreferredencoding() and some further tweaks.
  79     """
  80     try:
  81         pref = locale.getpreferredencoding()
  82         'TEST'.encode(pref)
  83     except Exception:
  84         pref = 'UTF-8'
  85
  86     return pref
  87
  88
  89 def write_json_file(obj, fn):
  90     """ Encode obj as JSON and write it to fn, atomically if possible """
  91
  92     fn = encodeFilename(fn)
  93     if sys.version_info < (3, 0) and sys.platform != 'win32':
  94         encoding = get_filesystem_encoding()
  95         # os.path.basename returns a bytes object, but NamedTemporaryFile
  96         # will fail if the filename contains non ascii characters unless we
  97         # use a unicode object
  98         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  99         # the same for os.path.dirname
 100         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 101     else:
 102         path_basename = os.path.basename
 103         path_dirname = os.path.dirname
 104
 105     args = {
 106         'suffix': '.tmp',
 107         'prefix': path_basename(fn) + '.',
 108         'dir': path_dirname(fn),
 109         'delete': False,
 110     }
 111
 112     # In Python 2.x, json.dump expects a bytestream.
 113     # In Python 3.x, it writes to a character stream
 114     if sys.version_info < (3, 0):
 115         args['mode'] = 'wb'
 116     else:
 117         args.update({
 118             'mode': 'w',
 119             'encoding': 'utf-8',
 120         })
 121
 122     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 123
 124     try:
 125         with tf:
 126             json.dump(obj, tf)
 127         if sys.platform == 'win32':
 128             # Need to remove existing file on Windows, else os.rename raises
 129             # WindowsError or FileExistsError.
 130             try:
 131                 os.unlink(fn)
 132             except OSError:
 133                 pass
 134         os.rename(tf.name, fn)
 135     except Exception:
 136         try:
 137             os.remove(tf.name)
 138         except OSError:
 139             pass
 140         raise
 141
 142
 143 if sys.version_info >= (2, 7):
 144     def find_xpath_attr(node, xpath, key, val=None):
 145         """ Find the xpath xpath[@key=val] """
 146         assert re.match(r'^[a-zA-Z_-]+$', key)
 147         if val:
 148             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 149         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 150         return node.find(expr)
 151 else:
 152     def find_xpath_attr(node, xpath, key, val=None):
 153         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 154         # .//node does not match if a node is a direct child of . !
 155         if isinstance(xpath, compat_str):
 156             xpath = xpath.encode('ascii')
 157
 158         for f in node.findall(xpath):
 159             if key not in f.attrib:
 160                 continue
 161             if val is None or f.attrib.get(key) == val:
 162                 return f
 163         return None
 164
 165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 166 # the namespace parameter
 167
 168
 169 def xpath_with_ns(path, ns_map):
 170     components = [c.split(':') for c in path.split('/')]
 171     replaced = []
 172     for c in components:
 173         if len(c) == 1:
 174             replaced.append(c[0])
 175         else:
 176             ns, tag = c
 177             replaced.append('{%s}%s' % (ns_map[ns], tag))
 178     return '/'.join(replaced)
 179
 180
 181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 182     def _find_xpath(xpath):
 183         if sys.version_info < (2, 7):  # Crazy 2.6
 184             xpath = xpath.encode('ascii')
 185         return node.find(xpath)
 186
 187     if isinstance(xpath, (str, compat_str)):
 188         n = _find_xpath(xpath)
 189     else:
 190         for xp in xpath:
 191             n = _find_xpath(xp)
 192             if n is not None:
 193                 break
 194
 195     if n is None:
 196         if default is not NO_DEFAULT:
 197             return default
 198         elif fatal:
 199             name = xpath if name is None else name
 200             raise ExtractorError('Could not find XML element %s' % name)
 201         else:
 202             return None
 203     return n
 204
 205
 206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 207     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 208     if n is None or n == default:
 209         return n
 210     if n.text is None:
 211         if default is not NO_DEFAULT:
 212             return default
 213         elif fatal:
 214             name = xpath if name is None else name
 215             raise ExtractorError('Could not find XML element\'s text %s' % name)
 216         else:
 217             return None
 218     return n.text
 219
 220
 221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 222     n = find_xpath_attr(node, xpath, key)
 223     if n is None:
 224         if default is not NO_DEFAULT:
 225             return default
 226         elif fatal:
 227             name = '%s[@%s]' % (xpath, key) if name is None else name
 228             raise ExtractorError('Could not find XML attribute %s' % name)
 229         else:
 230             return None
 231     return n.attrib[key]
 232
 233
 234 def get_element_by_id(id, html):
 235     """Return the content of the tag with the specified ID in the passed HTML document"""
 236     return get_element_by_attribute("id", id, html)
 237
 238
 239 def get_element_by_attribute(attribute, value, html):
 240     """Return the content of the tag with the specified attribute in the passed HTML document"""
 241
 242     m = re.search(r'''(?xs)
 243         <([a-zA-Z0-9:._-]+)
 244          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 245          \s+%s=['"]?%s['"]?
 246          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 247         \s*>
 248         (?P<content>.*?)
 249         </\1>
 250     ''' % (re.escape(attribute), re.escape(value)), html)
 251
 252     if not m:
 253         return None
 254     res = m.group('content')
 255
 256     if res.startswith('"') or res.startswith("'"):
 257         res = res[1:-1]
 258
 259     return unescapeHTML(res)
 260
 261
 262 def clean_html(html):
 263     """Clean an HTML snippet into a readable string"""
 264
 265     if html is None:  # Convenience for sanitizing descriptions etc.
 266         return html
 267
 268     # Newline vs <br />
 269     html = html.replace('\n', ' ')
 270     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 271     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 272     # Strip html tags
 273     html = re.sub('<.*?>', '', html)
 274     # Replace html entities
 275     html = unescapeHTML(html)
 276     return html.strip()
 277
 278
 279 def sanitize_open(filename, open_mode):
 280     """Try to open the given filename, and slightly tweak it if this fails.
 281
 282     Attempts to open the given filename. If this fails, it tries to change
 283     the filename slightly, step by step, until it's either able to open it
 284     or it fails and raises a final exception, like the standard open()
 285     function.
 286
 287     It returns the tuple (stream, definitive_file_name).
 288     """
 289     try:
 290         if filename == '-':
 291             if sys.platform == 'win32':
 292                 import msvcrt
 293                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 294             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 295         stream = open(encodeFilename(filename), open_mode)
 296         return (stream, filename)
 297     except (IOError, OSError) as err:
 298         if err.errno in (errno.EACCES,):
 299             raise
 300
 301         # In case of error, try to remove win32 forbidden chars
 302         alt_filename = sanitize_path(filename)
 303         if alt_filename == filename:
 304             raise
 305         else:
 306             # An exception here should be caught in the caller
 307             stream = open(encodeFilename(alt_filename), open_mode)
 308             return (stream, alt_filename)
 309
 310
 311 def timeconvert(timestr):
 312     """Convert RFC 2822 defined time string into system timestamp"""
 313     timestamp = None
 314     timetuple = email.utils.parsedate_tz(timestr)
 315     if timetuple is not None:
 316         timestamp = email.utils.mktime_tz(timetuple)
 317     return timestamp
 318
 319
 320 def sanitize_filename(s, restricted=False, is_id=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 324     """
 325     def replace_insane(char):
 326         if char == '?' or ord(char) < 32 or ord(char) == 127:
 327             return ''
 328         elif char == '"':
 329             return '' if restricted else '\''
 330         elif char == ':':
 331             return '_-' if restricted else ' -'
 332         elif char in '\\/|*<>':
 333             return '_'
 334         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 335             return '_'
 336         if restricted and ord(char) > 127:
 337             return '_'
 338         return char
 339
 340     # Handle timestamps
 341     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 342     result = ''.join(map(replace_insane, s))
 343     if not is_id:
 344         while '__' in result:
 345             result = result.replace('__', '_')
 346         result = result.strip('_')
 347         # Common case of "Foreign band name - English song title"
 348         if restricted and result.startswith('-_'):
 349             result = result[2:]
 350         if result.startswith('-'):
 351             result = '_' + result[len('-'):]
 352         result = result.lstrip('.')
 353         if not result:
 354             result = '_'
 355     return result
 356
 357
 358 def sanitize_path(s):
 359     """Sanitizes and normalizes path on Windows"""
 360     if sys.platform != 'win32':
 361         return s
 362     drive_or_unc, _ = os.path.splitdrive(s)
 363     if sys.version_info < (2, 7) and not drive_or_unc:
 364         drive_or_unc, _ = os.path.splitunc(s)
 365     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 366     if drive_or_unc:
 367         norm_path.pop(0)
 368     sanitized_path = [
 369         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 370         for path_part in norm_path]
 371     if drive_or_unc:
 372         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 373     return os.path.join(*sanitized_path)
 374
 375
 376 def orderedSet(iterable):
 377     """ Remove all duplicates from the input iterable """
 378     res = []
 379     for el in iterable:
 380         if el not in res:
 381             res.append(el)
 382     return res
 383
 384
 385 def _htmlentity_transform(entity):
 386     """Transforms an HTML entity to a character."""
 387     # Known non-numeric HTML entity
 388     if entity in compat_html_entities.name2codepoint:
 389         return compat_chr(compat_html_entities.name2codepoint[entity])
 390
 391     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 392     if mobj is not None:
 393         numstr = mobj.group(1)
 394         if numstr.startswith('x'):
 395             base = 16
 396             numstr = '0%s' % numstr
 397         else:
 398             base = 10
 399         return compat_chr(int(numstr, base))
 400
 401     # Unknown entity in name, return its literal representation
 402     return ('&%s;' % entity)
 403
 404
 405 def unescapeHTML(s):
 406     if s is None:
 407         return None
 408     assert type(s) == compat_str
 409
 410     return re.sub(
 411         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 412
 413
 414 def get_subprocess_encoding():
 415     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 416         # For subprocess calls, encode with locale encoding
 417         # Refer to http://stackoverflow.com/a/9951851/35070
 418         encoding = preferredencoding()
 419     else:
 420         encoding = sys.getfilesystemencoding()
 421     if encoding is None:
 422         encoding = 'utf-8'
 423     return encoding
 424
 425
 426 def encodeFilename(s, for_subprocess=False):
 427     """
 428     @param s The name of the file
 429     """
 430
 431     assert type(s) == compat_str
 432
 433     # Python 3 has a Unicode API
 434     if sys.version_info >= (3, 0):
 435         return s
 436
 437     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 438     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 439     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 440     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 441         return s
 442
 443     return s.encode(get_subprocess_encoding(), 'ignore')
 444
 445
 446 def decodeFilename(b, for_subprocess=False):
 447
 448     if sys.version_info >= (3, 0):
 449         return b
 450
 451     if not isinstance(b, bytes):
 452         return b
 453
 454     return b.decode(get_subprocess_encoding(), 'ignore')
 455
 456
 457 def encodeArgument(s):
 458     if not isinstance(s, compat_str):
 459         # Legacy code that uses byte strings
 460         # Uncomment the following line after fixing all post processors
 461         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 462         s = s.decode('ascii')
 463     return encodeFilename(s, True)
 464
 465
 466 def decodeArgument(b):
 467     return decodeFilename(b, True)
 468
 469
 470 def decodeOption(optval):
 471     if optval is None:
 472         return optval
 473     if isinstance(optval, bytes):
 474         optval = optval.decode(preferredencoding())
 475
 476     assert isinstance(optval, compat_str)
 477     return optval
 478
 479
 480 def formatSeconds(secs):
 481     if secs > 3600:
 482         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 483     elif secs > 60:
 484         return '%d:%02d' % (secs // 60, secs % 60)
 485     else:
 486         return '%d' % secs
 487
 488
 489 def make_HTTPS_handler(params, **kwargs):
 490     opts_no_check_certificate = params.get('nocheckcertificate', False)
 491     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 492         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 493         if opts_no_check_certificate:
 494             context.check_hostname = False
 495             context.verify_mode = ssl.CERT_NONE
 496         try:
 497             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 498         except TypeError:
 499             # Python 2.7.8
 500             # (create_default_context present but HTTPSHandler has no context=)
 501             pass
 502
 503     if sys.version_info < (3, 2):
 504         return YoutubeDLHTTPSHandler(params, **kwargs)
 505     else:  # Python < 3.4
 506         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 507         context.verify_mode = (ssl.CERT_NONE
 508                                if opts_no_check_certificate
 509                                else ssl.CERT_REQUIRED)
 510         context.set_default_verify_paths()
 511         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 512
 513
 514 def bug_reports_message():
 515     if ytdl_is_updateable():
 516         update_cmd = 'type  youtube-dl -U  to update'
 517     else:
 518         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 519     msg = '; please report this issue on https://yt-dl.org/bug .'
 520     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 521     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 522     return msg
 523
 524
 525 class ExtractorError(Exception):
 526     """Error during info extraction."""
 527
 528     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 529         """ tb, if given, is the original traceback (so that it can be printed out).
 530         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 531         """
 532
 533         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 534             expected = True
 535         if video_id is not None:
 536             msg = video_id + ': ' + msg
 537         if cause:
 538             msg += ' (caused by %r)' % cause
 539         if not expected:
 540             msg += bug_reports_message()
 541         super(ExtractorError, self).__init__(msg)
 542
 543         self.traceback = tb
 544         self.exc_info = sys.exc_info()  # preserve original exception
 545         self.cause = cause
 546         self.video_id = video_id
 547
 548     def format_traceback(self):
 549         if self.traceback is None:
 550             return None
 551         return ''.join(traceback.format_tb(self.traceback))
 552
 553
 554 class UnsupportedError(ExtractorError):
 555     def __init__(self, url):
 556         super(UnsupportedError, self).__init__(
 557             'Unsupported URL: %s' % url, expected=True)
 558         self.url = url
 559
 560
 561 class RegexNotFoundError(ExtractorError):
 562     """Error when a regex didn't match"""
 563     pass
 564
 565
 566 class DownloadError(Exception):
 567     """Download Error exception.
 568
 569     This exception may be thrown by FileDownloader objects if they are not
 570     configured to continue on errors. They will contain the appropriate
 571     error message.
 572     """
 573
 574     def __init__(self, msg, exc_info=None):
 575         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 576         super(DownloadError, self).__init__(msg)
 577         self.exc_info = exc_info
 578
 579
 580 class SameFileError(Exception):
 581     """Same File exception.
 582
 583     This exception will be thrown by FileDownloader objects if they detect
 584     multiple files would have to be downloaded to the same file on disk.
 585     """
 586     pass
 587
 588
 589 class PostProcessingError(Exception):
 590     """Post Processing exception.
 591
 592     This exception may be raised by PostProcessor's .run() method to
 593     indicate an error in the postprocessing task.
 594     """
 595
 596     def __init__(self, msg):
 597         self.msg = msg
 598
 599
 600 class MaxDownloadsReached(Exception):
 601     """ --max-downloads limit has been reached. """
 602     pass
 603
 604
 605 class UnavailableVideoError(Exception):
 606     """Unavailable Format exception.
 607
 608     This exception will be thrown when a video is requested
 609     in a format that is not available for that video.
 610     """
 611     pass
 612
 613
 614 class ContentTooShortError(Exception):
 615     """Content Too Short exception.
 616
 617     This exception may be raised by FileDownloader objects when a file they
 618     download is too small for what the server announced first, indicating
 619     the connection was probably interrupted.
 620     """
 621
 622     def __init__(self, downloaded, expected):
 623         # Both in bytes
 624         self.downloaded = downloaded
 625         self.expected = expected
 626
 627
 628 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 629     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 630     # expected HTTP responses to meet HTTP/1.0 or later (see also
 631     # https://github.com/rg3/youtube-dl/issues/6727)
 632     if sys.version_info < (3, 0):
 633         kwargs[b'strict'] = True
 634     hc = http_class(*args, **kwargs)
 635     source_address = ydl_handler._params.get('source_address')
 636     if source_address is not None:
 637         sa = (source_address, 0)
 638         if hasattr(hc, 'source_address'):  # Python 2.7+
 639             hc.source_address = sa
 640         else:  # Python 2.6
 641             def _hc_connect(self, *args, **kwargs):
 642                 sock = compat_socket_create_connection(
 643                     (self.host, self.port), self.timeout, sa)
 644                 if is_https:
 645                     self.sock = ssl.wrap_socket(
 646                         sock, self.key_file, self.cert_file,
 647                         ssl_version=ssl.PROTOCOL_TLSv1)
 648                 else:
 649                     self.sock = sock
 650             hc.connect = functools.partial(_hc_connect, hc)
 651
 652     return hc
 653
 654
 655 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 656     """Handler for HTTP requests and responses.
 657
 658     This class, when installed with an OpenerDirector, automatically adds
 659     the standard headers to every HTTP request and handles gzipped and
 660     deflated responses from web servers. If compression is to be avoided in
 661     a particular request, the original request in the program code only has
 662     to include the HTTP header "Youtubedl-No-Compression", which will be
 663     removed before making the real request.
 664
 665     Part of this code was copied from:
 666
 667     http://techknack.net/python-urllib2-handlers/
 668
 669     Andrew Rowls, the author of that code, agreed to release it to the
 670     public domain.
 671     """
 672
 673     def __init__(self, params, *args, **kwargs):
 674         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 675         self._params = params
 676
 677     def http_open(self, req):
 678         return self.do_open(functools.partial(
 679             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 680             req)
 681
 682     @staticmethod
 683     def deflate(data):
 684         try:
 685             return zlib.decompress(data, -zlib.MAX_WBITS)
 686         except zlib.error:
 687             return zlib.decompress(data)
 688
 689     @staticmethod
 690     def addinfourl_wrapper(stream, headers, url, code):
 691         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 692             return compat_urllib_request.addinfourl(stream, headers, url, code)
 693         ret = compat_urllib_request.addinfourl(stream, headers, url)
 694         ret.code = code
 695         return ret
 696
 697     def http_request(self, req):
 698         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 699         # always respected by websites, some tend to give out URLs with non percent-encoded
 700         # non-ASCII characters (see telemb.py, ard.py [#3412])
 701         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 702         # To work around aforementioned issue we will replace request's original URL with
 703         # percent-encoded one
 704         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 705         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 706         url = req.get_full_url()
 707         url_escaped = escape_url(url)
 708
 709         # Substitute URL if any change after escaping
 710         if url != url_escaped:
 711             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 712             new_req = req_type(
 713                 url_escaped, data=req.data, headers=req.headers,
 714                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 715             new_req.timeout = req.timeout
 716             req = new_req
 717
 718         for h, v in std_headers.items():
 719             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 720             # The dict keys are capitalized because of this bug by urllib
 721             if h.capitalize() not in req.headers:
 722                 req.add_header(h, v)
 723         if 'Youtubedl-no-compression' in req.headers:
 724             if 'Accept-encoding' in req.headers:
 725                 del req.headers['Accept-encoding']
 726             del req.headers['Youtubedl-no-compression']
 727
 728         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 729             # Python 2.6 is brain-dead when it comes to fragments
 730             req._Request__original = req._Request__original.partition('#')[0]
 731             req._Request__r_type = req._Request__r_type.partition('#')[0]
 732
 733         return req
 734
 735     def http_response(self, req, resp):
 736         old_resp = resp
 737         # gzip
 738         if resp.headers.get('Content-encoding', '') == 'gzip':
 739             content = resp.read()
 740             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 741             try:
 742                 uncompressed = io.BytesIO(gz.read())
 743             except IOError as original_ioerror:
 744                 # There may be junk add the end of the file
 745                 # See http://stackoverflow.com/q/4928560/35070 for details
 746                 for i in range(1, 1024):
 747                     try:
 748                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 749                         uncompressed = io.BytesIO(gz.read())
 750                     except IOError:
 751                         continue
 752                     break
 753                 else:
 754                     raise original_ioerror
 755             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 756             resp.msg = old_resp.msg
 757         # deflate
 758         if resp.headers.get('Content-encoding', '') == 'deflate':
 759             gz = io.BytesIO(self.deflate(resp.read()))
 760             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 761             resp.msg = old_resp.msg
 762         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 763         # https://github.com/rg3/youtube-dl/issues/6457).
 764         if 300 <= resp.code < 400:
 765             location = resp.headers.get('Location')
 766             if location:
 767                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 768                 if sys.version_info >= (3, 0):
 769                     location = location.encode('iso-8859-1').decode('utf-8')
 770                 location_escaped = escape_url(location)
 771                 if location != location_escaped:
 772                     del resp.headers['Location']
 773                     resp.headers['Location'] = location_escaped
 774         return resp
 775
 776     https_request = http_request
 777     https_response = http_response
 778
 779
 780 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 781     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 782         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 783         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 784         self._params = params
 785
 786     def https_open(self, req):
 787         kwargs = {}
 788         if hasattr(self, '_context'):  # python > 2.6
 789             kwargs['context'] = self._context
 790         if hasattr(self, '_check_hostname'):  # python 3.x
 791             kwargs['check_hostname'] = self._check_hostname
 792         return self.do_open(functools.partial(
 793             _create_http_connection, self, self._https_conn_class, True),
 794             req, **kwargs)
 795
 796
 797 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 798     def __init__(self, cookiejar=None):
 799         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 800
 801     def http_response(self, request, response):
 802         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 803         # characters in Set-Cookie HTTP header of last response (see
 804         # https://github.com/rg3/youtube-dl/issues/6769).
 805         # In order to at least prevent crashing we will percent encode Set-Cookie
 806         # header before HTTPCookieProcessor starts processing it.
 807         # if sys.version_info < (3, 0) and response.headers:
 808         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 809         #         set_cookie = response.headers.get(set_cookie_header)
 810         #         if set_cookie:
 811         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 812         #             if set_cookie != set_cookie_escaped:
 813         #                 del response.headers[set_cookie_header]
 814         #                 response.headers[set_cookie_header] = set_cookie_escaped
 815         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 816
 817     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 818     https_response = http_response
 819
 820
 821 def parse_iso8601(date_str, delimiter='T', timezone=None):
 822     """ Return a UNIX timestamp from the given date """
 823
 824     if date_str is None:
 825         return None
 826
 827     date_str = re.sub(r'\.[0-9]+', '', date_str)
 828
 829     if timezone is None:
 830         m = re.search(
 831             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 832             date_str)
 833         if not m:
 834             timezone = datetime.timedelta()
 835         else:
 836             date_str = date_str[:-len(m.group(0))]
 837             if not m.group('sign'):
 838                 timezone = datetime.timedelta()
 839             else:
 840                 sign = 1 if m.group('sign') == '+' else -1
 841                 timezone = datetime.timedelta(
 842                     hours=sign * int(m.group('hours')),
 843                     minutes=sign * int(m.group('minutes')))
 844     try:
 845         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 846         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 847         return calendar.timegm(dt.timetuple())
 848     except ValueError:
 849         pass
 850
 851
 852 def unified_strdate(date_str, day_first=True):
 853     """Return a string with the date in the format YYYYMMDD"""
 854
 855     if date_str is None:
 856         return None
 857     upload_date = None
 858     # Replace commas
 859     date_str = date_str.replace(',', ' ')
 860     # %z (UTC offset) is only supported in python>=3.2
 861     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 862         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 863     # Remove AM/PM + timezone
 864     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 865
 866     format_expressions = [
 867         '%d %B %Y',
 868         '%d %b %Y',
 869         '%B %d %Y',
 870         '%b %d %Y',
 871         '%b %dst %Y %I:%M%p',
 872         '%b %dnd %Y %I:%M%p',
 873         '%b %dth %Y %I:%M%p',
 874         '%Y %m %d',
 875         '%Y-%m-%d',
 876         '%Y/%m/%d',
 877         '%Y/%m/%d %H:%M:%S',
 878         '%Y-%m-%d %H:%M:%S',
 879         '%Y-%m-%d %H:%M:%S.%f',
 880         '%d.%m.%Y %H:%M',
 881         '%d.%m.%Y %H.%M',
 882         '%Y-%m-%dT%H:%M:%SZ',
 883         '%Y-%m-%dT%H:%M:%S.%fZ',
 884         '%Y-%m-%dT%H:%M:%S.%f0Z',
 885         '%Y-%m-%dT%H:%M:%S',
 886         '%Y-%m-%dT%H:%M:%S.%f',
 887         '%Y-%m-%dT%H:%M',
 888     ]
 889     if day_first:
 890         format_expressions.extend([
 891             '%d-%m-%Y',
 892             '%d.%m.%Y',
 893             '%d/%m/%Y',
 894             '%d/%m/%y',
 895             '%d/%m/%Y %H:%M:%S',
 896         ])
 897     else:
 898         format_expressions.extend([
 899             '%m-%d-%Y',
 900             '%m.%d.%Y',
 901             '%m/%d/%Y',
 902             '%m/%d/%y',
 903             '%m/%d/%Y %H:%M:%S',
 904         ])
 905     for expression in format_expressions:
 906         try:
 907             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 908         except ValueError:
 909             pass
 910     if upload_date is None:
 911         timetuple = email.utils.parsedate_tz(date_str)
 912         if timetuple:
 913             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 914     if upload_date is not None:
 915         return compat_str(upload_date)
 916
 917
 918 def determine_ext(url, default_ext='unknown_video'):
 919     if url is None:
 920         return default_ext
 921     guess = url.partition('?')[0].rpartition('.')[2]
 922     if re.match(r'^[A-Za-z0-9]+$', guess):
 923         return guess
 924     else:
 925         return default_ext
 926
 927
 928 def subtitles_filename(filename, sub_lang, sub_format):
 929     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 930
 931
 932 def date_from_str(date_str):
 933     """
 934     Return a datetime object from a string in the format YYYYMMDD or
 935     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 936     today = datetime.date.today()
 937     if date_str in ('now', 'today'):
 938         return today
 939     if date_str == 'yesterday':
 940         return today - datetime.timedelta(days=1)
 941     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 942     if match is not None:
 943         sign = match.group('sign')
 944         time = int(match.group('time'))
 945         if sign == '-':
 946             time = -time
 947         unit = match.group('unit')
 948         # A bad aproximation?
 949         if unit == 'month':
 950             unit = 'day'
 951             time *= 30
 952         elif unit == 'year':
 953             unit = 'day'
 954             time *= 365
 955         unit += 's'
 956         delta = datetime.timedelta(**{unit: time})
 957         return today + delta
 958     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 959
 960
 961 def hyphenate_date(date_str):
 962     """
 963     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 964     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 965     if match is not None:
 966         return '-'.join(match.groups())
 967     else:
 968         return date_str
 969
 970
 971 class DateRange(object):
 972     """Represents a time interval between two dates"""
 973
 974     def __init__(self, start=None, end=None):
 975         """start and end must be strings in the format accepted by date"""
 976         if start is not None:
 977             self.start = date_from_str(start)
 978         else:
 979             self.start = datetime.datetime.min.date()
 980         if end is not None:
 981             self.end = date_from_str(end)
 982         else:
 983             self.end = datetime.datetime.max.date()
 984         if self.start > self.end:
 985             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 986
 987     @classmethod
 988     def day(cls, day):
 989         """Returns a range that only contains the given day"""
 990         return cls(day, day)
 991
 992     def __contains__(self, date):
 993         """Check if the date is in the range"""
 994         if not isinstance(date, datetime.date):
 995             date = date_from_str(date)
 996         return self.start <= date <= self.end
 997
 998     def __str__(self):
 999         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1000
1001
1002 def platform_name():
1003     """ Returns the platform name as a compat_str """
1004     res = platform.platform()
1005     if isinstance(res, bytes):
1006         res = res.decode(preferredencoding())
1007
1008     assert isinstance(res, compat_str)
1009     return res
1010
1011
1012 def _windows_write_string(s, out):
1013     """ Returns True if the string was written using special methods,
1014     False if it has yet to be written out."""
1015     # Adapted from http://stackoverflow.com/a/3259271/35070
1016
1017     import ctypes
1018     import ctypes.wintypes
1019
1020     WIN_OUTPUT_IDS = {
1021         1: -11,
1022         2: -12,
1023     }
1024
1025     try:
1026         fileno = out.fileno()
1027     except AttributeError:
1028         # If the output stream doesn't have a fileno, it's virtual
1029         return False
1030     except io.UnsupportedOperation:
1031         # Some strange Windows pseudo files?
1032         return False
1033     if fileno not in WIN_OUTPUT_IDS:
1034         return False
1035
1036     GetStdHandle = ctypes.WINFUNCTYPE(
1037         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1038         (b"GetStdHandle", ctypes.windll.kernel32))
1039     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1040
1041     WriteConsoleW = ctypes.WINFUNCTYPE(
1042         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1043         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1044         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1045     written = ctypes.wintypes.DWORD(0)
1046
1047     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1048     FILE_TYPE_CHAR = 0x0002
1049     FILE_TYPE_REMOTE = 0x8000
1050     GetConsoleMode = ctypes.WINFUNCTYPE(
1051         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1052         ctypes.POINTER(ctypes.wintypes.DWORD))(
1053         (b"GetConsoleMode", ctypes.windll.kernel32))
1054     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1055
1056     def not_a_console(handle):
1057         if handle == INVALID_HANDLE_VALUE or handle is None:
1058             return True
1059         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1060                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1061
1062     if not_a_console(h):
1063         return False
1064
1065     def next_nonbmp_pos(s):
1066         try:
1067             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1068         except StopIteration:
1069             return len(s)
1070
1071     while s:
1072         count = min(next_nonbmp_pos(s), 1024)
1073
1074         ret = WriteConsoleW(
1075             h, s, count if count else 2, ctypes.byref(written), None)
1076         if ret == 0:
1077             raise OSError('Failed to write string')
1078         if not count:  # We just wrote a non-BMP character
1079             assert written.value == 2
1080             s = s[1:]
1081         else:
1082             assert written.value > 0
1083             s = s[written.value:]
1084     return True
1085
1086
1087 def write_string(s, out=None, encoding=None):
1088     if out is None:
1089         out = sys.stderr
1090     assert type(s) == compat_str
1091
1092     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1093         if _windows_write_string(s, out):
1094             return
1095
1096     if ('b' in getattr(out, 'mode', '') or
1097             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1098         byt = s.encode(encoding or preferredencoding(), 'ignore')
1099         out.write(byt)
1100     elif hasattr(out, 'buffer'):
1101         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1102         byt = s.encode(enc, 'ignore')
1103         out.buffer.write(byt)
1104     else:
1105         out.write(s)
1106     out.flush()
1107
1108
1109 def bytes_to_intlist(bs):
1110     if not bs:
1111         return []
1112     if isinstance(bs[0], int):  # Python 3
1113         return list(bs)
1114     else:
1115         return [ord(c) for c in bs]
1116
1117
1118 def intlist_to_bytes(xs):
1119     if not xs:
1120         return b''
1121     return struct_pack('%dB' % len(xs), *xs)
1122
1123
1124 # Cross-platform file locking
1125 if sys.platform == 'win32':
1126     import ctypes.wintypes
1127     import msvcrt
1128
1129     class OVERLAPPED(ctypes.Structure):
1130         _fields_ = [
1131             ('Internal', ctypes.wintypes.LPVOID),
1132             ('InternalHigh', ctypes.wintypes.LPVOID),
1133             ('Offset', ctypes.wintypes.DWORD),
1134             ('OffsetHigh', ctypes.wintypes.DWORD),
1135             ('hEvent', ctypes.wintypes.HANDLE),
1136         ]
1137
1138     kernel32 = ctypes.windll.kernel32
1139     LockFileEx = kernel32.LockFileEx
1140     LockFileEx.argtypes = [
1141         ctypes.wintypes.HANDLE,     # hFile
1142         ctypes.wintypes.DWORD,      # dwFlags
1143         ctypes.wintypes.DWORD,      # dwReserved
1144         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1145         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1146         ctypes.POINTER(OVERLAPPED)  # Overlapped
1147     ]
1148     LockFileEx.restype = ctypes.wintypes.BOOL
1149     UnlockFileEx = kernel32.UnlockFileEx
1150     UnlockFileEx.argtypes = [
1151         ctypes.wintypes.HANDLE,     # hFile
1152         ctypes.wintypes.DWORD,      # dwReserved
1153         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1154         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1155         ctypes.POINTER(OVERLAPPED)  # Overlapped
1156     ]
1157     UnlockFileEx.restype = ctypes.wintypes.BOOL
1158     whole_low = 0xffffffff
1159     whole_high = 0x7fffffff
1160
1161     def _lock_file(f, exclusive):
1162         overlapped = OVERLAPPED()
1163         overlapped.Offset = 0
1164         overlapped.OffsetHigh = 0
1165         overlapped.hEvent = 0
1166         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1167         handle = msvcrt.get_osfhandle(f.fileno())
1168         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1169                           whole_low, whole_high, f._lock_file_overlapped_p):
1170             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1171
1172     def _unlock_file(f):
1173         assert f._lock_file_overlapped_p
1174         handle = msvcrt.get_osfhandle(f.fileno())
1175         if not UnlockFileEx(handle, 0,
1176                             whole_low, whole_high, f._lock_file_overlapped_p):
1177             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1178
1179 else:
1180     import fcntl
1181
1182     def _lock_file(f, exclusive):
1183         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1184
1185     def _unlock_file(f):
1186         fcntl.flock(f, fcntl.LOCK_UN)
1187
1188
1189 class locked_file(object):
1190     def __init__(self, filename, mode, encoding=None):
1191         assert mode in ['r', 'a', 'w']
1192         self.f = io.open(filename, mode, encoding=encoding)
1193         self.mode = mode
1194
1195     def __enter__(self):
1196         exclusive = self.mode != 'r'
1197         try:
1198             _lock_file(self.f, exclusive)
1199         except IOError:
1200             self.f.close()
1201             raise
1202         return self
1203
1204     def __exit__(self, etype, value, traceback):
1205         try:
1206             _unlock_file(self.f)
1207         finally:
1208             self.f.close()
1209
1210     def __iter__(self):
1211         return iter(self.f)
1212
1213     def write(self, *args):
1214         return self.f.write(*args)
1215
1216     def read(self, *args):
1217         return self.f.read(*args)
1218
1219
1220 def get_filesystem_encoding():
1221     encoding = sys.getfilesystemencoding()
1222     return encoding if encoding is not None else 'utf-8'
1223
1224
1225 def shell_quote(args):
1226     quoted_args = []
1227     encoding = get_filesystem_encoding()
1228     for a in args:
1229         if isinstance(a, bytes):
1230             # We may get a filename encoded with 'encodeFilename'
1231             a = a.decode(encoding)
1232         quoted_args.append(pipes.quote(a))
1233     return ' '.join(quoted_args)
1234
1235
1236 def smuggle_url(url, data):
1237     """ Pass additional data in a URL for internal use. """
1238
1239     sdata = compat_urllib_parse.urlencode(
1240         {'__youtubedl_smuggle': json.dumps(data)})
1241     return url + '#' + sdata
1242
1243
1244 def unsmuggle_url(smug_url, default=None):
1245     if '#__youtubedl_smuggle' not in smug_url:
1246         return smug_url, default
1247     url, _, sdata = smug_url.rpartition('#')
1248     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1249     data = json.loads(jsond)
1250     return url, data
1251
1252
1253 def format_bytes(bytes):
1254     if bytes is None:
1255         return 'N/A'
1256     if type(bytes) is str:
1257         bytes = float(bytes)
1258     if bytes == 0.0:
1259         exponent = 0
1260     else:
1261         exponent = int(math.log(bytes, 1024.0))
1262     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1263     converted = float(bytes) / float(1024 ** exponent)
1264     return '%.2f%s' % (converted, suffix)
1265
1266
1267 def parse_filesize(s):
1268     if s is None:
1269         return None
1270
1271     # The lower-case forms are of course incorrect and inofficial,
1272     # but we support those too
1273     _UNIT_TABLE = {
1274         'B': 1,
1275         'b': 1,
1276         'KiB': 1024,
1277         'KB': 1000,
1278         'kB': 1024,
1279         'Kb': 1000,
1280         'MiB': 1024 ** 2,
1281         'MB': 1000 ** 2,
1282         'mB': 1024 ** 2,
1283         'Mb': 1000 ** 2,
1284         'GiB': 1024 ** 3,
1285         'GB': 1000 ** 3,
1286         'gB': 1024 ** 3,
1287         'Gb': 1000 ** 3,
1288         'TiB': 1024 ** 4,
1289         'TB': 1000 ** 4,
1290         'tB': 1024 ** 4,
1291         'Tb': 1000 ** 4,
1292         'PiB': 1024 ** 5,
1293         'PB': 1000 ** 5,
1294         'pB': 1024 ** 5,
1295         'Pb': 1000 ** 5,
1296         'EiB': 1024 ** 6,
1297         'EB': 1000 ** 6,
1298         'eB': 1024 ** 6,
1299         'Eb': 1000 ** 6,
1300         'ZiB': 1024 ** 7,
1301         'ZB': 1000 ** 7,
1302         'zB': 1024 ** 7,
1303         'Zb': 1000 ** 7,
1304         'YiB': 1024 ** 8,
1305         'YB': 1000 ** 8,
1306         'yB': 1024 ** 8,
1307         'Yb': 1000 ** 8,
1308     }
1309
1310     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1311     m = re.match(
1312         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1313     if not m:
1314         return None
1315
1316     num_str = m.group('num').replace(',', '.')
1317     mult = _UNIT_TABLE[m.group('unit')]
1318     return int(float(num_str) * mult)
1319
1320
1321 def month_by_name(name):
1322     """ Return the number of a month by (locale-independently) English name """
1323
1324     try:
1325         return ENGLISH_MONTH_NAMES.index(name) + 1
1326     except ValueError:
1327         return None
1328
1329
1330 def month_by_abbreviation(abbrev):
1331     """ Return the number of a month by (locale-independently) English
1332         abbreviations """
1333
1334     try:
1335         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1336     except ValueError:
1337         return None
1338
1339
1340 def fix_xml_ampersands(xml_str):
1341     """Replace all the '&' by '&amp;' in XML"""
1342     return re.sub(
1343         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1344         '&amp;',
1345         xml_str)
1346
1347
1348 def setproctitle(title):
1349     assert isinstance(title, compat_str)
1350     try:
1351         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1352     except OSError:
1353         return
1354     title_bytes = title.encode('utf-8')
1355     buf = ctypes.create_string_buffer(len(title_bytes))
1356     buf.value = title_bytes
1357     try:
1358         libc.prctl(15, buf, 0, 0, 0)
1359     except AttributeError:
1360         return  # Strange libc, just skip this
1361
1362
1363 def remove_start(s, start):
1364     if s.startswith(start):
1365         return s[len(start):]
1366     return s
1367
1368
1369 def remove_end(s, end):
1370     if s.endswith(end):
1371         return s[:-len(end)]
1372     return s
1373
1374
1375 def url_basename(url):
1376     path = compat_urlparse.urlparse(url).path
1377     return path.strip('/').split('/')[-1]
1378
1379
1380 class HEADRequest(compat_urllib_request.Request):
1381     def get_method(self):
1382         return "HEAD"
1383
1384
1385 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1386     if get_attr:
1387         if v is not None:
1388             v = getattr(v, get_attr, None)
1389     if v == '':
1390         v = None
1391     if v is None:
1392         return default
1393     try:
1394         return int(v) * invscale // scale
1395     except ValueError:
1396         return default
1397
1398
1399 def str_or_none(v, default=None):
1400     return default if v is None else compat_str(v)
1401
1402
1403 def str_to_int(int_str):
1404     """ A more relaxed version of int_or_none """
1405     if int_str is None:
1406         return None
1407     int_str = re.sub(r'[,\.\+]', '', int_str)
1408     return int(int_str)
1409
1410
1411 def float_or_none(v, scale=1, invscale=1, default=None):
1412     if v is None:
1413         return default
1414     try:
1415         return float(v) * invscale / scale
1416     except ValueError:
1417         return default
1418
1419
1420 def parse_duration(s):
1421     if not isinstance(s, compat_basestring):
1422         return None
1423
1424     s = s.strip()
1425
1426     m = re.match(
1427         r'''(?ix)(?:P?T)?
1428         (?:
1429             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1430             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1431
1432             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1433             (?:
1434                 (?:
1435                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1436                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1437                 )?
1438                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1439             )?
1440             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1441         )$''', s)
1442     if not m:
1443         return None
1444     res = 0
1445     if m.group('only_mins'):
1446         return float_or_none(m.group('only_mins'), invscale=60)
1447     if m.group('only_hours'):
1448         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1449     if m.group('secs'):
1450         res += int(m.group('secs'))
1451     if m.group('mins_reversed'):
1452         res += int(m.group('mins_reversed')) * 60
1453     if m.group('mins'):
1454         res += int(m.group('mins')) * 60
1455     if m.group('hours'):
1456         res += int(m.group('hours')) * 60 * 60
1457     if m.group('hours_reversed'):
1458         res += int(m.group('hours_reversed')) * 60 * 60
1459     if m.group('days'):
1460         res += int(m.group('days')) * 24 * 60 * 60
1461     if m.group('ms'):
1462         res += float(m.group('ms'))
1463     return res
1464
1465
1466 def prepend_extension(filename, ext, expected_real_ext=None):
1467     name, real_ext = os.path.splitext(filename)
1468     return (
1469         '{0}.{1}{2}'.format(name, ext, real_ext)
1470         if not expected_real_ext or real_ext[1:] == expected_real_ext
1471         else '{0}.{1}'.format(filename, ext))
1472
1473
1474 def replace_extension(filename, ext, expected_real_ext=None):
1475     name, real_ext = os.path.splitext(filename)
1476     return '{0}.{1}'.format(
1477         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1478         ext)
1479
1480
1481 def check_executable(exe, args=[]):
1482     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1483     args can be a list of arguments for a short output (like -version) """
1484     try:
1485         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1486     except OSError:
1487         return False
1488     return exe
1489
1490
1491 def get_exe_version(exe, args=['--version'],
1492                     version_re=None, unrecognized='present'):
1493     """ Returns the version of the specified executable,
1494     or False if the executable is not present """
1495     try:
1496         out, _ = subprocess.Popen(
1497             [encodeArgument(exe)] + args,
1498             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1499     except OSError:
1500         return False
1501     if isinstance(out, bytes):  # Python 2.x
1502         out = out.decode('ascii', 'ignore')
1503     return detect_exe_version(out, version_re, unrecognized)
1504
1505
1506 def detect_exe_version(output, version_re=None, unrecognized='present'):
1507     assert isinstance(output, compat_str)
1508     if version_re is None:
1509         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1510     m = re.search(version_re, output)
1511     if m:
1512         return m.group(1)
1513     else:
1514         return unrecognized
1515
1516
1517 class PagedList(object):
1518     def __len__(self):
1519         # This is only useful for tests
1520         return len(self.getslice())
1521
1522
1523 class OnDemandPagedList(PagedList):
1524     def __init__(self, pagefunc, pagesize):
1525         self._pagefunc = pagefunc
1526         self._pagesize = pagesize
1527
1528     def getslice(self, start=0, end=None):
1529         res = []
1530         for pagenum in itertools.count(start // self._pagesize):
1531             firstid = pagenum * self._pagesize
1532             nextfirstid = pagenum * self._pagesize + self._pagesize
1533             if start >= nextfirstid:
1534                 continue
1535
1536             page_results = list(self._pagefunc(pagenum))
1537
1538             startv = (
1539                 start % self._pagesize
1540                 if firstid <= start < nextfirstid
1541                 else 0)
1542
1543             endv = (
1544                 ((end - 1) % self._pagesize) + 1
1545                 if (end is not None and firstid <= end <= nextfirstid)
1546                 else None)
1547
1548             if startv != 0 or endv is not None:
1549                 page_results = page_results[startv:endv]
1550             res.extend(page_results)
1551
1552             # A little optimization - if current page is not "full", ie. does
1553             # not contain page_size videos then we can assume that this page
1554             # is the last one - there are no more ids on further pages -
1555             # i.e. no need to query again.
1556             if len(page_results) + startv < self._pagesize:
1557                 break
1558
1559             # If we got the whole page, but the next page is not interesting,
1560             # break out early as well
1561             if end == nextfirstid:
1562                 break
1563         return res
1564
1565
1566 class InAdvancePagedList(PagedList):
1567     def __init__(self, pagefunc, pagecount, pagesize):
1568         self._pagefunc = pagefunc
1569         self._pagecount = pagecount
1570         self._pagesize = pagesize
1571
1572     def getslice(self, start=0, end=None):
1573         res = []
1574         start_page = start // self._pagesize
1575         end_page = (
1576             self._pagecount if end is None else (end // self._pagesize + 1))
1577         skip_elems = start - start_page * self._pagesize
1578         only_more = None if end is None else end - start
1579         for pagenum in range(start_page, end_page):
1580             page = list(self._pagefunc(pagenum))
1581             if skip_elems:
1582                 page = page[skip_elems:]
1583                 skip_elems = None
1584             if only_more is not None:
1585                 if len(page) < only_more:
1586                     only_more -= len(page)
1587                 else:
1588                     page = page[:only_more]
1589                     res.extend(page)
1590                     break
1591             res.extend(page)
1592         return res
1593
1594
1595 def uppercase_escape(s):
1596     unicode_escape = codecs.getdecoder('unicode_escape')
1597     return re.sub(
1598         r'\\U[0-9a-fA-F]{8}',
1599         lambda m: unicode_escape(m.group(0))[0],
1600         s)
1601
1602
1603 def lowercase_escape(s):
1604     unicode_escape = codecs.getdecoder('unicode_escape')
1605     return re.sub(
1606         r'\\u[0-9a-fA-F]{4}',
1607         lambda m: unicode_escape(m.group(0))[0],
1608         s)
1609
1610
1611 def escape_rfc3986(s):
1612     """Escape non-ASCII characters as suggested by RFC 3986"""
1613     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1614         s = s.encode('utf-8')
1615     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1616
1617
1618 def escape_url(url):
1619     """Escape URL as suggested by RFC 3986"""
1620     url_parsed = compat_urllib_parse_urlparse(url)
1621     return url_parsed._replace(
1622         path=escape_rfc3986(url_parsed.path),
1623         params=escape_rfc3986(url_parsed.params),
1624         query=escape_rfc3986(url_parsed.query),
1625         fragment=escape_rfc3986(url_parsed.fragment)
1626     ).geturl()
1627
1628 try:
1629     struct.pack('!I', 0)
1630 except TypeError:
1631     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1632     def struct_pack(spec, *args):
1633         if isinstance(spec, compat_str):
1634             spec = spec.encode('ascii')
1635         return struct.pack(spec, *args)
1636
1637     def struct_unpack(spec, *args):
1638         if isinstance(spec, compat_str):
1639             spec = spec.encode('ascii')
1640         return struct.unpack(spec, *args)
1641 else:
1642     struct_pack = struct.pack
1643     struct_unpack = struct.unpack
1644
1645
1646 def read_batch_urls(batch_fd):
1647     def fixup(url):
1648         if not isinstance(url, compat_str):
1649             url = url.decode('utf-8', 'replace')
1650         BOM_UTF8 = '\xef\xbb\xbf'
1651         if url.startswith(BOM_UTF8):
1652             url = url[len(BOM_UTF8):]
1653         url = url.strip()
1654         if url.startswith(('#', ';', ']')):
1655             return False
1656         return url
1657
1658     with contextlib.closing(batch_fd) as fd:
1659         return [url for url in map(fixup, fd) if url]
1660
1661
1662 def urlencode_postdata(*args, **kargs):
1663     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1664
1665
1666 def encode_dict(d, encoding='utf-8'):
1667     return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1668
1669
1670 US_RATINGS = {
1671     'G': 0,
1672     'PG': 10,
1673     'PG-13': 13,
1674     'R': 16,
1675     'NC': 18,
1676 }
1677
1678
1679 def parse_age_limit(s):
1680     if s is None:
1681         return None
1682     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1683     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1684
1685
1686 def strip_jsonp(code):
1687     return re.sub(
1688         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1689
1690
1691 def js_to_json(code):
1692     def fix_kv(m):
1693         v = m.group(0)
1694         if v in ('true', 'false', 'null'):
1695             return v
1696         if v.startswith('"'):
1697             v = re.sub(r"\\'", "'", v[1:-1])
1698         elif v.startswith("'"):
1699             v = v[1:-1]
1700             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1701                 '\\\\': '\\\\',
1702                 "\\'": "'",
1703                 '"': '\\"',
1704             }[m.group(0)], v)
1705         return '"%s"' % v
1706
1707     res = re.sub(r'''(?x)
1708         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1709         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1710         [a-zA-Z_][.a-zA-Z_0-9]*
1711         ''', fix_kv, code)
1712     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1713     return res
1714
1715
1716 def qualities(quality_ids):
1717     """ Get a numeric quality value out of a list of possible values """
1718     def q(qid):
1719         try:
1720             return quality_ids.index(qid)
1721         except ValueError:
1722             return -1
1723     return q
1724
1725
1726 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1727
1728
1729 def limit_length(s, length):
1730     """ Add ellipses to overly long strings """
1731     if s is None:
1732         return None
1733     ELLIPSES = '...'
1734     if len(s) > length:
1735         return s[:length - len(ELLIPSES)] + ELLIPSES
1736     return s
1737
1738
1739 def version_tuple(v):
1740     return tuple(int(e) for e in re.split(r'[-.]', v))
1741
1742
1743 def is_outdated_version(version, limit, assume_new=True):
1744     if not version:
1745         return not assume_new
1746     try:
1747         return version_tuple(version) < version_tuple(limit)
1748     except ValueError:
1749         return not assume_new
1750
1751
1752 def ytdl_is_updateable():
1753     """ Returns if youtube-dl can be updated with -U """
1754     from zipimport import zipimporter
1755
1756     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1757
1758
1759 def args_to_str(args):
1760     # Get a short string representation for a subprocess command
1761     return ' '.join(shlex_quote(a) for a in args)
1762
1763
1764 def mimetype2ext(mt):
1765     _, _, res = mt.rpartition('/')
1766
1767     return {
1768         'x-ms-wmv': 'wmv',
1769         'x-mp4-fragmented': 'mp4',
1770         'ttml+xml': 'ttml',
1771     }.get(res, res)
1772
1773
1774 def urlhandle_detect_ext(url_handle):
1775     try:
1776         url_handle.headers
1777         getheader = lambda h: url_handle.headers[h]
1778     except AttributeError:  # Python < 3
1779         getheader = url_handle.info().getheader
1780
1781     cd = getheader('Content-Disposition')
1782     if cd:
1783         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1784         if m:
1785             e = determine_ext(m.group('filename'), default_ext=None)
1786             if e:
1787                 return e
1788
1789     return mimetype2ext(getheader('Content-Type'))
1790
1791
1792 def encode_data_uri(data, mime_type):
1793     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1794
1795
1796 def age_restricted(content_limit, age_limit):
1797     """ Returns True iff the content should be blocked """
1798
1799     if age_limit is None:  # No limit set
1800         return False
1801     if content_limit is None:
1802         return False  # Content available for everyone
1803     return age_limit < content_limit
1804
1805
1806 def is_html(first_bytes):
1807     """ Detect whether a file contains HTML by examining its first bytes. """
1808
1809     BOMS = [
1810         (b'\xef\xbb\xbf', 'utf-8'),
1811         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1812         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1813         (b'\xff\xfe', 'utf-16-le'),
1814         (b'\xfe\xff', 'utf-16-be'),
1815     ]
1816     for bom, enc in BOMS:
1817         if first_bytes.startswith(bom):
1818             s = first_bytes[len(bom):].decode(enc, 'replace')
1819             break
1820     else:
1821         s = first_bytes.decode('utf-8', 'replace')
1822
1823     return re.match(r'^\s*<', s)
1824
1825
1826 def determine_protocol(info_dict):
1827     protocol = info_dict.get('protocol')
1828     if protocol is not None:
1829         return protocol
1830
1831     url = info_dict['url']
1832     if url.startswith('rtmp'):
1833         return 'rtmp'
1834     elif url.startswith('mms'):
1835         return 'mms'
1836     elif url.startswith('rtsp'):
1837         return 'rtsp'
1838
1839     ext = determine_ext(url)
1840     if ext == 'm3u8':
1841         return 'm3u8'
1842     elif ext == 'f4m':
1843         return 'f4m'
1844
1845     return compat_urllib_parse_urlparse(url).scheme
1846
1847
1848 def render_table(header_row, data):
1849     """ Render a list of rows, each as a list of values """
1850     table = [header_row] + data
1851     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1852     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1853     return '\n'.join(format_str % tuple(row) for row in table)
1854
1855
1856 def _match_one(filter_part, dct):
1857     COMPARISON_OPERATORS = {
1858         '<': operator.lt,
1859         '<=': operator.le,
1860         '>': operator.gt,
1861         '>=': operator.ge,
1862         '=': operator.eq,
1863         '!=': operator.ne,
1864     }
1865     operator_rex = re.compile(r'''(?x)\s*
1866         (?P<key>[a-z_]+)
1867         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1868         (?:
1869             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1870             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1871         )
1872         \s*$
1873         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1874     m = operator_rex.search(filter_part)
1875     if m:
1876         op = COMPARISON_OPERATORS[m.group('op')]
1877         if m.group('strval') is not None:
1878             if m.group('op') not in ('=', '!='):
1879                 raise ValueError(
1880                     'Operator %s does not support string values!' % m.group('op'))
1881             comparison_value = m.group('strval')
1882         else:
1883             try:
1884                 comparison_value = int(m.group('intval'))
1885             except ValueError:
1886                 comparison_value = parse_filesize(m.group('intval'))
1887                 if comparison_value is None:
1888                     comparison_value = parse_filesize(m.group('intval') + 'B')
1889                 if comparison_value is None:
1890                     raise ValueError(
1891                         'Invalid integer value %r in filter part %r' % (
1892                             m.group('intval'), filter_part))
1893         actual_value = dct.get(m.group('key'))
1894         if actual_value is None:
1895             return m.group('none_inclusive')
1896         return op(actual_value, comparison_value)
1897
1898     UNARY_OPERATORS = {
1899         '': lambda v: v is not None,
1900         '!': lambda v: v is None,
1901     }
1902     operator_rex = re.compile(r'''(?x)\s*
1903         (?P<op>%s)\s*(?P<key>[a-z_]+)
1904         \s*$
1905         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1906     m = operator_rex.search(filter_part)
1907     if m:
1908         op = UNARY_OPERATORS[m.group('op')]
1909         actual_value = dct.get(m.group('key'))
1910         return op(actual_value)
1911
1912     raise ValueError('Invalid filter part %r' % filter_part)
1913
1914
1915 def match_str(filter_str, dct):
1916     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1917
1918     return all(
1919         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1920
1921
1922 def match_filter_func(filter_str):
1923     def _match_func(info_dict):
1924         if match_str(filter_str, info_dict):
1925             return None
1926         else:
1927             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1928             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1929     return _match_func
1930
1931
1932 def parse_dfxp_time_expr(time_expr):
1933     if not time_expr:
1934         return 0.0
1935
1936     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1937     if mobj:
1938         return float(mobj.group('time_offset'))
1939
1940     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1941     if mobj:
1942         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1943
1944
1945 def srt_subtitles_timecode(seconds):
1946     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1947
1948
1949 def dfxp2srt(dfxp_data):
1950     _x = functools.partial(xpath_with_ns, ns_map={
1951         'ttml': 'http://www.w3.org/ns/ttml',
1952         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1953     })
1954
1955     def parse_node(node):
1956         str_or_empty = functools.partial(str_or_none, default='')
1957
1958         out = str_or_empty(node.text)
1959
1960         for child in node:
1961             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1962                 out += '\n' + str_or_empty(child.tail)
1963             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1964                 out += str_or_empty(parse_node(child))
1965             else:
1966                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1967
1968         return out
1969
1970     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1971     out = []
1972     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1973
1974     if not paras:
1975         raise ValueError('Invalid dfxp/TTML subtitle')
1976
1977     for para, index in zip(paras, itertools.count(1)):
1978         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1979         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1980         if not end_time:
1981             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1982         out.append('%d\n%s --> %s\n%s\n\n' % (
1983             index,
1984             srt_subtitles_timecode(begin_time),
1985             srt_subtitles_timecode(end_time),
1986             parse_node(para)))
1987
1988     return ''.join(out)
1989
1990
1991 def cli_option(params, command_option, param):
1992     param = params.get(param)
1993     return [command_option, param] if param is not None else []
1994
1995
1996 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1997     param = params.get(param)
1998     assert isinstance(param, bool)
1999     if separator:
2000         return [command_option + separator + (true_value if param else false_value)]
2001     return [command_option, true_value if param else false_value]
2002
2003
2004 def cli_valueless_option(params, command_option, param, expected_value=True):
2005     param = params.get(param)
2006     return [command_option] if param == expected_value else []
2007
2008
2009 def cli_configuration_args(params, param, default=[]):
2010     ex_args = params.get(param)
2011     if ex_args is None:
2012         return default
2013     assert isinstance(ex_args, list)
2014     return ex_args
2015
2016
2017 class ISO639Utils(object):
2018     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2019     _lang_map = {
2020         'aa': 'aar',
2021         'ab': 'abk',
2022         'ae': 'ave',
2023         'af': 'afr',
2024         'ak': 'aka',
2025         'am': 'amh',
2026         'an': 'arg',
2027         'ar': 'ara',
2028         'as': 'asm',
2029         'av': 'ava',
2030         'ay': 'aym',
2031         'az': 'aze',
2032         'ba': 'bak',
2033         'be': 'bel',
2034         'bg': 'bul',
2035         'bh': 'bih',
2036         'bi': 'bis',
2037         'bm': 'bam',
2038         'bn': 'ben',
2039         'bo': 'bod',
2040         'br': 'bre',
2041         'bs': 'bos',
2042         'ca': 'cat',
2043         'ce': 'che',
2044         'ch': 'cha',
2045         'co': 'cos',
2046         'cr': 'cre',
2047         'cs': 'ces',
2048         'cu': 'chu',
2049         'cv': 'chv',
2050         'cy': 'cym',
2051         'da': 'dan',
2052         'de': 'deu',
2053         'dv': 'div',
2054         'dz': 'dzo',
2055         'ee': 'ewe',
2056         'el': 'ell',
2057         'en': 'eng',
2058         'eo': 'epo',
2059         'es': 'spa',
2060         'et': 'est',
2061         'eu': 'eus',
2062         'fa': 'fas',
2063         'ff': 'ful',
2064         'fi': 'fin',
2065         'fj': 'fij',
2066         'fo': 'fao',
2067         'fr': 'fra',
2068         'fy': 'fry',
2069         'ga': 'gle',
2070         'gd': 'gla',
2071         'gl': 'glg',
2072         'gn': 'grn',
2073         'gu': 'guj',
2074         'gv': 'glv',
2075         'ha': 'hau',
2076         'he': 'heb',
2077         'hi': 'hin',
2078         'ho': 'hmo',
2079         'hr': 'hrv',
2080         'ht': 'hat',
2081         'hu': 'hun',
2082         'hy': 'hye',
2083         'hz': 'her',
2084         'ia': 'ina',
2085         'id': 'ind',
2086         'ie': 'ile',
2087         'ig': 'ibo',
2088         'ii': 'iii',
2089         'ik': 'ipk',
2090         'io': 'ido',
2091         'is': 'isl',
2092         'it': 'ita',
2093         'iu': 'iku',
2094         'ja': 'jpn',
2095         'jv': 'jav',
2096         'ka': 'kat',
2097         'kg': 'kon',
2098         'ki': 'kik',
2099         'kj': 'kua',
2100         'kk': 'kaz',
2101         'kl': 'kal',
2102         'km': 'khm',
2103         'kn': 'kan',
2104         'ko': 'kor',
2105         'kr': 'kau',
2106         'ks': 'kas',
2107         'ku': 'kur',
2108         'kv': 'kom',
2109         'kw': 'cor',
2110         'ky': 'kir',
2111         'la': 'lat',
2112         'lb': 'ltz',
2113         'lg': 'lug',
2114         'li': 'lim',
2115         'ln': 'lin',
2116         'lo': 'lao',
2117         'lt': 'lit',
2118         'lu': 'lub',
2119         'lv': 'lav',
2120         'mg': 'mlg',
2121         'mh': 'mah',
2122         'mi': 'mri',
2123         'mk': 'mkd',
2124         'ml': 'mal',
2125         'mn': 'mon',
2126         'mr': 'mar',
2127         'ms': 'msa',
2128         'mt': 'mlt',
2129         'my': 'mya',
2130         'na': 'nau',
2131         'nb': 'nob',
2132         'nd': 'nde',
2133         'ne': 'nep',
2134         'ng': 'ndo',
2135         'nl': 'nld',
2136         'nn': 'nno',
2137         'no': 'nor',
2138         'nr': 'nbl',
2139         'nv': 'nav',
2140         'ny': 'nya',
2141         'oc': 'oci',
2142         'oj': 'oji',
2143         'om': 'orm',
2144         'or': 'ori',
2145         'os': 'oss',
2146         'pa': 'pan',
2147         'pi': 'pli',
2148         'pl': 'pol',
2149         'ps': 'pus',
2150         'pt': 'por',
2151         'qu': 'que',
2152         'rm': 'roh',
2153         'rn': 'run',
2154         'ro': 'ron',
2155         'ru': 'rus',
2156         'rw': 'kin',
2157         'sa': 'san',
2158         'sc': 'srd',
2159         'sd': 'snd',
2160         'se': 'sme',
2161         'sg': 'sag',
2162         'si': 'sin',
2163         'sk': 'slk',
2164         'sl': 'slv',
2165         'sm': 'smo',
2166         'sn': 'sna',
2167         'so': 'som',
2168         'sq': 'sqi',
2169         'sr': 'srp',
2170         'ss': 'ssw',
2171         'st': 'sot',
2172         'su': 'sun',
2173         'sv': 'swe',
2174         'sw': 'swa',
2175         'ta': 'tam',
2176         'te': 'tel',
2177         'tg': 'tgk',
2178         'th': 'tha',
2179         'ti': 'tir',
2180         'tk': 'tuk',
2181         'tl': 'tgl',
2182         'tn': 'tsn',
2183         'to': 'ton',
2184         'tr': 'tur',
2185         'ts': 'tso',
2186         'tt': 'tat',
2187         'tw': 'twi',
2188         'ty': 'tah',
2189         'ug': 'uig',
2190         'uk': 'ukr',
2191         'ur': 'urd',
2192         'uz': 'uzb',
2193         've': 'ven',
2194         'vi': 'vie',
2195         'vo': 'vol',
2196         'wa': 'wln',
2197         'wo': 'wol',
2198         'xh': 'xho',
2199         'yi': 'yid',
2200         'yo': 'yor',
2201         'za': 'zha',
2202         'zh': 'zho',
2203         'zu': 'zul',
2204     }
2205
2206     @classmethod
2207     def short2long(cls, code):
2208         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2209         return cls._lang_map.get(code[:2])
2210
2211     @classmethod
2212     def long2short(cls, code):
2213         """Convert language code from ISO 639-2/T to ISO 639-1"""
2214         for short_name, long_name in cls._lang_map.items():
2215             if long_name == code:
2216                 return short_name
2217
2218
2219 class ISO3166Utils(object):
2220     # From http://data.okfn.org/data/core/country-list
2221     _country_map = {
2222         'AF': 'Afghanistan',
2223         'AX': 'Åland Islands',
2224         'AL': 'Albania',
2225         'DZ': 'Algeria',
2226         'AS': 'American Samoa',
2227         'AD': 'Andorra',
2228         'AO': 'Angola',
2229         'AI': 'Anguilla',
2230         'AQ': 'Antarctica',
2231         'AG': 'Antigua and Barbuda',
2232         'AR': 'Argentina',
2233         'AM': 'Armenia',
2234         'AW': 'Aruba',
2235         'AU': 'Australia',
2236         'AT': 'Austria',
2237         'AZ': 'Azerbaijan',
2238         'BS': 'Bahamas',
2239         'BH': 'Bahrain',
2240         'BD': 'Bangladesh',
2241         'BB': 'Barbados',
2242         'BY': 'Belarus',
2243         'BE': 'Belgium',
2244         'BZ': 'Belize',
2245         'BJ': 'Benin',
2246         'BM': 'Bermuda',
2247         'BT': 'Bhutan',
2248         'BO': 'Bolivia, Plurinational State of',
2249         'BQ': 'Bonaire, Sint Eustatius and Saba',
2250         'BA': 'Bosnia and Herzegovina',
2251         'BW': 'Botswana',
2252         'BV': 'Bouvet Island',
2253         'BR': 'Brazil',
2254         'IO': 'British Indian Ocean Territory',
2255         'BN': 'Brunei Darussalam',
2256         'BG': 'Bulgaria',
2257         'BF': 'Burkina Faso',
2258         'BI': 'Burundi',
2259         'KH': 'Cambodia',
2260         'CM': 'Cameroon',
2261         'CA': 'Canada',
2262         'CV': 'Cape Verde',
2263         'KY': 'Cayman Islands',
2264         'CF': 'Central African Republic',
2265         'TD': 'Chad',
2266         'CL': 'Chile',
2267         'CN': 'China',
2268         'CX': 'Christmas Island',
2269         'CC': 'Cocos (Keeling) Islands',
2270         'CO': 'Colombia',
2271         'KM': 'Comoros',
2272         'CG': 'Congo',
2273         'CD': 'Congo, the Democratic Republic of the',
2274         'CK': 'Cook Islands',
2275         'CR': 'Costa Rica',
2276         'CI': 'Côte d\'Ivoire',
2277         'HR': 'Croatia',
2278         'CU': 'Cuba',
2279         'CW': 'Curaçao',
2280         'CY': 'Cyprus',
2281         'CZ': 'Czech Republic',
2282         'DK': 'Denmark',
2283         'DJ': 'Djibouti',
2284         'DM': 'Dominica',
2285         'DO': 'Dominican Republic',
2286         'EC': 'Ecuador',
2287         'EG': 'Egypt',
2288         'SV': 'El Salvador',
2289         'GQ': 'Equatorial Guinea',
2290         'ER': 'Eritrea',
2291         'EE': 'Estonia',
2292         'ET': 'Ethiopia',
2293         'FK': 'Falkland Islands (Malvinas)',
2294         'FO': 'Faroe Islands',
2295         'FJ': 'Fiji',
2296         'FI': 'Finland',
2297         'FR': 'France',
2298         'GF': 'French Guiana',
2299         'PF': 'French Polynesia',
2300         'TF': 'French Southern Territories',
2301         'GA': 'Gabon',
2302         'GM': 'Gambia',
2303         'GE': 'Georgia',
2304         'DE': 'Germany',
2305         'GH': 'Ghana',
2306         'GI': 'Gibraltar',
2307         'GR': 'Greece',
2308         'GL': 'Greenland',
2309         'GD': 'Grenada',
2310         'GP': 'Guadeloupe',
2311         'GU': 'Guam',
2312         'GT': 'Guatemala',
2313         'GG': 'Guernsey',
2314         'GN': 'Guinea',
2315         'GW': 'Guinea-Bissau',
2316         'GY': 'Guyana',
2317         'HT': 'Haiti',
2318         'HM': 'Heard Island and McDonald Islands',
2319         'VA': 'Holy See (Vatican City State)',
2320         'HN': 'Honduras',
2321         'HK': 'Hong Kong',
2322         'HU': 'Hungary',
2323         'IS': 'Iceland',
2324         'IN': 'India',
2325         'ID': 'Indonesia',
2326         'IR': 'Iran, Islamic Republic of',
2327         'IQ': 'Iraq',
2328         'IE': 'Ireland',
2329         'IM': 'Isle of Man',
2330         'IL': 'Israel',
2331         'IT': 'Italy',
2332         'JM': 'Jamaica',
2333         'JP': 'Japan',
2334         'JE': 'Jersey',
2335         'JO': 'Jordan',
2336         'KZ': 'Kazakhstan',
2337         'KE': 'Kenya',
2338         'KI': 'Kiribati',
2339         'KP': 'Korea, Democratic People\'s Republic of',
2340         'KR': 'Korea, Republic of',
2341         'KW': 'Kuwait',
2342         'KG': 'Kyrgyzstan',
2343         'LA': 'Lao People\'s Democratic Republic',
2344         'LV': 'Latvia',
2345         'LB': 'Lebanon',
2346         'LS': 'Lesotho',
2347         'LR': 'Liberia',
2348         'LY': 'Libya',
2349         'LI': 'Liechtenstein',
2350         'LT': 'Lithuania',
2351         'LU': 'Luxembourg',
2352         'MO': 'Macao',
2353         'MK': 'Macedonia, the Former Yugoslav Republic of',
2354         'MG': 'Madagascar',
2355         'MW': 'Malawi',
2356         'MY': 'Malaysia',
2357         'MV': 'Maldives',
2358         'ML': 'Mali',
2359         'MT': 'Malta',
2360         'MH': 'Marshall Islands',
2361         'MQ': 'Martinique',
2362         'MR': 'Mauritania',
2363         'MU': 'Mauritius',
2364         'YT': 'Mayotte',
2365         'MX': 'Mexico',
2366         'FM': 'Micronesia, Federated States of',
2367         'MD': 'Moldova, Republic of',
2368         'MC': 'Monaco',
2369         'MN': 'Mongolia',
2370         'ME': 'Montenegro',
2371         'MS': 'Montserrat',
2372         'MA': 'Morocco',
2373         'MZ': 'Mozambique',
2374         'MM': 'Myanmar',
2375         'NA': 'Namibia',
2376         'NR': 'Nauru',
2377         'NP': 'Nepal',
2378         'NL': 'Netherlands',
2379         'NC': 'New Caledonia',
2380         'NZ': 'New Zealand',
2381         'NI': 'Nicaragua',
2382         'NE': 'Niger',
2383         'NG': 'Nigeria',
2384         'NU': 'Niue',
2385         'NF': 'Norfolk Island',
2386         'MP': 'Northern Mariana Islands',
2387         'NO': 'Norway',
2388         'OM': 'Oman',
2389         'PK': 'Pakistan',
2390         'PW': 'Palau',
2391         'PS': 'Palestine, State of',
2392         'PA': 'Panama',
2393         'PG': 'Papua New Guinea',
2394         'PY': 'Paraguay',
2395         'PE': 'Peru',
2396         'PH': 'Philippines',
2397         'PN': 'Pitcairn',
2398         'PL': 'Poland',
2399         'PT': 'Portugal',
2400         'PR': 'Puerto Rico',
2401         'QA': 'Qatar',
2402         'RE': 'Réunion',
2403         'RO': 'Romania',
2404         'RU': 'Russian Federation',
2405         'RW': 'Rwanda',
2406         'BL': 'Saint Barthélemy',
2407         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2408         'KN': 'Saint Kitts and Nevis',
2409         'LC': 'Saint Lucia',
2410         'MF': 'Saint Martin (French part)',
2411         'PM': 'Saint Pierre and Miquelon',
2412         'VC': 'Saint Vincent and the Grenadines',
2413         'WS': 'Samoa',
2414         'SM': 'San Marino',
2415         'ST': 'Sao Tome and Principe',
2416         'SA': 'Saudi Arabia',
2417         'SN': 'Senegal',
2418         'RS': 'Serbia',
2419         'SC': 'Seychelles',
2420         'SL': 'Sierra Leone',
2421         'SG': 'Singapore',
2422         'SX': 'Sint Maarten (Dutch part)',
2423         'SK': 'Slovakia',
2424         'SI': 'Slovenia',
2425         'SB': 'Solomon Islands',
2426         'SO': 'Somalia',
2427         'ZA': 'South Africa',
2428         'GS': 'South Georgia and the South Sandwich Islands',
2429         'SS': 'South Sudan',
2430         'ES': 'Spain',
2431         'LK': 'Sri Lanka',
2432         'SD': 'Sudan',
2433         'SR': 'Suriname',
2434         'SJ': 'Svalbard and Jan Mayen',
2435         'SZ': 'Swaziland',
2436         'SE': 'Sweden',
2437         'CH': 'Switzerland',
2438         'SY': 'Syrian Arab Republic',
2439         'TW': 'Taiwan, Province of China',
2440         'TJ': 'Tajikistan',
2441         'TZ': 'Tanzania, United Republic of',
2442         'TH': 'Thailand',
2443         'TL': 'Timor-Leste',
2444         'TG': 'Togo',
2445         'TK': 'Tokelau',
2446         'TO': 'Tonga',
2447         'TT': 'Trinidad and Tobago',
2448         'TN': 'Tunisia',
2449         'TR': 'Turkey',
2450         'TM': 'Turkmenistan',
2451         'TC': 'Turks and Caicos Islands',
2452         'TV': 'Tuvalu',
2453         'UG': 'Uganda',
2454         'UA': 'Ukraine',
2455         'AE': 'United Arab Emirates',
2456         'GB': 'United Kingdom',
2457         'US': 'United States',
2458         'UM': 'United States Minor Outlying Islands',
2459         'UY': 'Uruguay',
2460         'UZ': 'Uzbekistan',
2461         'VU': 'Vanuatu',
2462         'VE': 'Venezuela, Bolivarian Republic of',
2463         'VN': 'Viet Nam',
2464         'VG': 'Virgin Islands, British',
2465         'VI': 'Virgin Islands, U.S.',
2466         'WF': 'Wallis and Futuna',
2467         'EH': 'Western Sahara',
2468         'YE': 'Yemen',
2469         'ZM': 'Zambia',
2470         'ZW': 'Zimbabwe',
2471     }
2472
2473     @classmethod
2474     def short2full(cls, code):
2475         """Convert an ISO 3166-2 country code to the corresponding full name"""
2476         return cls._country_map.get(code.upper())
2477
2478
2479 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2480     def __init__(self, proxies=None):
2481         # Set default handlers
2482         for type in ('http', 'https'):
2483             setattr(self, '%s_open' % type,
2484                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2485                         meth(r, proxy, type))
2486         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2487
2488     def proxy_open(self, req, proxy, type):
2489         req_proxy = req.headers.get('Ytdl-request-proxy')
2490         if req_proxy is not None:
2491             proxy = req_proxy
2492             del req.headers['Ytdl-request-proxy']
2493
2494         if proxy == '__noproxy__':
2495             return None  # No Proxy
2496         return compat_urllib_request.ProxyHandler.proxy_open(
2497             self, req, proxy, type)