git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_etree_fromstring,
  40     compat_html_entities,
  41     compat_http_client,
  42     compat_kwargs,
  43     compat_parse_qs,
  44     compat_socket_create_connection,
  45     compat_str,
  46     compat_urllib_error,
  47     compat_urllib_parse,
  48     compat_urllib_parse_urlparse,
  49     compat_urllib_request,
  50     compat_urlparse,
  51     shlex_quote,
  52 )
  53
  54
  55 # This is not clearly defined otherwise
  56 compiled_regex_type = type(re.compile(''))
  57
  58 std_headers = {
  59     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  60     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62     'Accept-Encoding': 'gzip, deflate',
  63     'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66
  67 NO_DEFAULT = object()
  68
  69 ENGLISH_MONTH_NAMES = [
  70     'January', 'February', 'March', 'April', 'May', 'June',
  71     'July', 'August', 'September', 'October', 'November', 'December']
  72
  73
  74 def preferredencoding():
  75     """Get preferred encoding.
  76
  77     Returns the best encoding scheme for the system, based on
  78     locale.getpreferredencoding() and some further tweaks.
  79     """
  80     try:
  81         pref = locale.getpreferredencoding()
  82         'TEST'.encode(pref)
  83     except Exception:
  84         pref = 'UTF-8'
  85
  86     return pref
  87
  88
  89 def write_json_file(obj, fn):
  90     """ Encode obj as JSON and write it to fn, atomically if possible """
  91
  92     fn = encodeFilename(fn)
  93     if sys.version_info < (3, 0) and sys.platform != 'win32':
  94         encoding = get_filesystem_encoding()
  95         # os.path.basename returns a bytes object, but NamedTemporaryFile
  96         # will fail if the filename contains non ascii characters unless we
  97         # use a unicode object
  98         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  99         # the same for os.path.dirname
 100         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 101     else:
 102         path_basename = os.path.basename
 103         path_dirname = os.path.dirname
 104
 105     args = {
 106         'suffix': '.tmp',
 107         'prefix': path_basename(fn) + '.',
 108         'dir': path_dirname(fn),
 109         'delete': False,
 110     }
 111
 112     # In Python 2.x, json.dump expects a bytestream.
 113     # In Python 3.x, it writes to a character stream
 114     if sys.version_info < (3, 0):
 115         args['mode'] = 'wb'
 116     else:
 117         args.update({
 118             'mode': 'w',
 119             'encoding': 'utf-8',
 120         })
 121
 122     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 123
 124     try:
 125         with tf:
 126             json.dump(obj, tf)
 127         if sys.platform == 'win32':
 128             # Need to remove existing file on Windows, else os.rename raises
 129             # WindowsError or FileExistsError.
 130             try:
 131                 os.unlink(fn)
 132             except OSError:
 133                 pass
 134         os.rename(tf.name, fn)
 135     except Exception:
 136         try:
 137             os.remove(tf.name)
 138         except OSError:
 139             pass
 140         raise
 141
 142
 143 if sys.version_info >= (2, 7):
 144     def find_xpath_attr(node, xpath, key, val=None):
 145         """ Find the xpath xpath[@key=val] """
 146         assert re.match(r'^[a-zA-Z_-]+$', key)
 147         if val:
 148             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 149         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 150         return node.find(expr)
 151 else:
 152     def find_xpath_attr(node, xpath, key, val=None):
 153         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 154         # .//node does not match if a node is a direct child of . !
 155         if isinstance(xpath, compat_str):
 156             xpath = xpath.encode('ascii')
 157
 158         for f in node.findall(xpath):
 159             if key not in f.attrib:
 160                 continue
 161             if val is None or f.attrib.get(key) == val:
 162                 return f
 163         return None
 164
 165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 166 # the namespace parameter
 167
 168
 169 def xpath_with_ns(path, ns_map):
 170     components = [c.split(':') for c in path.split('/')]
 171     replaced = []
 172     for c in components:
 173         if len(c) == 1:
 174             replaced.append(c[0])
 175         else:
 176             ns, tag = c
 177             replaced.append('{%s}%s' % (ns_map[ns], tag))
 178     return '/'.join(replaced)
 179
 180
 181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 182     def _find_xpath(xpath):
 183         if sys.version_info < (2, 7):  # Crazy 2.6
 184             xpath = xpath.encode('ascii')
 185         return node.find(xpath)
 186
 187     if isinstance(xpath, (str, compat_str)):
 188         n = _find_xpath(xpath)
 189     else:
 190         for xp in xpath:
 191             n = _find_xpath(xp)
 192             if n is not None:
 193                 break
 194
 195     if n is None:
 196         if default is not NO_DEFAULT:
 197             return default
 198         elif fatal:
 199             name = xpath if name is None else name
 200             raise ExtractorError('Could not find XML element %s' % name)
 201         else:
 202             return None
 203     return n
 204
 205
 206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 207     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 208     if n is None or n == default:
 209         return n
 210     if n.text is None:
 211         if default is not NO_DEFAULT:
 212             return default
 213         elif fatal:
 214             name = xpath if name is None else name
 215             raise ExtractorError('Could not find XML element\'s text %s' % name)
 216         else:
 217             return None
 218     return n.text
 219
 220
 221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 222     n = find_xpath_attr(node, xpath, key)
 223     if n is None:
 224         if default is not NO_DEFAULT:
 225             return default
 226         elif fatal:
 227             name = '%s[@%s]' % (xpath, key) if name is None else name
 228             raise ExtractorError('Could not find XML attribute %s' % name)
 229         else:
 230             return None
 231     return n.attrib[key]
 232
 233
 234 def get_element_by_id(id, html):
 235     """Return the content of the tag with the specified ID in the passed HTML document"""
 236     return get_element_by_attribute("id", id, html)
 237
 238
 239 def get_element_by_attribute(attribute, value, html):
 240     """Return the content of the tag with the specified attribute in the passed HTML document"""
 241
 242     m = re.search(r'''(?xs)
 243         <([a-zA-Z0-9:._-]+)
 244          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 245          \s+%s=['"]?%s['"]?
 246          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 247         \s*>
 248         (?P<content>.*?)
 249         </\1>
 250     ''' % (re.escape(attribute), re.escape(value)), html)
 251
 252     if not m:
 253         return None
 254     res = m.group('content')
 255
 256     if res.startswith('"') or res.startswith("'"):
 257         res = res[1:-1]
 258
 259     return unescapeHTML(res)
 260
 261
 262 def clean_html(html):
 263     """Clean an HTML snippet into a readable string"""
 264
 265     if html is None:  # Convenience for sanitizing descriptions etc.
 266         return html
 267
 268     # Newline vs <br />
 269     html = html.replace('\n', ' ')
 270     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 271     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 272     # Strip html tags
 273     html = re.sub('<.*?>', '', html)
 274     # Replace html entities
 275     html = unescapeHTML(html)
 276     return html.strip()
 277
 278
 279 def sanitize_open(filename, open_mode):
 280     """Try to open the given filename, and slightly tweak it if this fails.
 281
 282     Attempts to open the given filename. If this fails, it tries to change
 283     the filename slightly, step by step, until it's either able to open it
 284     or it fails and raises a final exception, like the standard open()
 285     function.
 286
 287     It returns the tuple (stream, definitive_file_name).
 288     """
 289     try:
 290         if filename == '-':
 291             if sys.platform == 'win32':
 292                 import msvcrt
 293                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 294             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 295         stream = open(encodeFilename(filename), open_mode)
 296         return (stream, filename)
 297     except (IOError, OSError) as err:
 298         if err.errno in (errno.EACCES,):
 299             raise
 300
 301         # In case of error, try to remove win32 forbidden chars
 302         alt_filename = sanitize_path(filename)
 303         if alt_filename == filename:
 304             raise
 305         else:
 306             # An exception here should be caught in the caller
 307             stream = open(encodeFilename(alt_filename), open_mode)
 308             return (stream, alt_filename)
 309
 310
 311 def timeconvert(timestr):
 312     """Convert RFC 2822 defined time string into system timestamp"""
 313     timestamp = None
 314     timetuple = email.utils.parsedate_tz(timestr)
 315     if timetuple is not None:
 316         timestamp = email.utils.mktime_tz(timetuple)
 317     return timestamp
 318
 319
 320 def sanitize_filename(s, restricted=False, is_id=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 324     """
 325     def replace_insane(char):
 326         if char == '?' or ord(char) < 32 or ord(char) == 127:
 327             return ''
 328         elif char == '"':
 329             return '' if restricted else '\''
 330         elif char == ':':
 331             return '_-' if restricted else ' -'
 332         elif char in '\\/|*<>':
 333             return '_'
 334         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 335             return '_'
 336         if restricted and ord(char) > 127:
 337             return '_'
 338         return char
 339
 340     # Handle timestamps
 341     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 342     result = ''.join(map(replace_insane, s))
 343     if not is_id:
 344         while '__' in result:
 345             result = result.replace('__', '_')
 346         result = result.strip('_')
 347         # Common case of "Foreign band name - English song title"
 348         if restricted and result.startswith('-_'):
 349             result = result[2:]
 350         if result.startswith('-'):
 351             result = '_' + result[len('-'):]
 352         result = result.lstrip('.')
 353         if not result:
 354             result = '_'
 355     return result
 356
 357
 358 def sanitize_path(s):
 359     """Sanitizes and normalizes path on Windows"""
 360     if sys.platform != 'win32':
 361         return s
 362     drive_or_unc, _ = os.path.splitdrive(s)
 363     if sys.version_info < (2, 7) and not drive_or_unc:
 364         drive_or_unc, _ = os.path.splitunc(s)
 365     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 366     if drive_or_unc:
 367         norm_path.pop(0)
 368     sanitized_path = [
 369         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 370         for path_part in norm_path]
 371     if drive_or_unc:
 372         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 373     return os.path.join(*sanitized_path)
 374
 375
 376 def orderedSet(iterable):
 377     """ Remove all duplicates from the input iterable """
 378     res = []
 379     for el in iterable:
 380         if el not in res:
 381             res.append(el)
 382     return res
 383
 384
 385 def _htmlentity_transform(entity):
 386     """Transforms an HTML entity to a character."""
 387     # Known non-numeric HTML entity
 388     if entity in compat_html_entities.name2codepoint:
 389         return compat_chr(compat_html_entities.name2codepoint[entity])
 390
 391     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 392     if mobj is not None:
 393         numstr = mobj.group(1)
 394         if numstr.startswith('x'):
 395             base = 16
 396             numstr = '0%s' % numstr
 397         else:
 398             base = 10
 399         # See https://github.com/rg3/youtube-dl/issues/7518
 400         try:
 401             return compat_chr(int(numstr, base))
 402         except ValueError:
 403             pass
 404
 405     # Unknown entity in name, return its literal representation
 406     return '&%s;' % entity
 407
 408
 409 def unescapeHTML(s):
 410     if s is None:
 411         return None
 412     assert type(s) == compat_str
 413
 414     return re.sub(
 415         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 416
 417
 418 def get_subprocess_encoding():
 419     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 420         # For subprocess calls, encode with locale encoding
 421         # Refer to http://stackoverflow.com/a/9951851/35070
 422         encoding = preferredencoding()
 423     else:
 424         encoding = sys.getfilesystemencoding()
 425     if encoding is None:
 426         encoding = 'utf-8'
 427     return encoding
 428
 429
 430 def encodeFilename(s, for_subprocess=False):
 431     """
 432     @param s The name of the file
 433     """
 434
 435     assert type(s) == compat_str
 436
 437     # Python 3 has a Unicode API
 438     if sys.version_info >= (3, 0):
 439         return s
 440
 441     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 442     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 443     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 444     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 445         return s
 446
 447     return s.encode(get_subprocess_encoding(), 'ignore')
 448
 449
 450 def decodeFilename(b, for_subprocess=False):
 451
 452     if sys.version_info >= (3, 0):
 453         return b
 454
 455     if not isinstance(b, bytes):
 456         return b
 457
 458     return b.decode(get_subprocess_encoding(), 'ignore')
 459
 460
 461 def encodeArgument(s):
 462     if not isinstance(s, compat_str):
 463         # Legacy code that uses byte strings
 464         # Uncomment the following line after fixing all post processors
 465         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 466         s = s.decode('ascii')
 467     return encodeFilename(s, True)
 468
 469
 470 def decodeArgument(b):
 471     return decodeFilename(b, True)
 472
 473
 474 def decodeOption(optval):
 475     if optval is None:
 476         return optval
 477     if isinstance(optval, bytes):
 478         optval = optval.decode(preferredencoding())
 479
 480     assert isinstance(optval, compat_str)
 481     return optval
 482
 483
 484 def formatSeconds(secs):
 485     if secs > 3600:
 486         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 487     elif secs > 60:
 488         return '%d:%02d' % (secs // 60, secs % 60)
 489     else:
 490         return '%d' % secs
 491
 492
 493 def make_HTTPS_handler(params, **kwargs):
 494     opts_no_check_certificate = params.get('nocheckcertificate', False)
 495     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 496         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 497         if opts_no_check_certificate:
 498             context.check_hostname = False
 499             context.verify_mode = ssl.CERT_NONE
 500         try:
 501             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 502         except TypeError:
 503             # Python 2.7.8
 504             # (create_default_context present but HTTPSHandler has no context=)
 505             pass
 506
 507     if sys.version_info < (3, 2):
 508         return YoutubeDLHTTPSHandler(params, **kwargs)
 509     else:  # Python < 3.4
 510         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 511         context.verify_mode = (ssl.CERT_NONE
 512                                if opts_no_check_certificate
 513                                else ssl.CERT_REQUIRED)
 514         context.set_default_verify_paths()
 515         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 516
 517
 518 def bug_reports_message():
 519     if ytdl_is_updateable():
 520         update_cmd = 'type  youtube-dl -U  to update'
 521     else:
 522         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 523     msg = '; please report this issue on https://yt-dl.org/bug .'
 524     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 525     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 526     return msg
 527
 528
 529 class ExtractorError(Exception):
 530     """Error during info extraction."""
 531
 532     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 533         """ tb, if given, is the original traceback (so that it can be printed out).
 534         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 535         """
 536
 537         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 538             expected = True
 539         if video_id is not None:
 540             msg = video_id + ': ' + msg
 541         if cause:
 542             msg += ' (caused by %r)' % cause
 543         if not expected:
 544             msg += bug_reports_message()
 545         super(ExtractorError, self).__init__(msg)
 546
 547         self.traceback = tb
 548         self.exc_info = sys.exc_info()  # preserve original exception
 549         self.cause = cause
 550         self.video_id = video_id
 551
 552     def format_traceback(self):
 553         if self.traceback is None:
 554             return None
 555         return ''.join(traceback.format_tb(self.traceback))
 556
 557
 558 class UnsupportedError(ExtractorError):
 559     def __init__(self, url):
 560         super(UnsupportedError, self).__init__(
 561             'Unsupported URL: %s' % url, expected=True)
 562         self.url = url
 563
 564
 565 class RegexNotFoundError(ExtractorError):
 566     """Error when a regex didn't match"""
 567     pass
 568
 569
 570 class DownloadError(Exception):
 571     """Download Error exception.
 572
 573     This exception may be thrown by FileDownloader objects if they are not
 574     configured to continue on errors. They will contain the appropriate
 575     error message.
 576     """
 577
 578     def __init__(self, msg, exc_info=None):
 579         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 580         super(DownloadError, self).__init__(msg)
 581         self.exc_info = exc_info
 582
 583
 584 class SameFileError(Exception):
 585     """Same File exception.
 586
 587     This exception will be thrown by FileDownloader objects if they detect
 588     multiple files would have to be downloaded to the same file on disk.
 589     """
 590     pass
 591
 592
 593 class PostProcessingError(Exception):
 594     """Post Processing exception.
 595
 596     This exception may be raised by PostProcessor's .run() method to
 597     indicate an error in the postprocessing task.
 598     """
 599
 600     def __init__(self, msg):
 601         self.msg = msg
 602
 603
 604 class MaxDownloadsReached(Exception):
 605     """ --max-downloads limit has been reached. """
 606     pass
 607
 608
 609 class UnavailableVideoError(Exception):
 610     """Unavailable Format exception.
 611
 612     This exception will be thrown when a video is requested
 613     in a format that is not available for that video.
 614     """
 615     pass
 616
 617
 618 class ContentTooShortError(Exception):
 619     """Content Too Short exception.
 620
 621     This exception may be raised by FileDownloader objects when a file they
 622     download is too small for what the server announced first, indicating
 623     the connection was probably interrupted.
 624     """
 625
 626     def __init__(self, downloaded, expected):
 627         # Both in bytes
 628         self.downloaded = downloaded
 629         self.expected = expected
 630
 631
 632 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 633     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 634     # expected HTTP responses to meet HTTP/1.0 or later (see also
 635     # https://github.com/rg3/youtube-dl/issues/6727)
 636     if sys.version_info < (3, 0):
 637         kwargs[b'strict'] = True
 638     hc = http_class(*args, **kwargs)
 639     source_address = ydl_handler._params.get('source_address')
 640     if source_address is not None:
 641         sa = (source_address, 0)
 642         if hasattr(hc, 'source_address'):  # Python 2.7+
 643             hc.source_address = sa
 644         else:  # Python 2.6
 645             def _hc_connect(self, *args, **kwargs):
 646                 sock = compat_socket_create_connection(
 647                     (self.host, self.port), self.timeout, sa)
 648                 if is_https:
 649                     self.sock = ssl.wrap_socket(
 650                         sock, self.key_file, self.cert_file,
 651                         ssl_version=ssl.PROTOCOL_TLSv1)
 652                 else:
 653                     self.sock = sock
 654             hc.connect = functools.partial(_hc_connect, hc)
 655
 656     return hc
 657
 658
 659 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 660     """Handler for HTTP requests and responses.
 661
 662     This class, when installed with an OpenerDirector, automatically adds
 663     the standard headers to every HTTP request and handles gzipped and
 664     deflated responses from web servers. If compression is to be avoided in
 665     a particular request, the original request in the program code only has
 666     to include the HTTP header "Youtubedl-No-Compression", which will be
 667     removed before making the real request.
 668
 669     Part of this code was copied from:
 670
 671     http://techknack.net/python-urllib2-handlers/
 672
 673     Andrew Rowls, the author of that code, agreed to release it to the
 674     public domain.
 675     """
 676
 677     def __init__(self, params, *args, **kwargs):
 678         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 679         self._params = params
 680
 681     def http_open(self, req):
 682         return self.do_open(functools.partial(
 683             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 684             req)
 685
 686     @staticmethod
 687     def deflate(data):
 688         try:
 689             return zlib.decompress(data, -zlib.MAX_WBITS)
 690         except zlib.error:
 691             return zlib.decompress(data)
 692
 693     @staticmethod
 694     def addinfourl_wrapper(stream, headers, url, code):
 695         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 696             return compat_urllib_request.addinfourl(stream, headers, url, code)
 697         ret = compat_urllib_request.addinfourl(stream, headers, url)
 698         ret.code = code
 699         return ret
 700
 701     def http_request(self, req):
 702         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 703         # always respected by websites, some tend to give out URLs with non percent-encoded
 704         # non-ASCII characters (see telemb.py, ard.py [#3412])
 705         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 706         # To work around aforementioned issue we will replace request's original URL with
 707         # percent-encoded one
 708         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 709         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 710         url = req.get_full_url()
 711         url_escaped = escape_url(url)
 712
 713         # Substitute URL if any change after escaping
 714         if url != url_escaped:
 715             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 716             new_req = req_type(
 717                 url_escaped, data=req.data, headers=req.headers,
 718                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 719             new_req.timeout = req.timeout
 720             req = new_req
 721
 722         for h, v in std_headers.items():
 723             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 724             # The dict keys are capitalized because of this bug by urllib
 725             if h.capitalize() not in req.headers:
 726                 req.add_header(h, v)
 727         if 'Youtubedl-no-compression' in req.headers:
 728             if 'Accept-encoding' in req.headers:
 729                 del req.headers['Accept-encoding']
 730             del req.headers['Youtubedl-no-compression']
 731
 732         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 733             # Python 2.6 is brain-dead when it comes to fragments
 734             req._Request__original = req._Request__original.partition('#')[0]
 735             req._Request__r_type = req._Request__r_type.partition('#')[0]
 736
 737         return req
 738
 739     def http_response(self, req, resp):
 740         old_resp = resp
 741         # gzip
 742         if resp.headers.get('Content-encoding', '') == 'gzip':
 743             content = resp.read()
 744             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 745             try:
 746                 uncompressed = io.BytesIO(gz.read())
 747             except IOError as original_ioerror:
 748                 # There may be junk add the end of the file
 749                 # See http://stackoverflow.com/q/4928560/35070 for details
 750                 for i in range(1, 1024):
 751                     try:
 752                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 753                         uncompressed = io.BytesIO(gz.read())
 754                     except IOError:
 755                         continue
 756                     break
 757                 else:
 758                     raise original_ioerror
 759             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 760             resp.msg = old_resp.msg
 761         # deflate
 762         if resp.headers.get('Content-encoding', '') == 'deflate':
 763             gz = io.BytesIO(self.deflate(resp.read()))
 764             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 765             resp.msg = old_resp.msg
 766         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 767         # https://github.com/rg3/youtube-dl/issues/6457).
 768         if 300 <= resp.code < 400:
 769             location = resp.headers.get('Location')
 770             if location:
 771                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 772                 if sys.version_info >= (3, 0):
 773                     location = location.encode('iso-8859-1').decode('utf-8')
 774                 location_escaped = escape_url(location)
 775                 if location != location_escaped:
 776                     del resp.headers['Location']
 777                     resp.headers['Location'] = location_escaped
 778         return resp
 779
 780     https_request = http_request
 781     https_response = http_response
 782
 783
 784 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 785     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 786         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 787         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 788         self._params = params
 789
 790     def https_open(self, req):
 791         kwargs = {}
 792         if hasattr(self, '_context'):  # python > 2.6
 793             kwargs['context'] = self._context
 794         if hasattr(self, '_check_hostname'):  # python 3.x
 795             kwargs['check_hostname'] = self._check_hostname
 796         return self.do_open(functools.partial(
 797             _create_http_connection, self, self._https_conn_class, True),
 798             req, **kwargs)
 799
 800
 801 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 802     def __init__(self, cookiejar=None):
 803         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 804
 805     def http_response(self, request, response):
 806         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 807         # characters in Set-Cookie HTTP header of last response (see
 808         # https://github.com/rg3/youtube-dl/issues/6769).
 809         # In order to at least prevent crashing we will percent encode Set-Cookie
 810         # header before HTTPCookieProcessor starts processing it.
 811         # if sys.version_info < (3, 0) and response.headers:
 812         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 813         #         set_cookie = response.headers.get(set_cookie_header)
 814         #         if set_cookie:
 815         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 816         #             if set_cookie != set_cookie_escaped:
 817         #                 del response.headers[set_cookie_header]
 818         #                 response.headers[set_cookie_header] = set_cookie_escaped
 819         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 820
 821     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 822     https_response = http_response
 823
 824
 825 def parse_iso8601(date_str, delimiter='T', timezone=None):
 826     """ Return a UNIX timestamp from the given date """
 827
 828     if date_str is None:
 829         return None
 830
 831     date_str = re.sub(r'\.[0-9]+', '', date_str)
 832
 833     if timezone is None:
 834         m = re.search(
 835             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 836             date_str)
 837         if not m:
 838             timezone = datetime.timedelta()
 839         else:
 840             date_str = date_str[:-len(m.group(0))]
 841             if not m.group('sign'):
 842                 timezone = datetime.timedelta()
 843             else:
 844                 sign = 1 if m.group('sign') == '+' else -1
 845                 timezone = datetime.timedelta(
 846                     hours=sign * int(m.group('hours')),
 847                     minutes=sign * int(m.group('minutes')))
 848     try:
 849         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 850         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 851         return calendar.timegm(dt.timetuple())
 852     except ValueError:
 853         pass
 854
 855
 856 def unified_strdate(date_str, day_first=True):
 857     """Return a string with the date in the format YYYYMMDD"""
 858
 859     if date_str is None:
 860         return None
 861     upload_date = None
 862     # Replace commas
 863     date_str = date_str.replace(',', ' ')
 864     # %z (UTC offset) is only supported in python>=3.2
 865     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 866         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 867     # Remove AM/PM + timezone
 868     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 869
 870     format_expressions = [
 871         '%d %B %Y',
 872         '%d %b %Y',
 873         '%B %d %Y',
 874         '%b %d %Y',
 875         '%b %dst %Y %I:%M%p',
 876         '%b %dnd %Y %I:%M%p',
 877         '%b %dth %Y %I:%M%p',
 878         '%Y %m %d',
 879         '%Y-%m-%d',
 880         '%Y/%m/%d',
 881         '%Y/%m/%d %H:%M:%S',
 882         '%Y-%m-%d %H:%M:%S',
 883         '%Y-%m-%d %H:%M:%S.%f',
 884         '%d.%m.%Y %H:%M',
 885         '%d.%m.%Y %H.%M',
 886         '%Y-%m-%dT%H:%M:%SZ',
 887         '%Y-%m-%dT%H:%M:%S.%fZ',
 888         '%Y-%m-%dT%H:%M:%S.%f0Z',
 889         '%Y-%m-%dT%H:%M:%S',
 890         '%Y-%m-%dT%H:%M:%S.%f',
 891         '%Y-%m-%dT%H:%M',
 892     ]
 893     if day_first:
 894         format_expressions.extend([
 895             '%d-%m-%Y',
 896             '%d.%m.%Y',
 897             '%d/%m/%Y',
 898             '%d/%m/%y',
 899             '%d/%m/%Y %H:%M:%S',
 900         ])
 901     else:
 902         format_expressions.extend([
 903             '%m-%d-%Y',
 904             '%m.%d.%Y',
 905             '%m/%d/%Y',
 906             '%m/%d/%y',
 907             '%m/%d/%Y %H:%M:%S',
 908         ])
 909     for expression in format_expressions:
 910         try:
 911             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 912         except ValueError:
 913             pass
 914     if upload_date is None:
 915         timetuple = email.utils.parsedate_tz(date_str)
 916         if timetuple:
 917             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 918     if upload_date is not None:
 919         return compat_str(upload_date)
 920
 921
 922 def determine_ext(url, default_ext='unknown_video'):
 923     if url is None:
 924         return default_ext
 925     guess = url.partition('?')[0].rpartition('.')[2]
 926     if re.match(r'^[A-Za-z0-9]+$', guess):
 927         return guess
 928     elif guess.rstrip('/') in (
 929             'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 930             'flv', 'f4v', 'f4a', 'f4b',
 931             'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 932             'mkv', 'mka', 'mk3d',
 933             'avi', 'divx',
 934             'mov',
 935             'asf', 'wmv', 'wma',
 936             '3gp', '3g2',
 937             'mp3',
 938             'flac',
 939             'ape',
 940             'wav',
 941             'f4f', 'f4m', 'm3u8', 'smil'):
 942         return guess.rstrip('/')
 943     else:
 944         return default_ext
 945
 946
 947 def subtitles_filename(filename, sub_lang, sub_format):
 948     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 949
 950
 951 def date_from_str(date_str):
 952     """
 953     Return a datetime object from a string in the format YYYYMMDD or
 954     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 955     today = datetime.date.today()
 956     if date_str in ('now', 'today'):
 957         return today
 958     if date_str == 'yesterday':
 959         return today - datetime.timedelta(days=1)
 960     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 961     if match is not None:
 962         sign = match.group('sign')
 963         time = int(match.group('time'))
 964         if sign == '-':
 965             time = -time
 966         unit = match.group('unit')
 967         # A bad aproximation?
 968         if unit == 'month':
 969             unit = 'day'
 970             time *= 30
 971         elif unit == 'year':
 972             unit = 'day'
 973             time *= 365
 974         unit += 's'
 975         delta = datetime.timedelta(**{unit: time})
 976         return today + delta
 977     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 978
 979
 980 def hyphenate_date(date_str):
 981     """
 982     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 983     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 984     if match is not None:
 985         return '-'.join(match.groups())
 986     else:
 987         return date_str
 988
 989
 990 class DateRange(object):
 991     """Represents a time interval between two dates"""
 992
 993     def __init__(self, start=None, end=None):
 994         """start and end must be strings in the format accepted by date"""
 995         if start is not None:
 996             self.start = date_from_str(start)
 997         else:
 998             self.start = datetime.datetime.min.date()
 999         if end is not None:
1000             self.end = date_from_str(end)
1001         else:
1002             self.end = datetime.datetime.max.date()
1003         if self.start > self.end:
1004             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1005
1006     @classmethod
1007     def day(cls, day):
1008         """Returns a range that only contains the given day"""
1009         return cls(day, day)
1010
1011     def __contains__(self, date):
1012         """Check if the date is in the range"""
1013         if not isinstance(date, datetime.date):
1014             date = date_from_str(date)
1015         return self.start <= date <= self.end
1016
1017     def __str__(self):
1018         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1019
1020
1021 def platform_name():
1022     """ Returns the platform name as a compat_str """
1023     res = platform.platform()
1024     if isinstance(res, bytes):
1025         res = res.decode(preferredencoding())
1026
1027     assert isinstance(res, compat_str)
1028     return res
1029
1030
1031 def _windows_write_string(s, out):
1032     """ Returns True if the string was written using special methods,
1033     False if it has yet to be written out."""
1034     # Adapted from http://stackoverflow.com/a/3259271/35070
1035
1036     import ctypes
1037     import ctypes.wintypes
1038
1039     WIN_OUTPUT_IDS = {
1040         1: -11,
1041         2: -12,
1042     }
1043
1044     try:
1045         fileno = out.fileno()
1046     except AttributeError:
1047         # If the output stream doesn't have a fileno, it's virtual
1048         return False
1049     except io.UnsupportedOperation:
1050         # Some strange Windows pseudo files?
1051         return False
1052     if fileno not in WIN_OUTPUT_IDS:
1053         return False
1054
1055     GetStdHandle = ctypes.WINFUNCTYPE(
1056         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1057         (b"GetStdHandle", ctypes.windll.kernel32))
1058     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1059
1060     WriteConsoleW = ctypes.WINFUNCTYPE(
1061         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1062         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1063         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1064     written = ctypes.wintypes.DWORD(0)
1065
1066     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1067     FILE_TYPE_CHAR = 0x0002
1068     FILE_TYPE_REMOTE = 0x8000
1069     GetConsoleMode = ctypes.WINFUNCTYPE(
1070         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1071         ctypes.POINTER(ctypes.wintypes.DWORD))(
1072         (b"GetConsoleMode", ctypes.windll.kernel32))
1073     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1074
1075     def not_a_console(handle):
1076         if handle == INVALID_HANDLE_VALUE or handle is None:
1077             return True
1078         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1079                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1080
1081     if not_a_console(h):
1082         return False
1083
1084     def next_nonbmp_pos(s):
1085         try:
1086             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1087         except StopIteration:
1088             return len(s)
1089
1090     while s:
1091         count = min(next_nonbmp_pos(s), 1024)
1092
1093         ret = WriteConsoleW(
1094             h, s, count if count else 2, ctypes.byref(written), None)
1095         if ret == 0:
1096             raise OSError('Failed to write string')
1097         if not count:  # We just wrote a non-BMP character
1098             assert written.value == 2
1099             s = s[1:]
1100         else:
1101             assert written.value > 0
1102             s = s[written.value:]
1103     return True
1104
1105
1106 def write_string(s, out=None, encoding=None):
1107     if out is None:
1108         out = sys.stderr
1109     assert type(s) == compat_str
1110
1111     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1112         if _windows_write_string(s, out):
1113             return
1114
1115     if ('b' in getattr(out, 'mode', '') or
1116             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1117         byt = s.encode(encoding or preferredencoding(), 'ignore')
1118         out.write(byt)
1119     elif hasattr(out, 'buffer'):
1120         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1121         byt = s.encode(enc, 'ignore')
1122         out.buffer.write(byt)
1123     else:
1124         out.write(s)
1125     out.flush()
1126
1127
1128 def bytes_to_intlist(bs):
1129     if not bs:
1130         return []
1131     if isinstance(bs[0], int):  # Python 3
1132         return list(bs)
1133     else:
1134         return [ord(c) for c in bs]
1135
1136
1137 def intlist_to_bytes(xs):
1138     if not xs:
1139         return b''
1140     return struct_pack('%dB' % len(xs), *xs)
1141
1142
1143 # Cross-platform file locking
1144 if sys.platform == 'win32':
1145     import ctypes.wintypes
1146     import msvcrt
1147
1148     class OVERLAPPED(ctypes.Structure):
1149         _fields_ = [
1150             ('Internal', ctypes.wintypes.LPVOID),
1151             ('InternalHigh', ctypes.wintypes.LPVOID),
1152             ('Offset', ctypes.wintypes.DWORD),
1153             ('OffsetHigh', ctypes.wintypes.DWORD),
1154             ('hEvent', ctypes.wintypes.HANDLE),
1155         ]
1156
1157     kernel32 = ctypes.windll.kernel32
1158     LockFileEx = kernel32.LockFileEx
1159     LockFileEx.argtypes = [
1160         ctypes.wintypes.HANDLE,     # hFile
1161         ctypes.wintypes.DWORD,      # dwFlags
1162         ctypes.wintypes.DWORD,      # dwReserved
1163         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1164         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1165         ctypes.POINTER(OVERLAPPED)  # Overlapped
1166     ]
1167     LockFileEx.restype = ctypes.wintypes.BOOL
1168     UnlockFileEx = kernel32.UnlockFileEx
1169     UnlockFileEx.argtypes = [
1170         ctypes.wintypes.HANDLE,     # hFile
1171         ctypes.wintypes.DWORD,      # dwReserved
1172         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1173         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1174         ctypes.POINTER(OVERLAPPED)  # Overlapped
1175     ]
1176     UnlockFileEx.restype = ctypes.wintypes.BOOL
1177     whole_low = 0xffffffff
1178     whole_high = 0x7fffffff
1179
1180     def _lock_file(f, exclusive):
1181         overlapped = OVERLAPPED()
1182         overlapped.Offset = 0
1183         overlapped.OffsetHigh = 0
1184         overlapped.hEvent = 0
1185         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1186         handle = msvcrt.get_osfhandle(f.fileno())
1187         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1188                           whole_low, whole_high, f._lock_file_overlapped_p):
1189             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1190
1191     def _unlock_file(f):
1192         assert f._lock_file_overlapped_p
1193         handle = msvcrt.get_osfhandle(f.fileno())
1194         if not UnlockFileEx(handle, 0,
1195                             whole_low, whole_high, f._lock_file_overlapped_p):
1196             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1197
1198 else:
1199     import fcntl
1200
1201     def _lock_file(f, exclusive):
1202         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1203
1204     def _unlock_file(f):
1205         fcntl.flock(f, fcntl.LOCK_UN)
1206
1207
1208 class locked_file(object):
1209     def __init__(self, filename, mode, encoding=None):
1210         assert mode in ['r', 'a', 'w']
1211         self.f = io.open(filename, mode, encoding=encoding)
1212         self.mode = mode
1213
1214     def __enter__(self):
1215         exclusive = self.mode != 'r'
1216         try:
1217             _lock_file(self.f, exclusive)
1218         except IOError:
1219             self.f.close()
1220             raise
1221         return self
1222
1223     def __exit__(self, etype, value, traceback):
1224         try:
1225             _unlock_file(self.f)
1226         finally:
1227             self.f.close()
1228
1229     def __iter__(self):
1230         return iter(self.f)
1231
1232     def write(self, *args):
1233         return self.f.write(*args)
1234
1235     def read(self, *args):
1236         return self.f.read(*args)
1237
1238
1239 def get_filesystem_encoding():
1240     encoding = sys.getfilesystemencoding()
1241     return encoding if encoding is not None else 'utf-8'
1242
1243
1244 def shell_quote(args):
1245     quoted_args = []
1246     encoding = get_filesystem_encoding()
1247     for a in args:
1248         if isinstance(a, bytes):
1249             # We may get a filename encoded with 'encodeFilename'
1250             a = a.decode(encoding)
1251         quoted_args.append(pipes.quote(a))
1252     return ' '.join(quoted_args)
1253
1254
1255 def smuggle_url(url, data):
1256     """ Pass additional data in a URL for internal use. """
1257
1258     sdata = compat_urllib_parse.urlencode(
1259         {'__youtubedl_smuggle': json.dumps(data)})
1260     return url + '#' + sdata
1261
1262
1263 def unsmuggle_url(smug_url, default=None):
1264     if '#__youtubedl_smuggle' not in smug_url:
1265         return smug_url, default
1266     url, _, sdata = smug_url.rpartition('#')
1267     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1268     data = json.loads(jsond)
1269     return url, data
1270
1271
1272 def format_bytes(bytes):
1273     if bytes is None:
1274         return 'N/A'
1275     if type(bytes) is str:
1276         bytes = float(bytes)
1277     if bytes == 0.0:
1278         exponent = 0
1279     else:
1280         exponent = int(math.log(bytes, 1024.0))
1281     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1282     converted = float(bytes) / float(1024 ** exponent)
1283     return '%.2f%s' % (converted, suffix)
1284
1285
1286 def parse_filesize(s):
1287     if s is None:
1288         return None
1289
1290     # The lower-case forms are of course incorrect and inofficial,
1291     # but we support those too
1292     _UNIT_TABLE = {
1293         'B': 1,
1294         'b': 1,
1295         'KiB': 1024,
1296         'KB': 1000,
1297         'kB': 1024,
1298         'Kb': 1000,
1299         'MiB': 1024 ** 2,
1300         'MB': 1000 ** 2,
1301         'mB': 1024 ** 2,
1302         'Mb': 1000 ** 2,
1303         'GiB': 1024 ** 3,
1304         'GB': 1000 ** 3,
1305         'gB': 1024 ** 3,
1306         'Gb': 1000 ** 3,
1307         'TiB': 1024 ** 4,
1308         'TB': 1000 ** 4,
1309         'tB': 1024 ** 4,
1310         'Tb': 1000 ** 4,
1311         'PiB': 1024 ** 5,
1312         'PB': 1000 ** 5,
1313         'pB': 1024 ** 5,
1314         'Pb': 1000 ** 5,
1315         'EiB': 1024 ** 6,
1316         'EB': 1000 ** 6,
1317         'eB': 1024 ** 6,
1318         'Eb': 1000 ** 6,
1319         'ZiB': 1024 ** 7,
1320         'ZB': 1000 ** 7,
1321         'zB': 1024 ** 7,
1322         'Zb': 1000 ** 7,
1323         'YiB': 1024 ** 8,
1324         'YB': 1000 ** 8,
1325         'yB': 1024 ** 8,
1326         'Yb': 1000 ** 8,
1327     }
1328
1329     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1330     m = re.match(
1331         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1332     if not m:
1333         return None
1334
1335     num_str = m.group('num').replace(',', '.')
1336     mult = _UNIT_TABLE[m.group('unit')]
1337     return int(float(num_str) * mult)
1338
1339
1340 def month_by_name(name):
1341     """ Return the number of a month by (locale-independently) English name """
1342
1343     try:
1344         return ENGLISH_MONTH_NAMES.index(name) + 1
1345     except ValueError:
1346         return None
1347
1348
1349 def month_by_abbreviation(abbrev):
1350     """ Return the number of a month by (locale-independently) English
1351         abbreviations """
1352
1353     try:
1354         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1355     except ValueError:
1356         return None
1357
1358
1359 def fix_xml_ampersands(xml_str):
1360     """Replace all the '&' by '&amp;' in XML"""
1361     return re.sub(
1362         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1363         '&amp;',
1364         xml_str)
1365
1366
1367 def setproctitle(title):
1368     assert isinstance(title, compat_str)
1369     try:
1370         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1371     except OSError:
1372         return
1373     title_bytes = title.encode('utf-8')
1374     buf = ctypes.create_string_buffer(len(title_bytes))
1375     buf.value = title_bytes
1376     try:
1377         libc.prctl(15, buf, 0, 0, 0)
1378     except AttributeError:
1379         return  # Strange libc, just skip this
1380
1381
1382 def remove_start(s, start):
1383     if s.startswith(start):
1384         return s[len(start):]
1385     return s
1386
1387
1388 def remove_end(s, end):
1389     if s.endswith(end):
1390         return s[:-len(end)]
1391     return s
1392
1393
1394 def url_basename(url):
1395     path = compat_urlparse.urlparse(url).path
1396     return path.strip('/').split('/')[-1]
1397
1398
1399 class HEADRequest(compat_urllib_request.Request):
1400     def get_method(self):
1401         return "HEAD"
1402
1403
1404 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1405     if get_attr:
1406         if v is not None:
1407             v = getattr(v, get_attr, None)
1408     if v == '':
1409         v = None
1410     if v is None:
1411         return default
1412     try:
1413         return int(v) * invscale // scale
1414     except ValueError:
1415         return default
1416
1417
1418 def str_or_none(v, default=None):
1419     return default if v is None else compat_str(v)
1420
1421
1422 def str_to_int(int_str):
1423     """ A more relaxed version of int_or_none """
1424     if int_str is None:
1425         return None
1426     int_str = re.sub(r'[,\.\+]', '', int_str)
1427     return int(int_str)
1428
1429
1430 def float_or_none(v, scale=1, invscale=1, default=None):
1431     if v is None:
1432         return default
1433     try:
1434         return float(v) * invscale / scale
1435     except ValueError:
1436         return default
1437
1438
1439 def parse_duration(s):
1440     if not isinstance(s, compat_basestring):
1441         return None
1442
1443     s = s.strip()
1444
1445     m = re.match(
1446         r'''(?ix)(?:P?T)?
1447         (?:
1448             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1449             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1450
1451             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1452             (?:
1453                 (?:
1454                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1455                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1456                 )?
1457                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1458             )?
1459             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1460         )$''', s)
1461     if not m:
1462         return None
1463     res = 0
1464     if m.group('only_mins'):
1465         return float_or_none(m.group('only_mins'), invscale=60)
1466     if m.group('only_hours'):
1467         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1468     if m.group('secs'):
1469         res += int(m.group('secs'))
1470     if m.group('mins_reversed'):
1471         res += int(m.group('mins_reversed')) * 60
1472     if m.group('mins'):
1473         res += int(m.group('mins')) * 60
1474     if m.group('hours'):
1475         res += int(m.group('hours')) * 60 * 60
1476     if m.group('hours_reversed'):
1477         res += int(m.group('hours_reversed')) * 60 * 60
1478     if m.group('days'):
1479         res += int(m.group('days')) * 24 * 60 * 60
1480     if m.group('ms'):
1481         res += float(m.group('ms'))
1482     return res
1483
1484
1485 def prepend_extension(filename, ext, expected_real_ext=None):
1486     name, real_ext = os.path.splitext(filename)
1487     return (
1488         '{0}.{1}{2}'.format(name, ext, real_ext)
1489         if not expected_real_ext or real_ext[1:] == expected_real_ext
1490         else '{0}.{1}'.format(filename, ext))
1491
1492
1493 def replace_extension(filename, ext, expected_real_ext=None):
1494     name, real_ext = os.path.splitext(filename)
1495     return '{0}.{1}'.format(
1496         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1497         ext)
1498
1499
1500 def check_executable(exe, args=[]):
1501     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1502     args can be a list of arguments for a short output (like -version) """
1503     try:
1504         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1505     except OSError:
1506         return False
1507     return exe
1508
1509
1510 def get_exe_version(exe, args=['--version'],
1511                     version_re=None, unrecognized='present'):
1512     """ Returns the version of the specified executable,
1513     or False if the executable is not present """
1514     try:
1515         out, _ = subprocess.Popen(
1516             [encodeArgument(exe)] + args,
1517             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1518     except OSError:
1519         return False
1520     if isinstance(out, bytes):  # Python 2.x
1521         out = out.decode('ascii', 'ignore')
1522     return detect_exe_version(out, version_re, unrecognized)
1523
1524
1525 def detect_exe_version(output, version_re=None, unrecognized='present'):
1526     assert isinstance(output, compat_str)
1527     if version_re is None:
1528         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1529     m = re.search(version_re, output)
1530     if m:
1531         return m.group(1)
1532     else:
1533         return unrecognized
1534
1535
1536 class PagedList(object):
1537     def __len__(self):
1538         # This is only useful for tests
1539         return len(self.getslice())
1540
1541
1542 class OnDemandPagedList(PagedList):
1543     def __init__(self, pagefunc, pagesize):
1544         self._pagefunc = pagefunc
1545         self._pagesize = pagesize
1546
1547     def getslice(self, start=0, end=None):
1548         res = []
1549         for pagenum in itertools.count(start // self._pagesize):
1550             firstid = pagenum * self._pagesize
1551             nextfirstid = pagenum * self._pagesize + self._pagesize
1552             if start >= nextfirstid:
1553                 continue
1554
1555             page_results = list(self._pagefunc(pagenum))
1556
1557             startv = (
1558                 start % self._pagesize
1559                 if firstid <= start < nextfirstid
1560                 else 0)
1561
1562             endv = (
1563                 ((end - 1) % self._pagesize) + 1
1564                 if (end is not None and firstid <= end <= nextfirstid)
1565                 else None)
1566
1567             if startv != 0 or endv is not None:
1568                 page_results = page_results[startv:endv]
1569             res.extend(page_results)
1570
1571             # A little optimization - if current page is not "full", ie. does
1572             # not contain page_size videos then we can assume that this page
1573             # is the last one - there are no more ids on further pages -
1574             # i.e. no need to query again.
1575             if len(page_results) + startv < self._pagesize:
1576                 break
1577
1578             # If we got the whole page, but the next page is not interesting,
1579             # break out early as well
1580             if end == nextfirstid:
1581                 break
1582         return res
1583
1584
1585 class InAdvancePagedList(PagedList):
1586     def __init__(self, pagefunc, pagecount, pagesize):
1587         self._pagefunc = pagefunc
1588         self._pagecount = pagecount
1589         self._pagesize = pagesize
1590
1591     def getslice(self, start=0, end=None):
1592         res = []
1593         start_page = start // self._pagesize
1594         end_page = (
1595             self._pagecount if end is None else (end // self._pagesize + 1))
1596         skip_elems = start - start_page * self._pagesize
1597         only_more = None if end is None else end - start
1598         for pagenum in range(start_page, end_page):
1599             page = list(self._pagefunc(pagenum))
1600             if skip_elems:
1601                 page = page[skip_elems:]
1602                 skip_elems = None
1603             if only_more is not None:
1604                 if len(page) < only_more:
1605                     only_more -= len(page)
1606                 else:
1607                     page = page[:only_more]
1608                     res.extend(page)
1609                     break
1610             res.extend(page)
1611         return res
1612
1613
1614 def uppercase_escape(s):
1615     unicode_escape = codecs.getdecoder('unicode_escape')
1616     return re.sub(
1617         r'\\U[0-9a-fA-F]{8}',
1618         lambda m: unicode_escape(m.group(0))[0],
1619         s)
1620
1621
1622 def lowercase_escape(s):
1623     unicode_escape = codecs.getdecoder('unicode_escape')
1624     return re.sub(
1625         r'\\u[0-9a-fA-F]{4}',
1626         lambda m: unicode_escape(m.group(0))[0],
1627         s)
1628
1629
1630 def escape_rfc3986(s):
1631     """Escape non-ASCII characters as suggested by RFC 3986"""
1632     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1633         s = s.encode('utf-8')
1634     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1635
1636
1637 def escape_url(url):
1638     """Escape URL as suggested by RFC 3986"""
1639     url_parsed = compat_urllib_parse_urlparse(url)
1640     return url_parsed._replace(
1641         path=escape_rfc3986(url_parsed.path),
1642         params=escape_rfc3986(url_parsed.params),
1643         query=escape_rfc3986(url_parsed.query),
1644         fragment=escape_rfc3986(url_parsed.fragment)
1645     ).geturl()
1646
1647 try:
1648     struct.pack('!I', 0)
1649 except TypeError:
1650     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1651     def struct_pack(spec, *args):
1652         if isinstance(spec, compat_str):
1653             spec = spec.encode('ascii')
1654         return struct.pack(spec, *args)
1655
1656     def struct_unpack(spec, *args):
1657         if isinstance(spec, compat_str):
1658             spec = spec.encode('ascii')
1659         return struct.unpack(spec, *args)
1660 else:
1661     struct_pack = struct.pack
1662     struct_unpack = struct.unpack
1663
1664
1665 def read_batch_urls(batch_fd):
1666     def fixup(url):
1667         if not isinstance(url, compat_str):
1668             url = url.decode('utf-8', 'replace')
1669         BOM_UTF8 = '\xef\xbb\xbf'
1670         if url.startswith(BOM_UTF8):
1671             url = url[len(BOM_UTF8):]
1672         url = url.strip()
1673         if url.startswith(('#', ';', ']')):
1674             return False
1675         return url
1676
1677     with contextlib.closing(batch_fd) as fd:
1678         return [url for url in map(fixup, fd) if url]
1679
1680
1681 def urlencode_postdata(*args, **kargs):
1682     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1683
1684
1685 def encode_dict(d, encoding='utf-8'):
1686     def encode(v):
1687         return v.encode(encoding) if isinstance(v, compat_basestring) else v
1688     return dict((encode(k), encode(v)) for k, v in d.items())
1689
1690
1691 US_RATINGS = {
1692     'G': 0,
1693     'PG': 10,
1694     'PG-13': 13,
1695     'R': 16,
1696     'NC': 18,
1697 }
1698
1699
1700 def parse_age_limit(s):
1701     if s is None:
1702         return None
1703     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1704     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1705
1706
1707 def strip_jsonp(code):
1708     return re.sub(
1709         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1710
1711
1712 def js_to_json(code):
1713     def fix_kv(m):
1714         v = m.group(0)
1715         if v in ('true', 'false', 'null'):
1716             return v
1717         if v.startswith('"'):
1718             v = re.sub(r"\\'", "'", v[1:-1])
1719         elif v.startswith("'"):
1720             v = v[1:-1]
1721             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1722                 '\\\\': '\\\\',
1723                 "\\'": "'",
1724                 '"': '\\"',
1725             }[m.group(0)], v)
1726         return '"%s"' % v
1727
1728     res = re.sub(r'''(?x)
1729         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1730         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1731         [a-zA-Z_][.a-zA-Z_0-9]*
1732         ''', fix_kv, code)
1733     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1734     return res
1735
1736
1737 def qualities(quality_ids):
1738     """ Get a numeric quality value out of a list of possible values """
1739     def q(qid):
1740         try:
1741             return quality_ids.index(qid)
1742         except ValueError:
1743             return -1
1744     return q
1745
1746
1747 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1748
1749
1750 def limit_length(s, length):
1751     """ Add ellipses to overly long strings """
1752     if s is None:
1753         return None
1754     ELLIPSES = '...'
1755     if len(s) > length:
1756         return s[:length - len(ELLIPSES)] + ELLIPSES
1757     return s
1758
1759
1760 def version_tuple(v):
1761     return tuple(int(e) for e in re.split(r'[-.]', v))
1762
1763
1764 def is_outdated_version(version, limit, assume_new=True):
1765     if not version:
1766         return not assume_new
1767     try:
1768         return version_tuple(version) < version_tuple(limit)
1769     except ValueError:
1770         return not assume_new
1771
1772
1773 def ytdl_is_updateable():
1774     """ Returns if youtube-dl can be updated with -U """
1775     from zipimport import zipimporter
1776
1777     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1778
1779
1780 def args_to_str(args):
1781     # Get a short string representation for a subprocess command
1782     return ' '.join(shlex_quote(a) for a in args)
1783
1784
1785 def mimetype2ext(mt):
1786     _, _, res = mt.rpartition('/')
1787
1788     return {
1789         'x-ms-wmv': 'wmv',
1790         'x-mp4-fragmented': 'mp4',
1791         'ttml+xml': 'ttml',
1792     }.get(res, res)
1793
1794
1795 def urlhandle_detect_ext(url_handle):
1796     try:
1797         url_handle.headers
1798         getheader = lambda h: url_handle.headers[h]
1799     except AttributeError:  # Python < 3
1800         getheader = url_handle.info().getheader
1801
1802     cd = getheader('Content-Disposition')
1803     if cd:
1804         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1805         if m:
1806             e = determine_ext(m.group('filename'), default_ext=None)
1807             if e:
1808                 return e
1809
1810     return mimetype2ext(getheader('Content-Type'))
1811
1812
1813 def encode_data_uri(data, mime_type):
1814     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1815
1816
1817 def age_restricted(content_limit, age_limit):
1818     """ Returns True iff the content should be blocked """
1819
1820     if age_limit is None:  # No limit set
1821         return False
1822     if content_limit is None:
1823         return False  # Content available for everyone
1824     return age_limit < content_limit
1825
1826
1827 def is_html(first_bytes):
1828     """ Detect whether a file contains HTML by examining its first bytes. """
1829
1830     BOMS = [
1831         (b'\xef\xbb\xbf', 'utf-8'),
1832         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1833         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1834         (b'\xff\xfe', 'utf-16-le'),
1835         (b'\xfe\xff', 'utf-16-be'),
1836     ]
1837     for bom, enc in BOMS:
1838         if first_bytes.startswith(bom):
1839             s = first_bytes[len(bom):].decode(enc, 'replace')
1840             break
1841     else:
1842         s = first_bytes.decode('utf-8', 'replace')
1843
1844     return re.match(r'^\s*<', s)
1845
1846
1847 def determine_protocol(info_dict):
1848     protocol = info_dict.get('protocol')
1849     if protocol is not None:
1850         return protocol
1851
1852     url = info_dict['url']
1853     if url.startswith('rtmp'):
1854         return 'rtmp'
1855     elif url.startswith('mms'):
1856         return 'mms'
1857     elif url.startswith('rtsp'):
1858         return 'rtsp'
1859
1860     ext = determine_ext(url)
1861     if ext == 'm3u8':
1862         return 'm3u8'
1863     elif ext == 'f4m':
1864         return 'f4m'
1865
1866     return compat_urllib_parse_urlparse(url).scheme
1867
1868
1869 def render_table(header_row, data):
1870     """ Render a list of rows, each as a list of values """
1871     table = [header_row] + data
1872     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1873     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1874     return '\n'.join(format_str % tuple(row) for row in table)
1875
1876
1877 def _match_one(filter_part, dct):
1878     COMPARISON_OPERATORS = {
1879         '<': operator.lt,
1880         '<=': operator.le,
1881         '>': operator.gt,
1882         '>=': operator.ge,
1883         '=': operator.eq,
1884         '!=': operator.ne,
1885     }
1886     operator_rex = re.compile(r'''(?x)\s*
1887         (?P<key>[a-z_]+)
1888         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1889         (?:
1890             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1891             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1892         )
1893         \s*$
1894         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1895     m = operator_rex.search(filter_part)
1896     if m:
1897         op = COMPARISON_OPERATORS[m.group('op')]
1898         if m.group('strval') is not None:
1899             if m.group('op') not in ('=', '!='):
1900                 raise ValueError(
1901                     'Operator %s does not support string values!' % m.group('op'))
1902             comparison_value = m.group('strval')
1903         else:
1904             try:
1905                 comparison_value = int(m.group('intval'))
1906             except ValueError:
1907                 comparison_value = parse_filesize(m.group('intval'))
1908                 if comparison_value is None:
1909                     comparison_value = parse_filesize(m.group('intval') + 'B')
1910                 if comparison_value is None:
1911                     raise ValueError(
1912                         'Invalid integer value %r in filter part %r' % (
1913                             m.group('intval'), filter_part))
1914         actual_value = dct.get(m.group('key'))
1915         if actual_value is None:
1916             return m.group('none_inclusive')
1917         return op(actual_value, comparison_value)
1918
1919     UNARY_OPERATORS = {
1920         '': lambda v: v is not None,
1921         '!': lambda v: v is None,
1922     }
1923     operator_rex = re.compile(r'''(?x)\s*
1924         (?P<op>%s)\s*(?P<key>[a-z_]+)
1925         \s*$
1926         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1927     m = operator_rex.search(filter_part)
1928     if m:
1929         op = UNARY_OPERATORS[m.group('op')]
1930         actual_value = dct.get(m.group('key'))
1931         return op(actual_value)
1932
1933     raise ValueError('Invalid filter part %r' % filter_part)
1934
1935
1936 def match_str(filter_str, dct):
1937     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1938
1939     return all(
1940         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1941
1942
1943 def match_filter_func(filter_str):
1944     def _match_func(info_dict):
1945         if match_str(filter_str, info_dict):
1946             return None
1947         else:
1948             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1949             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1950     return _match_func
1951
1952
1953 def parse_dfxp_time_expr(time_expr):
1954     if not time_expr:
1955         return 0.0
1956
1957     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1958     if mobj:
1959         return float(mobj.group('time_offset'))
1960
1961     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1962     if mobj:
1963         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1964
1965
1966 def srt_subtitles_timecode(seconds):
1967     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1968
1969
1970 def dfxp2srt(dfxp_data):
1971     _x = functools.partial(xpath_with_ns, ns_map={
1972         'ttml': 'http://www.w3.org/ns/ttml',
1973         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1974     })
1975
1976     def parse_node(node):
1977         str_or_empty = functools.partial(str_or_none, default='')
1978
1979         out = str_or_empty(node.text)
1980
1981         for child in node:
1982             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1983                 out += '\n' + str_or_empty(child.tail)
1984             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1985                 out += str_or_empty(parse_node(child))
1986             else:
1987                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1988
1989         return out
1990
1991     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1992     out = []
1993     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1994
1995     if not paras:
1996         raise ValueError('Invalid dfxp/TTML subtitle')
1997
1998     for para, index in zip(paras, itertools.count(1)):
1999         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
2000         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2001         if not end_time:
2002             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
2003         out.append('%d\n%s --> %s\n%s\n\n' % (
2004             index,
2005             srt_subtitles_timecode(begin_time),
2006             srt_subtitles_timecode(end_time),
2007             parse_node(para)))
2008
2009     return ''.join(out)
2010
2011
2012 def cli_option(params, command_option, param):
2013     param = params.get(param)
2014     return [command_option, param] if param is not None else []
2015
2016
2017 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2018     param = params.get(param)
2019     assert isinstance(param, bool)
2020     if separator:
2021         return [command_option + separator + (true_value if param else false_value)]
2022     return [command_option, true_value if param else false_value]
2023
2024
2025 def cli_valueless_option(params, command_option, param, expected_value=True):
2026     param = params.get(param)
2027     return [command_option] if param == expected_value else []
2028
2029
2030 def cli_configuration_args(params, param, default=[]):
2031     ex_args = params.get(param)
2032     if ex_args is None:
2033         return default
2034     assert isinstance(ex_args, list)
2035     return ex_args
2036
2037
2038 class ISO639Utils(object):
2039     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2040     _lang_map = {
2041         'aa': 'aar',
2042         'ab': 'abk',
2043         'ae': 'ave',
2044         'af': 'afr',
2045         'ak': 'aka',
2046         'am': 'amh',
2047         'an': 'arg',
2048         'ar': 'ara',
2049         'as': 'asm',
2050         'av': 'ava',
2051         'ay': 'aym',
2052         'az': 'aze',
2053         'ba': 'bak',
2054         'be': 'bel',
2055         'bg': 'bul',
2056         'bh': 'bih',
2057         'bi': 'bis',
2058         'bm': 'bam',
2059         'bn': 'ben',
2060         'bo': 'bod',
2061         'br': 'bre',
2062         'bs': 'bos',
2063         'ca': 'cat',
2064         'ce': 'che',
2065         'ch': 'cha',
2066         'co': 'cos',
2067         'cr': 'cre',
2068         'cs': 'ces',
2069         'cu': 'chu',
2070         'cv': 'chv',
2071         'cy': 'cym',
2072         'da': 'dan',
2073         'de': 'deu',
2074         'dv': 'div',
2075         'dz': 'dzo',
2076         'ee': 'ewe',
2077         'el': 'ell',
2078         'en': 'eng',
2079         'eo': 'epo',
2080         'es': 'spa',
2081         'et': 'est',
2082         'eu': 'eus',
2083         'fa': 'fas',
2084         'ff': 'ful',
2085         'fi': 'fin',
2086         'fj': 'fij',
2087         'fo': 'fao',
2088         'fr': 'fra',
2089         'fy': 'fry',
2090         'ga': 'gle',
2091         'gd': 'gla',
2092         'gl': 'glg',
2093         'gn': 'grn',
2094         'gu': 'guj',
2095         'gv': 'glv',
2096         'ha': 'hau',
2097         'he': 'heb',
2098         'hi': 'hin',
2099         'ho': 'hmo',
2100         'hr': 'hrv',
2101         'ht': 'hat',
2102         'hu': 'hun',
2103         'hy': 'hye',
2104         'hz': 'her',
2105         'ia': 'ina',
2106         'id': 'ind',
2107         'ie': 'ile',
2108         'ig': 'ibo',
2109         'ii': 'iii',
2110         'ik': 'ipk',
2111         'io': 'ido',
2112         'is': 'isl',
2113         'it': 'ita',
2114         'iu': 'iku',
2115         'ja': 'jpn',
2116         'jv': 'jav',
2117         'ka': 'kat',
2118         'kg': 'kon',
2119         'ki': 'kik',
2120         'kj': 'kua',
2121         'kk': 'kaz',
2122         'kl': 'kal',
2123         'km': 'khm',
2124         'kn': 'kan',
2125         'ko': 'kor',
2126         'kr': 'kau',
2127         'ks': 'kas',
2128         'ku': 'kur',
2129         'kv': 'kom',
2130         'kw': 'cor',
2131         'ky': 'kir',
2132         'la': 'lat',
2133         'lb': 'ltz',
2134         'lg': 'lug',
2135         'li': 'lim',
2136         'ln': 'lin',
2137         'lo': 'lao',
2138         'lt': 'lit',
2139         'lu': 'lub',
2140         'lv': 'lav',
2141         'mg': 'mlg',
2142         'mh': 'mah',
2143         'mi': 'mri',
2144         'mk': 'mkd',
2145         'ml': 'mal',
2146         'mn': 'mon',
2147         'mr': 'mar',
2148         'ms': 'msa',
2149         'mt': 'mlt',
2150         'my': 'mya',
2151         'na': 'nau',
2152         'nb': 'nob',
2153         'nd': 'nde',
2154         'ne': 'nep',
2155         'ng': 'ndo',
2156         'nl': 'nld',
2157         'nn': 'nno',
2158         'no': 'nor',
2159         'nr': 'nbl',
2160         'nv': 'nav',
2161         'ny': 'nya',
2162         'oc': 'oci',
2163         'oj': 'oji',
2164         'om': 'orm',
2165         'or': 'ori',
2166         'os': 'oss',
2167         'pa': 'pan',
2168         'pi': 'pli',
2169         'pl': 'pol',
2170         'ps': 'pus',
2171         'pt': 'por',
2172         'qu': 'que',
2173         'rm': 'roh',
2174         'rn': 'run',
2175         'ro': 'ron',
2176         'ru': 'rus',
2177         'rw': 'kin',
2178         'sa': 'san',
2179         'sc': 'srd',
2180         'sd': 'snd',
2181         'se': 'sme',
2182         'sg': 'sag',
2183         'si': 'sin',
2184         'sk': 'slk',
2185         'sl': 'slv',
2186         'sm': 'smo',
2187         'sn': 'sna',
2188         'so': 'som',
2189         'sq': 'sqi',
2190         'sr': 'srp',
2191         'ss': 'ssw',
2192         'st': 'sot',
2193         'su': 'sun',
2194         'sv': 'swe',
2195         'sw': 'swa',
2196         'ta': 'tam',
2197         'te': 'tel',
2198         'tg': 'tgk',
2199         'th': 'tha',
2200         'ti': 'tir',
2201         'tk': 'tuk',
2202         'tl': 'tgl',
2203         'tn': 'tsn',
2204         'to': 'ton',
2205         'tr': 'tur',
2206         'ts': 'tso',
2207         'tt': 'tat',
2208         'tw': 'twi',
2209         'ty': 'tah',
2210         'ug': 'uig',
2211         'uk': 'ukr',
2212         'ur': 'urd',
2213         'uz': 'uzb',
2214         've': 'ven',
2215         'vi': 'vie',
2216         'vo': 'vol',
2217         'wa': 'wln',
2218         'wo': 'wol',
2219         'xh': 'xho',
2220         'yi': 'yid',
2221         'yo': 'yor',
2222         'za': 'zha',
2223         'zh': 'zho',
2224         'zu': 'zul',
2225     }
2226
2227     @classmethod
2228     def short2long(cls, code):
2229         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2230         return cls._lang_map.get(code[:2])
2231
2232     @classmethod
2233     def long2short(cls, code):
2234         """Convert language code from ISO 639-2/T to ISO 639-1"""
2235         for short_name, long_name in cls._lang_map.items():
2236             if long_name == code:
2237                 return short_name
2238
2239
2240 class ISO3166Utils(object):
2241     # From http://data.okfn.org/data/core/country-list
2242     _country_map = {
2243         'AF': 'Afghanistan',
2244         'AX': 'Åland Islands',
2245         'AL': 'Albania',
2246         'DZ': 'Algeria',
2247         'AS': 'American Samoa',
2248         'AD': 'Andorra',
2249         'AO': 'Angola',
2250         'AI': 'Anguilla',
2251         'AQ': 'Antarctica',
2252         'AG': 'Antigua and Barbuda',
2253         'AR': 'Argentina',
2254         'AM': 'Armenia',
2255         'AW': 'Aruba',
2256         'AU': 'Australia',
2257         'AT': 'Austria',
2258         'AZ': 'Azerbaijan',
2259         'BS': 'Bahamas',
2260         'BH': 'Bahrain',
2261         'BD': 'Bangladesh',
2262         'BB': 'Barbados',
2263         'BY': 'Belarus',
2264         'BE': 'Belgium',
2265         'BZ': 'Belize',
2266         'BJ': 'Benin',
2267         'BM': 'Bermuda',
2268         'BT': 'Bhutan',
2269         'BO': 'Bolivia, Plurinational State of',
2270         'BQ': 'Bonaire, Sint Eustatius and Saba',
2271         'BA': 'Bosnia and Herzegovina',
2272         'BW': 'Botswana',
2273         'BV': 'Bouvet Island',
2274         'BR': 'Brazil',
2275         'IO': 'British Indian Ocean Territory',
2276         'BN': 'Brunei Darussalam',
2277         'BG': 'Bulgaria',
2278         'BF': 'Burkina Faso',
2279         'BI': 'Burundi',
2280         'KH': 'Cambodia',
2281         'CM': 'Cameroon',
2282         'CA': 'Canada',
2283         'CV': 'Cape Verde',
2284         'KY': 'Cayman Islands',
2285         'CF': 'Central African Republic',
2286         'TD': 'Chad',
2287         'CL': 'Chile',
2288         'CN': 'China',
2289         'CX': 'Christmas Island',
2290         'CC': 'Cocos (Keeling) Islands',
2291         'CO': 'Colombia',
2292         'KM': 'Comoros',
2293         'CG': 'Congo',
2294         'CD': 'Congo, the Democratic Republic of the',
2295         'CK': 'Cook Islands',
2296         'CR': 'Costa Rica',
2297         'CI': 'Côte d\'Ivoire',
2298         'HR': 'Croatia',
2299         'CU': 'Cuba',
2300         'CW': 'Curaçao',
2301         'CY': 'Cyprus',
2302         'CZ': 'Czech Republic',
2303         'DK': 'Denmark',
2304         'DJ': 'Djibouti',
2305         'DM': 'Dominica',
2306         'DO': 'Dominican Republic',
2307         'EC': 'Ecuador',
2308         'EG': 'Egypt',
2309         'SV': 'El Salvador',
2310         'GQ': 'Equatorial Guinea',
2311         'ER': 'Eritrea',
2312         'EE': 'Estonia',
2313         'ET': 'Ethiopia',
2314         'FK': 'Falkland Islands (Malvinas)',
2315         'FO': 'Faroe Islands',
2316         'FJ': 'Fiji',
2317         'FI': 'Finland',
2318         'FR': 'France',
2319         'GF': 'French Guiana',
2320         'PF': 'French Polynesia',
2321         'TF': 'French Southern Territories',
2322         'GA': 'Gabon',
2323         'GM': 'Gambia',
2324         'GE': 'Georgia',
2325         'DE': 'Germany',
2326         'GH': 'Ghana',
2327         'GI': 'Gibraltar',
2328         'GR': 'Greece',
2329         'GL': 'Greenland',
2330         'GD': 'Grenada',
2331         'GP': 'Guadeloupe',
2332         'GU': 'Guam',
2333         'GT': 'Guatemala',
2334         'GG': 'Guernsey',
2335         'GN': 'Guinea',
2336         'GW': 'Guinea-Bissau',
2337         'GY': 'Guyana',
2338         'HT': 'Haiti',
2339         'HM': 'Heard Island and McDonald Islands',
2340         'VA': 'Holy See (Vatican City State)',
2341         'HN': 'Honduras',
2342         'HK': 'Hong Kong',
2343         'HU': 'Hungary',
2344         'IS': 'Iceland',
2345         'IN': 'India',
2346         'ID': 'Indonesia',
2347         'IR': 'Iran, Islamic Republic of',
2348         'IQ': 'Iraq',
2349         'IE': 'Ireland',
2350         'IM': 'Isle of Man',
2351         'IL': 'Israel',
2352         'IT': 'Italy',
2353         'JM': 'Jamaica',
2354         'JP': 'Japan',
2355         'JE': 'Jersey',
2356         'JO': 'Jordan',
2357         'KZ': 'Kazakhstan',
2358         'KE': 'Kenya',
2359         'KI': 'Kiribati',
2360         'KP': 'Korea, Democratic People\'s Republic of',
2361         'KR': 'Korea, Republic of',
2362         'KW': 'Kuwait',
2363         'KG': 'Kyrgyzstan',
2364         'LA': 'Lao People\'s Democratic Republic',
2365         'LV': 'Latvia',
2366         'LB': 'Lebanon',
2367         'LS': 'Lesotho',
2368         'LR': 'Liberia',
2369         'LY': 'Libya',
2370         'LI': 'Liechtenstein',
2371         'LT': 'Lithuania',
2372         'LU': 'Luxembourg',
2373         'MO': 'Macao',
2374         'MK': 'Macedonia, the Former Yugoslav Republic of',
2375         'MG': 'Madagascar',
2376         'MW': 'Malawi',
2377         'MY': 'Malaysia',
2378         'MV': 'Maldives',
2379         'ML': 'Mali',
2380         'MT': 'Malta',
2381         'MH': 'Marshall Islands',
2382         'MQ': 'Martinique',
2383         'MR': 'Mauritania',
2384         'MU': 'Mauritius',
2385         'YT': 'Mayotte',
2386         'MX': 'Mexico',
2387         'FM': 'Micronesia, Federated States of',
2388         'MD': 'Moldova, Republic of',
2389         'MC': 'Monaco',
2390         'MN': 'Mongolia',
2391         'ME': 'Montenegro',
2392         'MS': 'Montserrat',
2393         'MA': 'Morocco',
2394         'MZ': 'Mozambique',
2395         'MM': 'Myanmar',
2396         'NA': 'Namibia',
2397         'NR': 'Nauru',
2398         'NP': 'Nepal',
2399         'NL': 'Netherlands',
2400         'NC': 'New Caledonia',
2401         'NZ': 'New Zealand',
2402         'NI': 'Nicaragua',
2403         'NE': 'Niger',
2404         'NG': 'Nigeria',
2405         'NU': 'Niue',
2406         'NF': 'Norfolk Island',
2407         'MP': 'Northern Mariana Islands',
2408         'NO': 'Norway',
2409         'OM': 'Oman',
2410         'PK': 'Pakistan',
2411         'PW': 'Palau',
2412         'PS': 'Palestine, State of',
2413         'PA': 'Panama',
2414         'PG': 'Papua New Guinea',
2415         'PY': 'Paraguay',
2416         'PE': 'Peru',
2417         'PH': 'Philippines',
2418         'PN': 'Pitcairn',
2419         'PL': 'Poland',
2420         'PT': 'Portugal',
2421         'PR': 'Puerto Rico',
2422         'QA': 'Qatar',
2423         'RE': 'Réunion',
2424         'RO': 'Romania',
2425         'RU': 'Russian Federation',
2426         'RW': 'Rwanda',
2427         'BL': 'Saint Barthélemy',
2428         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2429         'KN': 'Saint Kitts and Nevis',
2430         'LC': 'Saint Lucia',
2431         'MF': 'Saint Martin (French part)',
2432         'PM': 'Saint Pierre and Miquelon',
2433         'VC': 'Saint Vincent and the Grenadines',
2434         'WS': 'Samoa',
2435         'SM': 'San Marino',
2436         'ST': 'Sao Tome and Principe',
2437         'SA': 'Saudi Arabia',
2438         'SN': 'Senegal',
2439         'RS': 'Serbia',
2440         'SC': 'Seychelles',
2441         'SL': 'Sierra Leone',
2442         'SG': 'Singapore',
2443         'SX': 'Sint Maarten (Dutch part)',
2444         'SK': 'Slovakia',
2445         'SI': 'Slovenia',
2446         'SB': 'Solomon Islands',
2447         'SO': 'Somalia',
2448         'ZA': 'South Africa',
2449         'GS': 'South Georgia and the South Sandwich Islands',
2450         'SS': 'South Sudan',
2451         'ES': 'Spain',
2452         'LK': 'Sri Lanka',
2453         'SD': 'Sudan',
2454         'SR': 'Suriname',
2455         'SJ': 'Svalbard and Jan Mayen',
2456         'SZ': 'Swaziland',
2457         'SE': 'Sweden',
2458         'CH': 'Switzerland',
2459         'SY': 'Syrian Arab Republic',
2460         'TW': 'Taiwan, Province of China',
2461         'TJ': 'Tajikistan',
2462         'TZ': 'Tanzania, United Republic of',
2463         'TH': 'Thailand',
2464         'TL': 'Timor-Leste',
2465         'TG': 'Togo',
2466         'TK': 'Tokelau',
2467         'TO': 'Tonga',
2468         'TT': 'Trinidad and Tobago',
2469         'TN': 'Tunisia',
2470         'TR': 'Turkey',
2471         'TM': 'Turkmenistan',
2472         'TC': 'Turks and Caicos Islands',
2473         'TV': 'Tuvalu',
2474         'UG': 'Uganda',
2475         'UA': 'Ukraine',
2476         'AE': 'United Arab Emirates',
2477         'GB': 'United Kingdom',
2478         'US': 'United States',
2479         'UM': 'United States Minor Outlying Islands',
2480         'UY': 'Uruguay',
2481         'UZ': 'Uzbekistan',
2482         'VU': 'Vanuatu',
2483         'VE': 'Venezuela, Bolivarian Republic of',
2484         'VN': 'Viet Nam',
2485         'VG': 'Virgin Islands, British',
2486         'VI': 'Virgin Islands, U.S.',
2487         'WF': 'Wallis and Futuna',
2488         'EH': 'Western Sahara',
2489         'YE': 'Yemen',
2490         'ZM': 'Zambia',
2491         'ZW': 'Zimbabwe',
2492     }
2493
2494     @classmethod
2495     def short2full(cls, code):
2496         """Convert an ISO 3166-2 country code to the corresponding full name"""
2497         return cls._country_map.get(code.upper())
2498
2499
2500 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2501     def __init__(self, proxies=None):
2502         # Set default handlers
2503         for type in ('http', 'https'):
2504             setattr(self, '%s_open' % type,
2505                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2506                         meth(r, proxy, type))
2507         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2508
2509     def proxy_open(self, req, proxy, type):
2510         req_proxy = req.headers.get('Ytdl-request-proxy')
2511         if req_proxy is not None:
2512             proxy = req_proxy
2513             del req.headers['Ytdl-request-proxy']
2514
2515         if proxy == '__noproxy__':
2516             return None  # No Proxy
2517         return compat_urllib_request.ProxyHandler.proxy_open(
2518             self, req, proxy, type)