git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_html_entities,
  40     compat_http_client,
  41     compat_kwargs,
  42     compat_parse_qs,
  43     compat_socket_create_connection,
  44     compat_str,
  45     compat_urllib_error,
  46     compat_urllib_parse,
  47     compat_urllib_parse_urlparse,
  48     compat_urllib_request,
  49     compat_urlparse,
  50     shlex_quote,
  51 )
  52
  53
  54 # This is not clearly defined otherwise
  55 compiled_regex_type = type(re.compile(''))
  56
  57 std_headers = {
  58     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  59     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  60     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  61     'Accept-Encoding': 'gzip, deflate',
  62     'Accept-Language': 'en-us,en;q=0.5',
  63 }
  64
  65
  66 NO_DEFAULT = object()
  67
  68 ENGLISH_MONTH_NAMES = [
  69     'January', 'February', 'March', 'April', 'May', 'June',
  70     'July', 'August', 'September', 'October', 'November', 'December']
  71
  72
  73 def preferredencoding():
  74     """Get preferred encoding.
  75
  76     Returns the best encoding scheme for the system, based on
  77     locale.getpreferredencoding() and some further tweaks.
  78     """
  79     try:
  80         pref = locale.getpreferredencoding()
  81         'TEST'.encode(pref)
  82     except Exception:
  83         pref = 'UTF-8'
  84
  85     return pref
  86
  87
  88 def write_json_file(obj, fn):
  89     """ Encode obj as JSON and write it to fn, atomically if possible """
  90
  91     fn = encodeFilename(fn)
  92     if sys.version_info < (3, 0) and sys.platform != 'win32':
  93         encoding = get_filesystem_encoding()
  94         # os.path.basename returns a bytes object, but NamedTemporaryFile
  95         # will fail if the filename contains non ascii characters unless we
  96         # use a unicode object
  97         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  98         # the same for os.path.dirname
  99         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 100     else:
 101         path_basename = os.path.basename
 102         path_dirname = os.path.dirname
 103
 104     args = {
 105         'suffix': '.tmp',
 106         'prefix': path_basename(fn) + '.',
 107         'dir': path_dirname(fn),
 108         'delete': False,
 109     }
 110
 111     # In Python 2.x, json.dump expects a bytestream.
 112     # In Python 3.x, it writes to a character stream
 113     if sys.version_info < (3, 0):
 114         args['mode'] = 'wb'
 115     else:
 116         args.update({
 117             'mode': 'w',
 118             'encoding': 'utf-8',
 119         })
 120
 121     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 122
 123     try:
 124         with tf:
 125             json.dump(obj, tf)
 126         if sys.platform == 'win32':
 127             # Need to remove existing file on Windows, else os.rename raises
 128             # WindowsError or FileExistsError.
 129             try:
 130                 os.unlink(fn)
 131             except OSError:
 132                 pass
 133         os.rename(tf.name, fn)
 134     except Exception:
 135         try:
 136             os.remove(tf.name)
 137         except OSError:
 138             pass
 139         raise
 140
 141
 142 if sys.version_info >= (2, 7):
 143     def find_xpath_attr(node, xpath, key, val=None):
 144         """ Find the xpath xpath[@key=val] """
 145         assert re.match(r'^[a-zA-Z_-]+$', key)
 146         if val:
 147             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 148         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 149         return node.find(expr)
 150 else:
 151     def find_xpath_attr(node, xpath, key, val=None):
 152         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 153         # .//node does not match if a node is a direct child of . !
 154         if isinstance(xpath, compat_str):
 155             xpath = xpath.encode('ascii')
 156
 157         for f in node.findall(xpath):
 158             if key not in f.attrib:
 159                 continue
 160             if val is None or f.attrib.get(key) == val:
 161                 return f
 162         return None
 163
 164 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 165 # the namespace parameter
 166
 167
 168 def xpath_with_ns(path, ns_map):
 169     components = [c.split(':') for c in path.split('/')]
 170     replaced = []
 171     for c in components:
 172         if len(c) == 1:
 173             replaced.append(c[0])
 174         else:
 175             ns, tag = c
 176             replaced.append('{%s}%s' % (ns_map[ns], tag))
 177     return '/'.join(replaced)
 178
 179
 180 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 181     if sys.version_info < (2, 7):  # Crazy 2.6
 182         xpath = xpath.encode('ascii')
 183
 184     n = node.find(xpath)
 185     if n is None:
 186         if default is not NO_DEFAULT:
 187             return default
 188         elif fatal:
 189             name = xpath if name is None else name
 190             raise ExtractorError('Could not find XML element %s' % name)
 191         else:
 192             return None
 193     return n
 194
 195
 196 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 197     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 198     if n is None or n == default:
 199         return n
 200     if n.text is None:
 201         if default is not NO_DEFAULT:
 202             return default
 203         elif fatal:
 204             name = xpath if name is None else name
 205             raise ExtractorError('Could not find XML element\'s text %s' % name)
 206         else:
 207             return None
 208     return n.text
 209
 210
 211 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 212     n = find_xpath_attr(node, xpath, key)
 213     if n is None:
 214         if default is not NO_DEFAULT:
 215             return default
 216         elif fatal:
 217             name = '%s[@%s]' % (xpath, key) if name is None else name
 218             raise ExtractorError('Could not find XML attribute %s' % name)
 219         else:
 220             return None
 221     return n.attrib[key]
 222
 223
 224 def get_element_by_id(id, html):
 225     """Return the content of the tag with the specified ID in the passed HTML document"""
 226     return get_element_by_attribute("id", id, html)
 227
 228
 229 def get_element_by_attribute(attribute, value, html):
 230     """Return the content of the tag with the specified attribute in the passed HTML document"""
 231
 232     m = re.search(r'''(?xs)
 233         <([a-zA-Z0-9:._-]+)
 234          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 235          \s+%s=['"]?%s['"]?
 236          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 237         \s*>
 238         (?P<content>.*?)
 239         </\1>
 240     ''' % (re.escape(attribute), re.escape(value)), html)
 241
 242     if not m:
 243         return None
 244     res = m.group('content')
 245
 246     if res.startswith('"') or res.startswith("'"):
 247         res = res[1:-1]
 248
 249     return unescapeHTML(res)
 250
 251
 252 def clean_html(html):
 253     """Clean an HTML snippet into a readable string"""
 254
 255     if html is None:  # Convenience for sanitizing descriptions etc.
 256         return html
 257
 258     # Newline vs <br />
 259     html = html.replace('\n', ' ')
 260     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 261     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 262     # Strip html tags
 263     html = re.sub('<.*?>', '', html)
 264     # Replace html entities
 265     html = unescapeHTML(html)
 266     return html.strip()
 267
 268
 269 def sanitize_open(filename, open_mode):
 270     """Try to open the given filename, and slightly tweak it if this fails.
 271
 272     Attempts to open the given filename. If this fails, it tries to change
 273     the filename slightly, step by step, until it's either able to open it
 274     or it fails and raises a final exception, like the standard open()
 275     function.
 276
 277     It returns the tuple (stream, definitive_file_name).
 278     """
 279     try:
 280         if filename == '-':
 281             if sys.platform == 'win32':
 282                 import msvcrt
 283                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 284             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 285         stream = open(encodeFilename(filename), open_mode)
 286         return (stream, filename)
 287     except (IOError, OSError) as err:
 288         if err.errno in (errno.EACCES,):
 289             raise
 290
 291         # In case of error, try to remove win32 forbidden chars
 292         alt_filename = sanitize_path(filename)
 293         if alt_filename == filename:
 294             raise
 295         else:
 296             # An exception here should be caught in the caller
 297             stream = open(encodeFilename(alt_filename), open_mode)
 298             return (stream, alt_filename)
 299
 300
 301 def timeconvert(timestr):
 302     """Convert RFC 2822 defined time string into system timestamp"""
 303     timestamp = None
 304     timetuple = email.utils.parsedate_tz(timestr)
 305     if timetuple is not None:
 306         timestamp = email.utils.mktime_tz(timetuple)
 307     return timestamp
 308
 309
 310 def sanitize_filename(s, restricted=False, is_id=False):
 311     """Sanitizes a string so it could be used as part of a filename.
 312     If restricted is set, use a stricter subset of allowed characters.
 313     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 314     """
 315     def replace_insane(char):
 316         if char == '?' or ord(char) < 32 or ord(char) == 127:
 317             return ''
 318         elif char == '"':
 319             return '' if restricted else '\''
 320         elif char == ':':
 321             return '_-' if restricted else ' -'
 322         elif char in '\\/|*<>':
 323             return '_'
 324         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 325             return '_'
 326         if restricted and ord(char) > 127:
 327             return '_'
 328         return char
 329
 330     # Handle timestamps
 331     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 332     result = ''.join(map(replace_insane, s))
 333     if not is_id:
 334         while '__' in result:
 335             result = result.replace('__', '_')
 336         result = result.strip('_')
 337         # Common case of "Foreign band name - English song title"
 338         if restricted and result.startswith('-_'):
 339             result = result[2:]
 340         if result.startswith('-'):
 341             result = '_' + result[len('-'):]
 342         result = result.lstrip('.')
 343         if not result:
 344             result = '_'
 345     return result
 346
 347
 348 def sanitize_path(s):
 349     """Sanitizes and normalizes path on Windows"""
 350     if sys.platform != 'win32':
 351         return s
 352     drive_or_unc, _ = os.path.splitdrive(s)
 353     if sys.version_info < (2, 7) and not drive_or_unc:
 354         drive_or_unc, _ = os.path.splitunc(s)
 355     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 356     if drive_or_unc:
 357         norm_path.pop(0)
 358     sanitized_path = [
 359         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 360         for path_part in norm_path]
 361     if drive_or_unc:
 362         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 363     return os.path.join(*sanitized_path)
 364
 365
 366 def orderedSet(iterable):
 367     """ Remove all duplicates from the input iterable """
 368     res = []
 369     for el in iterable:
 370         if el not in res:
 371             res.append(el)
 372     return res
 373
 374
 375 def _htmlentity_transform(entity):
 376     """Transforms an HTML entity to a character."""
 377     # Known non-numeric HTML entity
 378     if entity in compat_html_entities.name2codepoint:
 379         return compat_chr(compat_html_entities.name2codepoint[entity])
 380
 381     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 382     if mobj is not None:
 383         numstr = mobj.group(1)
 384         if numstr.startswith('x'):
 385             base = 16
 386             numstr = '0%s' % numstr
 387         else:
 388             base = 10
 389         return compat_chr(int(numstr, base))
 390
 391     # Unknown entity in name, return its literal representation
 392     return ('&%s;' % entity)
 393
 394
 395 def unescapeHTML(s):
 396     if s is None:
 397         return None
 398     assert type(s) == compat_str
 399
 400     return re.sub(
 401         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 402
 403
 404 def get_subprocess_encoding():
 405     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 406         # For subprocess calls, encode with locale encoding
 407         # Refer to http://stackoverflow.com/a/9951851/35070
 408         encoding = preferredencoding()
 409     else:
 410         encoding = sys.getfilesystemencoding()
 411     if encoding is None:
 412         encoding = 'utf-8'
 413     return encoding
 414
 415
 416 def encodeFilename(s, for_subprocess=False):
 417     """
 418     @param s The name of the file
 419     """
 420
 421     assert type(s) == compat_str
 422
 423     # Python 3 has a Unicode API
 424     if sys.version_info >= (3, 0):
 425         return s
 426
 427     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 428     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 429     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 430     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 431         return s
 432
 433     return s.encode(get_subprocess_encoding(), 'ignore')
 434
 435
 436 def decodeFilename(b, for_subprocess=False):
 437
 438     if sys.version_info >= (3, 0):
 439         return b
 440
 441     if not isinstance(b, bytes):
 442         return b
 443
 444     return b.decode(get_subprocess_encoding(), 'ignore')
 445
 446
 447 def encodeArgument(s):
 448     if not isinstance(s, compat_str):
 449         # Legacy code that uses byte strings
 450         # Uncomment the following line after fixing all post processors
 451         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 452         s = s.decode('ascii')
 453     return encodeFilename(s, True)
 454
 455
 456 def decodeArgument(b):
 457     return decodeFilename(b, True)
 458
 459
 460 def decodeOption(optval):
 461     if optval is None:
 462         return optval
 463     if isinstance(optval, bytes):
 464         optval = optval.decode(preferredencoding())
 465
 466     assert isinstance(optval, compat_str)
 467     return optval
 468
 469
 470 def formatSeconds(secs):
 471     if secs > 3600:
 472         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 473     elif secs > 60:
 474         return '%d:%02d' % (secs // 60, secs % 60)
 475     else:
 476         return '%d' % secs
 477
 478
 479 def make_HTTPS_handler(params, **kwargs):
 480     opts_no_check_certificate = params.get('nocheckcertificate', False)
 481     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 482         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 483         if opts_no_check_certificate:
 484             context.check_hostname = False
 485             context.verify_mode = ssl.CERT_NONE
 486         try:
 487             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 488         except TypeError:
 489             # Python 2.7.8
 490             # (create_default_context present but HTTPSHandler has no context=)
 491             pass
 492
 493     if sys.version_info < (3, 2):
 494         return YoutubeDLHTTPSHandler(params, **kwargs)
 495     else:  # Python < 3.4
 496         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 497         context.verify_mode = (ssl.CERT_NONE
 498                                if opts_no_check_certificate
 499                                else ssl.CERT_REQUIRED)
 500         context.set_default_verify_paths()
 501         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 502
 503
 504 def bug_reports_message():
 505     if ytdl_is_updateable():
 506         update_cmd = 'type  youtube-dl -U  to update'
 507     else:
 508         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 509     msg = '; please report this issue on https://yt-dl.org/bug .'
 510     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 511     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 512     return msg
 513
 514
 515 class ExtractorError(Exception):
 516     """Error during info extraction."""
 517
 518     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 519         """ tb, if given, is the original traceback (so that it can be printed out).
 520         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 521         """
 522
 523         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 524             expected = True
 525         if video_id is not None:
 526             msg = video_id + ': ' + msg
 527         if cause:
 528             msg += ' (caused by %r)' % cause
 529         if not expected:
 530             msg += bug_reports_message()
 531         super(ExtractorError, self).__init__(msg)
 532
 533         self.traceback = tb
 534         self.exc_info = sys.exc_info()  # preserve original exception
 535         self.cause = cause
 536         self.video_id = video_id
 537
 538     def format_traceback(self):
 539         if self.traceback is None:
 540             return None
 541         return ''.join(traceback.format_tb(self.traceback))
 542
 543
 544 class UnsupportedError(ExtractorError):
 545     def __init__(self, url):
 546         super(UnsupportedError, self).__init__(
 547             'Unsupported URL: %s' % url, expected=True)
 548         self.url = url
 549
 550
 551 class RegexNotFoundError(ExtractorError):
 552     """Error when a regex didn't match"""
 553     pass
 554
 555
 556 class DownloadError(Exception):
 557     """Download Error exception.
 558
 559     This exception may be thrown by FileDownloader objects if they are not
 560     configured to continue on errors. They will contain the appropriate
 561     error message.
 562     """
 563
 564     def __init__(self, msg, exc_info=None):
 565         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 566         super(DownloadError, self).__init__(msg)
 567         self.exc_info = exc_info
 568
 569
 570 class SameFileError(Exception):
 571     """Same File exception.
 572
 573     This exception will be thrown by FileDownloader objects if they detect
 574     multiple files would have to be downloaded to the same file on disk.
 575     """
 576     pass
 577
 578
 579 class PostProcessingError(Exception):
 580     """Post Processing exception.
 581
 582     This exception may be raised by PostProcessor's .run() method to
 583     indicate an error in the postprocessing task.
 584     """
 585
 586     def __init__(self, msg):
 587         self.msg = msg
 588
 589
 590 class MaxDownloadsReached(Exception):
 591     """ --max-downloads limit has been reached. """
 592     pass
 593
 594
 595 class UnavailableVideoError(Exception):
 596     """Unavailable Format exception.
 597
 598     This exception will be thrown when a video is requested
 599     in a format that is not available for that video.
 600     """
 601     pass
 602
 603
 604 class ContentTooShortError(Exception):
 605     """Content Too Short exception.
 606
 607     This exception may be raised by FileDownloader objects when a file they
 608     download is too small for what the server announced first, indicating
 609     the connection was probably interrupted.
 610     """
 611
 612     def __init__(self, downloaded, expected):
 613         # Both in bytes
 614         self.downloaded = downloaded
 615         self.expected = expected
 616
 617
 618 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 619     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 620     # expected HTTP responses to meet HTTP/1.0 or later (see also
 621     # https://github.com/rg3/youtube-dl/issues/6727)
 622     if sys.version_info < (3, 0):
 623         kwargs[b'strict'] = True
 624     hc = http_class(*args, **kwargs)
 625     source_address = ydl_handler._params.get('source_address')
 626     if source_address is not None:
 627         sa = (source_address, 0)
 628         if hasattr(hc, 'source_address'):  # Python 2.7+
 629             hc.source_address = sa
 630         else:  # Python 2.6
 631             def _hc_connect(self, *args, **kwargs):
 632                 sock = compat_socket_create_connection(
 633                     (self.host, self.port), self.timeout, sa)
 634                 if is_https:
 635                     self.sock = ssl.wrap_socket(
 636                         sock, self.key_file, self.cert_file,
 637                         ssl_version=ssl.PROTOCOL_TLSv1)
 638                 else:
 639                     self.sock = sock
 640             hc.connect = functools.partial(_hc_connect, hc)
 641
 642     return hc
 643
 644
 645 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 646     """Handler for HTTP requests and responses.
 647
 648     This class, when installed with an OpenerDirector, automatically adds
 649     the standard headers to every HTTP request and handles gzipped and
 650     deflated responses from web servers. If compression is to be avoided in
 651     a particular request, the original request in the program code only has
 652     to include the HTTP header "Youtubedl-No-Compression", which will be
 653     removed before making the real request.
 654
 655     Part of this code was copied from:
 656
 657     http://techknack.net/python-urllib2-handlers/
 658
 659     Andrew Rowls, the author of that code, agreed to release it to the
 660     public domain.
 661     """
 662
 663     def __init__(self, params, *args, **kwargs):
 664         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 665         self._params = params
 666
 667     def http_open(self, req):
 668         return self.do_open(functools.partial(
 669             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 670             req)
 671
 672     @staticmethod
 673     def deflate(data):
 674         try:
 675             return zlib.decompress(data, -zlib.MAX_WBITS)
 676         except zlib.error:
 677             return zlib.decompress(data)
 678
 679     @staticmethod
 680     def addinfourl_wrapper(stream, headers, url, code):
 681         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 682             return compat_urllib_request.addinfourl(stream, headers, url, code)
 683         ret = compat_urllib_request.addinfourl(stream, headers, url)
 684         ret.code = code
 685         return ret
 686
 687     def http_request(self, req):
 688         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 689         # always respected by websites, some tend to give out URLs with non percent-encoded
 690         # non-ASCII characters (see telemb.py, ard.py [#3412])
 691         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 692         # To work around aforementioned issue we will replace request's original URL with
 693         # percent-encoded one
 694         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 695         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 696         url = req.get_full_url()
 697         url_escaped = escape_url(url)
 698
 699         # Substitute URL if any change after escaping
 700         if url != url_escaped:
 701             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 702             new_req = req_type(
 703                 url_escaped, data=req.data, headers=req.headers,
 704                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 705             new_req.timeout = req.timeout
 706             req = new_req
 707
 708         for h, v in std_headers.items():
 709             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 710             # The dict keys are capitalized because of this bug by urllib
 711             if h.capitalize() not in req.headers:
 712                 req.add_header(h, v)
 713         if 'Youtubedl-no-compression' in req.headers:
 714             if 'Accept-encoding' in req.headers:
 715                 del req.headers['Accept-encoding']
 716             del req.headers['Youtubedl-no-compression']
 717
 718         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 719             # Python 2.6 is brain-dead when it comes to fragments
 720             req._Request__original = req._Request__original.partition('#')[0]
 721             req._Request__r_type = req._Request__r_type.partition('#')[0]
 722
 723         return req
 724
 725     def http_response(self, req, resp):
 726         old_resp = resp
 727         # gzip
 728         if resp.headers.get('Content-encoding', '') == 'gzip':
 729             content = resp.read()
 730             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 731             try:
 732                 uncompressed = io.BytesIO(gz.read())
 733             except IOError as original_ioerror:
 734                 # There may be junk add the end of the file
 735                 # See http://stackoverflow.com/q/4928560/35070 for details
 736                 for i in range(1, 1024):
 737                     try:
 738                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 739                         uncompressed = io.BytesIO(gz.read())
 740                     except IOError:
 741                         continue
 742                     break
 743                 else:
 744                     raise original_ioerror
 745             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 746             resp.msg = old_resp.msg
 747         # deflate
 748         if resp.headers.get('Content-encoding', '') == 'deflate':
 749             gz = io.BytesIO(self.deflate(resp.read()))
 750             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 751             resp.msg = old_resp.msg
 752         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 753         # https://github.com/rg3/youtube-dl/issues/6457).
 754         if 300 <= resp.code < 400:
 755             location = resp.headers.get('Location')
 756             if location:
 757                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 758                 if sys.version_info >= (3, 0):
 759                     location = location.encode('iso-8859-1').decode('utf-8')
 760                 location_escaped = escape_url(location)
 761                 if location != location_escaped:
 762                     del resp.headers['Location']
 763                     resp.headers['Location'] = location_escaped
 764         return resp
 765
 766     https_request = http_request
 767     https_response = http_response
 768
 769
 770 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 771     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 772         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 773         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 774         self._params = params
 775
 776     def https_open(self, req):
 777         kwargs = {}
 778         if hasattr(self, '_context'):  # python > 2.6
 779             kwargs['context'] = self._context
 780         if hasattr(self, '_check_hostname'):  # python 3.x
 781             kwargs['check_hostname'] = self._check_hostname
 782         return self.do_open(functools.partial(
 783             _create_http_connection, self, self._https_conn_class, True),
 784             req, **kwargs)
 785
 786
 787 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 788     def __init__(self, cookiejar=None):
 789         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 790
 791     def http_response(self, request, response):
 792         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 793         # characters in Set-Cookie HTTP header of last response (see
 794         # https://github.com/rg3/youtube-dl/issues/6769).
 795         # In order to at least prevent crashing we will percent encode Set-Cookie
 796         # header before HTTPCookieProcessor starts processing it.
 797         # if sys.version_info < (3, 0) and response.headers:
 798         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 799         #         set_cookie = response.headers.get(set_cookie_header)
 800         #         if set_cookie:
 801         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 802         #             if set_cookie != set_cookie_escaped:
 803         #                 del response.headers[set_cookie_header]
 804         #                 response.headers[set_cookie_header] = set_cookie_escaped
 805         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 806
 807     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 808     https_response = http_response
 809
 810
 811 def parse_iso8601(date_str, delimiter='T', timezone=None):
 812     """ Return a UNIX timestamp from the given date """
 813
 814     if date_str is None:
 815         return None
 816
 817     date_str = re.sub(r'\.[0-9]+', '', date_str)
 818
 819     if timezone is None:
 820         m = re.search(
 821             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 822             date_str)
 823         if not m:
 824             timezone = datetime.timedelta()
 825         else:
 826             date_str = date_str[:-len(m.group(0))]
 827             if not m.group('sign'):
 828                 timezone = datetime.timedelta()
 829             else:
 830                 sign = 1 if m.group('sign') == '+' else -1
 831                 timezone = datetime.timedelta(
 832                     hours=sign * int(m.group('hours')),
 833                     minutes=sign * int(m.group('minutes')))
 834     try:
 835         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 836         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 837         return calendar.timegm(dt.timetuple())
 838     except ValueError:
 839         pass
 840
 841
 842 def unified_strdate(date_str, day_first=True):
 843     """Return a string with the date in the format YYYYMMDD"""
 844
 845     if date_str is None:
 846         return None
 847     upload_date = None
 848     # Replace commas
 849     date_str = date_str.replace(',', ' ')
 850     # %z (UTC offset) is only supported in python>=3.2
 851     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 852         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 853     # Remove AM/PM + timezone
 854     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 855
 856     format_expressions = [
 857         '%d %B %Y',
 858         '%d %b %Y',
 859         '%B %d %Y',
 860         '%b %d %Y',
 861         '%b %dst %Y %I:%M%p',
 862         '%b %dnd %Y %I:%M%p',
 863         '%b %dth %Y %I:%M%p',
 864         '%Y %m %d',
 865         '%Y-%m-%d',
 866         '%Y/%m/%d',
 867         '%Y/%m/%d %H:%M:%S',
 868         '%Y-%m-%d %H:%M:%S',
 869         '%Y-%m-%d %H:%M:%S.%f',
 870         '%d.%m.%Y %H:%M',
 871         '%d.%m.%Y %H.%M',
 872         '%Y-%m-%dT%H:%M:%SZ',
 873         '%Y-%m-%dT%H:%M:%S.%fZ',
 874         '%Y-%m-%dT%H:%M:%S.%f0Z',
 875         '%Y-%m-%dT%H:%M:%S',
 876         '%Y-%m-%dT%H:%M:%S.%f',
 877         '%Y-%m-%dT%H:%M',
 878     ]
 879     if day_first:
 880         format_expressions.extend([
 881             '%d-%m-%Y',
 882             '%d.%m.%Y',
 883             '%d/%m/%Y',
 884             '%d/%m/%y',
 885             '%d/%m/%Y %H:%M:%S',
 886         ])
 887     else:
 888         format_expressions.extend([
 889             '%m-%d-%Y',
 890             '%m.%d.%Y',
 891             '%m/%d/%Y',
 892             '%m/%d/%y',
 893             '%m/%d/%Y %H:%M:%S',
 894         ])
 895     for expression in format_expressions:
 896         try:
 897             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 898         except ValueError:
 899             pass
 900     if upload_date is None:
 901         timetuple = email.utils.parsedate_tz(date_str)
 902         if timetuple:
 903             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 904     return upload_date
 905
 906
 907 def determine_ext(url, default_ext='unknown_video'):
 908     if url is None:
 909         return default_ext
 910     guess = url.partition('?')[0].rpartition('.')[2]
 911     if re.match(r'^[A-Za-z0-9]+$', guess):
 912         return guess
 913     else:
 914         return default_ext
 915
 916
 917 def subtitles_filename(filename, sub_lang, sub_format):
 918     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 919
 920
 921 def date_from_str(date_str):
 922     """
 923     Return a datetime object from a string in the format YYYYMMDD or
 924     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 925     today = datetime.date.today()
 926     if date_str in ('now', 'today'):
 927         return today
 928     if date_str == 'yesterday':
 929         return today - datetime.timedelta(days=1)
 930     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 931     if match is not None:
 932         sign = match.group('sign')
 933         time = int(match.group('time'))
 934         if sign == '-':
 935             time = -time
 936         unit = match.group('unit')
 937         # A bad aproximation?
 938         if unit == 'month':
 939             unit = 'day'
 940             time *= 30
 941         elif unit == 'year':
 942             unit = 'day'
 943             time *= 365
 944         unit += 's'
 945         delta = datetime.timedelta(**{unit: time})
 946         return today + delta
 947     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 948
 949
 950 def hyphenate_date(date_str):
 951     """
 952     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 953     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 954     if match is not None:
 955         return '-'.join(match.groups())
 956     else:
 957         return date_str
 958
 959
 960 class DateRange(object):
 961     """Represents a time interval between two dates"""
 962
 963     def __init__(self, start=None, end=None):
 964         """start and end must be strings in the format accepted by date"""
 965         if start is not None:
 966             self.start = date_from_str(start)
 967         else:
 968             self.start = datetime.datetime.min.date()
 969         if end is not None:
 970             self.end = date_from_str(end)
 971         else:
 972             self.end = datetime.datetime.max.date()
 973         if self.start > self.end:
 974             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 975
 976     @classmethod
 977     def day(cls, day):
 978         """Returns a range that only contains the given day"""
 979         return cls(day, day)
 980
 981     def __contains__(self, date):
 982         """Check if the date is in the range"""
 983         if not isinstance(date, datetime.date):
 984             date = date_from_str(date)
 985         return self.start <= date <= self.end
 986
 987     def __str__(self):
 988         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 989
 990
 991 def platform_name():
 992     """ Returns the platform name as a compat_str """
 993     res = platform.platform()
 994     if isinstance(res, bytes):
 995         res = res.decode(preferredencoding())
 996
 997     assert isinstance(res, compat_str)
 998     return res
 999
1000
1001 def _windows_write_string(s, out):
1002     """ Returns True if the string was written using special methods,
1003     False if it has yet to be written out."""
1004     # Adapted from http://stackoverflow.com/a/3259271/35070
1005
1006     import ctypes
1007     import ctypes.wintypes
1008
1009     WIN_OUTPUT_IDS = {
1010         1: -11,
1011         2: -12,
1012     }
1013
1014     try:
1015         fileno = out.fileno()
1016     except AttributeError:
1017         # If the output stream doesn't have a fileno, it's virtual
1018         return False
1019     except io.UnsupportedOperation:
1020         # Some strange Windows pseudo files?
1021         return False
1022     if fileno not in WIN_OUTPUT_IDS:
1023         return False
1024
1025     GetStdHandle = ctypes.WINFUNCTYPE(
1026         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1027         (b"GetStdHandle", ctypes.windll.kernel32))
1028     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1029
1030     WriteConsoleW = ctypes.WINFUNCTYPE(
1031         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1032         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1033         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1034     written = ctypes.wintypes.DWORD(0)
1035
1036     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1037     FILE_TYPE_CHAR = 0x0002
1038     FILE_TYPE_REMOTE = 0x8000
1039     GetConsoleMode = ctypes.WINFUNCTYPE(
1040         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1041         ctypes.POINTER(ctypes.wintypes.DWORD))(
1042         (b"GetConsoleMode", ctypes.windll.kernel32))
1043     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1044
1045     def not_a_console(handle):
1046         if handle == INVALID_HANDLE_VALUE or handle is None:
1047             return True
1048         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1049                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1050
1051     if not_a_console(h):
1052         return False
1053
1054     def next_nonbmp_pos(s):
1055         try:
1056             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1057         except StopIteration:
1058             return len(s)
1059
1060     while s:
1061         count = min(next_nonbmp_pos(s), 1024)
1062
1063         ret = WriteConsoleW(
1064             h, s, count if count else 2, ctypes.byref(written), None)
1065         if ret == 0:
1066             raise OSError('Failed to write string')
1067         if not count:  # We just wrote a non-BMP character
1068             assert written.value == 2
1069             s = s[1:]
1070         else:
1071             assert written.value > 0
1072             s = s[written.value:]
1073     return True
1074
1075
1076 def write_string(s, out=None, encoding=None):
1077     if out is None:
1078         out = sys.stderr
1079     assert type(s) == compat_str
1080
1081     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1082         if _windows_write_string(s, out):
1083             return
1084
1085     if ('b' in getattr(out, 'mode', '') or
1086             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1087         byt = s.encode(encoding or preferredencoding(), 'ignore')
1088         out.write(byt)
1089     elif hasattr(out, 'buffer'):
1090         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1091         byt = s.encode(enc, 'ignore')
1092         out.buffer.write(byt)
1093     else:
1094         out.write(s)
1095     out.flush()
1096
1097
1098 def bytes_to_intlist(bs):
1099     if not bs:
1100         return []
1101     if isinstance(bs[0], int):  # Python 3
1102         return list(bs)
1103     else:
1104         return [ord(c) for c in bs]
1105
1106
1107 def intlist_to_bytes(xs):
1108     if not xs:
1109         return b''
1110     return struct_pack('%dB' % len(xs), *xs)
1111
1112
1113 # Cross-platform file locking
1114 if sys.platform == 'win32':
1115     import ctypes.wintypes
1116     import msvcrt
1117
1118     class OVERLAPPED(ctypes.Structure):
1119         _fields_ = [
1120             ('Internal', ctypes.wintypes.LPVOID),
1121             ('InternalHigh', ctypes.wintypes.LPVOID),
1122             ('Offset', ctypes.wintypes.DWORD),
1123             ('OffsetHigh', ctypes.wintypes.DWORD),
1124             ('hEvent', ctypes.wintypes.HANDLE),
1125         ]
1126
1127     kernel32 = ctypes.windll.kernel32
1128     LockFileEx = kernel32.LockFileEx
1129     LockFileEx.argtypes = [
1130         ctypes.wintypes.HANDLE,     # hFile
1131         ctypes.wintypes.DWORD,      # dwFlags
1132         ctypes.wintypes.DWORD,      # dwReserved
1133         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1134         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1135         ctypes.POINTER(OVERLAPPED)  # Overlapped
1136     ]
1137     LockFileEx.restype = ctypes.wintypes.BOOL
1138     UnlockFileEx = kernel32.UnlockFileEx
1139     UnlockFileEx.argtypes = [
1140         ctypes.wintypes.HANDLE,     # hFile
1141         ctypes.wintypes.DWORD,      # dwReserved
1142         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1143         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1144         ctypes.POINTER(OVERLAPPED)  # Overlapped
1145     ]
1146     UnlockFileEx.restype = ctypes.wintypes.BOOL
1147     whole_low = 0xffffffff
1148     whole_high = 0x7fffffff
1149
1150     def _lock_file(f, exclusive):
1151         overlapped = OVERLAPPED()
1152         overlapped.Offset = 0
1153         overlapped.OffsetHigh = 0
1154         overlapped.hEvent = 0
1155         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156         handle = msvcrt.get_osfhandle(f.fileno())
1157         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158                           whole_low, whole_high, f._lock_file_overlapped_p):
1159             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1160
1161     def _unlock_file(f):
1162         assert f._lock_file_overlapped_p
1163         handle = msvcrt.get_osfhandle(f.fileno())
1164         if not UnlockFileEx(handle, 0,
1165                             whole_low, whole_high, f._lock_file_overlapped_p):
1166             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1167
1168 else:
1169     import fcntl
1170
1171     def _lock_file(f, exclusive):
1172         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1173
1174     def _unlock_file(f):
1175         fcntl.flock(f, fcntl.LOCK_UN)
1176
1177
1178 class locked_file(object):
1179     def __init__(self, filename, mode, encoding=None):
1180         assert mode in ['r', 'a', 'w']
1181         self.f = io.open(filename, mode, encoding=encoding)
1182         self.mode = mode
1183
1184     def __enter__(self):
1185         exclusive = self.mode != 'r'
1186         try:
1187             _lock_file(self.f, exclusive)
1188         except IOError:
1189             self.f.close()
1190             raise
1191         return self
1192
1193     def __exit__(self, etype, value, traceback):
1194         try:
1195             _unlock_file(self.f)
1196         finally:
1197             self.f.close()
1198
1199     def __iter__(self):
1200         return iter(self.f)
1201
1202     def write(self, *args):
1203         return self.f.write(*args)
1204
1205     def read(self, *args):
1206         return self.f.read(*args)
1207
1208
1209 def get_filesystem_encoding():
1210     encoding = sys.getfilesystemencoding()
1211     return encoding if encoding is not None else 'utf-8'
1212
1213
1214 def shell_quote(args):
1215     quoted_args = []
1216     encoding = get_filesystem_encoding()
1217     for a in args:
1218         if isinstance(a, bytes):
1219             # We may get a filename encoded with 'encodeFilename'
1220             a = a.decode(encoding)
1221         quoted_args.append(pipes.quote(a))
1222     return ' '.join(quoted_args)
1223
1224
1225 def smuggle_url(url, data):
1226     """ Pass additional data in a URL for internal use. """
1227
1228     sdata = compat_urllib_parse.urlencode(
1229         {'__youtubedl_smuggle': json.dumps(data)})
1230     return url + '#' + sdata
1231
1232
1233 def unsmuggle_url(smug_url, default=None):
1234     if '#__youtubedl_smuggle' not in smug_url:
1235         return smug_url, default
1236     url, _, sdata = smug_url.rpartition('#')
1237     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1238     data = json.loads(jsond)
1239     return url, data
1240
1241
1242 def format_bytes(bytes):
1243     if bytes is None:
1244         return 'N/A'
1245     if type(bytes) is str:
1246         bytes = float(bytes)
1247     if bytes == 0.0:
1248         exponent = 0
1249     else:
1250         exponent = int(math.log(bytes, 1024.0))
1251     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1252     converted = float(bytes) / float(1024 ** exponent)
1253     return '%.2f%s' % (converted, suffix)
1254
1255
1256 def parse_filesize(s):
1257     if s is None:
1258         return None
1259
1260     # The lower-case forms are of course incorrect and inofficial,
1261     # but we support those too
1262     _UNIT_TABLE = {
1263         'B': 1,
1264         'b': 1,
1265         'KiB': 1024,
1266         'KB': 1000,
1267         'kB': 1024,
1268         'Kb': 1000,
1269         'MiB': 1024 ** 2,
1270         'MB': 1000 ** 2,
1271         'mB': 1024 ** 2,
1272         'Mb': 1000 ** 2,
1273         'GiB': 1024 ** 3,
1274         'GB': 1000 ** 3,
1275         'gB': 1024 ** 3,
1276         'Gb': 1000 ** 3,
1277         'TiB': 1024 ** 4,
1278         'TB': 1000 ** 4,
1279         'tB': 1024 ** 4,
1280         'Tb': 1000 ** 4,
1281         'PiB': 1024 ** 5,
1282         'PB': 1000 ** 5,
1283         'pB': 1024 ** 5,
1284         'Pb': 1000 ** 5,
1285         'EiB': 1024 ** 6,
1286         'EB': 1000 ** 6,
1287         'eB': 1024 ** 6,
1288         'Eb': 1000 ** 6,
1289         'ZiB': 1024 ** 7,
1290         'ZB': 1000 ** 7,
1291         'zB': 1024 ** 7,
1292         'Zb': 1000 ** 7,
1293         'YiB': 1024 ** 8,
1294         'YB': 1000 ** 8,
1295         'yB': 1024 ** 8,
1296         'Yb': 1000 ** 8,
1297     }
1298
1299     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1300     m = re.match(
1301         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1302     if not m:
1303         return None
1304
1305     num_str = m.group('num').replace(',', '.')
1306     mult = _UNIT_TABLE[m.group('unit')]
1307     return int(float(num_str) * mult)
1308
1309
1310 def month_by_name(name):
1311     """ Return the number of a month by (locale-independently) English name """
1312
1313     try:
1314         return ENGLISH_MONTH_NAMES.index(name) + 1
1315     except ValueError:
1316         return None
1317
1318
1319 def month_by_abbreviation(abbrev):
1320     """ Return the number of a month by (locale-independently) English
1321         abbreviations """
1322
1323     try:
1324         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1325     except ValueError:
1326         return None
1327
1328
1329 def fix_xml_ampersands(xml_str):
1330     """Replace all the '&' by '&amp;' in XML"""
1331     return re.sub(
1332         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1333         '&amp;',
1334         xml_str)
1335
1336
1337 def setproctitle(title):
1338     assert isinstance(title, compat_str)
1339     try:
1340         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1341     except OSError:
1342         return
1343     title_bytes = title.encode('utf-8')
1344     buf = ctypes.create_string_buffer(len(title_bytes))
1345     buf.value = title_bytes
1346     try:
1347         libc.prctl(15, buf, 0, 0, 0)
1348     except AttributeError:
1349         return  # Strange libc, just skip this
1350
1351
1352 def remove_start(s, start):
1353     if s.startswith(start):
1354         return s[len(start):]
1355     return s
1356
1357
1358 def remove_end(s, end):
1359     if s.endswith(end):
1360         return s[:-len(end)]
1361     return s
1362
1363
1364 def url_basename(url):
1365     path = compat_urlparse.urlparse(url).path
1366     return path.strip('/').split('/')[-1]
1367
1368
1369 class HEADRequest(compat_urllib_request.Request):
1370     def get_method(self):
1371         return "HEAD"
1372
1373
1374 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1375     if get_attr:
1376         if v is not None:
1377             v = getattr(v, get_attr, None)
1378     if v == '':
1379         v = None
1380     if v is None:
1381         return default
1382     try:
1383         return int(v) * invscale // scale
1384     except ValueError:
1385         return default
1386
1387
1388 def str_or_none(v, default=None):
1389     return default if v is None else compat_str(v)
1390
1391
1392 def str_to_int(int_str):
1393     """ A more relaxed version of int_or_none """
1394     if int_str is None:
1395         return None
1396     int_str = re.sub(r'[,\.\+]', '', int_str)
1397     return int(int_str)
1398
1399
1400 def float_or_none(v, scale=1, invscale=1, default=None):
1401     if v is None:
1402         return default
1403     try:
1404         return float(v) * invscale / scale
1405     except ValueError:
1406         return default
1407
1408
1409 def parse_duration(s):
1410     if not isinstance(s, compat_basestring):
1411         return None
1412
1413     s = s.strip()
1414
1415     m = re.match(
1416         r'''(?ix)(?:P?T)?
1417         (?:
1418             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1419             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1420
1421             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1422             (?:
1423                 (?:
1424                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1425                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1426                 )?
1427                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1428             )?
1429             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1430         )$''', s)
1431     if not m:
1432         return None
1433     res = 0
1434     if m.group('only_mins'):
1435         return float_or_none(m.group('only_mins'), invscale=60)
1436     if m.group('only_hours'):
1437         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1438     if m.group('secs'):
1439         res += int(m.group('secs'))
1440     if m.group('mins_reversed'):
1441         res += int(m.group('mins_reversed')) * 60
1442     if m.group('mins'):
1443         res += int(m.group('mins')) * 60
1444     if m.group('hours'):
1445         res += int(m.group('hours')) * 60 * 60
1446     if m.group('hours_reversed'):
1447         res += int(m.group('hours_reversed')) * 60 * 60
1448     if m.group('days'):
1449         res += int(m.group('days')) * 24 * 60 * 60
1450     if m.group('ms'):
1451         res += float(m.group('ms'))
1452     return res
1453
1454
1455 def prepend_extension(filename, ext, expected_real_ext=None):
1456     name, real_ext = os.path.splitext(filename)
1457     return (
1458         '{0}.{1}{2}'.format(name, ext, real_ext)
1459         if not expected_real_ext or real_ext[1:] == expected_real_ext
1460         else '{0}.{1}'.format(filename, ext))
1461
1462
1463 def replace_extension(filename, ext, expected_real_ext=None):
1464     name, real_ext = os.path.splitext(filename)
1465     return '{0}.{1}'.format(
1466         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1467         ext)
1468
1469
1470 def check_executable(exe, args=[]):
1471     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1472     args can be a list of arguments for a short output (like -version) """
1473     try:
1474         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1475     except OSError:
1476         return False
1477     return exe
1478
1479
1480 def get_exe_version(exe, args=['--version'],
1481                     version_re=None, unrecognized='present'):
1482     """ Returns the version of the specified executable,
1483     or False if the executable is not present """
1484     try:
1485         out, _ = subprocess.Popen(
1486             [encodeArgument(exe)] + args,
1487             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1488     except OSError:
1489         return False
1490     if isinstance(out, bytes):  # Python 2.x
1491         out = out.decode('ascii', 'ignore')
1492     return detect_exe_version(out, version_re, unrecognized)
1493
1494
1495 def detect_exe_version(output, version_re=None, unrecognized='present'):
1496     assert isinstance(output, compat_str)
1497     if version_re is None:
1498         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1499     m = re.search(version_re, output)
1500     if m:
1501         return m.group(1)
1502     else:
1503         return unrecognized
1504
1505
1506 class PagedList(object):
1507     def __len__(self):
1508         # This is only useful for tests
1509         return len(self.getslice())
1510
1511
1512 class OnDemandPagedList(PagedList):
1513     def __init__(self, pagefunc, pagesize):
1514         self._pagefunc = pagefunc
1515         self._pagesize = pagesize
1516
1517     def getslice(self, start=0, end=None):
1518         res = []
1519         for pagenum in itertools.count(start // self._pagesize):
1520             firstid = pagenum * self._pagesize
1521             nextfirstid = pagenum * self._pagesize + self._pagesize
1522             if start >= nextfirstid:
1523                 continue
1524
1525             page_results = list(self._pagefunc(pagenum))
1526
1527             startv = (
1528                 start % self._pagesize
1529                 if firstid <= start < nextfirstid
1530                 else 0)
1531
1532             endv = (
1533                 ((end - 1) % self._pagesize) + 1
1534                 if (end is not None and firstid <= end <= nextfirstid)
1535                 else None)
1536
1537             if startv != 0 or endv is not None:
1538                 page_results = page_results[startv:endv]
1539             res.extend(page_results)
1540
1541             # A little optimization - if current page is not "full", ie. does
1542             # not contain page_size videos then we can assume that this page
1543             # is the last one - there are no more ids on further pages -
1544             # i.e. no need to query again.
1545             if len(page_results) + startv < self._pagesize:
1546                 break
1547
1548             # If we got the whole page, but the next page is not interesting,
1549             # break out early as well
1550             if end == nextfirstid:
1551                 break
1552         return res
1553
1554
1555 class InAdvancePagedList(PagedList):
1556     def __init__(self, pagefunc, pagecount, pagesize):
1557         self._pagefunc = pagefunc
1558         self._pagecount = pagecount
1559         self._pagesize = pagesize
1560
1561     def getslice(self, start=0, end=None):
1562         res = []
1563         start_page = start // self._pagesize
1564         end_page = (
1565             self._pagecount if end is None else (end // self._pagesize + 1))
1566         skip_elems = start - start_page * self._pagesize
1567         only_more = None if end is None else end - start
1568         for pagenum in range(start_page, end_page):
1569             page = list(self._pagefunc(pagenum))
1570             if skip_elems:
1571                 page = page[skip_elems:]
1572                 skip_elems = None
1573             if only_more is not None:
1574                 if len(page) < only_more:
1575                     only_more -= len(page)
1576                 else:
1577                     page = page[:only_more]
1578                     res.extend(page)
1579                     break
1580             res.extend(page)
1581         return res
1582
1583
1584 def uppercase_escape(s):
1585     unicode_escape = codecs.getdecoder('unicode_escape')
1586     return re.sub(
1587         r'\\U[0-9a-fA-F]{8}',
1588         lambda m: unicode_escape(m.group(0))[0],
1589         s)
1590
1591
1592 def lowercase_escape(s):
1593     unicode_escape = codecs.getdecoder('unicode_escape')
1594     return re.sub(
1595         r'\\u[0-9a-fA-F]{4}',
1596         lambda m: unicode_escape(m.group(0))[0],
1597         s)
1598
1599
1600 def escape_rfc3986(s):
1601     """Escape non-ASCII characters as suggested by RFC 3986"""
1602     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1603         s = s.encode('utf-8')
1604     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1605
1606
1607 def escape_url(url):
1608     """Escape URL as suggested by RFC 3986"""
1609     url_parsed = compat_urllib_parse_urlparse(url)
1610     return url_parsed._replace(
1611         path=escape_rfc3986(url_parsed.path),
1612         params=escape_rfc3986(url_parsed.params),
1613         query=escape_rfc3986(url_parsed.query),
1614         fragment=escape_rfc3986(url_parsed.fragment)
1615     ).geturl()
1616
1617 try:
1618     struct.pack('!I', 0)
1619 except TypeError:
1620     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1621     def struct_pack(spec, *args):
1622         if isinstance(spec, compat_str):
1623             spec = spec.encode('ascii')
1624         return struct.pack(spec, *args)
1625
1626     def struct_unpack(spec, *args):
1627         if isinstance(spec, compat_str):
1628             spec = spec.encode('ascii')
1629         return struct.unpack(spec, *args)
1630 else:
1631     struct_pack = struct.pack
1632     struct_unpack = struct.unpack
1633
1634
1635 def read_batch_urls(batch_fd):
1636     def fixup(url):
1637         if not isinstance(url, compat_str):
1638             url = url.decode('utf-8', 'replace')
1639         BOM_UTF8 = '\xef\xbb\xbf'
1640         if url.startswith(BOM_UTF8):
1641             url = url[len(BOM_UTF8):]
1642         url = url.strip()
1643         if url.startswith(('#', ';', ']')):
1644             return False
1645         return url
1646
1647     with contextlib.closing(batch_fd) as fd:
1648         return [url for url in map(fixup, fd) if url]
1649
1650
1651 def urlencode_postdata(*args, **kargs):
1652     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1653
1654
1655 def encode_dict(d, encoding='utf-8'):
1656     return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1657
1658
1659 try:
1660     etree_iter = xml.etree.ElementTree.Element.iter
1661 except AttributeError:  # Python <=2.6
1662     etree_iter = lambda n: n.findall('.//*')
1663
1664
1665 def parse_xml(s):
1666     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1667         def doctype(self, name, pubid, system):
1668             pass  # Ignore doctypes
1669
1670     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1671     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1672     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1673     # Fix up XML parser in Python 2.x
1674     if sys.version_info < (3, 0):
1675         for n in etree_iter(tree):
1676             if n.text is not None:
1677                 if not isinstance(n.text, compat_str):
1678                     n.text = n.text.decode('utf-8')
1679     return tree
1680
1681
1682 US_RATINGS = {
1683     'G': 0,
1684     'PG': 10,
1685     'PG-13': 13,
1686     'R': 16,
1687     'NC': 18,
1688 }
1689
1690
1691 def parse_age_limit(s):
1692     if s is None:
1693         return None
1694     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1695     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1696
1697
1698 def strip_jsonp(code):
1699     return re.sub(
1700         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1701
1702
1703 def js_to_json(code):
1704     def fix_kv(m):
1705         v = m.group(0)
1706         if v in ('true', 'false', 'null'):
1707             return v
1708         if v.startswith('"'):
1709             v = re.sub(r"\\'", "'", v[1:-1])
1710         elif v.startswith("'"):
1711             v = v[1:-1]
1712             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1713                 '\\\\': '\\\\',
1714                 "\\'": "'",
1715                 '"': '\\"',
1716             }[m.group(0)], v)
1717         return '"%s"' % v
1718
1719     res = re.sub(r'''(?x)
1720         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1721         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1722         [a-zA-Z_][.a-zA-Z_0-9]*
1723         ''', fix_kv, code)
1724     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1725     return res
1726
1727
1728 def qualities(quality_ids):
1729     """ Get a numeric quality value out of a list of possible values """
1730     def q(qid):
1731         try:
1732             return quality_ids.index(qid)
1733         except ValueError:
1734             return -1
1735     return q
1736
1737
1738 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1739
1740
1741 def limit_length(s, length):
1742     """ Add ellipses to overly long strings """
1743     if s is None:
1744         return None
1745     ELLIPSES = '...'
1746     if len(s) > length:
1747         return s[:length - len(ELLIPSES)] + ELLIPSES
1748     return s
1749
1750
1751 def version_tuple(v):
1752     return tuple(int(e) for e in re.split(r'[-.]', v))
1753
1754
1755 def is_outdated_version(version, limit, assume_new=True):
1756     if not version:
1757         return not assume_new
1758     try:
1759         return version_tuple(version) < version_tuple(limit)
1760     except ValueError:
1761         return not assume_new
1762
1763
1764 def ytdl_is_updateable():
1765     """ Returns if youtube-dl can be updated with -U """
1766     from zipimport import zipimporter
1767
1768     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1769
1770
1771 def args_to_str(args):
1772     # Get a short string representation for a subprocess command
1773     return ' '.join(shlex_quote(a) for a in args)
1774
1775
1776 def mimetype2ext(mt):
1777     _, _, res = mt.rpartition('/')
1778
1779     return {
1780         'x-ms-wmv': 'wmv',
1781         'x-mp4-fragmented': 'mp4',
1782         'ttml+xml': 'ttml',
1783     }.get(res, res)
1784
1785
1786 def urlhandle_detect_ext(url_handle):
1787     try:
1788         url_handle.headers
1789         getheader = lambda h: url_handle.headers[h]
1790     except AttributeError:  # Python < 3
1791         getheader = url_handle.info().getheader
1792
1793     cd = getheader('Content-Disposition')
1794     if cd:
1795         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1796         if m:
1797             e = determine_ext(m.group('filename'), default_ext=None)
1798             if e:
1799                 return e
1800
1801     return mimetype2ext(getheader('Content-Type'))
1802
1803
1804 def encode_data_uri(data, mime_type):
1805     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1806
1807
1808 def age_restricted(content_limit, age_limit):
1809     """ Returns True iff the content should be blocked """
1810
1811     if age_limit is None:  # No limit set
1812         return False
1813     if content_limit is None:
1814         return False  # Content available for everyone
1815     return age_limit < content_limit
1816
1817
1818 def is_html(first_bytes):
1819     """ Detect whether a file contains HTML by examining its first bytes. """
1820
1821     BOMS = [
1822         (b'\xef\xbb\xbf', 'utf-8'),
1823         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1824         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1825         (b'\xff\xfe', 'utf-16-le'),
1826         (b'\xfe\xff', 'utf-16-be'),
1827     ]
1828     for bom, enc in BOMS:
1829         if first_bytes.startswith(bom):
1830             s = first_bytes[len(bom):].decode(enc, 'replace')
1831             break
1832     else:
1833         s = first_bytes.decode('utf-8', 'replace')
1834
1835     return re.match(r'^\s*<', s)
1836
1837
1838 def determine_protocol(info_dict):
1839     protocol = info_dict.get('protocol')
1840     if protocol is not None:
1841         return protocol
1842
1843     url = info_dict['url']
1844     if url.startswith('rtmp'):
1845         return 'rtmp'
1846     elif url.startswith('mms'):
1847         return 'mms'
1848     elif url.startswith('rtsp'):
1849         return 'rtsp'
1850
1851     ext = determine_ext(url)
1852     if ext == 'm3u8':
1853         return 'm3u8'
1854     elif ext == 'f4m':
1855         return 'f4m'
1856
1857     return compat_urllib_parse_urlparse(url).scheme
1858
1859
1860 def render_table(header_row, data):
1861     """ Render a list of rows, each as a list of values """
1862     table = [header_row] + data
1863     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1864     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1865     return '\n'.join(format_str % tuple(row) for row in table)
1866
1867
1868 def _match_one(filter_part, dct):
1869     COMPARISON_OPERATORS = {
1870         '<': operator.lt,
1871         '<=': operator.le,
1872         '>': operator.gt,
1873         '>=': operator.ge,
1874         '=': operator.eq,
1875         '!=': operator.ne,
1876     }
1877     operator_rex = re.compile(r'''(?x)\s*
1878         (?P<key>[a-z_]+)
1879         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1880         (?:
1881             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1882             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1883         )
1884         \s*$
1885         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1886     m = operator_rex.search(filter_part)
1887     if m:
1888         op = COMPARISON_OPERATORS[m.group('op')]
1889         if m.group('strval') is not None:
1890             if m.group('op') not in ('=', '!='):
1891                 raise ValueError(
1892                     'Operator %s does not support string values!' % m.group('op'))
1893             comparison_value = m.group('strval')
1894         else:
1895             try:
1896                 comparison_value = int(m.group('intval'))
1897             except ValueError:
1898                 comparison_value = parse_filesize(m.group('intval'))
1899                 if comparison_value is None:
1900                     comparison_value = parse_filesize(m.group('intval') + 'B')
1901                 if comparison_value is None:
1902                     raise ValueError(
1903                         'Invalid integer value %r in filter part %r' % (
1904                             m.group('intval'), filter_part))
1905         actual_value = dct.get(m.group('key'))
1906         if actual_value is None:
1907             return m.group('none_inclusive')
1908         return op(actual_value, comparison_value)
1909
1910     UNARY_OPERATORS = {
1911         '': lambda v: v is not None,
1912         '!': lambda v: v is None,
1913     }
1914     operator_rex = re.compile(r'''(?x)\s*
1915         (?P<op>%s)\s*(?P<key>[a-z_]+)
1916         \s*$
1917         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1918     m = operator_rex.search(filter_part)
1919     if m:
1920         op = UNARY_OPERATORS[m.group('op')]
1921         actual_value = dct.get(m.group('key'))
1922         return op(actual_value)
1923
1924     raise ValueError('Invalid filter part %r' % filter_part)
1925
1926
1927 def match_str(filter_str, dct):
1928     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1929
1930     return all(
1931         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1932
1933
1934 def match_filter_func(filter_str):
1935     def _match_func(info_dict):
1936         if match_str(filter_str, info_dict):
1937             return None
1938         else:
1939             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1940             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1941     return _match_func
1942
1943
1944 def parse_dfxp_time_expr(time_expr):
1945     if not time_expr:
1946         return 0.0
1947
1948     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1949     if mobj:
1950         return float(mobj.group('time_offset'))
1951
1952     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1953     if mobj:
1954         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1955
1956
1957 def srt_subtitles_timecode(seconds):
1958     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1959
1960
1961 def dfxp2srt(dfxp_data):
1962     _x = functools.partial(xpath_with_ns, ns_map={
1963         'ttml': 'http://www.w3.org/ns/ttml',
1964         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1965     })
1966
1967     def parse_node(node):
1968         str_or_empty = functools.partial(str_or_none, default='')
1969
1970         out = str_or_empty(node.text)
1971
1972         for child in node:
1973             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1974                 out += '\n' + str_or_empty(child.tail)
1975             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1976                 out += str_or_empty(parse_node(child))
1977             else:
1978                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1979
1980         return out
1981
1982     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1983     out = []
1984     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1985
1986     if not paras:
1987         raise ValueError('Invalid dfxp/TTML subtitle')
1988
1989     for para, index in zip(paras, itertools.count(1)):
1990         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1991         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1992         if not end_time:
1993             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1994         out.append('%d\n%s --> %s\n%s\n\n' % (
1995             index,
1996             srt_subtitles_timecode(begin_time),
1997             srt_subtitles_timecode(end_time),
1998             parse_node(para)))
1999
2000     return ''.join(out)
2001
2002
2003 def cli_option(params, command_option, param):
2004     param = params.get(param)
2005     return [command_option, param] if param is not None else []
2006
2007
2008 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2009     param = params.get(param)
2010     assert isinstance(param, bool)
2011     if separator:
2012         return [command_option + separator + (true_value if param else false_value)]
2013     return [command_option, true_value if param else false_value]
2014
2015
2016 def cli_valueless_option(params, command_option, param, expected_value=True):
2017     param = params.get(param)
2018     return [command_option] if param == expected_value else []
2019
2020
2021 def cli_configuration_args(params, param, default=[]):
2022     ex_args = params.get(param)
2023     if ex_args is None:
2024         return default
2025     assert isinstance(ex_args, list)
2026     return ex_args
2027
2028
2029 class ISO639Utils(object):
2030     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2031     _lang_map = {
2032         'aa': 'aar',
2033         'ab': 'abk',
2034         'ae': 'ave',
2035         'af': 'afr',
2036         'ak': 'aka',
2037         'am': 'amh',
2038         'an': 'arg',
2039         'ar': 'ara',
2040         'as': 'asm',
2041         'av': 'ava',
2042         'ay': 'aym',
2043         'az': 'aze',
2044         'ba': 'bak',
2045         'be': 'bel',
2046         'bg': 'bul',
2047         'bh': 'bih',
2048         'bi': 'bis',
2049         'bm': 'bam',
2050         'bn': 'ben',
2051         'bo': 'bod',
2052         'br': 'bre',
2053         'bs': 'bos',
2054         'ca': 'cat',
2055         'ce': 'che',
2056         'ch': 'cha',
2057         'co': 'cos',
2058         'cr': 'cre',
2059         'cs': 'ces',
2060         'cu': 'chu',
2061         'cv': 'chv',
2062         'cy': 'cym',
2063         'da': 'dan',
2064         'de': 'deu',
2065         'dv': 'div',
2066         'dz': 'dzo',
2067         'ee': 'ewe',
2068         'el': 'ell',
2069         'en': 'eng',
2070         'eo': 'epo',
2071         'es': 'spa',
2072         'et': 'est',
2073         'eu': 'eus',
2074         'fa': 'fas',
2075         'ff': 'ful',
2076         'fi': 'fin',
2077         'fj': 'fij',
2078         'fo': 'fao',
2079         'fr': 'fra',
2080         'fy': 'fry',
2081         'ga': 'gle',
2082         'gd': 'gla',
2083         'gl': 'glg',
2084         'gn': 'grn',
2085         'gu': 'guj',
2086         'gv': 'glv',
2087         'ha': 'hau',
2088         'he': 'heb',
2089         'hi': 'hin',
2090         'ho': 'hmo',
2091         'hr': 'hrv',
2092         'ht': 'hat',
2093         'hu': 'hun',
2094         'hy': 'hye',
2095         'hz': 'her',
2096         'ia': 'ina',
2097         'id': 'ind',
2098         'ie': 'ile',
2099         'ig': 'ibo',
2100         'ii': 'iii',
2101         'ik': 'ipk',
2102         'io': 'ido',
2103         'is': 'isl',
2104         'it': 'ita',
2105         'iu': 'iku',
2106         'ja': 'jpn',
2107         'jv': 'jav',
2108         'ka': 'kat',
2109         'kg': 'kon',
2110         'ki': 'kik',
2111         'kj': 'kua',
2112         'kk': 'kaz',
2113         'kl': 'kal',
2114         'km': 'khm',
2115         'kn': 'kan',
2116         'ko': 'kor',
2117         'kr': 'kau',
2118         'ks': 'kas',
2119         'ku': 'kur',
2120         'kv': 'kom',
2121         'kw': 'cor',
2122         'ky': 'kir',
2123         'la': 'lat',
2124         'lb': 'ltz',
2125         'lg': 'lug',
2126         'li': 'lim',
2127         'ln': 'lin',
2128         'lo': 'lao',
2129         'lt': 'lit',
2130         'lu': 'lub',
2131         'lv': 'lav',
2132         'mg': 'mlg',
2133         'mh': 'mah',
2134         'mi': 'mri',
2135         'mk': 'mkd',
2136         'ml': 'mal',
2137         'mn': 'mon',
2138         'mr': 'mar',
2139         'ms': 'msa',
2140         'mt': 'mlt',
2141         'my': 'mya',
2142         'na': 'nau',
2143         'nb': 'nob',
2144         'nd': 'nde',
2145         'ne': 'nep',
2146         'ng': 'ndo',
2147         'nl': 'nld',
2148         'nn': 'nno',
2149         'no': 'nor',
2150         'nr': 'nbl',
2151         'nv': 'nav',
2152         'ny': 'nya',
2153         'oc': 'oci',
2154         'oj': 'oji',
2155         'om': 'orm',
2156         'or': 'ori',
2157         'os': 'oss',
2158         'pa': 'pan',
2159         'pi': 'pli',
2160         'pl': 'pol',
2161         'ps': 'pus',
2162         'pt': 'por',
2163         'qu': 'que',
2164         'rm': 'roh',
2165         'rn': 'run',
2166         'ro': 'ron',
2167         'ru': 'rus',
2168         'rw': 'kin',
2169         'sa': 'san',
2170         'sc': 'srd',
2171         'sd': 'snd',
2172         'se': 'sme',
2173         'sg': 'sag',
2174         'si': 'sin',
2175         'sk': 'slk',
2176         'sl': 'slv',
2177         'sm': 'smo',
2178         'sn': 'sna',
2179         'so': 'som',
2180         'sq': 'sqi',
2181         'sr': 'srp',
2182         'ss': 'ssw',
2183         'st': 'sot',
2184         'su': 'sun',
2185         'sv': 'swe',
2186         'sw': 'swa',
2187         'ta': 'tam',
2188         'te': 'tel',
2189         'tg': 'tgk',
2190         'th': 'tha',
2191         'ti': 'tir',
2192         'tk': 'tuk',
2193         'tl': 'tgl',
2194         'tn': 'tsn',
2195         'to': 'ton',
2196         'tr': 'tur',
2197         'ts': 'tso',
2198         'tt': 'tat',
2199         'tw': 'twi',
2200         'ty': 'tah',
2201         'ug': 'uig',
2202         'uk': 'ukr',
2203         'ur': 'urd',
2204         'uz': 'uzb',
2205         've': 'ven',
2206         'vi': 'vie',
2207         'vo': 'vol',
2208         'wa': 'wln',
2209         'wo': 'wol',
2210         'xh': 'xho',
2211         'yi': 'yid',
2212         'yo': 'yor',
2213         'za': 'zha',
2214         'zh': 'zho',
2215         'zu': 'zul',
2216     }
2217
2218     @classmethod
2219     def short2long(cls, code):
2220         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2221         return cls._lang_map.get(code[:2])
2222
2223     @classmethod
2224     def long2short(cls, code):
2225         """Convert language code from ISO 639-2/T to ISO 639-1"""
2226         for short_name, long_name in cls._lang_map.items():
2227             if long_name == code:
2228                 return short_name
2229
2230
2231 class ISO3166Utils(object):
2232     # From http://data.okfn.org/data/core/country-list
2233     _country_map = {
2234         'AF': 'Afghanistan',
2235         'AX': 'Åland Islands',
2236         'AL': 'Albania',
2237         'DZ': 'Algeria',
2238         'AS': 'American Samoa',
2239         'AD': 'Andorra',
2240         'AO': 'Angola',
2241         'AI': 'Anguilla',
2242         'AQ': 'Antarctica',
2243         'AG': 'Antigua and Barbuda',
2244         'AR': 'Argentina',
2245         'AM': 'Armenia',
2246         'AW': 'Aruba',
2247         'AU': 'Australia',
2248         'AT': 'Austria',
2249         'AZ': 'Azerbaijan',
2250         'BS': 'Bahamas',
2251         'BH': 'Bahrain',
2252         'BD': 'Bangladesh',
2253         'BB': 'Barbados',
2254         'BY': 'Belarus',
2255         'BE': 'Belgium',
2256         'BZ': 'Belize',
2257         'BJ': 'Benin',
2258         'BM': 'Bermuda',
2259         'BT': 'Bhutan',
2260         'BO': 'Bolivia, Plurinational State of',
2261         'BQ': 'Bonaire, Sint Eustatius and Saba',
2262         'BA': 'Bosnia and Herzegovina',
2263         'BW': 'Botswana',
2264         'BV': 'Bouvet Island',
2265         'BR': 'Brazil',
2266         'IO': 'British Indian Ocean Territory',
2267         'BN': 'Brunei Darussalam',
2268         'BG': 'Bulgaria',
2269         'BF': 'Burkina Faso',
2270         'BI': 'Burundi',
2271         'KH': 'Cambodia',
2272         'CM': 'Cameroon',
2273         'CA': 'Canada',
2274         'CV': 'Cape Verde',
2275         'KY': 'Cayman Islands',
2276         'CF': 'Central African Republic',
2277         'TD': 'Chad',
2278         'CL': 'Chile',
2279         'CN': 'China',
2280         'CX': 'Christmas Island',
2281         'CC': 'Cocos (Keeling) Islands',
2282         'CO': 'Colombia',
2283         'KM': 'Comoros',
2284         'CG': 'Congo',
2285         'CD': 'Congo, the Democratic Republic of the',
2286         'CK': 'Cook Islands',
2287         'CR': 'Costa Rica',
2288         'CI': 'Côte d\'Ivoire',
2289         'HR': 'Croatia',
2290         'CU': 'Cuba',
2291         'CW': 'Curaçao',
2292         'CY': 'Cyprus',
2293         'CZ': 'Czech Republic',
2294         'DK': 'Denmark',
2295         'DJ': 'Djibouti',
2296         'DM': 'Dominica',
2297         'DO': 'Dominican Republic',
2298         'EC': 'Ecuador',
2299         'EG': 'Egypt',
2300         'SV': 'El Salvador',
2301         'GQ': 'Equatorial Guinea',
2302         'ER': 'Eritrea',
2303         'EE': 'Estonia',
2304         'ET': 'Ethiopia',
2305         'FK': 'Falkland Islands (Malvinas)',
2306         'FO': 'Faroe Islands',
2307         'FJ': 'Fiji',
2308         'FI': 'Finland',
2309         'FR': 'France',
2310         'GF': 'French Guiana',
2311         'PF': 'French Polynesia',
2312         'TF': 'French Southern Territories',
2313         'GA': 'Gabon',
2314         'GM': 'Gambia',
2315         'GE': 'Georgia',
2316         'DE': 'Germany',
2317         'GH': 'Ghana',
2318         'GI': 'Gibraltar',
2319         'GR': 'Greece',
2320         'GL': 'Greenland',
2321         'GD': 'Grenada',
2322         'GP': 'Guadeloupe',
2323         'GU': 'Guam',
2324         'GT': 'Guatemala',
2325         'GG': 'Guernsey',
2326         'GN': 'Guinea',
2327         'GW': 'Guinea-Bissau',
2328         'GY': 'Guyana',
2329         'HT': 'Haiti',
2330         'HM': 'Heard Island and McDonald Islands',
2331         'VA': 'Holy See (Vatican City State)',
2332         'HN': 'Honduras',
2333         'HK': 'Hong Kong',
2334         'HU': 'Hungary',
2335         'IS': 'Iceland',
2336         'IN': 'India',
2337         'ID': 'Indonesia',
2338         'IR': 'Iran, Islamic Republic of',
2339         'IQ': 'Iraq',
2340         'IE': 'Ireland',
2341         'IM': 'Isle of Man',
2342         'IL': 'Israel',
2343         'IT': 'Italy',
2344         'JM': 'Jamaica',
2345         'JP': 'Japan',
2346         'JE': 'Jersey',
2347         'JO': 'Jordan',
2348         'KZ': 'Kazakhstan',
2349         'KE': 'Kenya',
2350         'KI': 'Kiribati',
2351         'KP': 'Korea, Democratic People\'s Republic of',
2352         'KR': 'Korea, Republic of',
2353         'KW': 'Kuwait',
2354         'KG': 'Kyrgyzstan',
2355         'LA': 'Lao People\'s Democratic Republic',
2356         'LV': 'Latvia',
2357         'LB': 'Lebanon',
2358         'LS': 'Lesotho',
2359         'LR': 'Liberia',
2360         'LY': 'Libya',
2361         'LI': 'Liechtenstein',
2362         'LT': 'Lithuania',
2363         'LU': 'Luxembourg',
2364         'MO': 'Macao',
2365         'MK': 'Macedonia, the Former Yugoslav Republic of',
2366         'MG': 'Madagascar',
2367         'MW': 'Malawi',
2368         'MY': 'Malaysia',
2369         'MV': 'Maldives',
2370         'ML': 'Mali',
2371         'MT': 'Malta',
2372         'MH': 'Marshall Islands',
2373         'MQ': 'Martinique',
2374         'MR': 'Mauritania',
2375         'MU': 'Mauritius',
2376         'YT': 'Mayotte',
2377         'MX': 'Mexico',
2378         'FM': 'Micronesia, Federated States of',
2379         'MD': 'Moldova, Republic of',
2380         'MC': 'Monaco',
2381         'MN': 'Mongolia',
2382         'ME': 'Montenegro',
2383         'MS': 'Montserrat',
2384         'MA': 'Morocco',
2385         'MZ': 'Mozambique',
2386         'MM': 'Myanmar',
2387         'NA': 'Namibia',
2388         'NR': 'Nauru',
2389         'NP': 'Nepal',
2390         'NL': 'Netherlands',
2391         'NC': 'New Caledonia',
2392         'NZ': 'New Zealand',
2393         'NI': 'Nicaragua',
2394         'NE': 'Niger',
2395         'NG': 'Nigeria',
2396         'NU': 'Niue',
2397         'NF': 'Norfolk Island',
2398         'MP': 'Northern Mariana Islands',
2399         'NO': 'Norway',
2400         'OM': 'Oman',
2401         'PK': 'Pakistan',
2402         'PW': 'Palau',
2403         'PS': 'Palestine, State of',
2404         'PA': 'Panama',
2405         'PG': 'Papua New Guinea',
2406         'PY': 'Paraguay',
2407         'PE': 'Peru',
2408         'PH': 'Philippines',
2409         'PN': 'Pitcairn',
2410         'PL': 'Poland',
2411         'PT': 'Portugal',
2412         'PR': 'Puerto Rico',
2413         'QA': 'Qatar',
2414         'RE': 'Réunion',
2415         'RO': 'Romania',
2416         'RU': 'Russian Federation',
2417         'RW': 'Rwanda',
2418         'BL': 'Saint Barthélemy',
2419         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2420         'KN': 'Saint Kitts and Nevis',
2421         'LC': 'Saint Lucia',
2422         'MF': 'Saint Martin (French part)',
2423         'PM': 'Saint Pierre and Miquelon',
2424         'VC': 'Saint Vincent and the Grenadines',
2425         'WS': 'Samoa',
2426         'SM': 'San Marino',
2427         'ST': 'Sao Tome and Principe',
2428         'SA': 'Saudi Arabia',
2429         'SN': 'Senegal',
2430         'RS': 'Serbia',
2431         'SC': 'Seychelles',
2432         'SL': 'Sierra Leone',
2433         'SG': 'Singapore',
2434         'SX': 'Sint Maarten (Dutch part)',
2435         'SK': 'Slovakia',
2436         'SI': 'Slovenia',
2437         'SB': 'Solomon Islands',
2438         'SO': 'Somalia',
2439         'ZA': 'South Africa',
2440         'GS': 'South Georgia and the South Sandwich Islands',
2441         'SS': 'South Sudan',
2442         'ES': 'Spain',
2443         'LK': 'Sri Lanka',
2444         'SD': 'Sudan',
2445         'SR': 'Suriname',
2446         'SJ': 'Svalbard and Jan Mayen',
2447         'SZ': 'Swaziland',
2448         'SE': 'Sweden',
2449         'CH': 'Switzerland',
2450         'SY': 'Syrian Arab Republic',
2451         'TW': 'Taiwan, Province of China',
2452         'TJ': 'Tajikistan',
2453         'TZ': 'Tanzania, United Republic of',
2454         'TH': 'Thailand',
2455         'TL': 'Timor-Leste',
2456         'TG': 'Togo',
2457         'TK': 'Tokelau',
2458         'TO': 'Tonga',
2459         'TT': 'Trinidad and Tobago',
2460         'TN': 'Tunisia',
2461         'TR': 'Turkey',
2462         'TM': 'Turkmenistan',
2463         'TC': 'Turks and Caicos Islands',
2464         'TV': 'Tuvalu',
2465         'UG': 'Uganda',
2466         'UA': 'Ukraine',
2467         'AE': 'United Arab Emirates',
2468         'GB': 'United Kingdom',
2469         'US': 'United States',
2470         'UM': 'United States Minor Outlying Islands',
2471         'UY': 'Uruguay',
2472         'UZ': 'Uzbekistan',
2473         'VU': 'Vanuatu',
2474         'VE': 'Venezuela, Bolivarian Republic of',
2475         'VN': 'Viet Nam',
2476         'VG': 'Virgin Islands, British',
2477         'VI': 'Virgin Islands, U.S.',
2478         'WF': 'Wallis and Futuna',
2479         'EH': 'Western Sahara',
2480         'YE': 'Yemen',
2481         'ZM': 'Zambia',
2482         'ZW': 'Zimbabwe',
2483     }
2484
2485     @classmethod
2486     def short2full(cls, code):
2487         """Convert an ISO 3166-2 country code to the corresponding full name"""
2488         return cls._country_map.get(code.upper())
2489
2490
2491 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2492     def __init__(self, proxies=None):
2493         # Set default handlers
2494         for type in ('http', 'https'):
2495             setattr(self, '%s_open' % type,
2496                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2497                         meth(r, proxy, type))
2498         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2499
2500     def proxy_open(self, req, proxy, type):
2501         req_proxy = req.headers.get('Ytdl-request-proxy')
2502         if req_proxy is not None:
2503             proxy = req_proxy
2504             del req.headers['Ytdl-request-proxy']
2505
2506         if proxy == '__noproxy__':
2507             return None  # No Proxy
2508         return compat_urllib_request.ProxyHandler.proxy_open(
2509             self, req, proxy, type)