git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import itertools
  18 import io
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import ssl
  28 import socket
  29 import struct
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_http_client,
  43     compat_kwargs,
  44     compat_parse_qs,
  45     compat_socket_create_connection,
  46     compat_str,
  47     compat_urllib_error,
  48     compat_urllib_parse,
  49     compat_urllib_parse_urlparse,
  50     compat_urllib_request,
  51     compat_urlparse,
  52     shlex_quote,
  53 )
  54
  55
  56 # This is not clearly defined otherwise
  57 compiled_regex_type = type(re.compile(''))
  58
  59 std_headers = {
  60     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
  61     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  62     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  63     'Accept-Encoding': 'gzip, deflate',
  64     'Accept-Language': 'en-us,en;q=0.5',
  65 }
  66
  67
  68 NO_DEFAULT = object()
  69
  70 ENGLISH_MONTH_NAMES = [
  71     'January', 'February', 'March', 'April', 'May', 'June',
  72     'July', 'August', 'September', 'October', 'November', 'December']
  73
  74 KNOWN_EXTENSIONS = (
  75     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  76     'flv', 'f4v', 'f4a', 'f4b',
  77     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  78     'mkv', 'mka', 'mk3d',
  79     'avi', 'divx',
  80     'mov',
  81     'asf', 'wmv', 'wma',
  82     '3gp', '3g2',
  83     'mp3',
  84     'flac',
  85     'ape',
  86     'wav',
  87     'f4f', 'f4m', 'm3u8', 'smil')
  88
  89
  90 def preferredencoding():
  91     """Get preferred encoding.
  92
  93     Returns the best encoding scheme for the system, based on
  94     locale.getpreferredencoding() and some further tweaks.
  95     """
  96     try:
  97         pref = locale.getpreferredencoding()
  98         'TEST'.encode(pref)
  99     except Exception:
 100         pref = 'UTF-8'
 101
 102     return pref
 103
 104
 105 def write_json_file(obj, fn):
 106     """ Encode obj as JSON and write it to fn, atomically if possible """
 107
 108     fn = encodeFilename(fn)
 109     if sys.version_info < (3, 0) and sys.platform != 'win32':
 110         encoding = get_filesystem_encoding()
 111         # os.path.basename returns a bytes object, but NamedTemporaryFile
 112         # will fail if the filename contains non ascii characters unless we
 113         # use a unicode object
 114         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 115         # the same for os.path.dirname
 116         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 117     else:
 118         path_basename = os.path.basename
 119         path_dirname = os.path.dirname
 120
 121     args = {
 122         'suffix': '.tmp',
 123         'prefix': path_basename(fn) + '.',
 124         'dir': path_dirname(fn),
 125         'delete': False,
 126     }
 127
 128     # In Python 2.x, json.dump expects a bytestream.
 129     # In Python 3.x, it writes to a character stream
 130     if sys.version_info < (3, 0):
 131         args['mode'] = 'wb'
 132     else:
 133         args.update({
 134             'mode': 'w',
 135             'encoding': 'utf-8',
 136         })
 137
 138     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 139
 140     try:
 141         with tf:
 142             json.dump(obj, tf)
 143         if sys.platform == 'win32':
 144             # Need to remove existing file on Windows, else os.rename raises
 145             # WindowsError or FileExistsError.
 146             try:
 147                 os.unlink(fn)
 148             except OSError:
 149                 pass
 150         os.rename(tf.name, fn)
 151     except Exception:
 152         try:
 153             os.remove(tf.name)
 154         except OSError:
 155             pass
 156         raise
 157
 158
 159 if sys.version_info >= (2, 7):
 160     def find_xpath_attr(node, xpath, key, val=None):
 161         """ Find the xpath xpath[@key=val] """
 162         assert re.match(r'^[a-zA-Z_-]+$', key)
 163         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 164         return node.find(expr)
 165 else:
 166     def find_xpath_attr(node, xpath, key, val=None):
 167         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 168         # .//node does not match if a node is a direct child of . !
 169         if isinstance(xpath, compat_str):
 170             xpath = xpath.encode('ascii')
 171
 172         for f in node.findall(xpath):
 173             if key not in f.attrib:
 174                 continue
 175             if val is None or f.attrib.get(key) == val:
 176                 return f
 177         return None
 178
 179 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 180 # the namespace parameter
 181
 182
 183 def xpath_with_ns(path, ns_map):
 184     components = [c.split(':') for c in path.split('/')]
 185     replaced = []
 186     for c in components:
 187         if len(c) == 1:
 188             replaced.append(c[0])
 189         else:
 190             ns, tag = c
 191             replaced.append('{%s}%s' % (ns_map[ns], tag))
 192     return '/'.join(replaced)
 193
 194
 195 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 196     def _find_xpath(xpath):
 197         if sys.version_info < (2, 7):  # Crazy 2.6
 198             xpath = xpath.encode('ascii')
 199         return node.find(xpath)
 200
 201     if isinstance(xpath, (str, compat_str)):
 202         n = _find_xpath(xpath)
 203     else:
 204         for xp in xpath:
 205             n = _find_xpath(xp)
 206             if n is not None:
 207                 break
 208
 209     if n is None:
 210         if default is not NO_DEFAULT:
 211             return default
 212         elif fatal:
 213             name = xpath if name is None else name
 214             raise ExtractorError('Could not find XML element %s' % name)
 215         else:
 216             return None
 217     return n
 218
 219
 220 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 221     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 222     if n is None or n == default:
 223         return n
 224     if n.text is None:
 225         if default is not NO_DEFAULT:
 226             return default
 227         elif fatal:
 228             name = xpath if name is None else name
 229             raise ExtractorError('Could not find XML element\'s text %s' % name)
 230         else:
 231             return None
 232     return n.text
 233
 234
 235 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 236     n = find_xpath_attr(node, xpath, key)
 237     if n is None:
 238         if default is not NO_DEFAULT:
 239             return default
 240         elif fatal:
 241             name = '%s[@%s]' % (xpath, key) if name is None else name
 242             raise ExtractorError('Could not find XML attribute %s' % name)
 243         else:
 244             return None
 245     return n.attrib[key]
 246
 247
 248 def get_element_by_id(id, html):
 249     """Return the content of the tag with the specified ID in the passed HTML document"""
 250     return get_element_by_attribute('id', id, html)
 251
 252
 253 def get_element_by_attribute(attribute, value, html):
 254     """Return the content of the tag with the specified attribute in the passed HTML document"""
 255
 256     m = re.search(r'''(?xs)
 257         <([a-zA-Z0-9:._-]+)
 258          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 259          \s+%s=['"]?%s['"]?
 260          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 261         \s*>
 262         (?P<content>.*?)
 263         </\1>
 264     ''' % (re.escape(attribute), re.escape(value)), html)
 265
 266     if not m:
 267         return None
 268     res = m.group('content')
 269
 270     if res.startswith('"') or res.startswith("'"):
 271         res = res[1:-1]
 272
 273     return unescapeHTML(res)
 274
 275
 276 def clean_html(html):
 277     """Clean an HTML snippet into a readable string"""
 278
 279     if html is None:  # Convenience for sanitizing descriptions etc.
 280         return html
 281
 282     # Newline vs <br />
 283     html = html.replace('\n', ' ')
 284     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 285     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 286     # Strip html tags
 287     html = re.sub('<.*?>', '', html)
 288     # Replace html entities
 289     html = unescapeHTML(html)
 290     return html.strip()
 291
 292
 293 def sanitize_open(filename, open_mode):
 294     """Try to open the given filename, and slightly tweak it if this fails.
 295
 296     Attempts to open the given filename. If this fails, it tries to change
 297     the filename slightly, step by step, until it's either able to open it
 298     or it fails and raises a final exception, like the standard open()
 299     function.
 300
 301     It returns the tuple (stream, definitive_file_name).
 302     """
 303     try:
 304         if filename == '-':
 305             if sys.platform == 'win32':
 306                 import msvcrt
 307                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 308             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 309         stream = open(encodeFilename(filename), open_mode)
 310         return (stream, filename)
 311     except (IOError, OSError) as err:
 312         if err.errno in (errno.EACCES,):
 313             raise
 314
 315         # In case of error, try to remove win32 forbidden chars
 316         alt_filename = sanitize_path(filename)
 317         if alt_filename == filename:
 318             raise
 319         else:
 320             # An exception here should be caught in the caller
 321             stream = open(encodeFilename(alt_filename), open_mode)
 322             return (stream, alt_filename)
 323
 324
 325 def timeconvert(timestr):
 326     """Convert RFC 2822 defined time string into system timestamp"""
 327     timestamp = None
 328     timetuple = email.utils.parsedate_tz(timestr)
 329     if timetuple is not None:
 330         timestamp = email.utils.mktime_tz(timetuple)
 331     return timestamp
 332
 333
 334 def sanitize_filename(s, restricted=False, is_id=False):
 335     """Sanitizes a string so it could be used as part of a filename.
 336     If restricted is set, use a stricter subset of allowed characters.
 337     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 338     """
 339     def replace_insane(char):
 340         if char == '?' or ord(char) < 32 or ord(char) == 127:
 341             return ''
 342         elif char == '"':
 343             return '' if restricted else '\''
 344         elif char == ':':
 345             return '_-' if restricted else ' -'
 346         elif char in '\\/|*<>':
 347             return '_'
 348         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 349             return '_'
 350         if restricted and ord(char) > 127:
 351             return '_'
 352         return char
 353
 354     # Handle timestamps
 355     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 356     result = ''.join(map(replace_insane, s))
 357     if not is_id:
 358         while '__' in result:
 359             result = result.replace('__', '_')
 360         result = result.strip('_')
 361         # Common case of "Foreign band name - English song title"
 362         if restricted and result.startswith('-_'):
 363             result = result[2:]
 364         if result.startswith('-'):
 365             result = '_' + result[len('-'):]
 366         result = result.lstrip('.')
 367         if not result:
 368             result = '_'
 369     return result
 370
 371
 372 def sanitize_path(s):
 373     """Sanitizes and normalizes path on Windows"""
 374     if sys.platform != 'win32':
 375         return s
 376     drive_or_unc, _ = os.path.splitdrive(s)
 377     if sys.version_info < (2, 7) and not drive_or_unc:
 378         drive_or_unc, _ = os.path.splitunc(s)
 379     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 380     if drive_or_unc:
 381         norm_path.pop(0)
 382     sanitized_path = [
 383         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 384         for path_part in norm_path]
 385     if drive_or_unc:
 386         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 387     return os.path.join(*sanitized_path)
 388
 389
 390 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 391 # unwanted failures due to missing protocol
 392 def sanitized_Request(url, *args, **kwargs):
 393     return compat_urllib_request.Request(
 394         'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
 395
 396
 397 def orderedSet(iterable):
 398     """ Remove all duplicates from the input iterable """
 399     res = []
 400     for el in iterable:
 401         if el not in res:
 402             res.append(el)
 403     return res
 404
 405
 406 def _htmlentity_transform(entity):
 407     """Transforms an HTML entity to a character."""
 408     # Known non-numeric HTML entity
 409     if entity in compat_html_entities.name2codepoint:
 410         return compat_chr(compat_html_entities.name2codepoint[entity])
 411
 412     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 413     if mobj is not None:
 414         numstr = mobj.group(1)
 415         if numstr.startswith('x'):
 416             base = 16
 417             numstr = '0%s' % numstr
 418         else:
 419             base = 10
 420         # See https://github.com/rg3/youtube-dl/issues/7518
 421         try:
 422             return compat_chr(int(numstr, base))
 423         except ValueError:
 424             pass
 425
 426     # Unknown entity in name, return its literal representation
 427     return '&%s;' % entity
 428
 429
 430 def unescapeHTML(s):
 431     if s is None:
 432         return None
 433     assert type(s) == compat_str
 434
 435     return re.sub(
 436         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 437
 438
 439 def get_subprocess_encoding():
 440     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 441         # For subprocess calls, encode with locale encoding
 442         # Refer to http://stackoverflow.com/a/9951851/35070
 443         encoding = preferredencoding()
 444     else:
 445         encoding = sys.getfilesystemencoding()
 446     if encoding is None:
 447         encoding = 'utf-8'
 448     return encoding
 449
 450
 451 def encodeFilename(s, for_subprocess=False):
 452     """
 453     @param s The name of the file
 454     """
 455
 456     assert type(s) == compat_str
 457
 458     # Python 3 has a Unicode API
 459     if sys.version_info >= (3, 0):
 460         return s
 461
 462     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 463     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 464     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 465     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 466         return s
 467
 468     return s.encode(get_subprocess_encoding(), 'ignore')
 469
 470
 471 def decodeFilename(b, for_subprocess=False):
 472
 473     if sys.version_info >= (3, 0):
 474         return b
 475
 476     if not isinstance(b, bytes):
 477         return b
 478
 479     return b.decode(get_subprocess_encoding(), 'ignore')
 480
 481
 482 def encodeArgument(s):
 483     if not isinstance(s, compat_str):
 484         # Legacy code that uses byte strings
 485         # Uncomment the following line after fixing all post processors
 486         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 487         s = s.decode('ascii')
 488     return encodeFilename(s, True)
 489
 490
 491 def decodeArgument(b):
 492     return decodeFilename(b, True)
 493
 494
 495 def decodeOption(optval):
 496     if optval is None:
 497         return optval
 498     if isinstance(optval, bytes):
 499         optval = optval.decode(preferredencoding())
 500
 501     assert isinstance(optval, compat_str)
 502     return optval
 503
 504
 505 def formatSeconds(secs):
 506     if secs > 3600:
 507         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 508     elif secs > 60:
 509         return '%d:%02d' % (secs // 60, secs % 60)
 510     else:
 511         return '%d' % secs
 512
 513
 514 def make_HTTPS_handler(params, **kwargs):
 515     opts_no_check_certificate = params.get('nocheckcertificate', False)
 516     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 517         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 518         if opts_no_check_certificate:
 519             context.check_hostname = False
 520             context.verify_mode = ssl.CERT_NONE
 521         try:
 522             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 523         except TypeError:
 524             # Python 2.7.8
 525             # (create_default_context present but HTTPSHandler has no context=)
 526             pass
 527
 528     if sys.version_info < (3, 2):
 529         return YoutubeDLHTTPSHandler(params, **kwargs)
 530     else:  # Python < 3.4
 531         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 532         context.verify_mode = (ssl.CERT_NONE
 533                                if opts_no_check_certificate
 534                                else ssl.CERT_REQUIRED)
 535         context.set_default_verify_paths()
 536         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 537
 538
 539 def bug_reports_message():
 540     if ytdl_is_updateable():
 541         update_cmd = 'type  youtube-dl -U  to update'
 542     else:
 543         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 544     msg = '; please report this issue on https://yt-dl.org/bug .'
 545     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 546     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 547     return msg
 548
 549
 550 class ExtractorError(Exception):
 551     """Error during info extraction."""
 552
 553     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 554         """ tb, if given, is the original traceback (so that it can be printed out).
 555         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 556         """
 557
 558         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 559             expected = True
 560         if video_id is not None:
 561             msg = video_id + ': ' + msg
 562         if cause:
 563             msg += ' (caused by %r)' % cause
 564         if not expected:
 565             msg += bug_reports_message()
 566         super(ExtractorError, self).__init__(msg)
 567
 568         self.traceback = tb
 569         self.exc_info = sys.exc_info()  # preserve original exception
 570         self.cause = cause
 571         self.video_id = video_id
 572
 573     def format_traceback(self):
 574         if self.traceback is None:
 575             return None
 576         return ''.join(traceback.format_tb(self.traceback))
 577
 578
 579 class UnsupportedError(ExtractorError):
 580     def __init__(self, url):
 581         super(UnsupportedError, self).__init__(
 582             'Unsupported URL: %s' % url, expected=True)
 583         self.url = url
 584
 585
 586 class RegexNotFoundError(ExtractorError):
 587     """Error when a regex didn't match"""
 588     pass
 589
 590
 591 class DownloadError(Exception):
 592     """Download Error exception.
 593
 594     This exception may be thrown by FileDownloader objects if they are not
 595     configured to continue on errors. They will contain the appropriate
 596     error message.
 597     """
 598
 599     def __init__(self, msg, exc_info=None):
 600         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 601         super(DownloadError, self).__init__(msg)
 602         self.exc_info = exc_info
 603
 604
 605 class SameFileError(Exception):
 606     """Same File exception.
 607
 608     This exception will be thrown by FileDownloader objects if they detect
 609     multiple files would have to be downloaded to the same file on disk.
 610     """
 611     pass
 612
 613
 614 class PostProcessingError(Exception):
 615     """Post Processing exception.
 616
 617     This exception may be raised by PostProcessor's .run() method to
 618     indicate an error in the postprocessing task.
 619     """
 620
 621     def __init__(self, msg):
 622         self.msg = msg
 623
 624
 625 class MaxDownloadsReached(Exception):
 626     """ --max-downloads limit has been reached. """
 627     pass
 628
 629
 630 class UnavailableVideoError(Exception):
 631     """Unavailable Format exception.
 632
 633     This exception will be thrown when a video is requested
 634     in a format that is not available for that video.
 635     """
 636     pass
 637
 638
 639 class ContentTooShortError(Exception):
 640     """Content Too Short exception.
 641
 642     This exception may be raised by FileDownloader objects when a file they
 643     download is too small for what the server announced first, indicating
 644     the connection was probably interrupted.
 645     """
 646
 647     def __init__(self, downloaded, expected):
 648         # Both in bytes
 649         self.downloaded = downloaded
 650         self.expected = expected
 651
 652
 653 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 654     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 655     # expected HTTP responses to meet HTTP/1.0 or later (see also
 656     # https://github.com/rg3/youtube-dl/issues/6727)
 657     if sys.version_info < (3, 0):
 658         kwargs[b'strict'] = True
 659     hc = http_class(*args, **kwargs)
 660     source_address = ydl_handler._params.get('source_address')
 661     if source_address is not None:
 662         sa = (source_address, 0)
 663         if hasattr(hc, 'source_address'):  # Python 2.7+
 664             hc.source_address = sa
 665         else:  # Python 2.6
 666             def _hc_connect(self, *args, **kwargs):
 667                 sock = compat_socket_create_connection(
 668                     (self.host, self.port), self.timeout, sa)
 669                 if is_https:
 670                     self.sock = ssl.wrap_socket(
 671                         sock, self.key_file, self.cert_file,
 672                         ssl_version=ssl.PROTOCOL_TLSv1)
 673                 else:
 674                     self.sock = sock
 675             hc.connect = functools.partial(_hc_connect, hc)
 676
 677     return hc
 678
 679
 680 def handle_youtubedl_headers(headers):
 681     filtered_headers = headers
 682
 683     if 'Youtubedl-no-compression' in filtered_headers:
 684         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 685         del filtered_headers['Youtubedl-no-compression']
 686
 687     return filtered_headers
 688
 689
 690 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 691     """Handler for HTTP requests and responses.
 692
 693     This class, when installed with an OpenerDirector, automatically adds
 694     the standard headers to every HTTP request and handles gzipped and
 695     deflated responses from web servers. If compression is to be avoided in
 696     a particular request, the original request in the program code only has
 697     to include the HTTP header "Youtubedl-no-compression", which will be
 698     removed before making the real request.
 699
 700     Part of this code was copied from:
 701
 702     http://techknack.net/python-urllib2-handlers/
 703
 704     Andrew Rowls, the author of that code, agreed to release it to the
 705     public domain.
 706     """
 707
 708     def __init__(self, params, *args, **kwargs):
 709         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 710         self._params = params
 711
 712     def http_open(self, req):
 713         return self.do_open(functools.partial(
 714             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 715             req)
 716
 717     @staticmethod
 718     def deflate(data):
 719         try:
 720             return zlib.decompress(data, -zlib.MAX_WBITS)
 721         except zlib.error:
 722             return zlib.decompress(data)
 723
 724     @staticmethod
 725     def addinfourl_wrapper(stream, headers, url, code):
 726         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 727             return compat_urllib_request.addinfourl(stream, headers, url, code)
 728         ret = compat_urllib_request.addinfourl(stream, headers, url)
 729         ret.code = code
 730         return ret
 731
 732     def http_request(self, req):
 733         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 734         # always respected by websites, some tend to give out URLs with non percent-encoded
 735         # non-ASCII characters (see telemb.py, ard.py [#3412])
 736         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 737         # To work around aforementioned issue we will replace request's original URL with
 738         # percent-encoded one
 739         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 740         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 741         url = req.get_full_url()
 742         url_escaped = escape_url(url)
 743
 744         # Substitute URL if any change after escaping
 745         if url != url_escaped:
 746             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 747             new_req = req_type(
 748                 url_escaped, data=req.data, headers=req.headers,
 749                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 750             new_req.timeout = req.timeout
 751             req = new_req
 752
 753         for h, v in std_headers.items():
 754             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 755             # The dict keys are capitalized because of this bug by urllib
 756             if h.capitalize() not in req.headers:
 757                 req.add_header(h, v)
 758
 759         req.headers = handle_youtubedl_headers(req.headers)
 760
 761         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 762             # Python 2.6 is brain-dead when it comes to fragments
 763             req._Request__original = req._Request__original.partition('#')[0]
 764             req._Request__r_type = req._Request__r_type.partition('#')[0]
 765
 766         return req
 767
 768     def http_response(self, req, resp):
 769         old_resp = resp
 770         # gzip
 771         if resp.headers.get('Content-encoding', '') == 'gzip':
 772             content = resp.read()
 773             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 774             try:
 775                 uncompressed = io.BytesIO(gz.read())
 776             except IOError as original_ioerror:
 777                 # There may be junk add the end of the file
 778                 # See http://stackoverflow.com/q/4928560/35070 for details
 779                 for i in range(1, 1024):
 780                     try:
 781                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 782                         uncompressed = io.BytesIO(gz.read())
 783                     except IOError:
 784                         continue
 785                     break
 786                 else:
 787                     raise original_ioerror
 788             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 789             resp.msg = old_resp.msg
 790             del resp.headers['Content-encoding']
 791         # deflate
 792         if resp.headers.get('Content-encoding', '') == 'deflate':
 793             gz = io.BytesIO(self.deflate(resp.read()))
 794             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 795             resp.msg = old_resp.msg
 796             del resp.headers['Content-encoding']
 797         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 798         # https://github.com/rg3/youtube-dl/issues/6457).
 799         if 300 <= resp.code < 400:
 800             location = resp.headers.get('Location')
 801             if location:
 802                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 803                 if sys.version_info >= (3, 0):
 804                     location = location.encode('iso-8859-1').decode('utf-8')
 805                 location_escaped = escape_url(location)
 806                 if location != location_escaped:
 807                     del resp.headers['Location']
 808                     resp.headers['Location'] = location_escaped
 809         return resp
 810
 811     https_request = http_request
 812     https_response = http_response
 813
 814
 815 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 816     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 817         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 818         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 819         self._params = params
 820
 821     def https_open(self, req):
 822         kwargs = {}
 823         if hasattr(self, '_context'):  # python > 2.6
 824             kwargs['context'] = self._context
 825         if hasattr(self, '_check_hostname'):  # python 3.x
 826             kwargs['check_hostname'] = self._check_hostname
 827         return self.do_open(functools.partial(
 828             _create_http_connection, self, self._https_conn_class, True),
 829             req, **kwargs)
 830
 831
 832 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 833     def __init__(self, cookiejar=None):
 834         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 835
 836     def http_response(self, request, response):
 837         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 838         # characters in Set-Cookie HTTP header of last response (see
 839         # https://github.com/rg3/youtube-dl/issues/6769).
 840         # In order to at least prevent crashing we will percent encode Set-Cookie
 841         # header before HTTPCookieProcessor starts processing it.
 842         # if sys.version_info < (3, 0) and response.headers:
 843         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 844         #         set_cookie = response.headers.get(set_cookie_header)
 845         #         if set_cookie:
 846         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 847         #             if set_cookie != set_cookie_escaped:
 848         #                 del response.headers[set_cookie_header]
 849         #                 response.headers[set_cookie_header] = set_cookie_escaped
 850         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 851
 852     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 853     https_response = http_response
 854
 855
 856 def parse_iso8601(date_str, delimiter='T', timezone=None):
 857     """ Return a UNIX timestamp from the given date """
 858
 859     if date_str is None:
 860         return None
 861
 862     date_str = re.sub(r'\.[0-9]+', '', date_str)
 863
 864     if timezone is None:
 865         m = re.search(
 866             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 867             date_str)
 868         if not m:
 869             timezone = datetime.timedelta()
 870         else:
 871             date_str = date_str[:-len(m.group(0))]
 872             if not m.group('sign'):
 873                 timezone = datetime.timedelta()
 874             else:
 875                 sign = 1 if m.group('sign') == '+' else -1
 876                 timezone = datetime.timedelta(
 877                     hours=sign * int(m.group('hours')),
 878                     minutes=sign * int(m.group('minutes')))
 879     try:
 880         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 881         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 882         return calendar.timegm(dt.timetuple())
 883     except ValueError:
 884         pass
 885
 886
 887 def unified_strdate(date_str, day_first=True):
 888     """Return a string with the date in the format YYYYMMDD"""
 889
 890     if date_str is None:
 891         return None
 892     upload_date = None
 893     # Replace commas
 894     date_str = date_str.replace(',', ' ')
 895     # %z (UTC offset) is only supported in python>=3.2
 896     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 897         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 898     # Remove AM/PM + timezone
 899     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 900
 901     format_expressions = [
 902         '%d %B %Y',
 903         '%d %b %Y',
 904         '%B %d %Y',
 905         '%b %d %Y',
 906         '%b %dst %Y %I:%M',
 907         '%b %dnd %Y %I:%M',
 908         '%b %dth %Y %I:%M',
 909         '%Y %m %d',
 910         '%Y-%m-%d',
 911         '%Y/%m/%d',
 912         '%Y/%m/%d %H:%M:%S',
 913         '%Y-%m-%d %H:%M:%S',
 914         '%Y-%m-%d %H:%M:%S.%f',
 915         '%d.%m.%Y %H:%M',
 916         '%d.%m.%Y %H.%M',
 917         '%Y-%m-%dT%H:%M:%SZ',
 918         '%Y-%m-%dT%H:%M:%S.%fZ',
 919         '%Y-%m-%dT%H:%M:%S.%f0Z',
 920         '%Y-%m-%dT%H:%M:%S',
 921         '%Y-%m-%dT%H:%M:%S.%f',
 922         '%Y-%m-%dT%H:%M',
 923     ]
 924     if day_first:
 925         format_expressions.extend([
 926             '%d-%m-%Y',
 927             '%d.%m.%Y',
 928             '%d/%m/%Y',
 929             '%d/%m/%y',
 930             '%d/%m/%Y %H:%M:%S',
 931         ])
 932     else:
 933         format_expressions.extend([
 934             '%m-%d-%Y',
 935             '%m.%d.%Y',
 936             '%m/%d/%Y',
 937             '%m/%d/%y',
 938             '%m/%d/%Y %H:%M:%S',
 939         ])
 940     for expression in format_expressions:
 941         try:
 942             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 943         except ValueError:
 944             pass
 945     if upload_date is None:
 946         timetuple = email.utils.parsedate_tz(date_str)
 947         if timetuple:
 948             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 949     if upload_date is not None:
 950         return compat_str(upload_date)
 951
 952
 953 def determine_ext(url, default_ext='unknown_video'):
 954     if url is None:
 955         return default_ext
 956     guess = url.partition('?')[0].rpartition('.')[2]
 957     if re.match(r'^[A-Za-z0-9]+$', guess):
 958         return guess
 959     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
 960     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
 961         return guess.rstrip('/')
 962     else:
 963         return default_ext
 964
 965
 966 def subtitles_filename(filename, sub_lang, sub_format):
 967     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 968
 969
 970 def date_from_str(date_str):
 971     """
 972     Return a datetime object from a string in the format YYYYMMDD or
 973     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 974     today = datetime.date.today()
 975     if date_str in ('now', 'today'):
 976         return today
 977     if date_str == 'yesterday':
 978         return today - datetime.timedelta(days=1)
 979     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 980     if match is not None:
 981         sign = match.group('sign')
 982         time = int(match.group('time'))
 983         if sign == '-':
 984             time = -time
 985         unit = match.group('unit')
 986         # A bad approximation?
 987         if unit == 'month':
 988             unit = 'day'
 989             time *= 30
 990         elif unit == 'year':
 991             unit = 'day'
 992             time *= 365
 993         unit += 's'
 994         delta = datetime.timedelta(**{unit: time})
 995         return today + delta
 996     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
 997
 998
 999 def hyphenate_date(date_str):
1000     """
1001     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1002     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1003     if match is not None:
1004         return '-'.join(match.groups())
1005     else:
1006         return date_str
1007
1008
1009 class DateRange(object):
1010     """Represents a time interval between two dates"""
1011
1012     def __init__(self, start=None, end=None):
1013         """start and end must be strings in the format accepted by date"""
1014         if start is not None:
1015             self.start = date_from_str(start)
1016         else:
1017             self.start = datetime.datetime.min.date()
1018         if end is not None:
1019             self.end = date_from_str(end)
1020         else:
1021             self.end = datetime.datetime.max.date()
1022         if self.start > self.end:
1023             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1024
1025     @classmethod
1026     def day(cls, day):
1027         """Returns a range that only contains the given day"""
1028         return cls(day, day)
1029
1030     def __contains__(self, date):
1031         """Check if the date is in the range"""
1032         if not isinstance(date, datetime.date):
1033             date = date_from_str(date)
1034         return self.start <= date <= self.end
1035
1036     def __str__(self):
1037         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1038
1039
1040 def platform_name():
1041     """ Returns the platform name as a compat_str """
1042     res = platform.platform()
1043     if isinstance(res, bytes):
1044         res = res.decode(preferredencoding())
1045
1046     assert isinstance(res, compat_str)
1047     return res
1048
1049
1050 def _windows_write_string(s, out):
1051     """ Returns True if the string was written using special methods,
1052     False if it has yet to be written out."""
1053     # Adapted from http://stackoverflow.com/a/3259271/35070
1054
1055     import ctypes
1056     import ctypes.wintypes
1057
1058     WIN_OUTPUT_IDS = {
1059         1: -11,
1060         2: -12,
1061     }
1062
1063     try:
1064         fileno = out.fileno()
1065     except AttributeError:
1066         # If the output stream doesn't have a fileno, it's virtual
1067         return False
1068     except io.UnsupportedOperation:
1069         # Some strange Windows pseudo files?
1070         return False
1071     if fileno not in WIN_OUTPUT_IDS:
1072         return False
1073
1074     GetStdHandle = ctypes.WINFUNCTYPE(
1075         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1076         (b'GetStdHandle', ctypes.windll.kernel32))
1077     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1078
1079     WriteConsoleW = ctypes.WINFUNCTYPE(
1080         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1081         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1082         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1083     written = ctypes.wintypes.DWORD(0)
1084
1085     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1086     FILE_TYPE_CHAR = 0x0002
1087     FILE_TYPE_REMOTE = 0x8000
1088     GetConsoleMode = ctypes.WINFUNCTYPE(
1089         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1090         ctypes.POINTER(ctypes.wintypes.DWORD))(
1091         (b'GetConsoleMode', ctypes.windll.kernel32))
1092     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1093
1094     def not_a_console(handle):
1095         if handle == INVALID_HANDLE_VALUE or handle is None:
1096             return True
1097         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1098                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1099
1100     if not_a_console(h):
1101         return False
1102
1103     def next_nonbmp_pos(s):
1104         try:
1105             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1106         except StopIteration:
1107             return len(s)
1108
1109     while s:
1110         count = min(next_nonbmp_pos(s), 1024)
1111
1112         ret = WriteConsoleW(
1113             h, s, count if count else 2, ctypes.byref(written), None)
1114         if ret == 0:
1115             raise OSError('Failed to write string')
1116         if not count:  # We just wrote a non-BMP character
1117             assert written.value == 2
1118             s = s[1:]
1119         else:
1120             assert written.value > 0
1121             s = s[written.value:]
1122     return True
1123
1124
1125 def write_string(s, out=None, encoding=None):
1126     if out is None:
1127         out = sys.stderr
1128     assert type(s) == compat_str
1129
1130     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1131         if _windows_write_string(s, out):
1132             return
1133
1134     if ('b' in getattr(out, 'mode', '') or
1135             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1136         byt = s.encode(encoding or preferredencoding(), 'ignore')
1137         out.write(byt)
1138     elif hasattr(out, 'buffer'):
1139         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1140         byt = s.encode(enc, 'ignore')
1141         out.buffer.write(byt)
1142     else:
1143         out.write(s)
1144     out.flush()
1145
1146
1147 def bytes_to_intlist(bs):
1148     if not bs:
1149         return []
1150     if isinstance(bs[0], int):  # Python 3
1151         return list(bs)
1152     else:
1153         return [ord(c) for c in bs]
1154
1155
1156 def intlist_to_bytes(xs):
1157     if not xs:
1158         return b''
1159     return struct_pack('%dB' % len(xs), *xs)
1160
1161
1162 # Cross-platform file locking
1163 if sys.platform == 'win32':
1164     import ctypes.wintypes
1165     import msvcrt
1166
1167     class OVERLAPPED(ctypes.Structure):
1168         _fields_ = [
1169             ('Internal', ctypes.wintypes.LPVOID),
1170             ('InternalHigh', ctypes.wintypes.LPVOID),
1171             ('Offset', ctypes.wintypes.DWORD),
1172             ('OffsetHigh', ctypes.wintypes.DWORD),
1173             ('hEvent', ctypes.wintypes.HANDLE),
1174         ]
1175
1176     kernel32 = ctypes.windll.kernel32
1177     LockFileEx = kernel32.LockFileEx
1178     LockFileEx.argtypes = [
1179         ctypes.wintypes.HANDLE,     # hFile
1180         ctypes.wintypes.DWORD,      # dwFlags
1181         ctypes.wintypes.DWORD,      # dwReserved
1182         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1183         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1184         ctypes.POINTER(OVERLAPPED)  # Overlapped
1185     ]
1186     LockFileEx.restype = ctypes.wintypes.BOOL
1187     UnlockFileEx = kernel32.UnlockFileEx
1188     UnlockFileEx.argtypes = [
1189         ctypes.wintypes.HANDLE,     # hFile
1190         ctypes.wintypes.DWORD,      # dwReserved
1191         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1192         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1193         ctypes.POINTER(OVERLAPPED)  # Overlapped
1194     ]
1195     UnlockFileEx.restype = ctypes.wintypes.BOOL
1196     whole_low = 0xffffffff
1197     whole_high = 0x7fffffff
1198
1199     def _lock_file(f, exclusive):
1200         overlapped = OVERLAPPED()
1201         overlapped.Offset = 0
1202         overlapped.OffsetHigh = 0
1203         overlapped.hEvent = 0
1204         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1205         handle = msvcrt.get_osfhandle(f.fileno())
1206         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1207                           whole_low, whole_high, f._lock_file_overlapped_p):
1208             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1209
1210     def _unlock_file(f):
1211         assert f._lock_file_overlapped_p
1212         handle = msvcrt.get_osfhandle(f.fileno())
1213         if not UnlockFileEx(handle, 0,
1214                             whole_low, whole_high, f._lock_file_overlapped_p):
1215             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1216
1217 else:
1218     import fcntl
1219
1220     def _lock_file(f, exclusive):
1221         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1222
1223     def _unlock_file(f):
1224         fcntl.flock(f, fcntl.LOCK_UN)
1225
1226
1227 class locked_file(object):
1228     def __init__(self, filename, mode, encoding=None):
1229         assert mode in ['r', 'a', 'w']
1230         self.f = io.open(filename, mode, encoding=encoding)
1231         self.mode = mode
1232
1233     def __enter__(self):
1234         exclusive = self.mode != 'r'
1235         try:
1236             _lock_file(self.f, exclusive)
1237         except IOError:
1238             self.f.close()
1239             raise
1240         return self
1241
1242     def __exit__(self, etype, value, traceback):
1243         try:
1244             _unlock_file(self.f)
1245         finally:
1246             self.f.close()
1247
1248     def __iter__(self):
1249         return iter(self.f)
1250
1251     def write(self, *args):
1252         return self.f.write(*args)
1253
1254     def read(self, *args):
1255         return self.f.read(*args)
1256
1257
1258 def get_filesystem_encoding():
1259     encoding = sys.getfilesystemencoding()
1260     return encoding if encoding is not None else 'utf-8'
1261
1262
1263 def shell_quote(args):
1264     quoted_args = []
1265     encoding = get_filesystem_encoding()
1266     for a in args:
1267         if isinstance(a, bytes):
1268             # We may get a filename encoded with 'encodeFilename'
1269             a = a.decode(encoding)
1270         quoted_args.append(pipes.quote(a))
1271     return ' '.join(quoted_args)
1272
1273
1274 def smuggle_url(url, data):
1275     """ Pass additional data in a URL for internal use. """
1276
1277     sdata = compat_urllib_parse.urlencode(
1278         {'__youtubedl_smuggle': json.dumps(data)})
1279     return url + '#' + sdata
1280
1281
1282 def unsmuggle_url(smug_url, default=None):
1283     if '#__youtubedl_smuggle' not in smug_url:
1284         return smug_url, default
1285     url, _, sdata = smug_url.rpartition('#')
1286     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1287     data = json.loads(jsond)
1288     return url, data
1289
1290
1291 def format_bytes(bytes):
1292     if bytes is None:
1293         return 'N/A'
1294     if type(bytes) is str:
1295         bytes = float(bytes)
1296     if bytes == 0.0:
1297         exponent = 0
1298     else:
1299         exponent = int(math.log(bytes, 1024.0))
1300     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1301     converted = float(bytes) / float(1024 ** exponent)
1302     return '%.2f%s' % (converted, suffix)
1303
1304
1305 def parse_filesize(s):
1306     if s is None:
1307         return None
1308
1309     # The lower-case forms are of course incorrect and unofficial,
1310     # but we support those too
1311     _UNIT_TABLE = {
1312         'B': 1,
1313         'b': 1,
1314         'KiB': 1024,
1315         'KB': 1000,
1316         'kB': 1024,
1317         'Kb': 1000,
1318         'MiB': 1024 ** 2,
1319         'MB': 1000 ** 2,
1320         'mB': 1024 ** 2,
1321         'Mb': 1000 ** 2,
1322         'GiB': 1024 ** 3,
1323         'GB': 1000 ** 3,
1324         'gB': 1024 ** 3,
1325         'Gb': 1000 ** 3,
1326         'TiB': 1024 ** 4,
1327         'TB': 1000 ** 4,
1328         'tB': 1024 ** 4,
1329         'Tb': 1000 ** 4,
1330         'PiB': 1024 ** 5,
1331         'PB': 1000 ** 5,
1332         'pB': 1024 ** 5,
1333         'Pb': 1000 ** 5,
1334         'EiB': 1024 ** 6,
1335         'EB': 1000 ** 6,
1336         'eB': 1024 ** 6,
1337         'Eb': 1000 ** 6,
1338         'ZiB': 1024 ** 7,
1339         'ZB': 1000 ** 7,
1340         'zB': 1024 ** 7,
1341         'Zb': 1000 ** 7,
1342         'YiB': 1024 ** 8,
1343         'YB': 1000 ** 8,
1344         'yB': 1024 ** 8,
1345         'Yb': 1000 ** 8,
1346     }
1347
1348     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1349     m = re.match(
1350         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1351     if not m:
1352         return None
1353
1354     num_str = m.group('num').replace(',', '.')
1355     mult = _UNIT_TABLE[m.group('unit')]
1356     return int(float(num_str) * mult)
1357
1358
1359 def month_by_name(name):
1360     """ Return the number of a month by (locale-independently) English name """
1361
1362     try:
1363         return ENGLISH_MONTH_NAMES.index(name) + 1
1364     except ValueError:
1365         return None
1366
1367
1368 def month_by_abbreviation(abbrev):
1369     """ Return the number of a month by (locale-independently) English
1370         abbreviations """
1371
1372     try:
1373         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1374     except ValueError:
1375         return None
1376
1377
1378 def fix_xml_ampersands(xml_str):
1379     """Replace all the '&' by '&amp;' in XML"""
1380     return re.sub(
1381         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1382         '&amp;',
1383         xml_str)
1384
1385
1386 def setproctitle(title):
1387     assert isinstance(title, compat_str)
1388     try:
1389         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1390     except OSError:
1391         return
1392     title_bytes = title.encode('utf-8')
1393     buf = ctypes.create_string_buffer(len(title_bytes))
1394     buf.value = title_bytes
1395     try:
1396         libc.prctl(15, buf, 0, 0, 0)
1397     except AttributeError:
1398         return  # Strange libc, just skip this
1399
1400
1401 def remove_start(s, start):
1402     if s.startswith(start):
1403         return s[len(start):]
1404     return s
1405
1406
1407 def remove_end(s, end):
1408     if s.endswith(end):
1409         return s[:-len(end)]
1410     return s
1411
1412
1413 def remove_quotes(s):
1414     if s is None or len(s) < 2:
1415         return s
1416     for quote in ('"', "'", ):
1417         if s[0] == quote and s[-1] == quote:
1418             return s[1:-1]
1419     return s
1420
1421
1422 def url_basename(url):
1423     path = compat_urlparse.urlparse(url).path
1424     return path.strip('/').split('/')[-1]
1425
1426
1427 class HEADRequest(compat_urllib_request.Request):
1428     def get_method(self):
1429         return 'HEAD'
1430
1431
1432 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1433     if get_attr:
1434         if v is not None:
1435             v = getattr(v, get_attr, None)
1436     if v == '':
1437         v = None
1438     if v is None:
1439         return default
1440     try:
1441         return int(v) * invscale // scale
1442     except ValueError:
1443         return default
1444
1445
1446 def str_or_none(v, default=None):
1447     return default if v is None else compat_str(v)
1448
1449
1450 def str_to_int(int_str):
1451     """ A more relaxed version of int_or_none """
1452     if int_str is None:
1453         return None
1454     int_str = re.sub(r'[,\.\+]', '', int_str)
1455     return int(int_str)
1456
1457
1458 def float_or_none(v, scale=1, invscale=1, default=None):
1459     if v is None:
1460         return default
1461     try:
1462         return float(v) * invscale / scale
1463     except ValueError:
1464         return default
1465
1466
1467 def parse_duration(s):
1468     if not isinstance(s, compat_basestring):
1469         return None
1470
1471     s = s.strip()
1472
1473     m = re.match(
1474         r'''(?ix)(?:P?T)?
1475         (?:
1476             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1477             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1478
1479             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1480             (?:
1481                 (?:
1482                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1483                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1484                 )?
1485                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1486             )?
1487             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1488         )$''', s)
1489     if not m:
1490         return None
1491     res = 0
1492     if m.group('only_mins'):
1493         return float_or_none(m.group('only_mins'), invscale=60)
1494     if m.group('only_hours'):
1495         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1496     if m.group('secs'):
1497         res += int(m.group('secs'))
1498     if m.group('mins_reversed'):
1499         res += int(m.group('mins_reversed')) * 60
1500     if m.group('mins'):
1501         res += int(m.group('mins')) * 60
1502     if m.group('hours'):
1503         res += int(m.group('hours')) * 60 * 60
1504     if m.group('hours_reversed'):
1505         res += int(m.group('hours_reversed')) * 60 * 60
1506     if m.group('days'):
1507         res += int(m.group('days')) * 24 * 60 * 60
1508     if m.group('ms'):
1509         res += float(m.group('ms'))
1510     return res
1511
1512
1513 def prepend_extension(filename, ext, expected_real_ext=None):
1514     name, real_ext = os.path.splitext(filename)
1515     return (
1516         '{0}.{1}{2}'.format(name, ext, real_ext)
1517         if not expected_real_ext or real_ext[1:] == expected_real_ext
1518         else '{0}.{1}'.format(filename, ext))
1519
1520
1521 def replace_extension(filename, ext, expected_real_ext=None):
1522     name, real_ext = os.path.splitext(filename)
1523     return '{0}.{1}'.format(
1524         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1525         ext)
1526
1527
1528 def check_executable(exe, args=[]):
1529     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1530     args can be a list of arguments for a short output (like -version) """
1531     try:
1532         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1533     except OSError:
1534         return False
1535     return exe
1536
1537
1538 def get_exe_version(exe, args=['--version'],
1539                     version_re=None, unrecognized='present'):
1540     """ Returns the version of the specified executable,
1541     or False if the executable is not present """
1542     try:
1543         out, _ = subprocess.Popen(
1544             [encodeArgument(exe)] + args,
1545             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1546     except OSError:
1547         return False
1548     if isinstance(out, bytes):  # Python 2.x
1549         out = out.decode('ascii', 'ignore')
1550     return detect_exe_version(out, version_re, unrecognized)
1551
1552
1553 def detect_exe_version(output, version_re=None, unrecognized='present'):
1554     assert isinstance(output, compat_str)
1555     if version_re is None:
1556         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1557     m = re.search(version_re, output)
1558     if m:
1559         return m.group(1)
1560     else:
1561         return unrecognized
1562
1563
1564 class PagedList(object):
1565     def __len__(self):
1566         # This is only useful for tests
1567         return len(self.getslice())
1568
1569
1570 class OnDemandPagedList(PagedList):
1571     def __init__(self, pagefunc, pagesize, use_cache=False):
1572         self._pagefunc = pagefunc
1573         self._pagesize = pagesize
1574         self._use_cache = use_cache
1575         if use_cache:
1576             self._cache = {}
1577
1578     def getslice(self, start=0, end=None):
1579         res = []
1580         for pagenum in itertools.count(start // self._pagesize):
1581             firstid = pagenum * self._pagesize
1582             nextfirstid = pagenum * self._pagesize + self._pagesize
1583             if start >= nextfirstid:
1584                 continue
1585
1586             page_results = None
1587             if self._use_cache:
1588                 page_results = self._cache.get(pagenum)
1589             if page_results is None:
1590                 page_results = list(self._pagefunc(pagenum))
1591             if self._use_cache:
1592                 self._cache[pagenum] = page_results
1593
1594             startv = (
1595                 start % self._pagesize
1596                 if firstid <= start < nextfirstid
1597                 else 0)
1598
1599             endv = (
1600                 ((end - 1) % self._pagesize) + 1
1601                 if (end is not None and firstid <= end <= nextfirstid)
1602                 else None)
1603
1604             if startv != 0 or endv is not None:
1605                 page_results = page_results[startv:endv]
1606             res.extend(page_results)
1607
1608             # A little optimization - if current page is not "full", ie. does
1609             # not contain page_size videos then we can assume that this page
1610             # is the last one - there are no more ids on further pages -
1611             # i.e. no need to query again.
1612             if len(page_results) + startv < self._pagesize:
1613                 break
1614
1615             # If we got the whole page, but the next page is not interesting,
1616             # break out early as well
1617             if end == nextfirstid:
1618                 break
1619         return res
1620
1621
1622 class InAdvancePagedList(PagedList):
1623     def __init__(self, pagefunc, pagecount, pagesize):
1624         self._pagefunc = pagefunc
1625         self._pagecount = pagecount
1626         self._pagesize = pagesize
1627
1628     def getslice(self, start=0, end=None):
1629         res = []
1630         start_page = start // self._pagesize
1631         end_page = (
1632             self._pagecount if end is None else (end // self._pagesize + 1))
1633         skip_elems = start - start_page * self._pagesize
1634         only_more = None if end is None else end - start
1635         for pagenum in range(start_page, end_page):
1636             page = list(self._pagefunc(pagenum))
1637             if skip_elems:
1638                 page = page[skip_elems:]
1639                 skip_elems = None
1640             if only_more is not None:
1641                 if len(page) < only_more:
1642                     only_more -= len(page)
1643                 else:
1644                     page = page[:only_more]
1645                     res.extend(page)
1646                     break
1647             res.extend(page)
1648         return res
1649
1650
1651 def uppercase_escape(s):
1652     unicode_escape = codecs.getdecoder('unicode_escape')
1653     return re.sub(
1654         r'\\U[0-9a-fA-F]{8}',
1655         lambda m: unicode_escape(m.group(0))[0],
1656         s)
1657
1658
1659 def lowercase_escape(s):
1660     unicode_escape = codecs.getdecoder('unicode_escape')
1661     return re.sub(
1662         r'\\u[0-9a-fA-F]{4}',
1663         lambda m: unicode_escape(m.group(0))[0],
1664         s)
1665
1666
1667 def escape_rfc3986(s):
1668     """Escape non-ASCII characters as suggested by RFC 3986"""
1669     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1670         s = s.encode('utf-8')
1671     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1672
1673
1674 def escape_url(url):
1675     """Escape URL as suggested by RFC 3986"""
1676     url_parsed = compat_urllib_parse_urlparse(url)
1677     return url_parsed._replace(
1678         path=escape_rfc3986(url_parsed.path),
1679         params=escape_rfc3986(url_parsed.params),
1680         query=escape_rfc3986(url_parsed.query),
1681         fragment=escape_rfc3986(url_parsed.fragment)
1682     ).geturl()
1683
1684 try:
1685     struct.pack('!I', 0)
1686 except TypeError:
1687     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1688     def struct_pack(spec, *args):
1689         if isinstance(spec, compat_str):
1690             spec = spec.encode('ascii')
1691         return struct.pack(spec, *args)
1692
1693     def struct_unpack(spec, *args):
1694         if isinstance(spec, compat_str):
1695             spec = spec.encode('ascii')
1696         return struct.unpack(spec, *args)
1697 else:
1698     struct_pack = struct.pack
1699     struct_unpack = struct.unpack
1700
1701
1702 def read_batch_urls(batch_fd):
1703     def fixup(url):
1704         if not isinstance(url, compat_str):
1705             url = url.decode('utf-8', 'replace')
1706         BOM_UTF8 = '\xef\xbb\xbf'
1707         if url.startswith(BOM_UTF8):
1708             url = url[len(BOM_UTF8):]
1709         url = url.strip()
1710         if url.startswith(('#', ';', ']')):
1711             return False
1712         return url
1713
1714     with contextlib.closing(batch_fd) as fd:
1715         return [url for url in map(fixup, fd) if url]
1716
1717
1718 def urlencode_postdata(*args, **kargs):
1719     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1720
1721
1722 def encode_dict(d, encoding='utf-8'):
1723     def encode(v):
1724         return v.encode(encoding) if isinstance(v, compat_basestring) else v
1725     return dict((encode(k), encode(v)) for k, v in d.items())
1726
1727
1728 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1729     if isinstance(key_or_keys, (list, tuple)):
1730         for key in key_or_keys:
1731             if key not in d or d[key] is None or skip_false_values and not d[key]:
1732                 continue
1733             return d[key]
1734         return default
1735     return d.get(key_or_keys, default)
1736
1737
1738 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1739     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1740
1741
1742 US_RATINGS = {
1743     'G': 0,
1744     'PG': 10,
1745     'PG-13': 13,
1746     'R': 16,
1747     'NC': 18,
1748 }
1749
1750
1751 def parse_age_limit(s):
1752     if s is None:
1753         return None
1754     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1755     return int(m.group('age')) if m else US_RATINGS.get(s)
1756
1757
1758 def strip_jsonp(code):
1759     return re.sub(
1760         r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1761
1762
1763 def js_to_json(code):
1764     def fix_kv(m):
1765         v = m.group(0)
1766         if v in ('true', 'false', 'null'):
1767             return v
1768         if v.startswith('"'):
1769             v = re.sub(r"\\'", "'", v[1:-1])
1770         elif v.startswith("'"):
1771             v = v[1:-1]
1772             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1773                 '\\\\': '\\\\',
1774                 "\\'": "'",
1775                 '"': '\\"',
1776             }[m.group(0)], v)
1777         return '"%s"' % v
1778
1779     res = re.sub(r'''(?x)
1780         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1781         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1782         [a-zA-Z_][.a-zA-Z_0-9]*
1783         ''', fix_kv, code)
1784     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1785     return res
1786
1787
1788 def qualities(quality_ids):
1789     """ Get a numeric quality value out of a list of possible values """
1790     def q(qid):
1791         try:
1792             return quality_ids.index(qid)
1793         except ValueError:
1794             return -1
1795     return q
1796
1797
1798 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1799
1800
1801 def limit_length(s, length):
1802     """ Add ellipses to overly long strings """
1803     if s is None:
1804         return None
1805     ELLIPSES = '...'
1806     if len(s) > length:
1807         return s[:length - len(ELLIPSES)] + ELLIPSES
1808     return s
1809
1810
1811 def version_tuple(v):
1812     return tuple(int(e) for e in re.split(r'[-.]', v))
1813
1814
1815 def is_outdated_version(version, limit, assume_new=True):
1816     if not version:
1817         return not assume_new
1818     try:
1819         return version_tuple(version) < version_tuple(limit)
1820     except ValueError:
1821         return not assume_new
1822
1823
1824 def ytdl_is_updateable():
1825     """ Returns if youtube-dl can be updated with -U """
1826     from zipimport import zipimporter
1827
1828     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1829
1830
1831 def args_to_str(args):
1832     # Get a short string representation for a subprocess command
1833     return ' '.join(shlex_quote(a) for a in args)
1834
1835
1836 def error_to_compat_str(err):
1837     err_str = str(err)
1838     # On python 2 error byte string must be decoded with proper
1839     # encoding rather than ascii
1840     if sys.version_info[0] < 3:
1841         err_str = err_str.decode(preferredencoding())
1842     return err_str
1843
1844
1845 def mimetype2ext(mt):
1846     ext = {
1847         'audio/mp4': 'm4a',
1848     }.get(mt)
1849     if ext is not None:
1850         return ext
1851
1852     _, _, res = mt.rpartition('/')
1853
1854     return {
1855         '3gpp': '3gp',
1856         'smptett+xml': 'tt',
1857         'srt': 'srt',
1858         'ttaf+xml': 'dfxp',
1859         'ttml+xml': 'ttml',
1860         'vtt': 'vtt',
1861         'x-flv': 'flv',
1862         'x-mp4-fragmented': 'mp4',
1863         'x-ms-wmv': 'wmv',
1864     }.get(res, res)
1865
1866
1867 def urlhandle_detect_ext(url_handle):
1868     try:
1869         url_handle.headers
1870         getheader = lambda h: url_handle.headers[h]
1871     except AttributeError:  # Python < 3
1872         getheader = url_handle.info().getheader
1873
1874     cd = getheader('Content-Disposition')
1875     if cd:
1876         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1877         if m:
1878             e = determine_ext(m.group('filename'), default_ext=None)
1879             if e:
1880                 return e
1881
1882     return mimetype2ext(getheader('Content-Type'))
1883
1884
1885 def encode_data_uri(data, mime_type):
1886     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1887
1888
1889 def age_restricted(content_limit, age_limit):
1890     """ Returns True iff the content should be blocked """
1891
1892     if age_limit is None:  # No limit set
1893         return False
1894     if content_limit is None:
1895         return False  # Content available for everyone
1896     return age_limit < content_limit
1897
1898
1899 def is_html(first_bytes):
1900     """ Detect whether a file contains HTML by examining its first bytes. """
1901
1902     BOMS = [
1903         (b'\xef\xbb\xbf', 'utf-8'),
1904         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1905         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1906         (b'\xff\xfe', 'utf-16-le'),
1907         (b'\xfe\xff', 'utf-16-be'),
1908     ]
1909     for bom, enc in BOMS:
1910         if first_bytes.startswith(bom):
1911             s = first_bytes[len(bom):].decode(enc, 'replace')
1912             break
1913     else:
1914         s = first_bytes.decode('utf-8', 'replace')
1915
1916     return re.match(r'^\s*<', s)
1917
1918
1919 def determine_protocol(info_dict):
1920     protocol = info_dict.get('protocol')
1921     if protocol is not None:
1922         return protocol
1923
1924     url = info_dict['url']
1925     if url.startswith('rtmp'):
1926         return 'rtmp'
1927     elif url.startswith('mms'):
1928         return 'mms'
1929     elif url.startswith('rtsp'):
1930         return 'rtsp'
1931
1932     ext = determine_ext(url)
1933     if ext == 'm3u8':
1934         return 'm3u8'
1935     elif ext == 'f4m':
1936         return 'f4m'
1937
1938     return compat_urllib_parse_urlparse(url).scheme
1939
1940
1941 def render_table(header_row, data):
1942     """ Render a list of rows, each as a list of values """
1943     table = [header_row] + data
1944     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1945     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1946     return '\n'.join(format_str % tuple(row) for row in table)
1947
1948
1949 def _match_one(filter_part, dct):
1950     COMPARISON_OPERATORS = {
1951         '<': operator.lt,
1952         '<=': operator.le,
1953         '>': operator.gt,
1954         '>=': operator.ge,
1955         '=': operator.eq,
1956         '!=': operator.ne,
1957     }
1958     operator_rex = re.compile(r'''(?x)\s*
1959         (?P<key>[a-z_]+)
1960         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1961         (?:
1962             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1963             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1964         )
1965         \s*$
1966         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1967     m = operator_rex.search(filter_part)
1968     if m:
1969         op = COMPARISON_OPERATORS[m.group('op')]
1970         if m.group('strval') is not None:
1971             if m.group('op') not in ('=', '!='):
1972                 raise ValueError(
1973                     'Operator %s does not support string values!' % m.group('op'))
1974             comparison_value = m.group('strval')
1975         else:
1976             try:
1977                 comparison_value = int(m.group('intval'))
1978             except ValueError:
1979                 comparison_value = parse_filesize(m.group('intval'))
1980                 if comparison_value is None:
1981                     comparison_value = parse_filesize(m.group('intval') + 'B')
1982                 if comparison_value is None:
1983                     raise ValueError(
1984                         'Invalid integer value %r in filter part %r' % (
1985                             m.group('intval'), filter_part))
1986         actual_value = dct.get(m.group('key'))
1987         if actual_value is None:
1988             return m.group('none_inclusive')
1989         return op(actual_value, comparison_value)
1990
1991     UNARY_OPERATORS = {
1992         '': lambda v: v is not None,
1993         '!': lambda v: v is None,
1994     }
1995     operator_rex = re.compile(r'''(?x)\s*
1996         (?P<op>%s)\s*(?P<key>[a-z_]+)
1997         \s*$
1998         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1999     m = operator_rex.search(filter_part)
2000     if m:
2001         op = UNARY_OPERATORS[m.group('op')]
2002         actual_value = dct.get(m.group('key'))
2003         return op(actual_value)
2004
2005     raise ValueError('Invalid filter part %r' % filter_part)
2006
2007
2008 def match_str(filter_str, dct):
2009     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2010
2011     return all(
2012         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2013
2014
2015 def match_filter_func(filter_str):
2016     def _match_func(info_dict):
2017         if match_str(filter_str, info_dict):
2018             return None
2019         else:
2020             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2021             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2022     return _match_func
2023
2024
2025 def parse_dfxp_time_expr(time_expr):
2026     if not time_expr:
2027         return
2028
2029     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2030     if mobj:
2031         return float(mobj.group('time_offset'))
2032
2033     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2034     if mobj:
2035         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2036
2037
2038 def srt_subtitles_timecode(seconds):
2039     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2040
2041
2042 def dfxp2srt(dfxp_data):
2043     _x = functools.partial(xpath_with_ns, ns_map={
2044         'ttml': 'http://www.w3.org/ns/ttml',
2045         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2046     })
2047
2048     class TTMLPElementParser(object):
2049         out = ''
2050
2051         def start(self, tag, attrib):
2052             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2053                 self.out += '\n'
2054
2055         def end(self, tag):
2056             pass
2057
2058         def data(self, data):
2059             self.out += data
2060
2061         def close(self):
2062             return self.out.strip()
2063
2064     def parse_node(node):
2065         target = TTMLPElementParser()
2066         parser = xml.etree.ElementTree.XMLParser(target=target)
2067         parser.feed(xml.etree.ElementTree.tostring(node))
2068         return parser.close()
2069
2070     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2071     out = []
2072     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2073
2074     if not paras:
2075         raise ValueError('Invalid dfxp/TTML subtitle')
2076
2077     for para, index in zip(paras, itertools.count(1)):
2078         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2079         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2080         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2081         if begin_time is None:
2082             continue
2083         if not end_time:
2084             if not dur:
2085                 continue
2086             end_time = begin_time + dur
2087         out.append('%d\n%s --> %s\n%s\n\n' % (
2088             index,
2089             srt_subtitles_timecode(begin_time),
2090             srt_subtitles_timecode(end_time),
2091             parse_node(para)))
2092
2093     return ''.join(out)
2094
2095
2096 def cli_option(params, command_option, param):
2097     param = params.get(param)
2098     return [command_option, param] if param is not None else []
2099
2100
2101 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2102     param = params.get(param)
2103     assert isinstance(param, bool)
2104     if separator:
2105         return [command_option + separator + (true_value if param else false_value)]
2106     return [command_option, true_value if param else false_value]
2107
2108
2109 def cli_valueless_option(params, command_option, param, expected_value=True):
2110     param = params.get(param)
2111     return [command_option] if param == expected_value else []
2112
2113
2114 def cli_configuration_args(params, param, default=[]):
2115     ex_args = params.get(param)
2116     if ex_args is None:
2117         return default
2118     assert isinstance(ex_args, list)
2119     return ex_args
2120
2121
2122 class ISO639Utils(object):
2123     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2124     _lang_map = {
2125         'aa': 'aar',
2126         'ab': 'abk',
2127         'ae': 'ave',
2128         'af': 'afr',
2129         'ak': 'aka',
2130         'am': 'amh',
2131         'an': 'arg',
2132         'ar': 'ara',
2133         'as': 'asm',
2134         'av': 'ava',
2135         'ay': 'aym',
2136         'az': 'aze',
2137         'ba': 'bak',
2138         'be': 'bel',
2139         'bg': 'bul',
2140         'bh': 'bih',
2141         'bi': 'bis',
2142         'bm': 'bam',
2143         'bn': 'ben',
2144         'bo': 'bod',
2145         'br': 'bre',
2146         'bs': 'bos',
2147         'ca': 'cat',
2148         'ce': 'che',
2149         'ch': 'cha',
2150         'co': 'cos',
2151         'cr': 'cre',
2152         'cs': 'ces',
2153         'cu': 'chu',
2154         'cv': 'chv',
2155         'cy': 'cym',
2156         'da': 'dan',
2157         'de': 'deu',
2158         'dv': 'div',
2159         'dz': 'dzo',
2160         'ee': 'ewe',
2161         'el': 'ell',
2162         'en': 'eng',
2163         'eo': 'epo',
2164         'es': 'spa',
2165         'et': 'est',
2166         'eu': 'eus',
2167         'fa': 'fas',
2168         'ff': 'ful',
2169         'fi': 'fin',
2170         'fj': 'fij',
2171         'fo': 'fao',
2172         'fr': 'fra',
2173         'fy': 'fry',
2174         'ga': 'gle',
2175         'gd': 'gla',
2176         'gl': 'glg',
2177         'gn': 'grn',
2178         'gu': 'guj',
2179         'gv': 'glv',
2180         'ha': 'hau',
2181         'he': 'heb',
2182         'hi': 'hin',
2183         'ho': 'hmo',
2184         'hr': 'hrv',
2185         'ht': 'hat',
2186         'hu': 'hun',
2187         'hy': 'hye',
2188         'hz': 'her',
2189         'ia': 'ina',
2190         'id': 'ind',
2191         'ie': 'ile',
2192         'ig': 'ibo',
2193         'ii': 'iii',
2194         'ik': 'ipk',
2195         'io': 'ido',
2196         'is': 'isl',
2197         'it': 'ita',
2198         'iu': 'iku',
2199         'ja': 'jpn',
2200         'jv': 'jav',
2201         'ka': 'kat',
2202         'kg': 'kon',
2203         'ki': 'kik',
2204         'kj': 'kua',
2205         'kk': 'kaz',
2206         'kl': 'kal',
2207         'km': 'khm',
2208         'kn': 'kan',
2209         'ko': 'kor',
2210         'kr': 'kau',
2211         'ks': 'kas',
2212         'ku': 'kur',
2213         'kv': 'kom',
2214         'kw': 'cor',
2215         'ky': 'kir',
2216         'la': 'lat',
2217         'lb': 'ltz',
2218         'lg': 'lug',
2219         'li': 'lim',
2220         'ln': 'lin',
2221         'lo': 'lao',
2222         'lt': 'lit',
2223         'lu': 'lub',
2224         'lv': 'lav',
2225         'mg': 'mlg',
2226         'mh': 'mah',
2227         'mi': 'mri',
2228         'mk': 'mkd',
2229         'ml': 'mal',
2230         'mn': 'mon',
2231         'mr': 'mar',
2232         'ms': 'msa',
2233         'mt': 'mlt',
2234         'my': 'mya',
2235         'na': 'nau',
2236         'nb': 'nob',
2237         'nd': 'nde',
2238         'ne': 'nep',
2239         'ng': 'ndo',
2240         'nl': 'nld',
2241         'nn': 'nno',
2242         'no': 'nor',
2243         'nr': 'nbl',
2244         'nv': 'nav',
2245         'ny': 'nya',
2246         'oc': 'oci',
2247         'oj': 'oji',
2248         'om': 'orm',
2249         'or': 'ori',
2250         'os': 'oss',
2251         'pa': 'pan',
2252         'pi': 'pli',
2253         'pl': 'pol',
2254         'ps': 'pus',
2255         'pt': 'por',
2256         'qu': 'que',
2257         'rm': 'roh',
2258         'rn': 'run',
2259         'ro': 'ron',
2260         'ru': 'rus',
2261         'rw': 'kin',
2262         'sa': 'san',
2263         'sc': 'srd',
2264         'sd': 'snd',
2265         'se': 'sme',
2266         'sg': 'sag',
2267         'si': 'sin',
2268         'sk': 'slk',
2269         'sl': 'slv',
2270         'sm': 'smo',
2271         'sn': 'sna',
2272         'so': 'som',
2273         'sq': 'sqi',
2274         'sr': 'srp',
2275         'ss': 'ssw',
2276         'st': 'sot',
2277         'su': 'sun',
2278         'sv': 'swe',
2279         'sw': 'swa',
2280         'ta': 'tam',
2281         'te': 'tel',
2282         'tg': 'tgk',
2283         'th': 'tha',
2284         'ti': 'tir',
2285         'tk': 'tuk',
2286         'tl': 'tgl',
2287         'tn': 'tsn',
2288         'to': 'ton',
2289         'tr': 'tur',
2290         'ts': 'tso',
2291         'tt': 'tat',
2292         'tw': 'twi',
2293         'ty': 'tah',
2294         'ug': 'uig',
2295         'uk': 'ukr',
2296         'ur': 'urd',
2297         'uz': 'uzb',
2298         've': 'ven',
2299         'vi': 'vie',
2300         'vo': 'vol',
2301         'wa': 'wln',
2302         'wo': 'wol',
2303         'xh': 'xho',
2304         'yi': 'yid',
2305         'yo': 'yor',
2306         'za': 'zha',
2307         'zh': 'zho',
2308         'zu': 'zul',
2309     }
2310
2311     @classmethod
2312     def short2long(cls, code):
2313         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2314         return cls._lang_map.get(code[:2])
2315
2316     @classmethod
2317     def long2short(cls, code):
2318         """Convert language code from ISO 639-2/T to ISO 639-1"""
2319         for short_name, long_name in cls._lang_map.items():
2320             if long_name == code:
2321                 return short_name
2322
2323
2324 class ISO3166Utils(object):
2325     # From http://data.okfn.org/data/core/country-list
2326     _country_map = {
2327         'AF': 'Afghanistan',
2328         'AX': 'Åland Islands',
2329         'AL': 'Albania',
2330         'DZ': 'Algeria',
2331         'AS': 'American Samoa',
2332         'AD': 'Andorra',
2333         'AO': 'Angola',
2334         'AI': 'Anguilla',
2335         'AQ': 'Antarctica',
2336         'AG': 'Antigua and Barbuda',
2337         'AR': 'Argentina',
2338         'AM': 'Armenia',
2339         'AW': 'Aruba',
2340         'AU': 'Australia',
2341         'AT': 'Austria',
2342         'AZ': 'Azerbaijan',
2343         'BS': 'Bahamas',
2344         'BH': 'Bahrain',
2345         'BD': 'Bangladesh',
2346         'BB': 'Barbados',
2347         'BY': 'Belarus',
2348         'BE': 'Belgium',
2349         'BZ': 'Belize',
2350         'BJ': 'Benin',
2351         'BM': 'Bermuda',
2352         'BT': 'Bhutan',
2353         'BO': 'Bolivia, Plurinational State of',
2354         'BQ': 'Bonaire, Sint Eustatius and Saba',
2355         'BA': 'Bosnia and Herzegovina',
2356         'BW': 'Botswana',
2357         'BV': 'Bouvet Island',
2358         'BR': 'Brazil',
2359         'IO': 'British Indian Ocean Territory',
2360         'BN': 'Brunei Darussalam',
2361         'BG': 'Bulgaria',
2362         'BF': 'Burkina Faso',
2363         'BI': 'Burundi',
2364         'KH': 'Cambodia',
2365         'CM': 'Cameroon',
2366         'CA': 'Canada',
2367         'CV': 'Cape Verde',
2368         'KY': 'Cayman Islands',
2369         'CF': 'Central African Republic',
2370         'TD': 'Chad',
2371         'CL': 'Chile',
2372         'CN': 'China',
2373         'CX': 'Christmas Island',
2374         'CC': 'Cocos (Keeling) Islands',
2375         'CO': 'Colombia',
2376         'KM': 'Comoros',
2377         'CG': 'Congo',
2378         'CD': 'Congo, the Democratic Republic of the',
2379         'CK': 'Cook Islands',
2380         'CR': 'Costa Rica',
2381         'CI': 'Côte d\'Ivoire',
2382         'HR': 'Croatia',
2383         'CU': 'Cuba',
2384         'CW': 'Curaçao',
2385         'CY': 'Cyprus',
2386         'CZ': 'Czech Republic',
2387         'DK': 'Denmark',
2388         'DJ': 'Djibouti',
2389         'DM': 'Dominica',
2390         'DO': 'Dominican Republic',
2391         'EC': 'Ecuador',
2392         'EG': 'Egypt',
2393         'SV': 'El Salvador',
2394         'GQ': 'Equatorial Guinea',
2395         'ER': 'Eritrea',
2396         'EE': 'Estonia',
2397         'ET': 'Ethiopia',
2398         'FK': 'Falkland Islands (Malvinas)',
2399         'FO': 'Faroe Islands',
2400         'FJ': 'Fiji',
2401         'FI': 'Finland',
2402         'FR': 'France',
2403         'GF': 'French Guiana',
2404         'PF': 'French Polynesia',
2405         'TF': 'French Southern Territories',
2406         'GA': 'Gabon',
2407         'GM': 'Gambia',
2408         'GE': 'Georgia',
2409         'DE': 'Germany',
2410         'GH': 'Ghana',
2411         'GI': 'Gibraltar',
2412         'GR': 'Greece',
2413         'GL': 'Greenland',
2414         'GD': 'Grenada',
2415         'GP': 'Guadeloupe',
2416         'GU': 'Guam',
2417         'GT': 'Guatemala',
2418         'GG': 'Guernsey',
2419         'GN': 'Guinea',
2420         'GW': 'Guinea-Bissau',
2421         'GY': 'Guyana',
2422         'HT': 'Haiti',
2423         'HM': 'Heard Island and McDonald Islands',
2424         'VA': 'Holy See (Vatican City State)',
2425         'HN': 'Honduras',
2426         'HK': 'Hong Kong',
2427         'HU': 'Hungary',
2428         'IS': 'Iceland',
2429         'IN': 'India',
2430         'ID': 'Indonesia',
2431         'IR': 'Iran, Islamic Republic of',
2432         'IQ': 'Iraq',
2433         'IE': 'Ireland',
2434         'IM': 'Isle of Man',
2435         'IL': 'Israel',
2436         'IT': 'Italy',
2437         'JM': 'Jamaica',
2438         'JP': 'Japan',
2439         'JE': 'Jersey',
2440         'JO': 'Jordan',
2441         'KZ': 'Kazakhstan',
2442         'KE': 'Kenya',
2443         'KI': 'Kiribati',
2444         'KP': 'Korea, Democratic People\'s Republic of',
2445         'KR': 'Korea, Republic of',
2446         'KW': 'Kuwait',
2447         'KG': 'Kyrgyzstan',
2448         'LA': 'Lao People\'s Democratic Republic',
2449         'LV': 'Latvia',
2450         'LB': 'Lebanon',
2451         'LS': 'Lesotho',
2452         'LR': 'Liberia',
2453         'LY': 'Libya',
2454         'LI': 'Liechtenstein',
2455         'LT': 'Lithuania',
2456         'LU': 'Luxembourg',
2457         'MO': 'Macao',
2458         'MK': 'Macedonia, the Former Yugoslav Republic of',
2459         'MG': 'Madagascar',
2460         'MW': 'Malawi',
2461         'MY': 'Malaysia',
2462         'MV': 'Maldives',
2463         'ML': 'Mali',
2464         'MT': 'Malta',
2465         'MH': 'Marshall Islands',
2466         'MQ': 'Martinique',
2467         'MR': 'Mauritania',
2468         'MU': 'Mauritius',
2469         'YT': 'Mayotte',
2470         'MX': 'Mexico',
2471         'FM': 'Micronesia, Federated States of',
2472         'MD': 'Moldova, Republic of',
2473         'MC': 'Monaco',
2474         'MN': 'Mongolia',
2475         'ME': 'Montenegro',
2476         'MS': 'Montserrat',
2477         'MA': 'Morocco',
2478         'MZ': 'Mozambique',
2479         'MM': 'Myanmar',
2480         'NA': 'Namibia',
2481         'NR': 'Nauru',
2482         'NP': 'Nepal',
2483         'NL': 'Netherlands',
2484         'NC': 'New Caledonia',
2485         'NZ': 'New Zealand',
2486         'NI': 'Nicaragua',
2487         'NE': 'Niger',
2488         'NG': 'Nigeria',
2489         'NU': 'Niue',
2490         'NF': 'Norfolk Island',
2491         'MP': 'Northern Mariana Islands',
2492         'NO': 'Norway',
2493         'OM': 'Oman',
2494         'PK': 'Pakistan',
2495         'PW': 'Palau',
2496         'PS': 'Palestine, State of',
2497         'PA': 'Panama',
2498         'PG': 'Papua New Guinea',
2499         'PY': 'Paraguay',
2500         'PE': 'Peru',
2501         'PH': 'Philippines',
2502         'PN': 'Pitcairn',
2503         'PL': 'Poland',
2504         'PT': 'Portugal',
2505         'PR': 'Puerto Rico',
2506         'QA': 'Qatar',
2507         'RE': 'Réunion',
2508         'RO': 'Romania',
2509         'RU': 'Russian Federation',
2510         'RW': 'Rwanda',
2511         'BL': 'Saint Barthélemy',
2512         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2513         'KN': 'Saint Kitts and Nevis',
2514         'LC': 'Saint Lucia',
2515         'MF': 'Saint Martin (French part)',
2516         'PM': 'Saint Pierre and Miquelon',
2517         'VC': 'Saint Vincent and the Grenadines',
2518         'WS': 'Samoa',
2519         'SM': 'San Marino',
2520         'ST': 'Sao Tome and Principe',
2521         'SA': 'Saudi Arabia',
2522         'SN': 'Senegal',
2523         'RS': 'Serbia',
2524         'SC': 'Seychelles',
2525         'SL': 'Sierra Leone',
2526         'SG': 'Singapore',
2527         'SX': 'Sint Maarten (Dutch part)',
2528         'SK': 'Slovakia',
2529         'SI': 'Slovenia',
2530         'SB': 'Solomon Islands',
2531         'SO': 'Somalia',
2532         'ZA': 'South Africa',
2533         'GS': 'South Georgia and the South Sandwich Islands',
2534         'SS': 'South Sudan',
2535         'ES': 'Spain',
2536         'LK': 'Sri Lanka',
2537         'SD': 'Sudan',
2538         'SR': 'Suriname',
2539         'SJ': 'Svalbard and Jan Mayen',
2540         'SZ': 'Swaziland',
2541         'SE': 'Sweden',
2542         'CH': 'Switzerland',
2543         'SY': 'Syrian Arab Republic',
2544         'TW': 'Taiwan, Province of China',
2545         'TJ': 'Tajikistan',
2546         'TZ': 'Tanzania, United Republic of',
2547         'TH': 'Thailand',
2548         'TL': 'Timor-Leste',
2549         'TG': 'Togo',
2550         'TK': 'Tokelau',
2551         'TO': 'Tonga',
2552         'TT': 'Trinidad and Tobago',
2553         'TN': 'Tunisia',
2554         'TR': 'Turkey',
2555         'TM': 'Turkmenistan',
2556         'TC': 'Turks and Caicos Islands',
2557         'TV': 'Tuvalu',
2558         'UG': 'Uganda',
2559         'UA': 'Ukraine',
2560         'AE': 'United Arab Emirates',
2561         'GB': 'United Kingdom',
2562         'US': 'United States',
2563         'UM': 'United States Minor Outlying Islands',
2564         'UY': 'Uruguay',
2565         'UZ': 'Uzbekistan',
2566         'VU': 'Vanuatu',
2567         'VE': 'Venezuela, Bolivarian Republic of',
2568         'VN': 'Viet Nam',
2569         'VG': 'Virgin Islands, British',
2570         'VI': 'Virgin Islands, U.S.',
2571         'WF': 'Wallis and Futuna',
2572         'EH': 'Western Sahara',
2573         'YE': 'Yemen',
2574         'ZM': 'Zambia',
2575         'ZW': 'Zimbabwe',
2576     }
2577
2578     @classmethod
2579     def short2full(cls, code):
2580         """Convert an ISO 3166-2 country code to the corresponding full name"""
2581         return cls._country_map.get(code.upper())
2582
2583
2584 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2585     def __init__(self, proxies=None):
2586         # Set default handlers
2587         for type in ('http', 'https'):
2588             setattr(self, '%s_open' % type,
2589                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2590                         meth(r, proxy, type))
2591         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2592
2593     def proxy_open(self, req, proxy, type):
2594         req_proxy = req.headers.get('Ytdl-request-proxy')
2595         if req_proxy is not None:
2596             proxy = req_proxy
2597             del req.headers['Ytdl-request-proxy']
2598
2599         if proxy == '__noproxy__':
2600             return None  # No Proxy
2601         return compat_urllib_request.ProxyHandler.proxy_open(
2602             self, req, proxy, type)
2603
2604
2605 def ohdave_rsa_encrypt(data, exponent, modulus):
2606     '''
2607     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2608
2609     Input:
2610         data: data to encrypt, bytes-like object
2611         exponent, modulus: parameter e and N of RSA algorithm, both integer
2612     Output: hex string of encrypted data
2613
2614     Limitation: supports one block encryption only
2615     '''
2616
2617     payload = int(binascii.hexlify(data[::-1]), 16)
2618     encrypted = pow(payload, exponent, modulus)
2619     return '%x' % encrypted
2620
2621
2622 def encode_base_n(num, n, table=None):
2623     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2624     if not table:
2625         table = FULL_TABLE[:n]
2626
2627     if n > len(table):
2628         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2629
2630     if num == 0:
2631         return table[0]
2632
2633     ret = ''
2634     while num:
2635         ret = table[num % n] + ret
2636         num = num // n
2637     return ret
2638
2639
2640 def decode_packed_codes(code):
2641     mobj = re.search(
2642         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2643         code)
2644     obfucasted_code, base, count, symbols = mobj.groups()
2645     base = int(base)
2646     count = int(count)
2647     symbols = symbols.split('|')
2648     symbol_table = {}
2649
2650     while count:
2651         count -= 1
2652         base_n_count = encode_base_n(count, base)
2653         symbol_table[base_n_count] = symbols[count] or base_n_count
2654
2655     return re.sub(
2656         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2657         obfucasted_code)