_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_parse_qs,
  41     compat_socket_create_connection,
  42     compat_str,
  43     compat_urllib_error,
  44     compat_urllib_parse,
  45     compat_urllib_parse_urlparse,
  46     compat_urllib_request,
  47     compat_urlparse,
  48     shlex_quote,
  49 )
  50
  51
  52 # This is not clearly defined otherwise
  53 compiled_regex_type = type(re.compile(''))
  54
  55 std_headers = {
  56     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  57     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59     'Accept-Encoding': 'gzip, deflate',
  60     'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62
  63
  64 ENGLISH_MONTH_NAMES = [
  65     'January', 'February', 'March', 'April', 'May', 'June',
  66     'July', 'August', 'September', 'October', 'November', 'December']
  67
  68
  69 def preferredencoding():
  70     """Get preferred encoding.
  71
  72     Returns the best encoding scheme for the system, based on
  73     locale.getpreferredencoding() and some further tweaks.
  74     """
  75     try:
  76         pref = locale.getpreferredencoding()
  77         'TEST'.encode(pref)
  78     except Exception:
  79         pref = 'UTF-8'
  80
  81     return pref
  82
  83
  84 def write_json_file(obj, fn):
  85     """ Encode obj as JSON and write it to fn, atomically if possible """
  86
  87     fn = encodeFilename(fn)
  88     if sys.version_info < (3, 0) and sys.platform != 'win32':
  89         encoding = get_filesystem_encoding()
  90         # os.path.basename returns a bytes object, but NamedTemporaryFile
  91         # will fail if the filename contains non ascii characters unless we
  92         # use a unicode object
  93         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  94         # the same for os.path.dirname
  95         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  96     else:
  97         path_basename = os.path.basename
  98         path_dirname = os.path.dirname
  99
 100     args = {
 101         'suffix': '.tmp',
 102         'prefix': path_basename(fn) + '.',
 103         'dir': path_dirname(fn),
 104         'delete': False,
 105     }
 106
 107     # In Python 2.x, json.dump expects a bytestream.
 108     # In Python 3.x, it writes to a character stream
 109     if sys.version_info < (3, 0):
 110         args['mode'] = 'wb'
 111     else:
 112         args.update({
 113             'mode': 'w',
 114             'encoding': 'utf-8',
 115         })
 116
 117     tf = tempfile.NamedTemporaryFile(**args)
 118
 119     try:
 120         with tf:
 121             json.dump(obj, tf)
 122         if sys.platform == 'win32':
 123             # Need to remove existing file on Windows, else os.rename raises
 124             # WindowsError or FileExistsError.
 125             try:
 126                 os.unlink(fn)
 127             except OSError:
 128                 pass
 129         os.rename(tf.name, fn)
 130     except Exception:
 131         try:
 132             os.remove(tf.name)
 133         except OSError:
 134             pass
 135         raise
 136
 137
 138 if sys.version_info >= (2, 7):
 139     def find_xpath_attr(node, xpath, key, val):
 140         """ Find the xpath xpath[@key=val] """
 141         assert re.match(r'^[a-zA-Z-]+$', key)
 142         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 143         expr = xpath + "[@%s='%s']" % (key, val)
 144         return node.find(expr)
 145 else:
 146     def find_xpath_attr(node, xpath, key, val):
 147         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 148         # .//node does not match if a node is a direct child of . !
 149         if isinstance(xpath, compat_str):
 150             xpath = xpath.encode('ascii')
 151
 152         for f in node.findall(xpath):
 153             if f.attrib.get(key) == val:
 154                 return f
 155         return None
 156
 157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 158 # the namespace parameter
 159
 160
 161 def xpath_with_ns(path, ns_map):
 162     components = [c.split(':') for c in path.split('/')]
 163     replaced = []
 164     for c in components:
 165         if len(c) == 1:
 166             replaced.append(c[0])
 167         else:
 168             ns, tag = c
 169             replaced.append('{%s}%s' % (ns_map[ns], tag))
 170     return '/'.join(replaced)
 171
 172
 173 def xpath_text(node, xpath, name=None, fatal=False):
 174     if sys.version_info < (2, 7):  # Crazy 2.6
 175         xpath = xpath.encode('ascii')
 176
 177     n = node.find(xpath)
 178     if n is None or n.text is None:
 179         if fatal:
 180             name = xpath if name is None else name
 181             raise ExtractorError('Could not find XML element %s' % name)
 182         else:
 183             return None
 184     return n.text
 185
 186
 187 def get_element_by_id(id, html):
 188     """Return the content of the tag with the specified ID in the passed HTML document"""
 189     return get_element_by_attribute("id", id, html)
 190
 191
 192 def get_element_by_attribute(attribute, value, html):
 193     """Return the content of the tag with the specified attribute in the passed HTML document"""
 194
 195     m = re.search(r'''(?xs)
 196         <([a-zA-Z0-9:._-]+)
 197          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 198          \s+%s=['"]?%s['"]?
 199          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 200         \s*>
 201         (?P<content>.*?)
 202         </\1>
 203     ''' % (re.escape(attribute), re.escape(value)), html)
 204
 205     if not m:
 206         return None
 207     res = m.group('content')
 208
 209     if res.startswith('"') or res.startswith("'"):
 210         res = res[1:-1]
 211
 212     return unescapeHTML(res)
 213
 214
 215 def clean_html(html):
 216     """Clean an HTML snippet into a readable string"""
 217
 218     if html is None:  # Convenience for sanitizing descriptions etc.
 219         return html
 220
 221     # Newline vs <br />
 222     html = html.replace('\n', ' ')
 223     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 224     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 225     # Strip html tags
 226     html = re.sub('<.*?>', '', html)
 227     # Replace html entities
 228     html = unescapeHTML(html)
 229     return html.strip()
 230
 231
 232 def sanitize_open(filename, open_mode):
 233     """Try to open the given filename, and slightly tweak it if this fails.
 234
 235     Attempts to open the given filename. If this fails, it tries to change
 236     the filename slightly, step by step, until it's either able to open it
 237     or it fails and raises a final exception, like the standard open()
 238     function.
 239
 240     It returns the tuple (stream, definitive_file_name).
 241     """
 242     try:
 243         if filename == '-':
 244             if sys.platform == 'win32':
 245                 import msvcrt
 246                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 247             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 248         stream = open(encodeFilename(filename), open_mode)
 249         return (stream, filename)
 250     except (IOError, OSError) as err:
 251         if err.errno in (errno.EACCES,):
 252             raise
 253
 254         # In case of error, try to remove win32 forbidden chars
 255         alt_filename = sanitize_path(filename)
 256         if alt_filename == filename:
 257             raise
 258         else:
 259             # An exception here should be caught in the caller
 260             stream = open(encodeFilename(alt_filename), open_mode)
 261             return (stream, alt_filename)
 262
 263
 264 def timeconvert(timestr):
 265     """Convert RFC 2822 defined time string into system timestamp"""
 266     timestamp = None
 267     timetuple = email.utils.parsedate_tz(timestr)
 268     if timetuple is not None:
 269         timestamp = email.utils.mktime_tz(timetuple)
 270     return timestamp
 271
 272
 273 def sanitize_filename(s, restricted=False, is_id=False):
 274     """Sanitizes a string so it could be used as part of a filename.
 275     If restricted is set, use a stricter subset of allowed characters.
 276     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 277     """
 278     def replace_insane(char):
 279         if char == '?' or ord(char) < 32 or ord(char) == 127:
 280             return ''
 281         elif char == '"':
 282             return '' if restricted else '\''
 283         elif char == ':':
 284             return '_-' if restricted else ' -'
 285         elif char in '\\/|*<>':
 286             return '_'
 287         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 288             return '_'
 289         if restricted and ord(char) > 127:
 290             return '_'
 291         return char
 292
 293     # Handle timestamps
 294     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 295     result = ''.join(map(replace_insane, s))
 296     if not is_id:
 297         while '__' in result:
 298             result = result.replace('__', '_')
 299         result = result.strip('_')
 300         # Common case of "Foreign band name - English song title"
 301         if restricted and result.startswith('-_'):
 302             result = result[2:]
 303         if result.startswith('-'):
 304             result = '_' + result[len('-'):]
 305         result = result.lstrip('.')
 306         if not result:
 307             result = '_'
 308     return result
 309
 310
 311 def sanitize_path(s):
 312     """Sanitizes and normalizes path on Windows"""
 313     if sys.platform != 'win32':
 314         return s
 315     drive_or_unc, _ = os.path.splitdrive(s)
 316     if sys.version_info < (2, 7) and not drive_or_unc:
 317         drive_or_unc, _ = os.path.splitunc(s)
 318     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 319     if drive_or_unc:
 320         norm_path.pop(0)
 321     sanitized_path = [
 322         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 323         for path_part in norm_path]
 324     if drive_or_unc:
 325         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 326     return os.path.join(*sanitized_path)
 327
 328
 329 def sanitize_url_path_consecutive_slashes(url):
 330     """Collapses consecutive slashes in URLs' path"""
 331     parsed_url = list(compat_urlparse.urlparse(url))
 332     parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
 333     return compat_urlparse.urlunparse(parsed_url)
 334
 335
 336 def orderedSet(iterable):
 337     """ Remove all duplicates from the input iterable """
 338     res = []
 339     for el in iterable:
 340         if el not in res:
 341             res.append(el)
 342     return res
 343
 344
 345 def _htmlentity_transform(entity):
 346     """Transforms an HTML entity to a character."""
 347     # Known non-numeric HTML entity
 348     if entity in compat_html_entities.name2codepoint:
 349         return compat_chr(compat_html_entities.name2codepoint[entity])
 350
 351     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 352     if mobj is not None:
 353         numstr = mobj.group(1)
 354         if numstr.startswith('x'):
 355             base = 16
 356             numstr = '0%s' % numstr
 357         else:
 358             base = 10
 359         return compat_chr(int(numstr, base))
 360
 361     # Unknown entity in name, return its literal representation
 362     return ('&%s;' % entity)
 363
 364
 365 def unescapeHTML(s):
 366     if s is None:
 367         return None
 368     assert type(s) == compat_str
 369
 370     return re.sub(
 371         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 372
 373
 374 def encodeFilename(s, for_subprocess=False):
 375     """
 376     @param s The name of the file
 377     """
 378
 379     assert type(s) == compat_str
 380
 381     # Python 3 has a Unicode API
 382     if sys.version_info >= (3, 0):
 383         return s
 384
 385     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 386         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 387         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 388         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 389         if not for_subprocess:
 390             return s
 391         else:
 392             # For subprocess calls, encode with locale encoding
 393             # Refer to http://stackoverflow.com/a/9951851/35070
 394             encoding = preferredencoding()
 395     else:
 396         encoding = sys.getfilesystemencoding()
 397     if encoding is None:
 398         encoding = 'utf-8'
 399     return s.encode(encoding, 'ignore')
 400
 401
 402 def encodeArgument(s):
 403     if not isinstance(s, compat_str):
 404         # Legacy code that uses byte strings
 405         # Uncomment the following line after fixing all post processors
 406         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 407         s = s.decode('ascii')
 408     return encodeFilename(s, True)
 409
 410
 411 def decodeOption(optval):
 412     if optval is None:
 413         return optval
 414     if isinstance(optval, bytes):
 415         optval = optval.decode(preferredencoding())
 416
 417     assert isinstance(optval, compat_str)
 418     return optval
 419
 420
 421 def formatSeconds(secs):
 422     if secs > 3600:
 423         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 424     elif secs > 60:
 425         return '%d:%02d' % (secs // 60, secs % 60)
 426     else:
 427         return '%d' % secs
 428
 429
 430 def make_HTTPS_handler(params, **kwargs):
 431     opts_no_check_certificate = params.get('nocheckcertificate', False)
 432     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 433         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 434         if opts_no_check_certificate:
 435             context.check_hostname = False
 436             context.verify_mode = ssl.CERT_NONE
 437         try:
 438             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 439         except TypeError:
 440             # Python 2.7.8
 441             # (create_default_context present but HTTPSHandler has no context=)
 442             pass
 443
 444     if sys.version_info < (3, 2):
 445         return YoutubeDLHTTPSHandler(params, **kwargs)
 446     else:  # Python < 3.4
 447         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 448         context.verify_mode = (ssl.CERT_NONE
 449                                if opts_no_check_certificate
 450                                else ssl.CERT_REQUIRED)
 451         context.set_default_verify_paths()
 452         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 453
 454
 455 def bug_reports_message():
 456     if ytdl_is_updateable():
 457         update_cmd = 'type  youtube-dl -U  to update'
 458     else:
 459         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 460     msg = '; please report this issue on https://yt-dl.org/bug .'
 461     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 462     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 463     return msg
 464
 465
 466 class ExtractorError(Exception):
 467     """Error during info extraction."""
 468
 469     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 470         """ tb, if given, is the original traceback (so that it can be printed out).
 471         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 472         """
 473
 474         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 475             expected = True
 476         if video_id is not None:
 477             msg = video_id + ': ' + msg
 478         if cause:
 479             msg += ' (caused by %r)' % cause
 480         if not expected:
 481             msg += bug_reports_message()
 482         super(ExtractorError, self).__init__(msg)
 483
 484         self.traceback = tb
 485         self.exc_info = sys.exc_info()  # preserve original exception
 486         self.cause = cause
 487         self.video_id = video_id
 488
 489     def format_traceback(self):
 490         if self.traceback is None:
 491             return None
 492         return ''.join(traceback.format_tb(self.traceback))
 493
 494
 495 class UnsupportedError(ExtractorError):
 496     def __init__(self, url):
 497         super(UnsupportedError, self).__init__(
 498             'Unsupported URL: %s' % url, expected=True)
 499         self.url = url
 500
 501
 502 class RegexNotFoundError(ExtractorError):
 503     """Error when a regex didn't match"""
 504     pass
 505
 506
 507 class DownloadError(Exception):
 508     """Download Error exception.
 509
 510     This exception may be thrown by FileDownloader objects if they are not
 511     configured to continue on errors. They will contain the appropriate
 512     error message.
 513     """
 514
 515     def __init__(self, msg, exc_info=None):
 516         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 517         super(DownloadError, self).__init__(msg)
 518         self.exc_info = exc_info
 519
 520
 521 class SameFileError(Exception):
 522     """Same File exception.
 523
 524     This exception will be thrown by FileDownloader objects if they detect
 525     multiple files would have to be downloaded to the same file on disk.
 526     """
 527     pass
 528
 529
 530 class PostProcessingError(Exception):
 531     """Post Processing exception.
 532
 533     This exception may be raised by PostProcessor's .run() method to
 534     indicate an error in the postprocessing task.
 535     """
 536
 537     def __init__(self, msg):
 538         self.msg = msg
 539
 540
 541 class MaxDownloadsReached(Exception):
 542     """ --max-downloads limit has been reached. """
 543     pass
 544
 545
 546 class UnavailableVideoError(Exception):
 547     """Unavailable Format exception.
 548
 549     This exception will be thrown when a video is requested
 550     in a format that is not available for that video.
 551     """
 552     pass
 553
 554
 555 class ContentTooShortError(Exception):
 556     """Content Too Short exception.
 557
 558     This exception may be raised by FileDownloader objects when a file they
 559     download is too small for what the server announced first, indicating
 560     the connection was probably interrupted.
 561     """
 562     # Both in bytes
 563     downloaded = None
 564     expected = None
 565
 566     def __init__(self, downloaded, expected):
 567         self.downloaded = downloaded
 568         self.expected = expected
 569
 570
 571 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 572     hc = http_class(*args, **kwargs)
 573     source_address = ydl_handler._params.get('source_address')
 574     if source_address is not None:
 575         sa = (source_address, 0)
 576         if hasattr(hc, 'source_address'):  # Python 2.7+
 577             hc.source_address = sa
 578         else:  # Python 2.6
 579             def _hc_connect(self, *args, **kwargs):
 580                 sock = compat_socket_create_connection(
 581                     (self.host, self.port), self.timeout, sa)
 582                 if is_https:
 583                     self.sock = ssl.wrap_socket(
 584                         sock, self.key_file, self.cert_file,
 585                         ssl_version=ssl.PROTOCOL_TLSv1)
 586                 else:
 587                     self.sock = sock
 588             hc.connect = functools.partial(_hc_connect, hc)
 589
 590     return hc
 591
 592
 593 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 594     """Handler for HTTP requests and responses.
 595
 596     This class, when installed with an OpenerDirector, automatically adds
 597     the standard headers to every HTTP request and handles gzipped and
 598     deflated responses from web servers. If compression is to be avoided in
 599     a particular request, the original request in the program code only has
 600     to include the HTTP header "Youtubedl-No-Compression", which will be
 601     removed before making the real request.
 602
 603     Part of this code was copied from:
 604
 605     http://techknack.net/python-urllib2-handlers/
 606
 607     Andrew Rowls, the author of that code, agreed to release it to the
 608     public domain.
 609     """
 610
 611     def __init__(self, params, *args, **kwargs):
 612         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 613         self._params = params
 614
 615     def http_open(self, req):
 616         return self.do_open(functools.partial(
 617             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 618             req)
 619
 620     @staticmethod
 621     def deflate(data):
 622         try:
 623             return zlib.decompress(data, -zlib.MAX_WBITS)
 624         except zlib.error:
 625             return zlib.decompress(data)
 626
 627     @staticmethod
 628     def addinfourl_wrapper(stream, headers, url, code):
 629         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 630             return compat_urllib_request.addinfourl(stream, headers, url, code)
 631         ret = compat_urllib_request.addinfourl(stream, headers, url)
 632         ret.code = code
 633         return ret
 634
 635     def http_request(self, req):
 636         for h, v in std_headers.items():
 637             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 638             # The dict keys are capitalized because of this bug by urllib
 639             if h.capitalize() not in req.headers:
 640                 req.add_header(h, v)
 641         if 'Youtubedl-no-compression' in req.headers:
 642             if 'Accept-encoding' in req.headers:
 643                 del req.headers['Accept-encoding']
 644             del req.headers['Youtubedl-no-compression']
 645
 646         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 647             # Python 2.6 is brain-dead when it comes to fragments
 648             req._Request__original = req._Request__original.partition('#')[0]
 649             req._Request__r_type = req._Request__r_type.partition('#')[0]
 650
 651         return req
 652
 653     def http_response(self, req, resp):
 654         old_resp = resp
 655         # gzip
 656         if resp.headers.get('Content-encoding', '') == 'gzip':
 657             content = resp.read()
 658             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 659             try:
 660                 uncompressed = io.BytesIO(gz.read())
 661             except IOError as original_ioerror:
 662                 # There may be junk add the end of the file
 663                 # See http://stackoverflow.com/q/4928560/35070 for details
 664                 for i in range(1, 1024):
 665                     try:
 666                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 667                         uncompressed = io.BytesIO(gz.read())
 668                     except IOError:
 669                         continue
 670                     break
 671                 else:
 672                     raise original_ioerror
 673             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 674             resp.msg = old_resp.msg
 675         # deflate
 676         if resp.headers.get('Content-encoding', '') == 'deflate':
 677             gz = io.BytesIO(self.deflate(resp.read()))
 678             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 679             resp.msg = old_resp.msg
 680         return resp
 681
 682     https_request = http_request
 683     https_response = http_response
 684
 685
 686 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 687     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 688         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 689         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 690         self._params = params
 691
 692     def https_open(self, req):
 693         kwargs = {}
 694         if hasattr(self, '_context'):  # python > 2.6
 695             kwargs['context'] = self._context
 696         if hasattr(self, '_check_hostname'):  # python 3.x
 697             kwargs['check_hostname'] = self._check_hostname
 698         return self.do_open(functools.partial(
 699             _create_http_connection, self, self._https_conn_class, True),
 700             req, **kwargs)
 701
 702
 703 def parse_iso8601(date_str, delimiter='T', timezone=None):
 704     """ Return a UNIX timestamp from the given date """
 705
 706     if date_str is None:
 707         return None
 708
 709     if timezone is None:
 710         m = re.search(
 711             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 712             date_str)
 713         if not m:
 714             timezone = datetime.timedelta()
 715         else:
 716             date_str = date_str[:-len(m.group(0))]
 717             if not m.group('sign'):
 718                 timezone = datetime.timedelta()
 719             else:
 720                 sign = 1 if m.group('sign') == '+' else -1
 721                 timezone = datetime.timedelta(
 722                     hours=sign * int(m.group('hours')),
 723                     minutes=sign * int(m.group('minutes')))
 724     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 725     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 726     return calendar.timegm(dt.timetuple())
 727
 728
 729 def unified_strdate(date_str, day_first=True):
 730     """Return a string with the date in the format YYYYMMDD"""
 731
 732     if date_str is None:
 733         return None
 734     upload_date = None
 735     # Replace commas
 736     date_str = date_str.replace(',', ' ')
 737     # %z (UTC offset) is only supported in python>=3.2
 738     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 739         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 740     # Remove AM/PM + timezone
 741     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 742
 743     format_expressions = [
 744         '%d %B %Y',
 745         '%d %b %Y',
 746         '%B %d %Y',
 747         '%b %d %Y',
 748         '%b %dst %Y %I:%M%p',
 749         '%b %dnd %Y %I:%M%p',
 750         '%b %dth %Y %I:%M%p',
 751         '%Y %m %d',
 752         '%Y-%m-%d',
 753         '%Y/%m/%d',
 754         '%Y/%m/%d %H:%M:%S',
 755         '%Y-%m-%d %H:%M:%S',
 756         '%Y-%m-%d %H:%M:%S.%f',
 757         '%d.%m.%Y %H:%M',
 758         '%d.%m.%Y %H.%M',
 759         '%Y-%m-%dT%H:%M:%SZ',
 760         '%Y-%m-%dT%H:%M:%S.%fZ',
 761         '%Y-%m-%dT%H:%M:%S.%f0Z',
 762         '%Y-%m-%dT%H:%M:%S',
 763         '%Y-%m-%dT%H:%M:%S.%f',
 764         '%Y-%m-%dT%H:%M',
 765     ]
 766     if day_first:
 767         format_expressions.extend([
 768             '%d-%m-%Y',
 769             '%d.%m.%Y',
 770             '%d/%m/%Y',
 771             '%d/%m/%y',
 772             '%d/%m/%Y %H:%M:%S',
 773         ])
 774     else:
 775         format_expressions.extend([
 776             '%m-%d-%Y',
 777             '%m.%d.%Y',
 778             '%m/%d/%Y',
 779             '%m/%d/%y',
 780             '%m/%d/%Y %H:%M:%S',
 781         ])
 782     for expression in format_expressions:
 783         try:
 784             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 785         except ValueError:
 786             pass
 787     if upload_date is None:
 788         timetuple = email.utils.parsedate_tz(date_str)
 789         if timetuple:
 790             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 791     return upload_date
 792
 793
 794 def determine_ext(url, default_ext='unknown_video'):
 795     if url is None:
 796         return default_ext
 797     guess = url.partition('?')[0].rpartition('.')[2]
 798     if re.match(r'^[A-Za-z0-9]+$', guess):
 799         return guess
 800     else:
 801         return default_ext
 802
 803
 804 def subtitles_filename(filename, sub_lang, sub_format):
 805     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 806
 807
 808 def date_from_str(date_str):
 809     """
 810     Return a datetime object from a string in the format YYYYMMDD or
 811     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 812     today = datetime.date.today()
 813     if date_str in ('now', 'today'):
 814         return today
 815     if date_str == 'yesterday':
 816         return today - datetime.timedelta(days=1)
 817     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 818     if match is not None:
 819         sign = match.group('sign')
 820         time = int(match.group('time'))
 821         if sign == '-':
 822             time = -time
 823         unit = match.group('unit')
 824         # A bad aproximation?
 825         if unit == 'month':
 826             unit = 'day'
 827             time *= 30
 828         elif unit == 'year':
 829             unit = 'day'
 830             time *= 365
 831         unit += 's'
 832         delta = datetime.timedelta(**{unit: time})
 833         return today + delta
 834     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 835
 836
 837 def hyphenate_date(date_str):
 838     """
 839     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 840     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 841     if match is not None:
 842         return '-'.join(match.groups())
 843     else:
 844         return date_str
 845
 846
 847 class DateRange(object):
 848     """Represents a time interval between two dates"""
 849
 850     def __init__(self, start=None, end=None):
 851         """start and end must be strings in the format accepted by date"""
 852         if start is not None:
 853             self.start = date_from_str(start)
 854         else:
 855             self.start = datetime.datetime.min.date()
 856         if end is not None:
 857             self.end = date_from_str(end)
 858         else:
 859             self.end = datetime.datetime.max.date()
 860         if self.start > self.end:
 861             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 862
 863     @classmethod
 864     def day(cls, day):
 865         """Returns a range that only contains the given day"""
 866         return cls(day, day)
 867
 868     def __contains__(self, date):
 869         """Check if the date is in the range"""
 870         if not isinstance(date, datetime.date):
 871             date = date_from_str(date)
 872         return self.start <= date <= self.end
 873
 874     def __str__(self):
 875         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 876
 877
 878 def platform_name():
 879     """ Returns the platform name as a compat_str """
 880     res = platform.platform()
 881     if isinstance(res, bytes):
 882         res = res.decode(preferredencoding())
 883
 884     assert isinstance(res, compat_str)
 885     return res
 886
 887
 888 def _windows_write_string(s, out):
 889     """ Returns True if the string was written using special methods,
 890     False if it has yet to be written out."""
 891     # Adapted from http://stackoverflow.com/a/3259271/35070
 892
 893     import ctypes
 894     import ctypes.wintypes
 895
 896     WIN_OUTPUT_IDS = {
 897         1: -11,
 898         2: -12,
 899     }
 900
 901     try:
 902         fileno = out.fileno()
 903     except AttributeError:
 904         # If the output stream doesn't have a fileno, it's virtual
 905         return False
 906     except io.UnsupportedOperation:
 907         # Some strange Windows pseudo files?
 908         return False
 909     if fileno not in WIN_OUTPUT_IDS:
 910         return False
 911
 912     GetStdHandle = ctypes.WINFUNCTYPE(
 913         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 914         (b"GetStdHandle", ctypes.windll.kernel32))
 915     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 916
 917     WriteConsoleW = ctypes.WINFUNCTYPE(
 918         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 919         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 920         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 921     written = ctypes.wintypes.DWORD(0)
 922
 923     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 924     FILE_TYPE_CHAR = 0x0002
 925     FILE_TYPE_REMOTE = 0x8000
 926     GetConsoleMode = ctypes.WINFUNCTYPE(
 927         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 928         ctypes.POINTER(ctypes.wintypes.DWORD))(
 929         (b"GetConsoleMode", ctypes.windll.kernel32))
 930     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 931
 932     def not_a_console(handle):
 933         if handle == INVALID_HANDLE_VALUE or handle is None:
 934             return True
 935         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 936                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 937
 938     if not_a_console(h):
 939         return False
 940
 941     def next_nonbmp_pos(s):
 942         try:
 943             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 944         except StopIteration:
 945             return len(s)
 946
 947     while s:
 948         count = min(next_nonbmp_pos(s), 1024)
 949
 950         ret = WriteConsoleW(
 951             h, s, count if count else 2, ctypes.byref(written), None)
 952         if ret == 0:
 953             raise OSError('Failed to write string')
 954         if not count:  # We just wrote a non-BMP character
 955             assert written.value == 2
 956             s = s[1:]
 957         else:
 958             assert written.value > 0
 959             s = s[written.value:]
 960     return True
 961
 962
 963 def write_string(s, out=None, encoding=None):
 964     if out is None:
 965         out = sys.stderr
 966     assert type(s) == compat_str
 967
 968     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 969         if _windows_write_string(s, out):
 970             return
 971
 972     if ('b' in getattr(out, 'mode', '') or
 973             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 974         byt = s.encode(encoding or preferredencoding(), 'ignore')
 975         out.write(byt)
 976     elif hasattr(out, 'buffer'):
 977         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 978         byt = s.encode(enc, 'ignore')
 979         out.buffer.write(byt)
 980     else:
 981         out.write(s)
 982     out.flush()
 983
 984
 985 def bytes_to_intlist(bs):
 986     if not bs:
 987         return []
 988     if isinstance(bs[0], int):  # Python 3
 989         return list(bs)
 990     else:
 991         return [ord(c) for c in bs]
 992
 993
 994 def intlist_to_bytes(xs):
 995     if not xs:
 996         return b''
 997     return struct_pack('%dB' % len(xs), *xs)
 998
 999
1000 # Cross-platform file locking
1001 if sys.platform == 'win32':
1002     import ctypes.wintypes
1003     import msvcrt
1004
1005     class OVERLAPPED(ctypes.Structure):
1006         _fields_ = [
1007             ('Internal', ctypes.wintypes.LPVOID),
1008             ('InternalHigh', ctypes.wintypes.LPVOID),
1009             ('Offset', ctypes.wintypes.DWORD),
1010             ('OffsetHigh', ctypes.wintypes.DWORD),
1011             ('hEvent', ctypes.wintypes.HANDLE),
1012         ]
1013
1014     kernel32 = ctypes.windll.kernel32
1015     LockFileEx = kernel32.LockFileEx
1016     LockFileEx.argtypes = [
1017         ctypes.wintypes.HANDLE,     # hFile
1018         ctypes.wintypes.DWORD,      # dwFlags
1019         ctypes.wintypes.DWORD,      # dwReserved
1020         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1021         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1022         ctypes.POINTER(OVERLAPPED)  # Overlapped
1023     ]
1024     LockFileEx.restype = ctypes.wintypes.BOOL
1025     UnlockFileEx = kernel32.UnlockFileEx
1026     UnlockFileEx.argtypes = [
1027         ctypes.wintypes.HANDLE,     # hFile
1028         ctypes.wintypes.DWORD,      # dwReserved
1029         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1030         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1031         ctypes.POINTER(OVERLAPPED)  # Overlapped
1032     ]
1033     UnlockFileEx.restype = ctypes.wintypes.BOOL
1034     whole_low = 0xffffffff
1035     whole_high = 0x7fffffff
1036
1037     def _lock_file(f, exclusive):
1038         overlapped = OVERLAPPED()
1039         overlapped.Offset = 0
1040         overlapped.OffsetHigh = 0
1041         overlapped.hEvent = 0
1042         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1043         handle = msvcrt.get_osfhandle(f.fileno())
1044         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1045                           whole_low, whole_high, f._lock_file_overlapped_p):
1046             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1047
1048     def _unlock_file(f):
1049         assert f._lock_file_overlapped_p
1050         handle = msvcrt.get_osfhandle(f.fileno())
1051         if not UnlockFileEx(handle, 0,
1052                             whole_low, whole_high, f._lock_file_overlapped_p):
1053             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1054
1055 else:
1056     import fcntl
1057
1058     def _lock_file(f, exclusive):
1059         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1060
1061     def _unlock_file(f):
1062         fcntl.flock(f, fcntl.LOCK_UN)
1063
1064
1065 class locked_file(object):
1066     def __init__(self, filename, mode, encoding=None):
1067         assert mode in ['r', 'a', 'w']
1068         self.f = io.open(filename, mode, encoding=encoding)
1069         self.mode = mode
1070
1071     def __enter__(self):
1072         exclusive = self.mode != 'r'
1073         try:
1074             _lock_file(self.f, exclusive)
1075         except IOError:
1076             self.f.close()
1077             raise
1078         return self
1079
1080     def __exit__(self, etype, value, traceback):
1081         try:
1082             _unlock_file(self.f)
1083         finally:
1084             self.f.close()
1085
1086     def __iter__(self):
1087         return iter(self.f)
1088
1089     def write(self, *args):
1090         return self.f.write(*args)
1091
1092     def read(self, *args):
1093         return self.f.read(*args)
1094
1095
1096 def get_filesystem_encoding():
1097     encoding = sys.getfilesystemencoding()
1098     return encoding if encoding is not None else 'utf-8'
1099
1100
1101 def shell_quote(args):
1102     quoted_args = []
1103     encoding = get_filesystem_encoding()
1104     for a in args:
1105         if isinstance(a, bytes):
1106             # We may get a filename encoded with 'encodeFilename'
1107             a = a.decode(encoding)
1108         quoted_args.append(pipes.quote(a))
1109     return ' '.join(quoted_args)
1110
1111
1112 def smuggle_url(url, data):
1113     """ Pass additional data in a URL for internal use. """
1114
1115     sdata = compat_urllib_parse.urlencode(
1116         {'__youtubedl_smuggle': json.dumps(data)})
1117     return url + '#' + sdata
1118
1119
1120 def unsmuggle_url(smug_url, default=None):
1121     if '#__youtubedl_smuggle' not in smug_url:
1122         return smug_url, default
1123     url, _, sdata = smug_url.rpartition('#')
1124     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1125     data = json.loads(jsond)
1126     return url, data
1127
1128
1129 def format_bytes(bytes):
1130     if bytes is None:
1131         return 'N/A'
1132     if type(bytes) is str:
1133         bytes = float(bytes)
1134     if bytes == 0.0:
1135         exponent = 0
1136     else:
1137         exponent = int(math.log(bytes, 1024.0))
1138     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1139     converted = float(bytes) / float(1024 ** exponent)
1140     return '%.2f%s' % (converted, suffix)
1141
1142
1143 def parse_filesize(s):
1144     if s is None:
1145         return None
1146
1147     # The lower-case forms are of course incorrect and inofficial,
1148     # but we support those too
1149     _UNIT_TABLE = {
1150         'B': 1,
1151         'b': 1,
1152         'KiB': 1024,
1153         'KB': 1000,
1154         'kB': 1024,
1155         'Kb': 1000,
1156         'MiB': 1024 ** 2,
1157         'MB': 1000 ** 2,
1158         'mB': 1024 ** 2,
1159         'Mb': 1000 ** 2,
1160         'GiB': 1024 ** 3,
1161         'GB': 1000 ** 3,
1162         'gB': 1024 ** 3,
1163         'Gb': 1000 ** 3,
1164         'TiB': 1024 ** 4,
1165         'TB': 1000 ** 4,
1166         'tB': 1024 ** 4,
1167         'Tb': 1000 ** 4,
1168         'PiB': 1024 ** 5,
1169         'PB': 1000 ** 5,
1170         'pB': 1024 ** 5,
1171         'Pb': 1000 ** 5,
1172         'EiB': 1024 ** 6,
1173         'EB': 1000 ** 6,
1174         'eB': 1024 ** 6,
1175         'Eb': 1000 ** 6,
1176         'ZiB': 1024 ** 7,
1177         'ZB': 1000 ** 7,
1178         'zB': 1024 ** 7,
1179         'Zb': 1000 ** 7,
1180         'YiB': 1024 ** 8,
1181         'YB': 1000 ** 8,
1182         'yB': 1024 ** 8,
1183         'Yb': 1000 ** 8,
1184     }
1185
1186     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1187     m = re.match(
1188         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1189     if not m:
1190         return None
1191
1192     num_str = m.group('num').replace(',', '.')
1193     mult = _UNIT_TABLE[m.group('unit')]
1194     return int(float(num_str) * mult)
1195
1196
1197 def month_by_name(name):
1198     """ Return the number of a month by (locale-independently) English name """
1199
1200     try:
1201         return ENGLISH_MONTH_NAMES.index(name) + 1
1202     except ValueError:
1203         return None
1204
1205
1206 def month_by_abbreviation(abbrev):
1207     """ Return the number of a month by (locale-independently) English
1208         abbreviations """
1209
1210     try:
1211         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1212     except ValueError:
1213         return None
1214
1215
1216 def fix_xml_ampersands(xml_str):
1217     """Replace all the '&' by '&amp;' in XML"""
1218     return re.sub(
1219         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1220         '&amp;',
1221         xml_str)
1222
1223
1224 def setproctitle(title):
1225     assert isinstance(title, compat_str)
1226     try:
1227         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1228     except OSError:
1229         return
1230     title_bytes = title.encode('utf-8')
1231     buf = ctypes.create_string_buffer(len(title_bytes))
1232     buf.value = title_bytes
1233     try:
1234         libc.prctl(15, buf, 0, 0, 0)
1235     except AttributeError:
1236         return  # Strange libc, just skip this
1237
1238
1239 def remove_start(s, start):
1240     if s.startswith(start):
1241         return s[len(start):]
1242     return s
1243
1244
1245 def remove_end(s, end):
1246     if s.endswith(end):
1247         return s[:-len(end)]
1248     return s
1249
1250
1251 def url_basename(url):
1252     path = compat_urlparse.urlparse(url).path
1253     return path.strip('/').split('/')[-1]
1254
1255
1256 class HEADRequest(compat_urllib_request.Request):
1257     def get_method(self):
1258         return "HEAD"
1259
1260
1261 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1262     if get_attr:
1263         if v is not None:
1264             v = getattr(v, get_attr, None)
1265     if v == '':
1266         v = None
1267     return default if v is None else (int(v) * invscale // scale)
1268
1269
1270 def str_or_none(v, default=None):
1271     return default if v is None else compat_str(v)
1272
1273
1274 def str_to_int(int_str):
1275     """ A more relaxed version of int_or_none """
1276     if int_str is None:
1277         return None
1278     int_str = re.sub(r'[,\.\+]', '', int_str)
1279     return int(int_str)
1280
1281
1282 def float_or_none(v, scale=1, invscale=1, default=None):
1283     return default if v is None else (float(v) * invscale / scale)
1284
1285
1286 def parse_duration(s):
1287     if not isinstance(s, compat_basestring):
1288         return None
1289
1290     s = s.strip()
1291
1292     m = re.match(
1293         r'''(?ix)(?:P?T)?
1294         (?:
1295             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1296             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1297
1298             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1299             (?:
1300                 (?:
1301                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1302                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1303                 )?
1304                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1305             )?
1306             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1307         )$''', s)
1308     if not m:
1309         return None
1310     res = 0
1311     if m.group('only_mins'):
1312         return float_or_none(m.group('only_mins'), invscale=60)
1313     if m.group('only_hours'):
1314         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1315     if m.group('secs'):
1316         res += int(m.group('secs'))
1317     if m.group('mins_reversed'):
1318         res += int(m.group('mins_reversed')) * 60
1319     if m.group('mins'):
1320         res += int(m.group('mins')) * 60
1321     if m.group('hours'):
1322         res += int(m.group('hours')) * 60 * 60
1323     if m.group('hours_reversed'):
1324         res += int(m.group('hours_reversed')) * 60 * 60
1325     if m.group('days'):
1326         res += int(m.group('days')) * 24 * 60 * 60
1327     if m.group('ms'):
1328         res += float(m.group('ms'))
1329     return res
1330
1331
1332 def prepend_extension(filename, ext):
1333     name, real_ext = os.path.splitext(filename)
1334     return '{0}.{1}{2}'.format(name, ext, real_ext)
1335
1336
1337 def check_executable(exe, args=[]):
1338     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1339     args can be a list of arguments for a short output (like -version) """
1340     try:
1341         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1342     except OSError:
1343         return False
1344     return exe
1345
1346
1347 def get_exe_version(exe, args=['--version'],
1348                     version_re=None, unrecognized='present'):
1349     """ Returns the version of the specified executable,
1350     or False if the executable is not present """
1351     try:
1352         out, _ = subprocess.Popen(
1353             [exe] + args,
1354             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1355     except OSError:
1356         return False
1357     if isinstance(out, bytes):  # Python 2.x
1358         out = out.decode('ascii', 'ignore')
1359     return detect_exe_version(out, version_re, unrecognized)
1360
1361
1362 def detect_exe_version(output, version_re=None, unrecognized='present'):
1363     assert isinstance(output, compat_str)
1364     if version_re is None:
1365         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1366     m = re.search(version_re, output)
1367     if m:
1368         return m.group(1)
1369     else:
1370         return unrecognized
1371
1372
1373 class PagedList(object):
1374     def __len__(self):
1375         # This is only useful for tests
1376         return len(self.getslice())
1377
1378
1379 class OnDemandPagedList(PagedList):
1380     def __init__(self, pagefunc, pagesize):
1381         self._pagefunc = pagefunc
1382         self._pagesize = pagesize
1383
1384     def getslice(self, start=0, end=None):
1385         res = []
1386         for pagenum in itertools.count(start // self._pagesize):
1387             firstid = pagenum * self._pagesize
1388             nextfirstid = pagenum * self._pagesize + self._pagesize
1389             if start >= nextfirstid:
1390                 continue
1391
1392             page_results = list(self._pagefunc(pagenum))
1393
1394             startv = (
1395                 start % self._pagesize
1396                 if firstid <= start < nextfirstid
1397                 else 0)
1398
1399             endv = (
1400                 ((end - 1) % self._pagesize) + 1
1401                 if (end is not None and firstid <= end <= nextfirstid)
1402                 else None)
1403
1404             if startv != 0 or endv is not None:
1405                 page_results = page_results[startv:endv]
1406             res.extend(page_results)
1407
1408             # A little optimization - if current page is not "full", ie. does
1409             # not contain page_size videos then we can assume that this page
1410             # is the last one - there are no more ids on further pages -
1411             # i.e. no need to query again.
1412             if len(page_results) + startv < self._pagesize:
1413                 break
1414
1415             # If we got the whole page, but the next page is not interesting,
1416             # break out early as well
1417             if end == nextfirstid:
1418                 break
1419         return res
1420
1421
1422 class InAdvancePagedList(PagedList):
1423     def __init__(self, pagefunc, pagecount, pagesize):
1424         self._pagefunc = pagefunc
1425         self._pagecount = pagecount
1426         self._pagesize = pagesize
1427
1428     def getslice(self, start=0, end=None):
1429         res = []
1430         start_page = start // self._pagesize
1431         end_page = (
1432             self._pagecount if end is None else (end // self._pagesize + 1))
1433         skip_elems = start - start_page * self._pagesize
1434         only_more = None if end is None else end - start
1435         for pagenum in range(start_page, end_page):
1436             page = list(self._pagefunc(pagenum))
1437             if skip_elems:
1438                 page = page[skip_elems:]
1439                 skip_elems = None
1440             if only_more is not None:
1441                 if len(page) < only_more:
1442                     only_more -= len(page)
1443                 else:
1444                     page = page[:only_more]
1445                     res.extend(page)
1446                     break
1447             res.extend(page)
1448         return res
1449
1450
1451 def uppercase_escape(s):
1452     unicode_escape = codecs.getdecoder('unicode_escape')
1453     return re.sub(
1454         r'\\U[0-9a-fA-F]{8}',
1455         lambda m: unicode_escape(m.group(0))[0],
1456         s)
1457
1458
1459 def escape_rfc3986(s):
1460     """Escape non-ASCII characters as suggested by RFC 3986"""
1461     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1462         s = s.encode('utf-8')
1463     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1464
1465
1466 def escape_url(url):
1467     """Escape URL as suggested by RFC 3986"""
1468     url_parsed = compat_urllib_parse_urlparse(url)
1469     return url_parsed._replace(
1470         path=escape_rfc3986(url_parsed.path),
1471         params=escape_rfc3986(url_parsed.params),
1472         query=escape_rfc3986(url_parsed.query),
1473         fragment=escape_rfc3986(url_parsed.fragment)
1474     ).geturl()
1475
1476 try:
1477     struct.pack('!I', 0)
1478 except TypeError:
1479     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1480     def struct_pack(spec, *args):
1481         if isinstance(spec, compat_str):
1482             spec = spec.encode('ascii')
1483         return struct.pack(spec, *args)
1484
1485     def struct_unpack(spec, *args):
1486         if isinstance(spec, compat_str):
1487             spec = spec.encode('ascii')
1488         return struct.unpack(spec, *args)
1489 else:
1490     struct_pack = struct.pack
1491     struct_unpack = struct.unpack
1492
1493
1494 def read_batch_urls(batch_fd):
1495     def fixup(url):
1496         if not isinstance(url, compat_str):
1497             url = url.decode('utf-8', 'replace')
1498         BOM_UTF8 = '\xef\xbb\xbf'
1499         if url.startswith(BOM_UTF8):
1500             url = url[len(BOM_UTF8):]
1501         url = url.strip()
1502         if url.startswith(('#', ';', ']')):
1503             return False
1504         return url
1505
1506     with contextlib.closing(batch_fd) as fd:
1507         return [url for url in map(fixup, fd) if url]
1508
1509
1510 def urlencode_postdata(*args, **kargs):
1511     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1512
1513
1514 try:
1515     etree_iter = xml.etree.ElementTree.Element.iter
1516 except AttributeError:  # Python <=2.6
1517     etree_iter = lambda n: n.findall('.//*')
1518
1519
1520 def parse_xml(s):
1521     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1522         def doctype(self, name, pubid, system):
1523             pass  # Ignore doctypes
1524
1525     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1526     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1527     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1528     # Fix up XML parser in Python 2.x
1529     if sys.version_info < (3, 0):
1530         for n in etree_iter(tree):
1531             if n.text is not None:
1532                 if not isinstance(n.text, compat_str):
1533                     n.text = n.text.decode('utf-8')
1534     return tree
1535
1536
1537 US_RATINGS = {
1538     'G': 0,
1539     'PG': 10,
1540     'PG-13': 13,
1541     'R': 16,
1542     'NC': 18,
1543 }
1544
1545
1546 def parse_age_limit(s):
1547     if s is None:
1548         return None
1549     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1550     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1551
1552
1553 def strip_jsonp(code):
1554     return re.sub(
1555         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1556
1557
1558 def js_to_json(code):
1559     def fix_kv(m):
1560         v = m.group(0)
1561         if v in ('true', 'false', 'null'):
1562             return v
1563         if v.startswith('"'):
1564             return v
1565         if v.startswith("'"):
1566             v = v[1:-1]
1567             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1568                 '\\\\': '\\\\',
1569                 "\\'": "'",
1570                 '"': '\\"',
1571             }[m.group(0)], v)
1572         return '"%s"' % v
1573
1574     res = re.sub(r'''(?x)
1575         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1576         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1577         [a-zA-Z_][.a-zA-Z_0-9]*
1578         ''', fix_kv, code)
1579     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1580     return res
1581
1582
1583 def qualities(quality_ids):
1584     """ Get a numeric quality value out of a list of possible values """
1585     def q(qid):
1586         try:
1587             return quality_ids.index(qid)
1588         except ValueError:
1589             return -1
1590     return q
1591
1592
1593 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1594
1595
1596 def limit_length(s, length):
1597     """ Add ellipses to overly long strings """
1598     if s is None:
1599         return None
1600     ELLIPSES = '...'
1601     if len(s) > length:
1602         return s[:length - len(ELLIPSES)] + ELLIPSES
1603     return s
1604
1605
1606 def version_tuple(v):
1607     return tuple(int(e) for e in re.split(r'[-.]', v))
1608
1609
1610 def is_outdated_version(version, limit, assume_new=True):
1611     if not version:
1612         return not assume_new
1613     try:
1614         return version_tuple(version) < version_tuple(limit)
1615     except ValueError:
1616         return not assume_new
1617
1618
1619 def ytdl_is_updateable():
1620     """ Returns if youtube-dl can be updated with -U """
1621     from zipimport import zipimporter
1622
1623     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1624
1625
1626 def args_to_str(args):
1627     # Get a short string representation for a subprocess command
1628     return ' '.join(shlex_quote(a) for a in args)
1629
1630
1631 def mimetype2ext(mt):
1632     _, _, res = mt.rpartition('/')
1633
1634     return {
1635         'x-ms-wmv': 'wmv',
1636         'x-mp4-fragmented': 'mp4',
1637     }.get(res, res)
1638
1639
1640 def urlhandle_detect_ext(url_handle):
1641     try:
1642         url_handle.headers
1643         getheader = lambda h: url_handle.headers[h]
1644     except AttributeError:  # Python < 3
1645         getheader = url_handle.info().getheader
1646
1647     cd = getheader('Content-Disposition')
1648     if cd:
1649         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1650         if m:
1651             e = determine_ext(m.group('filename'), default_ext=None)
1652             if e:
1653                 return e
1654
1655     return mimetype2ext(getheader('Content-Type'))
1656
1657
1658 def age_restricted(content_limit, age_limit):
1659     """ Returns True iff the content should be blocked """
1660
1661     if age_limit is None:  # No limit set
1662         return False
1663     if content_limit is None:
1664         return False  # Content available for everyone
1665     return age_limit < content_limit
1666
1667
1668 def is_html(first_bytes):
1669     """ Detect whether a file contains HTML by examining its first bytes. """
1670
1671     BOMS = [
1672         (b'\xef\xbb\xbf', 'utf-8'),
1673         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1674         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1675         (b'\xff\xfe', 'utf-16-le'),
1676         (b'\xfe\xff', 'utf-16-be'),
1677     ]
1678     for bom, enc in BOMS:
1679         if first_bytes.startswith(bom):
1680             s = first_bytes[len(bom):].decode(enc, 'replace')
1681             break
1682     else:
1683         s = first_bytes.decode('utf-8', 'replace')
1684
1685     return re.match(r'^\s*<', s)
1686
1687
1688 def determine_protocol(info_dict):
1689     protocol = info_dict.get('protocol')
1690     if protocol is not None:
1691         return protocol
1692
1693     url = info_dict['url']
1694     if url.startswith('rtmp'):
1695         return 'rtmp'
1696     elif url.startswith('mms'):
1697         return 'mms'
1698     elif url.startswith('rtsp'):
1699         return 'rtsp'
1700
1701     ext = determine_ext(url)
1702     if ext == 'm3u8':
1703         return 'm3u8'
1704     elif ext == 'f4m':
1705         return 'f4m'
1706
1707     return compat_urllib_parse_urlparse(url).scheme
1708
1709
1710 def render_table(header_row, data):
1711     """ Render a list of rows, each as a list of values """
1712     table = [header_row] + data
1713     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1714     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1715     return '\n'.join(format_str % tuple(row) for row in table)
1716
1717
1718 def _match_one(filter_part, dct):
1719     COMPARISON_OPERATORS = {
1720         '<': operator.lt,
1721         '<=': operator.le,
1722         '>': operator.gt,
1723         '>=': operator.ge,
1724         '=': operator.eq,
1725         '!=': operator.ne,
1726     }
1727     operator_rex = re.compile(r'''(?x)\s*
1728         (?P<key>[a-z_]+)
1729         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1730         (?:
1731             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1732             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1733         )
1734         \s*$
1735         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1736     m = operator_rex.search(filter_part)
1737     if m:
1738         op = COMPARISON_OPERATORS[m.group('op')]
1739         if m.group('strval') is not None:
1740             if m.group('op') not in ('=', '!='):
1741                 raise ValueError(
1742                     'Operator %s does not support string values!' % m.group('op'))
1743             comparison_value = m.group('strval')
1744         else:
1745             try:
1746                 comparison_value = int(m.group('intval'))
1747             except ValueError:
1748                 comparison_value = parse_filesize(m.group('intval'))
1749                 if comparison_value is None:
1750                     comparison_value = parse_filesize(m.group('intval') + 'B')
1751                 if comparison_value is None:
1752                     raise ValueError(
1753                         'Invalid integer value %r in filter part %r' % (
1754                             m.group('intval'), filter_part))
1755         actual_value = dct.get(m.group('key'))
1756         if actual_value is None:
1757             return m.group('none_inclusive')
1758         return op(actual_value, comparison_value)
1759
1760     UNARY_OPERATORS = {
1761         '': lambda v: v is not None,
1762         '!': lambda v: v is None,
1763     }
1764     operator_rex = re.compile(r'''(?x)\s*
1765         (?P<op>%s)\s*(?P<key>[a-z_]+)
1766         \s*$
1767         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1768     m = operator_rex.search(filter_part)
1769     if m:
1770         op = UNARY_OPERATORS[m.group('op')]
1771         actual_value = dct.get(m.group('key'))
1772         return op(actual_value)
1773
1774     raise ValueError('Invalid filter part %r' % filter_part)
1775
1776
1777 def match_str(filter_str, dct):
1778     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1779
1780     return all(
1781         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1782
1783
1784 def match_filter_func(filter_str):
1785     def _match_func(info_dict):
1786         if match_str(filter_str, info_dict):
1787             return None
1788         else:
1789             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1790             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1791     return _match_func
1792
1793
1794 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1795     def __init__(self, proxies=None):
1796         # Set default handlers
1797         for type in ('http', 'https'):
1798             setattr(self, '%s_open' % type,
1799                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1800                         meth(r, proxy, type))
1801         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1802
1803     def proxy_open(self, req, proxy, type):
1804         req_proxy = req.headers.get('Ytdl-request-proxy')
1805         if req_proxy is not None:
1806             proxy = req_proxy
1807             del req.headers['Ytdl-request-proxy']
1808
1809         if proxy == '__noproxy__':
1810             return None  # No Proxy
1811         return compat_urllib_request.ProxyHandler.proxy_open(
1812             self, req, proxy, type)