_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_parse_qs,
  41     compat_socket_create_connection,
  42     compat_str,
  43     compat_urllib_error,
  44     compat_urllib_parse,
  45     compat_urllib_parse_urlparse,
  46     compat_urllib_request,
  47     compat_urlparse,
  48     shlex_quote,
  49 )
  50
  51
  52 # This is not clearly defined otherwise
  53 compiled_regex_type = type(re.compile(''))
  54
  55 std_headers = {
  56     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  57     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59     'Accept-Encoding': 'gzip, deflate',
  60     'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62
  63
  64 ENGLISH_MONTH_NAMES = [
  65     'January', 'February', 'March', 'April', 'May', 'June',
  66     'July', 'August', 'September', 'October', 'November', 'December']
  67
  68
  69 def preferredencoding():
  70     """Get preferred encoding.
  71
  72     Returns the best encoding scheme for the system, based on
  73     locale.getpreferredencoding() and some further tweaks.
  74     """
  75     try:
  76         pref = locale.getpreferredencoding()
  77         'TEST'.encode(pref)
  78     except:
  79         pref = 'UTF-8'
  80
  81     return pref
  82
  83
  84 def write_json_file(obj, fn):
  85     """ Encode obj as JSON and write it to fn, atomically if possible """
  86
  87     fn = encodeFilename(fn)
  88     if sys.version_info < (3, 0) and sys.platform != 'win32':
  89         encoding = get_filesystem_encoding()
  90         # os.path.basename returns a bytes object, but NamedTemporaryFile
  91         # will fail if the filename contains non ascii characters unless we
  92         # use a unicode object
  93         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  94         # the same for os.path.dirname
  95         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  96     else:
  97         path_basename = os.path.basename
  98         path_dirname = os.path.dirname
  99
 100     args = {
 101         'suffix': '.tmp',
 102         'prefix': path_basename(fn) + '.',
 103         'dir': path_dirname(fn),
 104         'delete': False,
 105     }
 106
 107     # In Python 2.x, json.dump expects a bytestream.
 108     # In Python 3.x, it writes to a character stream
 109     if sys.version_info < (3, 0):
 110         args['mode'] = 'wb'
 111     else:
 112         args.update({
 113             'mode': 'w',
 114             'encoding': 'utf-8',
 115         })
 116
 117     tf = tempfile.NamedTemporaryFile(**args)
 118
 119     try:
 120         with tf:
 121             json.dump(obj, tf)
 122         if sys.platform == 'win32':
 123             # Need to remove existing file on Windows, else os.rename raises
 124             # WindowsError or FileExistsError.
 125             try:
 126                 os.unlink(fn)
 127             except OSError:
 128                 pass
 129         os.rename(tf.name, fn)
 130     except:
 131         try:
 132             os.remove(tf.name)
 133         except OSError:
 134             pass
 135         raise
 136
 137
 138 if sys.version_info >= (2, 7):
 139     def find_xpath_attr(node, xpath, key, val):
 140         """ Find the xpath xpath[@key=val] """
 141         assert re.match(r'^[a-zA-Z-]+$', key)
 142         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 143         expr = xpath + "[@%s='%s']" % (key, val)
 144         return node.find(expr)
 145 else:
 146     def find_xpath_attr(node, xpath, key, val):
 147         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 148         # .//node does not match if a node is a direct child of . !
 149         if isinstance(xpath, compat_str):
 150             xpath = xpath.encode('ascii')
 151
 152         for f in node.findall(xpath):
 153             if f.attrib.get(key) == val:
 154                 return f
 155         return None
 156
 157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 158 # the namespace parameter
 159
 160
 161 def xpath_with_ns(path, ns_map):
 162     components = [c.split(':') for c in path.split('/')]
 163     replaced = []
 164     for c in components:
 165         if len(c) == 1:
 166             replaced.append(c[0])
 167         else:
 168             ns, tag = c
 169             replaced.append('{%s}%s' % (ns_map[ns], tag))
 170     return '/'.join(replaced)
 171
 172
 173 def xpath_text(node, xpath, name=None, fatal=False):
 174     if sys.version_info < (2, 7):  # Crazy 2.6
 175         xpath = xpath.encode('ascii')
 176
 177     n = node.find(xpath)
 178     if n is None or n.text is None:
 179         if fatal:
 180             name = xpath if name is None else name
 181             raise ExtractorError('Could not find XML element %s' % name)
 182         else:
 183             return None
 184     return n.text
 185
 186
 187 def get_element_by_id(id, html):
 188     """Return the content of the tag with the specified ID in the passed HTML document"""
 189     return get_element_by_attribute("id", id, html)
 190
 191
 192 def get_element_by_attribute(attribute, value, html):
 193     """Return the content of the tag with the specified attribute in the passed HTML document"""
 194
 195     m = re.search(r'''(?xs)
 196         <([a-zA-Z0-9:._-]+)
 197          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 198          \s+%s=['"]?%s['"]?
 199          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 200         \s*>
 201         (?P<content>.*?)
 202         </\1>
 203     ''' % (re.escape(attribute), re.escape(value)), html)
 204
 205     if not m:
 206         return None
 207     res = m.group('content')
 208
 209     if res.startswith('"') or res.startswith("'"):
 210         res = res[1:-1]
 211
 212     return unescapeHTML(res)
 213
 214
 215 def clean_html(html):
 216     """Clean an HTML snippet into a readable string"""
 217
 218     if html is None:  # Convenience for sanitizing descriptions etc.
 219         return html
 220
 221     # Newline vs <br />
 222     html = html.replace('\n', ' ')
 223     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 224     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 225     # Strip html tags
 226     html = re.sub('<.*?>', '', html)
 227     # Replace html entities
 228     html = unescapeHTML(html)
 229     return html.strip()
 230
 231
 232 def sanitize_open(filename, open_mode):
 233     """Try to open the given filename, and slightly tweak it if this fails.
 234
 235     Attempts to open the given filename. If this fails, it tries to change
 236     the filename slightly, step by step, until it's either able to open it
 237     or it fails and raises a final exception, like the standard open()
 238     function.
 239
 240     It returns the tuple (stream, definitive_file_name).
 241     """
 242     try:
 243         if filename == '-':
 244             if sys.platform == 'win32':
 245                 import msvcrt
 246                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 247             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 248         stream = open(encodeFilename(filename), open_mode)
 249         return (stream, filename)
 250     except (IOError, OSError) as err:
 251         if err.errno in (errno.EACCES,):
 252             raise
 253
 254         # In case of error, try to remove win32 forbidden chars
 255         alt_filename = sanitize_path(filename)
 256         if alt_filename == filename:
 257             raise
 258         else:
 259             # An exception here should be caught in the caller
 260             stream = open(encodeFilename(alt_filename), open_mode)
 261             return (stream, alt_filename)
 262
 263
 264 def timeconvert(timestr):
 265     """Convert RFC 2822 defined time string into system timestamp"""
 266     timestamp = None
 267     timetuple = email.utils.parsedate_tz(timestr)
 268     if timetuple is not None:
 269         timestamp = email.utils.mktime_tz(timetuple)
 270     return timestamp
 271
 272
 273 def sanitize_filename(s, restricted=False, is_id=False):
 274     """Sanitizes a string so it could be used as part of a filename.
 275     If restricted is set, use a stricter subset of allowed characters.
 276     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 277     """
 278     def replace_insane(char):
 279         if char == '?' or ord(char) < 32 or ord(char) == 127:
 280             return ''
 281         elif char == '"':
 282             return '' if restricted else '\''
 283         elif char == ':':
 284             return '_-' if restricted else ' -'
 285         elif char in '\\/|*<>':
 286             return '_'
 287         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 288             return '_'
 289         if restricted and ord(char) > 127:
 290             return '_'
 291         return char
 292
 293     # Handle timestamps
 294     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 295     result = ''.join(map(replace_insane, s))
 296     if not is_id:
 297         while '__' in result:
 298             result = result.replace('__', '_')
 299         result = result.strip('_')
 300         # Common case of "Foreign band name - English song title"
 301         if restricted and result.startswith('-_'):
 302             result = result[2:]
 303         if result.startswith('-'):
 304             result = '_' + result[len('-'):]
 305         result = result.lstrip('.')
 306         if not result:
 307             result = '_'
 308     return result
 309
 310
 311 def sanitize_path(s):
 312     """Sanitizes and normalizes path on Windows"""
 313     if sys.platform != 'win32':
 314         return s
 315     drive, _ = os.path.splitdrive(s)
 316     unc, _ = os.path.splitunc(s)
 317     unc_or_drive = unc or drive
 318     norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
 319     if unc_or_drive:
 320         norm_path.pop(0)
 321     sanitized_path = [
 322         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 323         for path_part in norm_path]
 324     if unc_or_drive:
 325         sanitized_path.insert(0, unc_or_drive + os.path.sep)
 326     return os.path.join(*sanitized_path)
 327
 328
 329 def orderedSet(iterable):
 330     """ Remove all duplicates from the input iterable """
 331     res = []
 332     for el in iterable:
 333         if el not in res:
 334             res.append(el)
 335     return res
 336
 337
 338 def _htmlentity_transform(entity):
 339     """Transforms an HTML entity to a character."""
 340     # Known non-numeric HTML entity
 341     if entity in compat_html_entities.name2codepoint:
 342         return compat_chr(compat_html_entities.name2codepoint[entity])
 343
 344     mobj = re.match(r'#(x?[0-9]+)', entity)
 345     if mobj is not None:
 346         numstr = mobj.group(1)
 347         if numstr.startswith('x'):
 348             base = 16
 349             numstr = '0%s' % numstr
 350         else:
 351             base = 10
 352         return compat_chr(int(numstr, base))
 353
 354     # Unknown entity in name, return its literal representation
 355     return ('&%s;' % entity)
 356
 357
 358 def unescapeHTML(s):
 359     if s is None:
 360         return None
 361     assert type(s) == compat_str
 362
 363     return re.sub(
 364         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 365
 366
 367 def encodeFilename(s, for_subprocess=False):
 368     """
 369     @param s The name of the file
 370     """
 371
 372     assert type(s) == compat_str
 373
 374     # Python 3 has a Unicode API
 375     if sys.version_info >= (3, 0):
 376         return s
 377
 378     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 379         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 380         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 381         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 382         if not for_subprocess:
 383             return s
 384         else:
 385             # For subprocess calls, encode with locale encoding
 386             # Refer to http://stackoverflow.com/a/9951851/35070
 387             encoding = preferredencoding()
 388     else:
 389         encoding = sys.getfilesystemencoding()
 390     if encoding is None:
 391         encoding = 'utf-8'
 392     return s.encode(encoding, 'ignore')
 393
 394
 395 def encodeArgument(s):
 396     if not isinstance(s, compat_str):
 397         # Legacy code that uses byte strings
 398         # Uncomment the following line after fixing all post processors
 399         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 400         s = s.decode('ascii')
 401     return encodeFilename(s, True)
 402
 403
 404 def decodeOption(optval):
 405     if optval is None:
 406         return optval
 407     if isinstance(optval, bytes):
 408         optval = optval.decode(preferredencoding())
 409
 410     assert isinstance(optval, compat_str)
 411     return optval
 412
 413
 414 def formatSeconds(secs):
 415     if secs > 3600:
 416         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 417     elif secs > 60:
 418         return '%d:%02d' % (secs // 60, secs % 60)
 419     else:
 420         return '%d' % secs
 421
 422
 423 def make_HTTPS_handler(params, **kwargs):
 424     opts_no_check_certificate = params.get('nocheckcertificate', False)
 425     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 426         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 427         if opts_no_check_certificate:
 428             context.check_hostname = False
 429             context.verify_mode = ssl.CERT_NONE
 430         try:
 431             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 432         except TypeError:
 433             # Python 2.7.8
 434             # (create_default_context present but HTTPSHandler has no context=)
 435             pass
 436
 437     if sys.version_info < (3, 2):
 438         return YoutubeDLHTTPSHandler(params, **kwargs)
 439     else:  # Python < 3.4
 440         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 441         context.verify_mode = (ssl.CERT_NONE
 442                                if opts_no_check_certificate
 443                                else ssl.CERT_REQUIRED)
 444         context.set_default_verify_paths()
 445         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 446
 447
 448 class ExtractorError(Exception):
 449     """Error during info extraction."""
 450
 451     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 452         """ tb, if given, is the original traceback (so that it can be printed out).
 453         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 454         """
 455
 456         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 457             expected = True
 458         if video_id is not None:
 459             msg = video_id + ': ' + msg
 460         if cause:
 461             msg += ' (caused by %r)' % cause
 462         if not expected:
 463             if ytdl_is_updateable():
 464                 update_cmd = 'type  youtube-dl -U  to update'
 465             else:
 466                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 467             msg += '; please report this issue on https://yt-dl.org/bug .'
 468             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 469             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 470         super(ExtractorError, self).__init__(msg)
 471
 472         self.traceback = tb
 473         self.exc_info = sys.exc_info()  # preserve original exception
 474         self.cause = cause
 475         self.video_id = video_id
 476
 477     def format_traceback(self):
 478         if self.traceback is None:
 479             return None
 480         return ''.join(traceback.format_tb(self.traceback))
 481
 482
 483 class UnsupportedError(ExtractorError):
 484     def __init__(self, url):
 485         super(UnsupportedError, self).__init__(
 486             'Unsupported URL: %s' % url, expected=True)
 487         self.url = url
 488
 489
 490 class RegexNotFoundError(ExtractorError):
 491     """Error when a regex didn't match"""
 492     pass
 493
 494
 495 class DownloadError(Exception):
 496     """Download Error exception.
 497
 498     This exception may be thrown by FileDownloader objects if they are not
 499     configured to continue on errors. They will contain the appropriate
 500     error message.
 501     """
 502
 503     def __init__(self, msg, exc_info=None):
 504         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 505         super(DownloadError, self).__init__(msg)
 506         self.exc_info = exc_info
 507
 508
 509 class SameFileError(Exception):
 510     """Same File exception.
 511
 512     This exception will be thrown by FileDownloader objects if they detect
 513     multiple files would have to be downloaded to the same file on disk.
 514     """
 515     pass
 516
 517
 518 class PostProcessingError(Exception):
 519     """Post Processing exception.
 520
 521     This exception may be raised by PostProcessor's .run() method to
 522     indicate an error in the postprocessing task.
 523     """
 524
 525     def __init__(self, msg):
 526         self.msg = msg
 527
 528
 529 class MaxDownloadsReached(Exception):
 530     """ --max-downloads limit has been reached. """
 531     pass
 532
 533
 534 class UnavailableVideoError(Exception):
 535     """Unavailable Format exception.
 536
 537     This exception will be thrown when a video is requested
 538     in a format that is not available for that video.
 539     """
 540     pass
 541
 542
 543 class ContentTooShortError(Exception):
 544     """Content Too Short exception.
 545
 546     This exception may be raised by FileDownloader objects when a file they
 547     download is too small for what the server announced first, indicating
 548     the connection was probably interrupted.
 549     """
 550     # Both in bytes
 551     downloaded = None
 552     expected = None
 553
 554     def __init__(self, downloaded, expected):
 555         self.downloaded = downloaded
 556         self.expected = expected
 557
 558
 559 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 560     hc = http_class(*args, **kwargs)
 561     source_address = ydl_handler._params.get('source_address')
 562     if source_address is not None:
 563         sa = (source_address, 0)
 564         if hasattr(hc, 'source_address'):  # Python 2.7+
 565             hc.source_address = sa
 566         else:  # Python 2.6
 567             def _hc_connect(self, *args, **kwargs):
 568                 sock = compat_socket_create_connection(
 569                     (self.host, self.port), self.timeout, sa)
 570                 if is_https:
 571                     self.sock = ssl.wrap_socket(
 572                         sock, self.key_file, self.cert_file,
 573                         ssl_version=ssl.PROTOCOL_TLSv1)
 574                 else:
 575                     self.sock = sock
 576             hc.connect = functools.partial(_hc_connect, hc)
 577
 578     return hc
 579
 580
 581 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 582     """Handler for HTTP requests and responses.
 583
 584     This class, when installed with an OpenerDirector, automatically adds
 585     the standard headers to every HTTP request and handles gzipped and
 586     deflated responses from web servers. If compression is to be avoided in
 587     a particular request, the original request in the program code only has
 588     to include the HTTP header "Youtubedl-No-Compression", which will be
 589     removed before making the real request.
 590
 591     Part of this code was copied from:
 592
 593     http://techknack.net/python-urllib2-handlers/
 594
 595     Andrew Rowls, the author of that code, agreed to release it to the
 596     public domain.
 597     """
 598
 599     def __init__(self, params, *args, **kwargs):
 600         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 601         self._params = params
 602
 603     def http_open(self, req):
 604         return self.do_open(functools.partial(
 605             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 606             req)
 607
 608     @staticmethod
 609     def deflate(data):
 610         try:
 611             return zlib.decompress(data, -zlib.MAX_WBITS)
 612         except zlib.error:
 613             return zlib.decompress(data)
 614
 615     @staticmethod
 616     def addinfourl_wrapper(stream, headers, url, code):
 617         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 618             return compat_urllib_request.addinfourl(stream, headers, url, code)
 619         ret = compat_urllib_request.addinfourl(stream, headers, url)
 620         ret.code = code
 621         return ret
 622
 623     def http_request(self, req):
 624         for h, v in std_headers.items():
 625             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 626             # The dict keys are capitalized because of this bug by urllib
 627             if h.capitalize() not in req.headers:
 628                 req.add_header(h, v)
 629         if 'Youtubedl-no-compression' in req.headers:
 630             if 'Accept-encoding' in req.headers:
 631                 del req.headers['Accept-encoding']
 632             del req.headers['Youtubedl-no-compression']
 633
 634         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 635             # Python 2.6 is brain-dead when it comes to fragments
 636             req._Request__original = req._Request__original.partition('#')[0]
 637             req._Request__r_type = req._Request__r_type.partition('#')[0]
 638
 639         return req
 640
 641     def http_response(self, req, resp):
 642         old_resp = resp
 643         # gzip
 644         if resp.headers.get('Content-encoding', '') == 'gzip':
 645             content = resp.read()
 646             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 647             try:
 648                 uncompressed = io.BytesIO(gz.read())
 649             except IOError as original_ioerror:
 650                 # There may be junk add the end of the file
 651                 # See http://stackoverflow.com/q/4928560/35070 for details
 652                 for i in range(1, 1024):
 653                     try:
 654                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 655                         uncompressed = io.BytesIO(gz.read())
 656                     except IOError:
 657                         continue
 658                     break
 659                 else:
 660                     raise original_ioerror
 661             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 662             resp.msg = old_resp.msg
 663         # deflate
 664         if resp.headers.get('Content-encoding', '') == 'deflate':
 665             gz = io.BytesIO(self.deflate(resp.read()))
 666             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 667             resp.msg = old_resp.msg
 668         return resp
 669
 670     https_request = http_request
 671     https_response = http_response
 672
 673
 674 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 675     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 676         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 677         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 678         self._params = params
 679
 680     def https_open(self, req):
 681         kwargs = {}
 682         if hasattr(self, '_context'):  # python > 2.6
 683             kwargs['context'] = self._context
 684         if hasattr(self, '_check_hostname'):  # python 3.x
 685             kwargs['check_hostname'] = self._check_hostname
 686         return self.do_open(functools.partial(
 687             _create_http_connection, self, self._https_conn_class, True),
 688             req, **kwargs)
 689
 690
 691 def parse_iso8601(date_str, delimiter='T', timezone=None):
 692     """ Return a UNIX timestamp from the given date """
 693
 694     if date_str is None:
 695         return None
 696
 697     if timezone is None:
 698         m = re.search(
 699             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 700             date_str)
 701         if not m:
 702             timezone = datetime.timedelta()
 703         else:
 704             date_str = date_str[:-len(m.group(0))]
 705             if not m.group('sign'):
 706                 timezone = datetime.timedelta()
 707             else:
 708                 sign = 1 if m.group('sign') == '+' else -1
 709                 timezone = datetime.timedelta(
 710                     hours=sign * int(m.group('hours')),
 711                     minutes=sign * int(m.group('minutes')))
 712     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 713     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 714     return calendar.timegm(dt.timetuple())
 715
 716
 717 def unified_strdate(date_str, day_first=True):
 718     """Return a string with the date in the format YYYYMMDD"""
 719
 720     if date_str is None:
 721         return None
 722     upload_date = None
 723     # Replace commas
 724     date_str = date_str.replace(',', ' ')
 725     # %z (UTC offset) is only supported in python>=3.2
 726     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 727     # Remove AM/PM + timezone
 728     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 729
 730     format_expressions = [
 731         '%d %B %Y',
 732         '%d %b %Y',
 733         '%B %d %Y',
 734         '%b %d %Y',
 735         '%b %dst %Y %I:%M%p',
 736         '%b %dnd %Y %I:%M%p',
 737         '%b %dth %Y %I:%M%p',
 738         '%Y %m %d',
 739         '%Y-%m-%d',
 740         '%Y/%m/%d',
 741         '%Y/%m/%d %H:%M:%S',
 742         '%Y-%m-%d %H:%M:%S',
 743         '%Y-%m-%d %H:%M:%S.%f',
 744         '%d.%m.%Y %H:%M',
 745         '%d.%m.%Y %H.%M',
 746         '%Y-%m-%dT%H:%M:%SZ',
 747         '%Y-%m-%dT%H:%M:%S.%fZ',
 748         '%Y-%m-%dT%H:%M:%S.%f0Z',
 749         '%Y-%m-%dT%H:%M:%S',
 750         '%Y-%m-%dT%H:%M:%S.%f',
 751         '%Y-%m-%dT%H:%M',
 752     ]
 753     if day_first:
 754         format_expressions.extend([
 755             '%d.%m.%Y',
 756             '%d/%m/%Y',
 757             '%d/%m/%y',
 758             '%d/%m/%Y %H:%M:%S',
 759         ])
 760     else:
 761         format_expressions.extend([
 762             '%m.%d.%Y',
 763             '%m/%d/%Y',
 764             '%m/%d/%y',
 765             '%m/%d/%Y %H:%M:%S',
 766         ])
 767     for expression in format_expressions:
 768         try:
 769             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 770         except ValueError:
 771             pass
 772     if upload_date is None:
 773         timetuple = email.utils.parsedate_tz(date_str)
 774         if timetuple:
 775             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 776     return upload_date
 777
 778
 779 def determine_ext(url, default_ext='unknown_video'):
 780     if url is None:
 781         return default_ext
 782     guess = url.partition('?')[0].rpartition('.')[2]
 783     if re.match(r'^[A-Za-z0-9]+$', guess):
 784         return guess
 785     else:
 786         return default_ext
 787
 788
 789 def subtitles_filename(filename, sub_lang, sub_format):
 790     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 791
 792
 793 def date_from_str(date_str):
 794     """
 795     Return a datetime object from a string in the format YYYYMMDD or
 796     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 797     today = datetime.date.today()
 798     if date_str in ('now', 'today'):
 799         return today
 800     if date_str == 'yesterday':
 801         return today - datetime.timedelta(days=1)
 802     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 803     if match is not None:
 804         sign = match.group('sign')
 805         time = int(match.group('time'))
 806         if sign == '-':
 807             time = -time
 808         unit = match.group('unit')
 809         # A bad aproximation?
 810         if unit == 'month':
 811             unit = 'day'
 812             time *= 30
 813         elif unit == 'year':
 814             unit = 'day'
 815             time *= 365
 816         unit += 's'
 817         delta = datetime.timedelta(**{unit: time})
 818         return today + delta
 819     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 820
 821
 822 def hyphenate_date(date_str):
 823     """
 824     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 825     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 826     if match is not None:
 827         return '-'.join(match.groups())
 828     else:
 829         return date_str
 830
 831
 832 class DateRange(object):
 833     """Represents a time interval between two dates"""
 834
 835     def __init__(self, start=None, end=None):
 836         """start and end must be strings in the format accepted by date"""
 837         if start is not None:
 838             self.start = date_from_str(start)
 839         else:
 840             self.start = datetime.datetime.min.date()
 841         if end is not None:
 842             self.end = date_from_str(end)
 843         else:
 844             self.end = datetime.datetime.max.date()
 845         if self.start > self.end:
 846             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 847
 848     @classmethod
 849     def day(cls, day):
 850         """Returns a range that only contains the given day"""
 851         return cls(day, day)
 852
 853     def __contains__(self, date):
 854         """Check if the date is in the range"""
 855         if not isinstance(date, datetime.date):
 856             date = date_from_str(date)
 857         return self.start <= date <= self.end
 858
 859     def __str__(self):
 860         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 861
 862
 863 def platform_name():
 864     """ Returns the platform name as a compat_str """
 865     res = platform.platform()
 866     if isinstance(res, bytes):
 867         res = res.decode(preferredencoding())
 868
 869     assert isinstance(res, compat_str)
 870     return res
 871
 872
 873 def _windows_write_string(s, out):
 874     """ Returns True if the string was written using special methods,
 875     False if it has yet to be written out."""
 876     # Adapted from http://stackoverflow.com/a/3259271/35070
 877
 878     import ctypes
 879     import ctypes.wintypes
 880
 881     WIN_OUTPUT_IDS = {
 882         1: -11,
 883         2: -12,
 884     }
 885
 886     try:
 887         fileno = out.fileno()
 888     except AttributeError:
 889         # If the output stream doesn't have a fileno, it's virtual
 890         return False
 891     except io.UnsupportedOperation:
 892         # Some strange Windows pseudo files?
 893         return False
 894     if fileno not in WIN_OUTPUT_IDS:
 895         return False
 896
 897     GetStdHandle = ctypes.WINFUNCTYPE(
 898         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 899         (b"GetStdHandle", ctypes.windll.kernel32))
 900     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 901
 902     WriteConsoleW = ctypes.WINFUNCTYPE(
 903         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 904         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 905         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 906     written = ctypes.wintypes.DWORD(0)
 907
 908     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 909     FILE_TYPE_CHAR = 0x0002
 910     FILE_TYPE_REMOTE = 0x8000
 911     GetConsoleMode = ctypes.WINFUNCTYPE(
 912         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 913         ctypes.POINTER(ctypes.wintypes.DWORD))(
 914         (b"GetConsoleMode", ctypes.windll.kernel32))
 915     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 916
 917     def not_a_console(handle):
 918         if handle == INVALID_HANDLE_VALUE or handle is None:
 919             return True
 920         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 921                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 922
 923     if not_a_console(h):
 924         return False
 925
 926     def next_nonbmp_pos(s):
 927         try:
 928             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 929         except StopIteration:
 930             return len(s)
 931
 932     while s:
 933         count = min(next_nonbmp_pos(s), 1024)
 934
 935         ret = WriteConsoleW(
 936             h, s, count if count else 2, ctypes.byref(written), None)
 937         if ret == 0:
 938             raise OSError('Failed to write string')
 939         if not count:  # We just wrote a non-BMP character
 940             assert written.value == 2
 941             s = s[1:]
 942         else:
 943             assert written.value > 0
 944             s = s[written.value:]
 945     return True
 946
 947
 948 def write_string(s, out=None, encoding=None):
 949     if out is None:
 950         out = sys.stderr
 951     assert type(s) == compat_str
 952
 953     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 954         if _windows_write_string(s, out):
 955             return
 956
 957     if ('b' in getattr(out, 'mode', '') or
 958             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 959         byt = s.encode(encoding or preferredencoding(), 'ignore')
 960         out.write(byt)
 961     elif hasattr(out, 'buffer'):
 962         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 963         byt = s.encode(enc, 'ignore')
 964         out.buffer.write(byt)
 965     else:
 966         out.write(s)
 967     out.flush()
 968
 969
 970 def bytes_to_intlist(bs):
 971     if not bs:
 972         return []
 973     if isinstance(bs[0], int):  # Python 3
 974         return list(bs)
 975     else:
 976         return [ord(c) for c in bs]
 977
 978
 979 def intlist_to_bytes(xs):
 980     if not xs:
 981         return b''
 982     return struct_pack('%dB' % len(xs), *xs)
 983
 984
 985 # Cross-platform file locking
 986 if sys.platform == 'win32':
 987     import ctypes.wintypes
 988     import msvcrt
 989
 990     class OVERLAPPED(ctypes.Structure):
 991         _fields_ = [
 992             ('Internal', ctypes.wintypes.LPVOID),
 993             ('InternalHigh', ctypes.wintypes.LPVOID),
 994             ('Offset', ctypes.wintypes.DWORD),
 995             ('OffsetHigh', ctypes.wintypes.DWORD),
 996             ('hEvent', ctypes.wintypes.HANDLE),
 997         ]
 998
 999     kernel32 = ctypes.windll.kernel32
1000     LockFileEx = kernel32.LockFileEx
1001     LockFileEx.argtypes = [
1002         ctypes.wintypes.HANDLE,     # hFile
1003         ctypes.wintypes.DWORD,      # dwFlags
1004         ctypes.wintypes.DWORD,      # dwReserved
1005         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1006         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1007         ctypes.POINTER(OVERLAPPED)  # Overlapped
1008     ]
1009     LockFileEx.restype = ctypes.wintypes.BOOL
1010     UnlockFileEx = kernel32.UnlockFileEx
1011     UnlockFileEx.argtypes = [
1012         ctypes.wintypes.HANDLE,     # hFile
1013         ctypes.wintypes.DWORD,      # dwReserved
1014         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1015         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1016         ctypes.POINTER(OVERLAPPED)  # Overlapped
1017     ]
1018     UnlockFileEx.restype = ctypes.wintypes.BOOL
1019     whole_low = 0xffffffff
1020     whole_high = 0x7fffffff
1021
1022     def _lock_file(f, exclusive):
1023         overlapped = OVERLAPPED()
1024         overlapped.Offset = 0
1025         overlapped.OffsetHigh = 0
1026         overlapped.hEvent = 0
1027         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1028         handle = msvcrt.get_osfhandle(f.fileno())
1029         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1030                           whole_low, whole_high, f._lock_file_overlapped_p):
1031             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1032
1033     def _unlock_file(f):
1034         assert f._lock_file_overlapped_p
1035         handle = msvcrt.get_osfhandle(f.fileno())
1036         if not UnlockFileEx(handle, 0,
1037                             whole_low, whole_high, f._lock_file_overlapped_p):
1038             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1039
1040 else:
1041     import fcntl
1042
1043     def _lock_file(f, exclusive):
1044         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1045
1046     def _unlock_file(f):
1047         fcntl.flock(f, fcntl.LOCK_UN)
1048
1049
1050 class locked_file(object):
1051     def __init__(self, filename, mode, encoding=None):
1052         assert mode in ['r', 'a', 'w']
1053         self.f = io.open(filename, mode, encoding=encoding)
1054         self.mode = mode
1055
1056     def __enter__(self):
1057         exclusive = self.mode != 'r'
1058         try:
1059             _lock_file(self.f, exclusive)
1060         except IOError:
1061             self.f.close()
1062             raise
1063         return self
1064
1065     def __exit__(self, etype, value, traceback):
1066         try:
1067             _unlock_file(self.f)
1068         finally:
1069             self.f.close()
1070
1071     def __iter__(self):
1072         return iter(self.f)
1073
1074     def write(self, *args):
1075         return self.f.write(*args)
1076
1077     def read(self, *args):
1078         return self.f.read(*args)
1079
1080
1081 def get_filesystem_encoding():
1082     encoding = sys.getfilesystemencoding()
1083     return encoding if encoding is not None else 'utf-8'
1084
1085
1086 def shell_quote(args):
1087     quoted_args = []
1088     encoding = get_filesystem_encoding()
1089     for a in args:
1090         if isinstance(a, bytes):
1091             # We may get a filename encoded with 'encodeFilename'
1092             a = a.decode(encoding)
1093         quoted_args.append(pipes.quote(a))
1094     return ' '.join(quoted_args)
1095
1096
1097 def takewhile_inclusive(pred, seq):
1098     """ Like itertools.takewhile, but include the latest evaluated element
1099         (the first element so that Not pred(e)) """
1100     for e in seq:
1101         yield e
1102         if not pred(e):
1103             return
1104
1105
1106 def smuggle_url(url, data):
1107     """ Pass additional data in a URL for internal use. """
1108
1109     sdata = compat_urllib_parse.urlencode(
1110         {'__youtubedl_smuggle': json.dumps(data)})
1111     return url + '#' + sdata
1112
1113
1114 def unsmuggle_url(smug_url, default=None):
1115     if '#__youtubedl_smuggle' not in smug_url:
1116         return smug_url, default
1117     url, _, sdata = smug_url.rpartition('#')
1118     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1119     data = json.loads(jsond)
1120     return url, data
1121
1122
1123 def format_bytes(bytes):
1124     if bytes is None:
1125         return 'N/A'
1126     if type(bytes) is str:
1127         bytes = float(bytes)
1128     if bytes == 0.0:
1129         exponent = 0
1130     else:
1131         exponent = int(math.log(bytes, 1024.0))
1132     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1133     converted = float(bytes) / float(1024 ** exponent)
1134     return '%.2f%s' % (converted, suffix)
1135
1136
1137 def parse_filesize(s):
1138     if s is None:
1139         return None
1140
1141     # The lower-case forms are of course incorrect and inofficial,
1142     # but we support those too
1143     _UNIT_TABLE = {
1144         'B': 1,
1145         'b': 1,
1146         'KiB': 1024,
1147         'KB': 1000,
1148         'kB': 1024,
1149         'Kb': 1000,
1150         'MiB': 1024 ** 2,
1151         'MB': 1000 ** 2,
1152         'mB': 1024 ** 2,
1153         'Mb': 1000 ** 2,
1154         'GiB': 1024 ** 3,
1155         'GB': 1000 ** 3,
1156         'gB': 1024 ** 3,
1157         'Gb': 1000 ** 3,
1158         'TiB': 1024 ** 4,
1159         'TB': 1000 ** 4,
1160         'tB': 1024 ** 4,
1161         'Tb': 1000 ** 4,
1162         'PiB': 1024 ** 5,
1163         'PB': 1000 ** 5,
1164         'pB': 1024 ** 5,
1165         'Pb': 1000 ** 5,
1166         'EiB': 1024 ** 6,
1167         'EB': 1000 ** 6,
1168         'eB': 1024 ** 6,
1169         'Eb': 1000 ** 6,
1170         'ZiB': 1024 ** 7,
1171         'ZB': 1000 ** 7,
1172         'zB': 1024 ** 7,
1173         'Zb': 1000 ** 7,
1174         'YiB': 1024 ** 8,
1175         'YB': 1000 ** 8,
1176         'yB': 1024 ** 8,
1177         'Yb': 1000 ** 8,
1178     }
1179
1180     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1181     m = re.match(
1182         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1183     if not m:
1184         return None
1185
1186     num_str = m.group('num').replace(',', '.')
1187     mult = _UNIT_TABLE[m.group('unit')]
1188     return int(float(num_str) * mult)
1189
1190
1191 def month_by_name(name):
1192     """ Return the number of a month by (locale-independently) English name """
1193
1194     try:
1195         return ENGLISH_MONTH_NAMES.index(name) + 1
1196     except ValueError:
1197         return None
1198
1199
1200 def month_by_abbreviation(abbrev):
1201     """ Return the number of a month by (locale-independently) English
1202         abbreviations """
1203
1204     try:
1205         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1206     except ValueError:
1207         return None
1208
1209
1210 def fix_xml_ampersands(xml_str):
1211     """Replace all the '&' by '&amp;' in XML"""
1212     return re.sub(
1213         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1214         '&amp;',
1215         xml_str)
1216
1217
1218 def setproctitle(title):
1219     assert isinstance(title, compat_str)
1220     try:
1221         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1222     except OSError:
1223         return
1224     title_bytes = title.encode('utf-8')
1225     buf = ctypes.create_string_buffer(len(title_bytes))
1226     buf.value = title_bytes
1227     try:
1228         libc.prctl(15, buf, 0, 0, 0)
1229     except AttributeError:
1230         return  # Strange libc, just skip this
1231
1232
1233 def remove_start(s, start):
1234     if s.startswith(start):
1235         return s[len(start):]
1236     return s
1237
1238
1239 def remove_end(s, end):
1240     if s.endswith(end):
1241         return s[:-len(end)]
1242     return s
1243
1244
1245 def url_basename(url):
1246     path = compat_urlparse.urlparse(url).path
1247     return path.strip('/').split('/')[-1]
1248
1249
1250 class HEADRequest(compat_urllib_request.Request):
1251     def get_method(self):
1252         return "HEAD"
1253
1254
1255 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1256     if get_attr:
1257         if v is not None:
1258             v = getattr(v, get_attr, None)
1259     if v == '':
1260         v = None
1261     return default if v is None else (int(v) * invscale // scale)
1262
1263
1264 def str_or_none(v, default=None):
1265     return default if v is None else compat_str(v)
1266
1267
1268 def str_to_int(int_str):
1269     """ A more relaxed version of int_or_none """
1270     if int_str is None:
1271         return None
1272     int_str = re.sub(r'[,\.\+]', '', int_str)
1273     return int(int_str)
1274
1275
1276 def float_or_none(v, scale=1, invscale=1, default=None):
1277     return default if v is None else (float(v) * invscale / scale)
1278
1279
1280 def parse_duration(s):
1281     if not isinstance(s, compat_basestring):
1282         return None
1283
1284     s = s.strip()
1285
1286     m = re.match(
1287         r'''(?ix)(?:P?T)?
1288         (?:
1289             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1290             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1291
1292             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1293             (?:
1294                 (?:
1295                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1296                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1297                 )?
1298                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1299             )?
1300             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1301         )$''', s)
1302     if not m:
1303         return None
1304     res = 0
1305     if m.group('only_mins'):
1306         return float_or_none(m.group('only_mins'), invscale=60)
1307     if m.group('only_hours'):
1308         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1309     if m.group('secs'):
1310         res += int(m.group('secs'))
1311     if m.group('mins_reversed'):
1312         res += int(m.group('mins_reversed')) * 60
1313     if m.group('mins'):
1314         res += int(m.group('mins')) * 60
1315     if m.group('hours'):
1316         res += int(m.group('hours')) * 60 * 60
1317     if m.group('hours_reversed'):
1318         res += int(m.group('hours_reversed')) * 60 * 60
1319     if m.group('days'):
1320         res += int(m.group('days')) * 24 * 60 * 60
1321     if m.group('ms'):
1322         res += float(m.group('ms'))
1323     return res
1324
1325
1326 def prepend_extension(filename, ext):
1327     name, real_ext = os.path.splitext(filename)
1328     return '{0}.{1}{2}'.format(name, ext, real_ext)
1329
1330
1331 def check_executable(exe, args=[]):
1332     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1333     args can be a list of arguments for a short output (like -version) """
1334     try:
1335         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1336     except OSError:
1337         return False
1338     return exe
1339
1340
1341 def get_exe_version(exe, args=['--version'],
1342                     version_re=None, unrecognized='present'):
1343     """ Returns the version of the specified executable,
1344     or False if the executable is not present """
1345     try:
1346         out, _ = subprocess.Popen(
1347             [exe] + args,
1348             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1349     except OSError:
1350         return False
1351     if isinstance(out, bytes):  # Python 2.x
1352         out = out.decode('ascii', 'ignore')
1353     return detect_exe_version(out, version_re, unrecognized)
1354
1355
1356 def detect_exe_version(output, version_re=None, unrecognized='present'):
1357     assert isinstance(output, compat_str)
1358     if version_re is None:
1359         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1360     m = re.search(version_re, output)
1361     if m:
1362         return m.group(1)
1363     else:
1364         return unrecognized
1365
1366
1367 class PagedList(object):
1368     def __len__(self):
1369         # This is only useful for tests
1370         return len(self.getslice())
1371
1372
1373 class OnDemandPagedList(PagedList):
1374     def __init__(self, pagefunc, pagesize):
1375         self._pagefunc = pagefunc
1376         self._pagesize = pagesize
1377
1378     def getslice(self, start=0, end=None):
1379         res = []
1380         for pagenum in itertools.count(start // self._pagesize):
1381             firstid = pagenum * self._pagesize
1382             nextfirstid = pagenum * self._pagesize + self._pagesize
1383             if start >= nextfirstid:
1384                 continue
1385
1386             page_results = list(self._pagefunc(pagenum))
1387
1388             startv = (
1389                 start % self._pagesize
1390                 if firstid <= start < nextfirstid
1391                 else 0)
1392
1393             endv = (
1394                 ((end - 1) % self._pagesize) + 1
1395                 if (end is not None and firstid <= end <= nextfirstid)
1396                 else None)
1397
1398             if startv != 0 or endv is not None:
1399                 page_results = page_results[startv:endv]
1400             res.extend(page_results)
1401
1402             # A little optimization - if current page is not "full", ie. does
1403             # not contain page_size videos then we can assume that this page
1404             # is the last one - there are no more ids on further pages -
1405             # i.e. no need to query again.
1406             if len(page_results) + startv < self._pagesize:
1407                 break
1408
1409             # If we got the whole page, but the next page is not interesting,
1410             # break out early as well
1411             if end == nextfirstid:
1412                 break
1413         return res
1414
1415
1416 class InAdvancePagedList(PagedList):
1417     def __init__(self, pagefunc, pagecount, pagesize):
1418         self._pagefunc = pagefunc
1419         self._pagecount = pagecount
1420         self._pagesize = pagesize
1421
1422     def getslice(self, start=0, end=None):
1423         res = []
1424         start_page = start // self._pagesize
1425         end_page = (
1426             self._pagecount if end is None else (end // self._pagesize + 1))
1427         skip_elems = start - start_page * self._pagesize
1428         only_more = None if end is None else end - start
1429         for pagenum in range(start_page, end_page):
1430             page = list(self._pagefunc(pagenum))
1431             if skip_elems:
1432                 page = page[skip_elems:]
1433                 skip_elems = None
1434             if only_more is not None:
1435                 if len(page) < only_more:
1436                     only_more -= len(page)
1437                 else:
1438                     page = page[:only_more]
1439                     res.extend(page)
1440                     break
1441             res.extend(page)
1442         return res
1443
1444
1445 def uppercase_escape(s):
1446     unicode_escape = codecs.getdecoder('unicode_escape')
1447     return re.sub(
1448         r'\\U[0-9a-fA-F]{8}',
1449         lambda m: unicode_escape(m.group(0))[0],
1450         s)
1451
1452
1453 def escape_rfc3986(s):
1454     """Escape non-ASCII characters as suggested by RFC 3986"""
1455     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1456         s = s.encode('utf-8')
1457     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1458
1459
1460 def escape_url(url):
1461     """Escape URL as suggested by RFC 3986"""
1462     url_parsed = compat_urllib_parse_urlparse(url)
1463     return url_parsed._replace(
1464         path=escape_rfc3986(url_parsed.path),
1465         params=escape_rfc3986(url_parsed.params),
1466         query=escape_rfc3986(url_parsed.query),
1467         fragment=escape_rfc3986(url_parsed.fragment)
1468     ).geturl()
1469
1470 try:
1471     struct.pack('!I', 0)
1472 except TypeError:
1473     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1474     def struct_pack(spec, *args):
1475         if isinstance(spec, compat_str):
1476             spec = spec.encode('ascii')
1477         return struct.pack(spec, *args)
1478
1479     def struct_unpack(spec, *args):
1480         if isinstance(spec, compat_str):
1481             spec = spec.encode('ascii')
1482         return struct.unpack(spec, *args)
1483 else:
1484     struct_pack = struct.pack
1485     struct_unpack = struct.unpack
1486
1487
1488 def read_batch_urls(batch_fd):
1489     def fixup(url):
1490         if not isinstance(url, compat_str):
1491             url = url.decode('utf-8', 'replace')
1492         BOM_UTF8 = '\xef\xbb\xbf'
1493         if url.startswith(BOM_UTF8):
1494             url = url[len(BOM_UTF8):]
1495         url = url.strip()
1496         if url.startswith(('#', ';', ']')):
1497             return False
1498         return url
1499
1500     with contextlib.closing(batch_fd) as fd:
1501         return [url for url in map(fixup, fd) if url]
1502
1503
1504 def urlencode_postdata(*args, **kargs):
1505     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1506
1507
1508 try:
1509     etree_iter = xml.etree.ElementTree.Element.iter
1510 except AttributeError:  # Python <=2.6
1511     etree_iter = lambda n: n.findall('.//*')
1512
1513
1514 def parse_xml(s):
1515     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1516         def doctype(self, name, pubid, system):
1517             pass  # Ignore doctypes
1518
1519     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1520     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1521     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1522     # Fix up XML parser in Python 2.x
1523     if sys.version_info < (3, 0):
1524         for n in etree_iter(tree):
1525             if n.text is not None:
1526                 if not isinstance(n.text, compat_str):
1527                     n.text = n.text.decode('utf-8')
1528     return tree
1529
1530
1531 US_RATINGS = {
1532     'G': 0,
1533     'PG': 10,
1534     'PG-13': 13,
1535     'R': 16,
1536     'NC': 18,
1537 }
1538
1539
1540 def parse_age_limit(s):
1541     if s is None:
1542         return None
1543     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1544     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1545
1546
1547 def strip_jsonp(code):
1548     return re.sub(
1549         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1550
1551
1552 def js_to_json(code):
1553     def fix_kv(m):
1554         v = m.group(0)
1555         if v in ('true', 'false', 'null'):
1556             return v
1557         if v.startswith('"'):
1558             return v
1559         if v.startswith("'"):
1560             v = v[1:-1]
1561             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1562                 '\\\\': '\\\\',
1563                 "\\'": "'",
1564                 '"': '\\"',
1565             }[m.group(0)], v)
1566         return '"%s"' % v
1567
1568     res = re.sub(r'''(?x)
1569         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1570         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1571         [a-zA-Z_][.a-zA-Z_0-9]*
1572         ''', fix_kv, code)
1573     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1574     return res
1575
1576
1577 def qualities(quality_ids):
1578     """ Get a numeric quality value out of a list of possible values """
1579     def q(qid):
1580         try:
1581             return quality_ids.index(qid)
1582         except ValueError:
1583             return -1
1584     return q
1585
1586
1587 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1588
1589
1590 def limit_length(s, length):
1591     """ Add ellipses to overly long strings """
1592     if s is None:
1593         return None
1594     ELLIPSES = '...'
1595     if len(s) > length:
1596         return s[:length - len(ELLIPSES)] + ELLIPSES
1597     return s
1598
1599
1600 def version_tuple(v):
1601     return tuple(int(e) for e in re.split(r'[-.]', v))
1602
1603
1604 def is_outdated_version(version, limit, assume_new=True):
1605     if not version:
1606         return not assume_new
1607     try:
1608         return version_tuple(version) < version_tuple(limit)
1609     except ValueError:
1610         return not assume_new
1611
1612
1613 def ytdl_is_updateable():
1614     """ Returns if youtube-dl can be updated with -U """
1615     from zipimport import zipimporter
1616
1617     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1618
1619
1620 def args_to_str(args):
1621     # Get a short string representation for a subprocess command
1622     return ' '.join(shlex_quote(a) for a in args)
1623
1624
1625 def mimetype2ext(mt):
1626     _, _, res = mt.rpartition('/')
1627
1628     return {
1629         'x-ms-wmv': 'wmv',
1630         'x-mp4-fragmented': 'mp4',
1631     }.get(res, res)
1632
1633
1634 def urlhandle_detect_ext(url_handle):
1635     try:
1636         url_handle.headers
1637         getheader = lambda h: url_handle.headers[h]
1638     except AttributeError:  # Python < 3
1639         getheader = url_handle.info().getheader
1640
1641     cd = getheader('Content-Disposition')
1642     if cd:
1643         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1644         if m:
1645             e = determine_ext(m.group('filename'), default_ext=None)
1646             if e:
1647                 return e
1648
1649     return mimetype2ext(getheader('Content-Type'))
1650
1651
1652 def age_restricted(content_limit, age_limit):
1653     """ Returns True iff the content should be blocked """
1654
1655     if age_limit is None:  # No limit set
1656         return False
1657     if content_limit is None:
1658         return False  # Content available for everyone
1659     return age_limit < content_limit
1660
1661
1662 def is_html(first_bytes):
1663     """ Detect whether a file contains HTML by examining its first bytes. """
1664
1665     BOMS = [
1666         (b'\xef\xbb\xbf', 'utf-8'),
1667         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1668         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1669         (b'\xff\xfe', 'utf-16-le'),
1670         (b'\xfe\xff', 'utf-16-be'),
1671     ]
1672     for bom, enc in BOMS:
1673         if first_bytes.startswith(bom):
1674             s = first_bytes[len(bom):].decode(enc, 'replace')
1675             break
1676     else:
1677         s = first_bytes.decode('utf-8', 'replace')
1678
1679     return re.match(r'^\s*<', s)
1680
1681
1682 def determine_protocol(info_dict):
1683     protocol = info_dict.get('protocol')
1684     if protocol is not None:
1685         return protocol
1686
1687     url = info_dict['url']
1688     if url.startswith('rtmp'):
1689         return 'rtmp'
1690     elif url.startswith('mms'):
1691         return 'mms'
1692     elif url.startswith('rtsp'):
1693         return 'rtsp'
1694
1695     ext = determine_ext(url)
1696     if ext == 'm3u8':
1697         return 'm3u8'
1698     elif ext == 'f4m':
1699         return 'f4m'
1700
1701     return compat_urllib_parse_urlparse(url).scheme
1702
1703
1704 def render_table(header_row, data):
1705     """ Render a list of rows, each as a list of values """
1706     table = [header_row] + data
1707     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1708     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1709     return '\n'.join(format_str % tuple(row) for row in table)
1710
1711
1712 def _match_one(filter_part, dct):
1713     COMPARISON_OPERATORS = {
1714         '<': operator.lt,
1715         '<=': operator.le,
1716         '>': operator.gt,
1717         '>=': operator.ge,
1718         '=': operator.eq,
1719         '!=': operator.ne,
1720     }
1721     operator_rex = re.compile(r'''(?x)\s*
1722         (?P<key>[a-z_]+)
1723         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1724         (?:
1725             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1726             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1727         )
1728         \s*$
1729         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1730     m = operator_rex.search(filter_part)
1731     if m:
1732         op = COMPARISON_OPERATORS[m.group('op')]
1733         if m.group('strval') is not None:
1734             if m.group('op') not in ('=', '!='):
1735                 raise ValueError(
1736                     'Operator %s does not support string values!' % m.group('op'))
1737             comparison_value = m.group('strval')
1738         else:
1739             try:
1740                 comparison_value = int(m.group('intval'))
1741             except ValueError:
1742                 comparison_value = parse_filesize(m.group('intval'))
1743                 if comparison_value is None:
1744                     comparison_value = parse_filesize(m.group('intval') + 'B')
1745                 if comparison_value is None:
1746                     raise ValueError(
1747                         'Invalid integer value %r in filter part %r' % (
1748                             m.group('intval'), filter_part))
1749         actual_value = dct.get(m.group('key'))
1750         if actual_value is None:
1751             return m.group('none_inclusive')
1752         return op(actual_value, comparison_value)
1753
1754     UNARY_OPERATORS = {
1755         '': lambda v: v is not None,
1756         '!': lambda v: v is None,
1757     }
1758     operator_rex = re.compile(r'''(?x)\s*
1759         (?P<op>%s)\s*(?P<key>[a-z_]+)
1760         \s*$
1761         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1762     m = operator_rex.search(filter_part)
1763     if m:
1764         op = UNARY_OPERATORS[m.group('op')]
1765         actual_value = dct.get(m.group('key'))
1766         return op(actual_value)
1767
1768     raise ValueError('Invalid filter part %r' % filter_part)
1769
1770
1771 def match_str(filter_str, dct):
1772     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1773
1774     return all(
1775         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1776
1777
1778 def match_filter_func(filter_str):
1779     def _match_func(info_dict):
1780         if match_str(filter_str, info_dict):
1781             return None
1782         else:
1783             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1784             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1785     return _match_func
1786
1787
1788 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1789     def __init__(self, proxies=None):
1790         # Set default handlers
1791         for type in ('http', 'https'):
1792             setattr(self, '%s_open' % type,
1793                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1794                         meth(r, proxy, type))
1795         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1796
1797     def proxy_open(self, req, proxy, type):
1798         req_proxy = req.headers.get('Ytdl-request-proxy')
1799         if req_proxy is not None:
1800             proxy = req_proxy
1801             del req.headers['Ytdl-request-proxy']
1802
1803         if proxy == '__noproxy__':
1804             return None  # No Proxy
1805         return compat_urllib_request.ProxyHandler.proxy_open(
1806             self, req, proxy, type)
1807
1808
1809 def url_sanitize_consecutive_slashes(url):
1810     """Sanitize URLs with consecutive slashes
1811
1812     For example, transform both
1813         http://hostname/foo//bar/filename.html
1814     and
1815         http://hostname//foo/bar/filename.html
1816     into
1817         http://hostname/foo/bar/filename.html
1818     """
1819     parsed_url = list(compat_urlparse.urlparse(url))
1820     parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
1821     return compat_urlparse.urlunparse(parsed_url)