_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 403         if opts_no_check_certificate:
 404             context.check_hostname = False
 405             context.verify_mode = ssl.CERT_NONE
 406         try:
 407             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 408         except TypeError:
 409             # Python 2.7.8
 410             # (create_default_context present but HTTPSHandler has no context=)
 411             pass
 412
 413     if sys.version_info < (3, 2):
 414         import httplib
 415
 416         class HTTPSConnectionV3(httplib.HTTPSConnection):
 417             def __init__(self, *args, **kwargs):
 418                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 419
 420             def connect(self):
 421                 sock = socket.create_connection((self.host, self.port), self.timeout)
 422                 if getattr(self, '_tunnel_host', False):
 423                     self.sock = sock
 424                     self._tunnel()
 425                 try:
 426                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 427                 except ssl.SSLError:
 428                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 429
 430         return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
 431     else:  # Python < 3.4
 432         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 433         context.verify_mode = (ssl.CERT_NONE
 434                                if opts_no_check_certificate
 435                                else ssl.CERT_REQUIRED)
 436         context.set_default_verify_paths()
 437         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 438
 439
 440 class ExtractorError(Exception):
 441     """Error during info extraction."""
 442
 443     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 444         """ tb, if given, is the original traceback (so that it can be printed out).
 445         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 446         """
 447
 448         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 449             expected = True
 450         if video_id is not None:
 451             msg = video_id + ': ' + msg
 452         if cause:
 453             msg += ' (caused by %r)' % cause
 454         if not expected:
 455             if ytdl_is_updateable():
 456                 update_cmd = 'type  youtube-dl -U  to update'
 457             else:
 458                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 459             msg += '; please report this issue on https://yt-dl.org/bug .'
 460             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 461             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 462         super(ExtractorError, self).__init__(msg)
 463
 464         self.traceback = tb
 465         self.exc_info = sys.exc_info()  # preserve original exception
 466         self.cause = cause
 467         self.video_id = video_id
 468
 469     def format_traceback(self):
 470         if self.traceback is None:
 471             return None
 472         return ''.join(traceback.format_tb(self.traceback))
 473
 474
 475 class UnsupportedError(ExtractorError):
 476     def __init__(self, url):
 477         super(UnsupportedError, self).__init__(
 478             'Unsupported URL: %s' % url, expected=True)
 479         self.url = url
 480
 481
 482 class RegexNotFoundError(ExtractorError):
 483     """Error when a regex didn't match"""
 484     pass
 485
 486
 487 class DownloadError(Exception):
 488     """Download Error exception.
 489
 490     This exception may be thrown by FileDownloader objects if they are not
 491     configured to continue on errors. They will contain the appropriate
 492     error message.
 493     """
 494
 495     def __init__(self, msg, exc_info=None):
 496         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 497         super(DownloadError, self).__init__(msg)
 498         self.exc_info = exc_info
 499
 500
 501 class SameFileError(Exception):
 502     """Same File exception.
 503
 504     This exception will be thrown by FileDownloader objects if they detect
 505     multiple files would have to be downloaded to the same file on disk.
 506     """
 507     pass
 508
 509
 510 class PostProcessingError(Exception):
 511     """Post Processing exception.
 512
 513     This exception may be raised by PostProcessor's .run() method to
 514     indicate an error in the postprocessing task.
 515     """
 516
 517     def __init__(self, msg):
 518         self.msg = msg
 519
 520
 521 class MaxDownloadsReached(Exception):
 522     """ --max-downloads limit has been reached. """
 523     pass
 524
 525
 526 class UnavailableVideoError(Exception):
 527     """Unavailable Format exception.
 528
 529     This exception will be thrown when a video is requested
 530     in a format that is not available for that video.
 531     """
 532     pass
 533
 534
 535 class ContentTooShortError(Exception):
 536     """Content Too Short exception.
 537
 538     This exception may be raised by FileDownloader objects when a file they
 539     download is too small for what the server announced first, indicating
 540     the connection was probably interrupted.
 541     """
 542     # Both in bytes
 543     downloaded = None
 544     expected = None
 545
 546     def __init__(self, downloaded, expected):
 547         self.downloaded = downloaded
 548         self.expected = expected
 549
 550
 551 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 552     hc = http_class(*args, **kwargs)
 553     source_address = ydl_handler._params.get('source_address')
 554     if source_address is not None:
 555         sa = (source_address, 0)
 556         if hasattr(hc, 'source_address'):  # Python 2.7+
 557             hc.source_address = sa
 558         else:  # Python 2.6
 559             def _hc_connect(self, *args, **kwargs):
 560                 sock = compat_socket_create_connection(
 561                     (self.host, self.port), self.timeout, sa)
 562                 if is_https:
 563                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
 564                 else:
 565                     self.sock = sock
 566             hc.connect = functools.partial(_hc_connect, hc)
 567
 568     return hc
 569
 570
 571 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 572     """Handler for HTTP requests and responses.
 573
 574     This class, when installed with an OpenerDirector, automatically adds
 575     the standard headers to every HTTP request and handles gzipped and
 576     deflated responses from web servers. If compression is to be avoided in
 577     a particular request, the original request in the program code only has
 578     to include the HTTP header "Youtubedl-No-Compression", which will be
 579     removed before making the real request.
 580
 581     Part of this code was copied from:
 582
 583     http://techknack.net/python-urllib2-handlers/
 584
 585     Andrew Rowls, the author of that code, agreed to release it to the
 586     public domain.
 587     """
 588
 589     def __init__(self, params, *args, **kwargs):
 590         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 591         self._params = params
 592
 593     def http_open(self, req):
 594         return self.do_open(functools.partial(
 595             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 596             req)
 597
 598     @staticmethod
 599     def deflate(data):
 600         try:
 601             return zlib.decompress(data, -zlib.MAX_WBITS)
 602         except zlib.error:
 603             return zlib.decompress(data)
 604
 605     @staticmethod
 606     def addinfourl_wrapper(stream, headers, url, code):
 607         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 608             return compat_urllib_request.addinfourl(stream, headers, url, code)
 609         ret = compat_urllib_request.addinfourl(stream, headers, url)
 610         ret.code = code
 611         return ret
 612
 613     def http_request(self, req):
 614         for h, v in std_headers.items():
 615             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 616             # The dict keys are capitalized because of this bug by urllib
 617             if h.capitalize() not in req.headers:
 618                 req.add_header(h, v)
 619         if 'Youtubedl-no-compression' in req.headers:
 620             if 'Accept-encoding' in req.headers:
 621                 del req.headers['Accept-encoding']
 622             del req.headers['Youtubedl-no-compression']
 623         if 'Youtubedl-user-agent' in req.headers:
 624             if 'User-agent' in req.headers:
 625                 del req.headers['User-agent']
 626             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 627             del req.headers['Youtubedl-user-agent']
 628
 629         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 630             # Python 2.6 is brain-dead when it comes to fragments
 631             req._Request__original = req._Request__original.partition('#')[0]
 632             req._Request__r_type = req._Request__r_type.partition('#')[0]
 633
 634         return req
 635
 636     def http_response(self, req, resp):
 637         old_resp = resp
 638         # gzip
 639         if resp.headers.get('Content-encoding', '') == 'gzip':
 640             content = resp.read()
 641             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 642             try:
 643                 uncompressed = io.BytesIO(gz.read())
 644             except IOError as original_ioerror:
 645                 # There may be junk add the end of the file
 646                 # See http://stackoverflow.com/q/4928560/35070 for details
 647                 for i in range(1, 1024):
 648                     try:
 649                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 650                         uncompressed = io.BytesIO(gz.read())
 651                     except IOError:
 652                         continue
 653                     break
 654                 else:
 655                     raise original_ioerror
 656             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 657             resp.msg = old_resp.msg
 658         # deflate
 659         if resp.headers.get('Content-encoding', '') == 'deflate':
 660             gz = io.BytesIO(self.deflate(resp.read()))
 661             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 662             resp.msg = old_resp.msg
 663         return resp
 664
 665     https_request = http_request
 666     https_response = http_response
 667
 668
 669 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 670     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 671         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 672         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 673         self._params = params
 674
 675     def https_open(self, req):
 676         return self.do_open(functools.partial(
 677             _create_http_connection, self, self._https_conn_class, True),
 678             req)
 679
 680
 681 def parse_iso8601(date_str, delimiter='T'):
 682     """ Return a UNIX timestamp from the given date """
 683
 684     if date_str is None:
 685         return None
 686
 687     m = re.search(
 688         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 689         date_str)
 690     if not m:
 691         timezone = datetime.timedelta()
 692     else:
 693         date_str = date_str[:-len(m.group(0))]
 694         if not m.group('sign'):
 695             timezone = datetime.timedelta()
 696         else:
 697             sign = 1 if m.group('sign') == '+' else -1
 698             timezone = datetime.timedelta(
 699                 hours=sign * int(m.group('hours')),
 700                 minutes=sign * int(m.group('minutes')))
 701     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 702     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 703     return calendar.timegm(dt.timetuple())
 704
 705
 706 def unified_strdate(date_str, day_first=True):
 707     """Return a string with the date in the format YYYYMMDD"""
 708
 709     if date_str is None:
 710         return None
 711     upload_date = None
 712     # Replace commas
 713     date_str = date_str.replace(',', ' ')
 714     # %z (UTC offset) is only supported in python>=3.2
 715     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 716     # Remove AM/PM + timezone
 717     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 718
 719     format_expressions = [
 720         '%d %B %Y',
 721         '%d %b %Y',
 722         '%B %d %Y',
 723         '%b %d %Y',
 724         '%b %dst %Y %I:%M%p',
 725         '%b %dnd %Y %I:%M%p',
 726         '%b %dth %Y %I:%M%p',
 727         '%Y %m %d',
 728         '%Y-%m-%d',
 729         '%Y/%m/%d',
 730         '%Y/%m/%d %H:%M:%S',
 731         '%Y-%m-%d %H:%M:%S',
 732         '%Y-%m-%d %H:%M:%S.%f',
 733         '%d.%m.%Y %H:%M',
 734         '%d.%m.%Y %H.%M',
 735         '%Y-%m-%dT%H:%M:%SZ',
 736         '%Y-%m-%dT%H:%M:%S.%fZ',
 737         '%Y-%m-%dT%H:%M:%S.%f0Z',
 738         '%Y-%m-%dT%H:%M:%S',
 739         '%Y-%m-%dT%H:%M:%S.%f',
 740         '%Y-%m-%dT%H:%M',
 741     ]
 742     if day_first:
 743         format_expressions.extend([
 744             '%d.%m.%Y',
 745             '%d/%m/%Y',
 746             '%d/%m/%y',
 747             '%d/%m/%Y %H:%M:%S',
 748         ])
 749     else:
 750         format_expressions.extend([
 751             '%m.%d.%Y',
 752             '%m/%d/%Y',
 753             '%m/%d/%y',
 754             '%m/%d/%Y %H:%M:%S',
 755         ])
 756     for expression in format_expressions:
 757         try:
 758             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 759         except ValueError:
 760             pass
 761     if upload_date is None:
 762         timetuple = email.utils.parsedate_tz(date_str)
 763         if timetuple:
 764             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 765     return upload_date
 766
 767
 768 def determine_ext(url, default_ext='unknown_video'):
 769     if url is None:
 770         return default_ext
 771     guess = url.partition('?')[0].rpartition('.')[2]
 772     if re.match(r'^[A-Za-z0-9]+$', guess):
 773         return guess
 774     else:
 775         return default_ext
 776
 777
 778 def subtitles_filename(filename, sub_lang, sub_format):
 779     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 780
 781
 782 def date_from_str(date_str):
 783     """
 784     Return a datetime object from a string in the format YYYYMMDD or
 785     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 786     today = datetime.date.today()
 787     if date_str in ('now', 'today'):
 788         return today
 789     if date_str == 'yesterday':
 790         return today - datetime.timedelta(days=1)
 791     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 792     if match is not None:
 793         sign = match.group('sign')
 794         time = int(match.group('time'))
 795         if sign == '-':
 796             time = -time
 797         unit = match.group('unit')
 798         # A bad aproximation?
 799         if unit == 'month':
 800             unit = 'day'
 801             time *= 30
 802         elif unit == 'year':
 803             unit = 'day'
 804             time *= 365
 805         unit += 's'
 806         delta = datetime.timedelta(**{unit: time})
 807         return today + delta
 808     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 809
 810
 811 def hyphenate_date(date_str):
 812     """
 813     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 814     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 815     if match is not None:
 816         return '-'.join(match.groups())
 817     else:
 818         return date_str
 819
 820
 821 class DateRange(object):
 822     """Represents a time interval between two dates"""
 823
 824     def __init__(self, start=None, end=None):
 825         """start and end must be strings in the format accepted by date"""
 826         if start is not None:
 827             self.start = date_from_str(start)
 828         else:
 829             self.start = datetime.datetime.min.date()
 830         if end is not None:
 831             self.end = date_from_str(end)
 832         else:
 833             self.end = datetime.datetime.max.date()
 834         if self.start > self.end:
 835             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 836
 837     @classmethod
 838     def day(cls, day):
 839         """Returns a range that only contains the given day"""
 840         return cls(day, day)
 841
 842     def __contains__(self, date):
 843         """Check if the date is in the range"""
 844         if not isinstance(date, datetime.date):
 845             date = date_from_str(date)
 846         return self.start <= date <= self.end
 847
 848     def __str__(self):
 849         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 850
 851
 852 def platform_name():
 853     """ Returns the platform name as a compat_str """
 854     res = platform.platform()
 855     if isinstance(res, bytes):
 856         res = res.decode(preferredencoding())
 857
 858     assert isinstance(res, compat_str)
 859     return res
 860
 861
 862 def _windows_write_string(s, out):
 863     """ Returns True if the string was written using special methods,
 864     False if it has yet to be written out."""
 865     # Adapted from http://stackoverflow.com/a/3259271/35070
 866
 867     import ctypes
 868     import ctypes.wintypes
 869
 870     WIN_OUTPUT_IDS = {
 871         1: -11,
 872         2: -12,
 873     }
 874
 875     try:
 876         fileno = out.fileno()
 877     except AttributeError:
 878         # If the output stream doesn't have a fileno, it's virtual
 879         return False
 880     if fileno not in WIN_OUTPUT_IDS:
 881         return False
 882
 883     GetStdHandle = ctypes.WINFUNCTYPE(
 884         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 885         (b"GetStdHandle", ctypes.windll.kernel32))
 886     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 887
 888     WriteConsoleW = ctypes.WINFUNCTYPE(
 889         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 890         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 891         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 892     written = ctypes.wintypes.DWORD(0)
 893
 894     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 895     FILE_TYPE_CHAR = 0x0002
 896     FILE_TYPE_REMOTE = 0x8000
 897     GetConsoleMode = ctypes.WINFUNCTYPE(
 898         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 899         ctypes.POINTER(ctypes.wintypes.DWORD))(
 900         (b"GetConsoleMode", ctypes.windll.kernel32))
 901     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 902
 903     def not_a_console(handle):
 904         if handle == INVALID_HANDLE_VALUE or handle is None:
 905             return True
 906         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 907                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 908
 909     if not_a_console(h):
 910         return False
 911
 912     def next_nonbmp_pos(s):
 913         try:
 914             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 915         except StopIteration:
 916             return len(s)
 917
 918     while s:
 919         count = min(next_nonbmp_pos(s), 1024)
 920
 921         ret = WriteConsoleW(
 922             h, s, count if count else 2, ctypes.byref(written), None)
 923         if ret == 0:
 924             raise OSError('Failed to write string')
 925         if not count:  # We just wrote a non-BMP character
 926             assert written.value == 2
 927             s = s[1:]
 928         else:
 929             assert written.value > 0
 930             s = s[written.value:]
 931     return True
 932
 933
 934 def write_string(s, out=None, encoding=None):
 935     if out is None:
 936         out = sys.stderr
 937     assert type(s) == compat_str
 938
 939     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 940         if _windows_write_string(s, out):
 941             return
 942
 943     if ('b' in getattr(out, 'mode', '') or
 944             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 945         byt = s.encode(encoding or preferredencoding(), 'ignore')
 946         out.write(byt)
 947     elif hasattr(out, 'buffer'):
 948         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 949         byt = s.encode(enc, 'ignore')
 950         out.buffer.write(byt)
 951     else:
 952         out.write(s)
 953     out.flush()
 954
 955
 956 def bytes_to_intlist(bs):
 957     if not bs:
 958         return []
 959     if isinstance(bs[0], int):  # Python 3
 960         return list(bs)
 961     else:
 962         return [ord(c) for c in bs]
 963
 964
 965 def intlist_to_bytes(xs):
 966     if not xs:
 967         return b''
 968     return struct_pack('%dB' % len(xs), *xs)
 969
 970
 971 # Cross-platform file locking
 972 if sys.platform == 'win32':
 973     import ctypes.wintypes
 974     import msvcrt
 975
 976     class OVERLAPPED(ctypes.Structure):
 977         _fields_ = [
 978             ('Internal', ctypes.wintypes.LPVOID),
 979             ('InternalHigh', ctypes.wintypes.LPVOID),
 980             ('Offset', ctypes.wintypes.DWORD),
 981             ('OffsetHigh', ctypes.wintypes.DWORD),
 982             ('hEvent', ctypes.wintypes.HANDLE),
 983         ]
 984
 985     kernel32 = ctypes.windll.kernel32
 986     LockFileEx = kernel32.LockFileEx
 987     LockFileEx.argtypes = [
 988         ctypes.wintypes.HANDLE,     # hFile
 989         ctypes.wintypes.DWORD,      # dwFlags
 990         ctypes.wintypes.DWORD,      # dwReserved
 991         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 993         ctypes.POINTER(OVERLAPPED)  # Overlapped
 994     ]
 995     LockFileEx.restype = ctypes.wintypes.BOOL
 996     UnlockFileEx = kernel32.UnlockFileEx
 997     UnlockFileEx.argtypes = [
 998         ctypes.wintypes.HANDLE,     # hFile
 999         ctypes.wintypes.DWORD,      # dwReserved
1000         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1002         ctypes.POINTER(OVERLAPPED)  # Overlapped
1003     ]
1004     UnlockFileEx.restype = ctypes.wintypes.BOOL
1005     whole_low = 0xffffffff
1006     whole_high = 0x7fffffff
1007
1008     def _lock_file(f, exclusive):
1009         overlapped = OVERLAPPED()
1010         overlapped.Offset = 0
1011         overlapped.OffsetHigh = 0
1012         overlapped.hEvent = 0
1013         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1014         handle = msvcrt.get_osfhandle(f.fileno())
1015         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1016                           whole_low, whole_high, f._lock_file_overlapped_p):
1017             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1018
1019     def _unlock_file(f):
1020         assert f._lock_file_overlapped_p
1021         handle = msvcrt.get_osfhandle(f.fileno())
1022         if not UnlockFileEx(handle, 0,
1023                             whole_low, whole_high, f._lock_file_overlapped_p):
1024             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1025
1026 else:
1027     import fcntl
1028
1029     def _lock_file(f, exclusive):
1030         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1031
1032     def _unlock_file(f):
1033         fcntl.flock(f, fcntl.LOCK_UN)
1034
1035
1036 class locked_file(object):
1037     def __init__(self, filename, mode, encoding=None):
1038         assert mode in ['r', 'a', 'w']
1039         self.f = io.open(filename, mode, encoding=encoding)
1040         self.mode = mode
1041
1042     def __enter__(self):
1043         exclusive = self.mode != 'r'
1044         try:
1045             _lock_file(self.f, exclusive)
1046         except IOError:
1047             self.f.close()
1048             raise
1049         return self
1050
1051     def __exit__(self, etype, value, traceback):
1052         try:
1053             _unlock_file(self.f)
1054         finally:
1055             self.f.close()
1056
1057     def __iter__(self):
1058         return iter(self.f)
1059
1060     def write(self, *args):
1061         return self.f.write(*args)
1062
1063     def read(self, *args):
1064         return self.f.read(*args)
1065
1066
1067 def get_filesystem_encoding():
1068     encoding = sys.getfilesystemencoding()
1069     return encoding if encoding is not None else 'utf-8'
1070
1071
1072 def shell_quote(args):
1073     quoted_args = []
1074     encoding = get_filesystem_encoding()
1075     for a in args:
1076         if isinstance(a, bytes):
1077             # We may get a filename encoded with 'encodeFilename'
1078             a = a.decode(encoding)
1079         quoted_args.append(pipes.quote(a))
1080     return ' '.join(quoted_args)
1081
1082
1083 def takewhile_inclusive(pred, seq):
1084     """ Like itertools.takewhile, but include the latest evaluated element
1085         (the first element so that Not pred(e)) """
1086     for e in seq:
1087         yield e
1088         if not pred(e):
1089             return
1090
1091
1092 def smuggle_url(url, data):
1093     """ Pass additional data in a URL for internal use. """
1094
1095     sdata = compat_urllib_parse.urlencode(
1096         {'__youtubedl_smuggle': json.dumps(data)})
1097     return url + '#' + sdata
1098
1099
1100 def unsmuggle_url(smug_url, default=None):
1101     if '#__youtubedl_smuggle' not in smug_url:
1102         return smug_url, default
1103     url, _, sdata = smug_url.rpartition('#')
1104     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1105     data = json.loads(jsond)
1106     return url, data
1107
1108
1109 def format_bytes(bytes):
1110     if bytes is None:
1111         return 'N/A'
1112     if type(bytes) is str:
1113         bytes = float(bytes)
1114     if bytes == 0.0:
1115         exponent = 0
1116     else:
1117         exponent = int(math.log(bytes, 1024.0))
1118     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1119     converted = float(bytes) / float(1024 ** exponent)
1120     return '%.2f%s' % (converted, suffix)
1121
1122
1123 def parse_filesize(s):
1124     if s is None:
1125         return None
1126
1127     # The lower-case forms are of course incorrect and inofficial,
1128     # but we support those too
1129     _UNIT_TABLE = {
1130         'B': 1,
1131         'b': 1,
1132         'KiB': 1024,
1133         'KB': 1000,
1134         'kB': 1024,
1135         'Kb': 1000,
1136         'MiB': 1024 ** 2,
1137         'MB': 1000 ** 2,
1138         'mB': 1024 ** 2,
1139         'Mb': 1000 ** 2,
1140         'GiB': 1024 ** 3,
1141         'GB': 1000 ** 3,
1142         'gB': 1024 ** 3,
1143         'Gb': 1000 ** 3,
1144         'TiB': 1024 ** 4,
1145         'TB': 1000 ** 4,
1146         'tB': 1024 ** 4,
1147         'Tb': 1000 ** 4,
1148         'PiB': 1024 ** 5,
1149         'PB': 1000 ** 5,
1150         'pB': 1024 ** 5,
1151         'Pb': 1000 ** 5,
1152         'EiB': 1024 ** 6,
1153         'EB': 1000 ** 6,
1154         'eB': 1024 ** 6,
1155         'Eb': 1000 ** 6,
1156         'ZiB': 1024 ** 7,
1157         'ZB': 1000 ** 7,
1158         'zB': 1024 ** 7,
1159         'Zb': 1000 ** 7,
1160         'YiB': 1024 ** 8,
1161         'YB': 1000 ** 8,
1162         'yB': 1024 ** 8,
1163         'Yb': 1000 ** 8,
1164     }
1165
1166     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1167     m = re.match(
1168         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1169     if not m:
1170         return None
1171
1172     num_str = m.group('num').replace(',', '.')
1173     mult = _UNIT_TABLE[m.group('unit')]
1174     return int(float(num_str) * mult)
1175
1176
1177 def get_term_width():
1178     columns = compat_getenv('COLUMNS', None)
1179     if columns:
1180         return int(columns)
1181
1182     try:
1183         sp = subprocess.Popen(
1184             ['stty', 'size'],
1185             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1186         out, err = sp.communicate()
1187         return int(out.split()[1])
1188     except:
1189         pass
1190     return None
1191
1192
1193 def month_by_name(name):
1194     """ Return the number of a month by (locale-independently) English name """
1195
1196     ENGLISH_NAMES = [
1197         'January', 'February', 'March', 'April', 'May', 'June',
1198         'July', 'August', 'September', 'October', 'November', 'December']
1199     try:
1200         return ENGLISH_NAMES.index(name) + 1
1201     except ValueError:
1202         return None
1203
1204
1205 def fix_xml_ampersands(xml_str):
1206     """Replace all the '&' by '&amp;' in XML"""
1207     return re.sub(
1208         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1209         '&amp;',
1210         xml_str)
1211
1212
1213 def setproctitle(title):
1214     assert isinstance(title, compat_str)
1215     try:
1216         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1217     except OSError:
1218         return
1219     title_bytes = title.encode('utf-8')
1220     buf = ctypes.create_string_buffer(len(title_bytes))
1221     buf.value = title_bytes
1222     try:
1223         libc.prctl(15, buf, 0, 0, 0)
1224     except AttributeError:
1225         return  # Strange libc, just skip this
1226
1227
1228 def remove_start(s, start):
1229     if s.startswith(start):
1230         return s[len(start):]
1231     return s
1232
1233
1234 def remove_end(s, end):
1235     if s.endswith(end):
1236         return s[:-len(end)]
1237     return s
1238
1239
1240 def url_basename(url):
1241     path = compat_urlparse.urlparse(url).path
1242     return path.strip('/').split('/')[-1]
1243
1244
1245 class HEADRequest(compat_urllib_request.Request):
1246     def get_method(self):
1247         return "HEAD"
1248
1249
1250 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1251     if get_attr:
1252         if v is not None:
1253             v = getattr(v, get_attr, None)
1254     if v == '':
1255         v = None
1256     return default if v is None else (int(v) * invscale // scale)
1257
1258
1259 def str_or_none(v, default=None):
1260     return default if v is None else compat_str(v)
1261
1262
1263 def str_to_int(int_str):
1264     """ A more relaxed version of int_or_none """
1265     if int_str is None:
1266         return None
1267     int_str = re.sub(r'[,\.\+]', '', int_str)
1268     return int(int_str)
1269
1270
1271 def float_or_none(v, scale=1, invscale=1, default=None):
1272     return default if v is None else (float(v) * invscale / scale)
1273
1274
1275 def parse_duration(s):
1276     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1277         return None
1278
1279     s = s.strip()
1280
1281     m = re.match(
1282         r'''(?ix)(?:P?T)?
1283         (?:
1284             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1285             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1286
1287             (?:
1288                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1289                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1290             )?
1291             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1292         )$''', s)
1293     if not m:
1294         return None
1295     res = 0
1296     if m.group('only_mins'):
1297         return float_or_none(m.group('only_mins'), invscale=60)
1298     if m.group('only_hours'):
1299         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1300     if m.group('secs'):
1301         res += int(m.group('secs'))
1302     if m.group('mins'):
1303         res += int(m.group('mins')) * 60
1304     if m.group('hours'):
1305         res += int(m.group('hours')) * 60 * 60
1306     if m.group('ms'):
1307         res += float(m.group('ms'))
1308     return res
1309
1310
1311 def prepend_extension(filename, ext):
1312     name, real_ext = os.path.splitext(filename)
1313     return '{0}.{1}{2}'.format(name, ext, real_ext)
1314
1315
1316 def check_executable(exe, args=[]):
1317     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1318     args can be a list of arguments for a short output (like -version) """
1319     try:
1320         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1321     except OSError:
1322         return False
1323     return exe
1324
1325
1326 def get_exe_version(exe, args=['--version'],
1327                     version_re=None, unrecognized='present'):
1328     """ Returns the version of the specified executable,
1329     or False if the executable is not present """
1330     try:
1331         out, _ = subprocess.Popen(
1332             [exe] + args,
1333             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1334     except OSError:
1335         return False
1336     if isinstance(out, bytes):  # Python 2.x
1337         out = out.decode('ascii', 'ignore')
1338     return detect_exe_version(out, version_re, unrecognized)
1339
1340
1341 def detect_exe_version(output, version_re=None, unrecognized='present'):
1342     assert isinstance(output, compat_str)
1343     if version_re is None:
1344         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1345     m = re.search(version_re, output)
1346     if m:
1347         return m.group(1)
1348     else:
1349         return unrecognized
1350
1351
1352 class PagedList(object):
1353     def __len__(self):
1354         # This is only useful for tests
1355         return len(self.getslice())
1356
1357
1358 class OnDemandPagedList(PagedList):
1359     def __init__(self, pagefunc, pagesize):
1360         self._pagefunc = pagefunc
1361         self._pagesize = pagesize
1362
1363     def getslice(self, start=0, end=None):
1364         res = []
1365         for pagenum in itertools.count(start // self._pagesize):
1366             firstid = pagenum * self._pagesize
1367             nextfirstid = pagenum * self._pagesize + self._pagesize
1368             if start >= nextfirstid:
1369                 continue
1370
1371             page_results = list(self._pagefunc(pagenum))
1372
1373             startv = (
1374                 start % self._pagesize
1375                 if firstid <= start < nextfirstid
1376                 else 0)
1377
1378             endv = (
1379                 ((end - 1) % self._pagesize) + 1
1380                 if (end is not None and firstid <= end <= nextfirstid)
1381                 else None)
1382
1383             if startv != 0 or endv is not None:
1384                 page_results = page_results[startv:endv]
1385             res.extend(page_results)
1386
1387             # A little optimization - if current page is not "full", ie. does
1388             # not contain page_size videos then we can assume that this page
1389             # is the last one - there are no more ids on further pages -
1390             # i.e. no need to query again.
1391             if len(page_results) + startv < self._pagesize:
1392                 break
1393
1394             # If we got the whole page, but the next page is not interesting,
1395             # break out early as well
1396             if end == nextfirstid:
1397                 break
1398         return res
1399
1400
1401 class InAdvancePagedList(PagedList):
1402     def __init__(self, pagefunc, pagecount, pagesize):
1403         self._pagefunc = pagefunc
1404         self._pagecount = pagecount
1405         self._pagesize = pagesize
1406
1407     def getslice(self, start=0, end=None):
1408         res = []
1409         start_page = start // self._pagesize
1410         end_page = (
1411             self._pagecount if end is None else (end // self._pagesize + 1))
1412         skip_elems = start - start_page * self._pagesize
1413         only_more = None if end is None else end - start
1414         for pagenum in range(start_page, end_page):
1415             page = list(self._pagefunc(pagenum))
1416             if skip_elems:
1417                 page = page[skip_elems:]
1418                 skip_elems = None
1419             if only_more is not None:
1420                 if len(page) < only_more:
1421                     only_more -= len(page)
1422                 else:
1423                     page = page[:only_more]
1424                     res.extend(page)
1425                     break
1426             res.extend(page)
1427         return res
1428
1429
1430 def uppercase_escape(s):
1431     unicode_escape = codecs.getdecoder('unicode_escape')
1432     return re.sub(
1433         r'\\U[0-9a-fA-F]{8}',
1434         lambda m: unicode_escape(m.group(0))[0],
1435         s)
1436
1437
1438 def escape_rfc3986(s):
1439     """Escape non-ASCII characters as suggested by RFC 3986"""
1440     if sys.version_info < (3, 0) and isinstance(s, unicode):
1441         s = s.encode('utf-8')
1442     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1443
1444
1445 def escape_url(url):
1446     """Escape URL as suggested by RFC 3986"""
1447     url_parsed = compat_urllib_parse_urlparse(url)
1448     return url_parsed._replace(
1449         path=escape_rfc3986(url_parsed.path),
1450         params=escape_rfc3986(url_parsed.params),
1451         query=escape_rfc3986(url_parsed.query),
1452         fragment=escape_rfc3986(url_parsed.fragment)
1453     ).geturl()
1454
1455 try:
1456     struct.pack('!I', 0)
1457 except TypeError:
1458     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1459     def struct_pack(spec, *args):
1460         if isinstance(spec, compat_str):
1461             spec = spec.encode('ascii')
1462         return struct.pack(spec, *args)
1463
1464     def struct_unpack(spec, *args):
1465         if isinstance(spec, compat_str):
1466             spec = spec.encode('ascii')
1467         return struct.unpack(spec, *args)
1468 else:
1469     struct_pack = struct.pack
1470     struct_unpack = struct.unpack
1471
1472
1473 def read_batch_urls(batch_fd):
1474     def fixup(url):
1475         if not isinstance(url, compat_str):
1476             url = url.decode('utf-8', 'replace')
1477         BOM_UTF8 = '\xef\xbb\xbf'
1478         if url.startswith(BOM_UTF8):
1479             url = url[len(BOM_UTF8):]
1480         url = url.strip()
1481         if url.startswith(('#', ';', ']')):
1482             return False
1483         return url
1484
1485     with contextlib.closing(batch_fd) as fd:
1486         return [url for url in map(fixup, fd) if url]
1487
1488
1489 def urlencode_postdata(*args, **kargs):
1490     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1491
1492
1493 try:
1494     etree_iter = xml.etree.ElementTree.Element.iter
1495 except AttributeError:  # Python <=2.6
1496     etree_iter = lambda n: n.findall('.//*')
1497
1498
1499 def parse_xml(s):
1500     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1501         def doctype(self, name, pubid, system):
1502             pass  # Ignore doctypes
1503
1504     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1505     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1506     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1507     # Fix up XML parser in Python 2.x
1508     if sys.version_info < (3, 0):
1509         for n in etree_iter(tree):
1510             if n.text is not None:
1511                 if not isinstance(n.text, compat_str):
1512                     n.text = n.text.decode('utf-8')
1513     return tree
1514
1515
1516 US_RATINGS = {
1517     'G': 0,
1518     'PG': 10,
1519     'PG-13': 13,
1520     'R': 16,
1521     'NC': 18,
1522 }
1523
1524
1525 def parse_age_limit(s):
1526     if s is None:
1527         return None
1528     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1529     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1530
1531
1532 def strip_jsonp(code):
1533     return re.sub(
1534         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1535
1536
1537 def js_to_json(code):
1538     def fix_kv(m):
1539         v = m.group(0)
1540         if v in ('true', 'false', 'null'):
1541             return v
1542         if v.startswith('"'):
1543             return v
1544         if v.startswith("'"):
1545             v = v[1:-1]
1546             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1547                 '\\\\': '\\\\',
1548                 "\\'": "'",
1549                 '"': '\\"',
1550             }[m.group(0)], v)
1551         return '"%s"' % v
1552
1553     res = re.sub(r'''(?x)
1554         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1555         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1556         [a-zA-Z_][a-zA-Z_0-9]*
1557         ''', fix_kv, code)
1558     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1559     return res
1560
1561
1562 def qualities(quality_ids):
1563     """ Get a numeric quality value out of a list of possible values """
1564     def q(qid):
1565         try:
1566             return quality_ids.index(qid)
1567         except ValueError:
1568             return -1
1569     return q
1570
1571
1572 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1573
1574
1575 def limit_length(s, length):
1576     """ Add ellipses to overly long strings """
1577     if s is None:
1578         return None
1579     ELLIPSES = '...'
1580     if len(s) > length:
1581         return s[:length - len(ELLIPSES)] + ELLIPSES
1582     return s
1583
1584
1585 def version_tuple(v):
1586     return tuple(int(e) for e in re.split(r'[-.]', v))
1587
1588
1589 def is_outdated_version(version, limit, assume_new=True):
1590     if not version:
1591         return not assume_new
1592     try:
1593         return version_tuple(version) < version_tuple(limit)
1594     except ValueError:
1595         return not assume_new
1596
1597
1598 def ytdl_is_updateable():
1599     """ Returns if youtube-dl can be updated with -U """
1600     from zipimport import zipimporter
1601
1602     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1603
1604
1605 def args_to_str(args):
1606     # Get a short string representation for a subprocess command
1607     return ' '.join(shlex_quote(a) for a in args)
1608
1609
1610 def urlhandle_detect_ext(url_handle):
1611     try:
1612         url_handle.headers
1613         getheader = lambda h: url_handle.headers[h]
1614     except AttributeError:  # Python < 3
1615         getheader = url_handle.info().getheader
1616
1617     cd = getheader('Content-Disposition')
1618     if cd:
1619         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1620         if m:
1621             e = determine_ext(m.group('filename'), default_ext=None)
1622             if e:
1623                 return e
1624
1625     return getheader('Content-Type').split("/")[1]
1626
1627
1628 def age_restricted(content_limit, age_limit):
1629     """ Returns True iff the content should be blocked """
1630
1631     if age_limit is None:  # No limit set
1632         return False
1633     if content_limit is None:
1634         return False  # Content available for everyone
1635     return age_limit < content_limit
1636
1637
1638 def is_html(first_bytes):
1639     """ Detect whether a file contains HTML by examining its first bytes. """
1640
1641     BOMS = [
1642         (b'\xef\xbb\xbf', 'utf-8'),
1643         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1644         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1645         (b'\xff\xfe', 'utf-16-le'),
1646         (b'\xfe\xff', 'utf-16-be'),
1647     ]
1648     for bom, enc in BOMS:
1649         if first_bytes.startswith(bom):
1650             s = first_bytes[len(bom):].decode(enc, 'replace')
1651             break
1652     else:
1653         s = first_bytes.decode('utf-8', 'replace')
1654
1655     return re.match(r'^\s*<', s)