_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 403         if opts_no_check_certificate:
 404             context.verify_mode = ssl.CERT_NONE
 405         try:
 406             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 407         except TypeError:
 408             # Python 2.7.8
 409             # (create_default_context present but HTTPSHandler has no context=)
 410             pass
 411
 412     if sys.version_info < (3, 2):
 413         import httplib
 414
 415         class HTTPSConnectionV3(httplib.HTTPSConnection):
 416             def __init__(self, *args, **kwargs):
 417                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 418
 419             def connect(self):
 420                 sock = socket.create_connection((self.host, self.port), self.timeout)
 421                 if getattr(self, '_tunnel_host', False):
 422                     self.sock = sock
 423                     self._tunnel()
 424                 try:
 425                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 426                 except ssl.SSLError:
 427                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 428
 429         return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
 430     else:  # Python < 3.4
 431         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 432         context.verify_mode = (ssl.CERT_NONE
 433                                if opts_no_check_certificate
 434                                else ssl.CERT_REQUIRED)
 435         context.set_default_verify_paths()
 436         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 437
 438
 439 class ExtractorError(Exception):
 440     """Error during info extraction."""
 441
 442     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 443         """ tb, if given, is the original traceback (so that it can be printed out).
 444         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 445         """
 446
 447         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 448             expected = True
 449         if video_id is not None:
 450             msg = video_id + ': ' + msg
 451         if cause:
 452             msg += ' (caused by %r)' % cause
 453         if not expected:
 454             if ytdl_is_updateable():
 455                 update_cmd = 'type  youtube-dl -U  to update'
 456             else:
 457                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 458             msg += '; please report this issue on https://yt-dl.org/bug .'
 459             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 460             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 461         super(ExtractorError, self).__init__(msg)
 462
 463         self.traceback = tb
 464         self.exc_info = sys.exc_info()  # preserve original exception
 465         self.cause = cause
 466         self.video_id = video_id
 467
 468     def format_traceback(self):
 469         if self.traceback is None:
 470             return None
 471         return ''.join(traceback.format_tb(self.traceback))
 472
 473
 474 class UnsupportedError(ExtractorError):
 475     def __init__(self, url):
 476         super(UnsupportedError, self).__init__(
 477             'Unsupported URL: %s' % url, expected=True)
 478         self.url = url
 479
 480
 481 class RegexNotFoundError(ExtractorError):
 482     """Error when a regex didn't match"""
 483     pass
 484
 485
 486 class DownloadError(Exception):
 487     """Download Error exception.
 488
 489     This exception may be thrown by FileDownloader objects if they are not
 490     configured to continue on errors. They will contain the appropriate
 491     error message.
 492     """
 493
 494     def __init__(self, msg, exc_info=None):
 495         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 496         super(DownloadError, self).__init__(msg)
 497         self.exc_info = exc_info
 498
 499
 500 class SameFileError(Exception):
 501     """Same File exception.
 502
 503     This exception will be thrown by FileDownloader objects if they detect
 504     multiple files would have to be downloaded to the same file on disk.
 505     """
 506     pass
 507
 508
 509 class PostProcessingError(Exception):
 510     """Post Processing exception.
 511
 512     This exception may be raised by PostProcessor's .run() method to
 513     indicate an error in the postprocessing task.
 514     """
 515
 516     def __init__(self, msg):
 517         self.msg = msg
 518
 519
 520 class MaxDownloadsReached(Exception):
 521     """ --max-downloads limit has been reached. """
 522     pass
 523
 524
 525 class UnavailableVideoError(Exception):
 526     """Unavailable Format exception.
 527
 528     This exception will be thrown when a video is requested
 529     in a format that is not available for that video.
 530     """
 531     pass
 532
 533
 534 class ContentTooShortError(Exception):
 535     """Content Too Short exception.
 536
 537     This exception may be raised by FileDownloader objects when a file they
 538     download is too small for what the server announced first, indicating
 539     the connection was probably interrupted.
 540     """
 541     # Both in bytes
 542     downloaded = None
 543     expected = None
 544
 545     def __init__(self, downloaded, expected):
 546         self.downloaded = downloaded
 547         self.expected = expected
 548
 549
 550 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 551     hc = http_class(*args, **kwargs)
 552     source_address = ydl_handler._params.get('source_address')
 553     if source_address is not None:
 554         sa = (source_address, 0)
 555         if hasattr(hc, 'source_address'):  # Python 2.7+
 556             hc.source_address = sa
 557         else:  # Python 2.6
 558             def _hc_connect(self, *args, **kwargs):
 559                 sock = compat_socket_create_connection(
 560                     (self.host, self.port), self.timeout, sa)
 561                 if is_https:
 562                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
 563                 else:
 564                     self.sock = sock
 565             hc.connect = functools.partial(_hc_connect, hc)
 566
 567     return hc
 568
 569
 570 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 571     """Handler for HTTP requests and responses.
 572
 573     This class, when installed with an OpenerDirector, automatically adds
 574     the standard headers to every HTTP request and handles gzipped and
 575     deflated responses from web servers. If compression is to be avoided in
 576     a particular request, the original request in the program code only has
 577     to include the HTTP header "Youtubedl-No-Compression", which will be
 578     removed before making the real request.
 579
 580     Part of this code was copied from:
 581
 582     http://techknack.net/python-urllib2-handlers/
 583
 584     Andrew Rowls, the author of that code, agreed to release it to the
 585     public domain.
 586     """
 587
 588     def __init__(self, params, *args, **kwargs):
 589         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 590         self._params = params
 591
 592     def http_open(self, req):
 593         return self.do_open(functools.partial(
 594             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 595             req)
 596
 597     @staticmethod
 598     def deflate(data):
 599         try:
 600             return zlib.decompress(data, -zlib.MAX_WBITS)
 601         except zlib.error:
 602             return zlib.decompress(data)
 603
 604     @staticmethod
 605     def addinfourl_wrapper(stream, headers, url, code):
 606         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 607             return compat_urllib_request.addinfourl(stream, headers, url, code)
 608         ret = compat_urllib_request.addinfourl(stream, headers, url)
 609         ret.code = code
 610         return ret
 611
 612     def http_request(self, req):
 613         for h, v in std_headers.items():
 614             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 615             # The dict keys are capitalized because of this bug by urllib
 616             if h.capitalize() not in req.headers:
 617                 req.add_header(h, v)
 618         if 'Youtubedl-no-compression' in req.headers:
 619             if 'Accept-encoding' in req.headers:
 620                 del req.headers['Accept-encoding']
 621             del req.headers['Youtubedl-no-compression']
 622         if 'Youtubedl-user-agent' in req.headers:
 623             if 'User-agent' in req.headers:
 624                 del req.headers['User-agent']
 625             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 626             del req.headers['Youtubedl-user-agent']
 627
 628         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 629             # Python 2.6 is brain-dead when it comes to fragments
 630             req._Request__original = req._Request__original.partition('#')[0]
 631             req._Request__r_type = req._Request__r_type.partition('#')[0]
 632
 633         return req
 634
 635     def http_response(self, req, resp):
 636         old_resp = resp
 637         # gzip
 638         if resp.headers.get('Content-encoding', '') == 'gzip':
 639             content = resp.read()
 640             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 641             try:
 642                 uncompressed = io.BytesIO(gz.read())
 643             except IOError as original_ioerror:
 644                 # There may be junk add the end of the file
 645                 # See http://stackoverflow.com/q/4928560/35070 for details
 646                 for i in range(1, 1024):
 647                     try:
 648                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 649                         uncompressed = io.BytesIO(gz.read())
 650                     except IOError:
 651                         continue
 652                     break
 653                 else:
 654                     raise original_ioerror
 655             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 656             resp.msg = old_resp.msg
 657         # deflate
 658         if resp.headers.get('Content-encoding', '') == 'deflate':
 659             gz = io.BytesIO(self.deflate(resp.read()))
 660             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 661             resp.msg = old_resp.msg
 662         return resp
 663
 664     https_request = http_request
 665     https_response = http_response
 666
 667
 668 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 669     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 670         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 671         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 672         self._params = params
 673
 674     def https_open(self, req):
 675         return self.do_open(functools.partial(
 676             _create_http_connection, self, self._https_conn_class, True),
 677             req)
 678
 679
 680 def parse_iso8601(date_str, delimiter='T'):
 681     """ Return a UNIX timestamp from the given date """
 682
 683     if date_str is None:
 684         return None
 685
 686     m = re.search(
 687         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 688         date_str)
 689     if not m:
 690         timezone = datetime.timedelta()
 691     else:
 692         date_str = date_str[:-len(m.group(0))]
 693         if not m.group('sign'):
 694             timezone = datetime.timedelta()
 695         else:
 696             sign = 1 if m.group('sign') == '+' else -1
 697             timezone = datetime.timedelta(
 698                 hours=sign * int(m.group('hours')),
 699                 minutes=sign * int(m.group('minutes')))
 700     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 701     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 702     return calendar.timegm(dt.timetuple())
 703
 704
 705 def unified_strdate(date_str, day_first=True):
 706     """Return a string with the date in the format YYYYMMDD"""
 707
 708     if date_str is None:
 709         return None
 710     upload_date = None
 711     # Replace commas
 712     date_str = date_str.replace(',', ' ')
 713     # %z (UTC offset) is only supported in python>=3.2
 714     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 715     # Remove AM/PM + timezone
 716     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 717
 718     format_expressions = [
 719         '%d %B %Y',
 720         '%d %b %Y',
 721         '%B %d %Y',
 722         '%b %d %Y',
 723         '%b %dst %Y %I:%M%p',
 724         '%b %dnd %Y %I:%M%p',
 725         '%b %dth %Y %I:%M%p',
 726         '%Y-%m-%d',
 727         '%Y/%m/%d',
 728         '%Y/%m/%d %H:%M:%S',
 729         '%Y-%m-%d %H:%M:%S',
 730         '%Y-%m-%d %H:%M:%S.%f',
 731         '%d.%m.%Y %H:%M',
 732         '%d.%m.%Y %H.%M',
 733         '%Y-%m-%dT%H:%M:%SZ',
 734         '%Y-%m-%dT%H:%M:%S.%fZ',
 735         '%Y-%m-%dT%H:%M:%S.%f0Z',
 736         '%Y-%m-%dT%H:%M:%S',
 737         '%Y-%m-%dT%H:%M:%S.%f',
 738         '%Y-%m-%dT%H:%M',
 739     ]
 740     if day_first:
 741         format_expressions.extend([
 742             '%d.%m.%Y',
 743             '%d/%m/%Y',
 744             '%d/%m/%y',
 745             '%d/%m/%Y %H:%M:%S',
 746         ])
 747     else:
 748         format_expressions.extend([
 749             '%m.%d.%Y',
 750             '%m/%d/%Y',
 751             '%m/%d/%y',
 752             '%m/%d/%Y %H:%M:%S',
 753         ])
 754     for expression in format_expressions:
 755         try:
 756             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 757         except ValueError:
 758             pass
 759     if upload_date is None:
 760         timetuple = email.utils.parsedate_tz(date_str)
 761         if timetuple:
 762             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 763     return upload_date
 764
 765
 766 def determine_ext(url, default_ext='unknown_video'):
 767     if url is None:
 768         return default_ext
 769     guess = url.partition('?')[0].rpartition('.')[2]
 770     if re.match(r'^[A-Za-z0-9]+$', guess):
 771         return guess
 772     else:
 773         return default_ext
 774
 775
 776 def subtitles_filename(filename, sub_lang, sub_format):
 777     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 778
 779
 780 def date_from_str(date_str):
 781     """
 782     Return a datetime object from a string in the format YYYYMMDD or
 783     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 784     today = datetime.date.today()
 785     if date_str in ('now', 'today'):
 786         return today
 787     if date_str == 'yesterday':
 788         return today - datetime.timedelta(days=1)
 789     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 790     if match is not None:
 791         sign = match.group('sign')
 792         time = int(match.group('time'))
 793         if sign == '-':
 794             time = -time
 795         unit = match.group('unit')
 796         # A bad aproximation?
 797         if unit == 'month':
 798             unit = 'day'
 799             time *= 30
 800         elif unit == 'year':
 801             unit = 'day'
 802             time *= 365
 803         unit += 's'
 804         delta = datetime.timedelta(**{unit: time})
 805         return today + delta
 806     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 807
 808
 809 def hyphenate_date(date_str):
 810     """
 811     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 812     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 813     if match is not None:
 814         return '-'.join(match.groups())
 815     else:
 816         return date_str
 817
 818
 819 class DateRange(object):
 820     """Represents a time interval between two dates"""
 821
 822     def __init__(self, start=None, end=None):
 823         """start and end must be strings in the format accepted by date"""
 824         if start is not None:
 825             self.start = date_from_str(start)
 826         else:
 827             self.start = datetime.datetime.min.date()
 828         if end is not None:
 829             self.end = date_from_str(end)
 830         else:
 831             self.end = datetime.datetime.max.date()
 832         if self.start > self.end:
 833             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 834
 835     @classmethod
 836     def day(cls, day):
 837         """Returns a range that only contains the given day"""
 838         return cls(day, day)
 839
 840     def __contains__(self, date):
 841         """Check if the date is in the range"""
 842         if not isinstance(date, datetime.date):
 843             date = date_from_str(date)
 844         return self.start <= date <= self.end
 845
 846     def __str__(self):
 847         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 848
 849
 850 def platform_name():
 851     """ Returns the platform name as a compat_str """
 852     res = platform.platform()
 853     if isinstance(res, bytes):
 854         res = res.decode(preferredencoding())
 855
 856     assert isinstance(res, compat_str)
 857     return res
 858
 859
 860 def _windows_write_string(s, out):
 861     """ Returns True if the string was written using special methods,
 862     False if it has yet to be written out."""
 863     # Adapted from http://stackoverflow.com/a/3259271/35070
 864
 865     import ctypes
 866     import ctypes.wintypes
 867
 868     WIN_OUTPUT_IDS = {
 869         1: -11,
 870         2: -12,
 871     }
 872
 873     try:
 874         fileno = out.fileno()
 875     except AttributeError:
 876         # If the output stream doesn't have a fileno, it's virtual
 877         return False
 878     if fileno not in WIN_OUTPUT_IDS:
 879         return False
 880
 881     GetStdHandle = ctypes.WINFUNCTYPE(
 882         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 883         (b"GetStdHandle", ctypes.windll.kernel32))
 884     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 885
 886     WriteConsoleW = ctypes.WINFUNCTYPE(
 887         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 888         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 889         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 890     written = ctypes.wintypes.DWORD(0)
 891
 892     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 893     FILE_TYPE_CHAR = 0x0002
 894     FILE_TYPE_REMOTE = 0x8000
 895     GetConsoleMode = ctypes.WINFUNCTYPE(
 896         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 897         ctypes.POINTER(ctypes.wintypes.DWORD))(
 898         (b"GetConsoleMode", ctypes.windll.kernel32))
 899     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 900
 901     def not_a_console(handle):
 902         if handle == INVALID_HANDLE_VALUE or handle is None:
 903             return True
 904         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 905                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 906
 907     if not_a_console(h):
 908         return False
 909
 910     def next_nonbmp_pos(s):
 911         try:
 912             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 913         except StopIteration:
 914             return len(s)
 915
 916     while s:
 917         count = min(next_nonbmp_pos(s), 1024)
 918
 919         ret = WriteConsoleW(
 920             h, s, count if count else 2, ctypes.byref(written), None)
 921         if ret == 0:
 922             raise OSError('Failed to write string')
 923         if not count:  # We just wrote a non-BMP character
 924             assert written.value == 2
 925             s = s[1:]
 926         else:
 927             assert written.value > 0
 928             s = s[written.value:]
 929     return True
 930
 931
 932 def write_string(s, out=None, encoding=None):
 933     if out is None:
 934         out = sys.stderr
 935     assert type(s) == compat_str
 936
 937     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 938         if _windows_write_string(s, out):
 939             return
 940
 941     if ('b' in getattr(out, 'mode', '') or
 942             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 943         byt = s.encode(encoding or preferredencoding(), 'ignore')
 944         out.write(byt)
 945     elif hasattr(out, 'buffer'):
 946         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 947         byt = s.encode(enc, 'ignore')
 948         out.buffer.write(byt)
 949     else:
 950         out.write(s)
 951     out.flush()
 952
 953
 954 def bytes_to_intlist(bs):
 955     if not bs:
 956         return []
 957     if isinstance(bs[0], int):  # Python 3
 958         return list(bs)
 959     else:
 960         return [ord(c) for c in bs]
 961
 962
 963 def intlist_to_bytes(xs):
 964     if not xs:
 965         return b''
 966     return struct_pack('%dB' % len(xs), *xs)
 967
 968
 969 # Cross-platform file locking
 970 if sys.platform == 'win32':
 971     import ctypes.wintypes
 972     import msvcrt
 973
 974     class OVERLAPPED(ctypes.Structure):
 975         _fields_ = [
 976             ('Internal', ctypes.wintypes.LPVOID),
 977             ('InternalHigh', ctypes.wintypes.LPVOID),
 978             ('Offset', ctypes.wintypes.DWORD),
 979             ('OffsetHigh', ctypes.wintypes.DWORD),
 980             ('hEvent', ctypes.wintypes.HANDLE),
 981         ]
 982
 983     kernel32 = ctypes.windll.kernel32
 984     LockFileEx = kernel32.LockFileEx
 985     LockFileEx.argtypes = [
 986         ctypes.wintypes.HANDLE,     # hFile
 987         ctypes.wintypes.DWORD,      # dwFlags
 988         ctypes.wintypes.DWORD,      # dwReserved
 989         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 990         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 991         ctypes.POINTER(OVERLAPPED)  # Overlapped
 992     ]
 993     LockFileEx.restype = ctypes.wintypes.BOOL
 994     UnlockFileEx = kernel32.UnlockFileEx
 995     UnlockFileEx.argtypes = [
 996         ctypes.wintypes.HANDLE,     # hFile
 997         ctypes.wintypes.DWORD,      # dwReserved
 998         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 999         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1000         ctypes.POINTER(OVERLAPPED)  # Overlapped
1001     ]
1002     UnlockFileEx.restype = ctypes.wintypes.BOOL
1003     whole_low = 0xffffffff
1004     whole_high = 0x7fffffff
1005
1006     def _lock_file(f, exclusive):
1007         overlapped = OVERLAPPED()
1008         overlapped.Offset = 0
1009         overlapped.OffsetHigh = 0
1010         overlapped.hEvent = 0
1011         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1012         handle = msvcrt.get_osfhandle(f.fileno())
1013         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1014                           whole_low, whole_high, f._lock_file_overlapped_p):
1015             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1016
1017     def _unlock_file(f):
1018         assert f._lock_file_overlapped_p
1019         handle = msvcrt.get_osfhandle(f.fileno())
1020         if not UnlockFileEx(handle, 0,
1021                             whole_low, whole_high, f._lock_file_overlapped_p):
1022             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1023
1024 else:
1025     import fcntl
1026
1027     def _lock_file(f, exclusive):
1028         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1029
1030     def _unlock_file(f):
1031         fcntl.flock(f, fcntl.LOCK_UN)
1032
1033
1034 class locked_file(object):
1035     def __init__(self, filename, mode, encoding=None):
1036         assert mode in ['r', 'a', 'w']
1037         self.f = io.open(filename, mode, encoding=encoding)
1038         self.mode = mode
1039
1040     def __enter__(self):
1041         exclusive = self.mode != 'r'
1042         try:
1043             _lock_file(self.f, exclusive)
1044         except IOError:
1045             self.f.close()
1046             raise
1047         return self
1048
1049     def __exit__(self, etype, value, traceback):
1050         try:
1051             _unlock_file(self.f)
1052         finally:
1053             self.f.close()
1054
1055     def __iter__(self):
1056         return iter(self.f)
1057
1058     def write(self, *args):
1059         return self.f.write(*args)
1060
1061     def read(self, *args):
1062         return self.f.read(*args)
1063
1064
1065 def get_filesystem_encoding():
1066     encoding = sys.getfilesystemencoding()
1067     return encoding if encoding is not None else 'utf-8'
1068
1069
1070 def shell_quote(args):
1071     quoted_args = []
1072     encoding = get_filesystem_encoding()
1073     for a in args:
1074         if isinstance(a, bytes):
1075             # We may get a filename encoded with 'encodeFilename'
1076             a = a.decode(encoding)
1077         quoted_args.append(pipes.quote(a))
1078     return ' '.join(quoted_args)
1079
1080
1081 def takewhile_inclusive(pred, seq):
1082     """ Like itertools.takewhile, but include the latest evaluated element
1083         (the first element so that Not pred(e)) """
1084     for e in seq:
1085         yield e
1086         if not pred(e):
1087             return
1088
1089
1090 def smuggle_url(url, data):
1091     """ Pass additional data in a URL for internal use. """
1092
1093     sdata = compat_urllib_parse.urlencode(
1094         {'__youtubedl_smuggle': json.dumps(data)})
1095     return url + '#' + sdata
1096
1097
1098 def unsmuggle_url(smug_url, default=None):
1099     if '#__youtubedl_smuggle' not in smug_url:
1100         return smug_url, default
1101     url, _, sdata = smug_url.rpartition('#')
1102     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1103     data = json.loads(jsond)
1104     return url, data
1105
1106
1107 def format_bytes(bytes):
1108     if bytes is None:
1109         return 'N/A'
1110     if type(bytes) is str:
1111         bytes = float(bytes)
1112     if bytes == 0.0:
1113         exponent = 0
1114     else:
1115         exponent = int(math.log(bytes, 1024.0))
1116     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1117     converted = float(bytes) / float(1024 ** exponent)
1118     return '%.2f%s' % (converted, suffix)
1119
1120
1121 def parse_filesize(s):
1122     if s is None:
1123         return None
1124
1125     # The lower-case forms are of course incorrect and inofficial,
1126     # but we support those too
1127     _UNIT_TABLE = {
1128         'B': 1,
1129         'b': 1,
1130         'KiB': 1024,
1131         'KB': 1000,
1132         'kB': 1024,
1133         'Kb': 1000,
1134         'MiB': 1024 ** 2,
1135         'MB': 1000 ** 2,
1136         'mB': 1024 ** 2,
1137         'Mb': 1000 ** 2,
1138         'GiB': 1024 ** 3,
1139         'GB': 1000 ** 3,
1140         'gB': 1024 ** 3,
1141         'Gb': 1000 ** 3,
1142         'TiB': 1024 ** 4,
1143         'TB': 1000 ** 4,
1144         'tB': 1024 ** 4,
1145         'Tb': 1000 ** 4,
1146         'PiB': 1024 ** 5,
1147         'PB': 1000 ** 5,
1148         'pB': 1024 ** 5,
1149         'Pb': 1000 ** 5,
1150         'EiB': 1024 ** 6,
1151         'EB': 1000 ** 6,
1152         'eB': 1024 ** 6,
1153         'Eb': 1000 ** 6,
1154         'ZiB': 1024 ** 7,
1155         'ZB': 1000 ** 7,
1156         'zB': 1024 ** 7,
1157         'Zb': 1000 ** 7,
1158         'YiB': 1024 ** 8,
1159         'YB': 1000 ** 8,
1160         'yB': 1024 ** 8,
1161         'Yb': 1000 ** 8,
1162     }
1163
1164     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1165     m = re.match(
1166         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1167     if not m:
1168         return None
1169
1170     num_str = m.group('num').replace(',', '.')
1171     mult = _UNIT_TABLE[m.group('unit')]
1172     return int(float(num_str) * mult)
1173
1174
1175 def get_term_width():
1176     columns = compat_getenv('COLUMNS', None)
1177     if columns:
1178         return int(columns)
1179
1180     try:
1181         sp = subprocess.Popen(
1182             ['stty', 'size'],
1183             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1184         out, err = sp.communicate()
1185         return int(out.split()[1])
1186     except:
1187         pass
1188     return None
1189
1190
1191 def month_by_name(name):
1192     """ Return the number of a month by (locale-independently) English name """
1193
1194     ENGLISH_NAMES = [
1195         'January', 'February', 'March', 'April', 'May', 'June',
1196         'July', 'August', 'September', 'October', 'November', 'December']
1197     try:
1198         return ENGLISH_NAMES.index(name) + 1
1199     except ValueError:
1200         return None
1201
1202
1203 def fix_xml_ampersands(xml_str):
1204     """Replace all the '&' by '&amp;' in XML"""
1205     return re.sub(
1206         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1207         '&amp;',
1208         xml_str)
1209
1210
1211 def setproctitle(title):
1212     assert isinstance(title, compat_str)
1213     try:
1214         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1215     except OSError:
1216         return
1217     title_bytes = title.encode('utf-8')
1218     buf = ctypes.create_string_buffer(len(title_bytes))
1219     buf.value = title_bytes
1220     try:
1221         libc.prctl(15, buf, 0, 0, 0)
1222     except AttributeError:
1223         return  # Strange libc, just skip this
1224
1225
1226 def remove_start(s, start):
1227     if s.startswith(start):
1228         return s[len(start):]
1229     return s
1230
1231
1232 def remove_end(s, end):
1233     if s.endswith(end):
1234         return s[:-len(end)]
1235     return s
1236
1237
1238 def url_basename(url):
1239     path = compat_urlparse.urlparse(url).path
1240     return path.strip('/').split('/')[-1]
1241
1242
1243 class HEADRequest(compat_urllib_request.Request):
1244     def get_method(self):
1245         return "HEAD"
1246
1247
1248 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1249     if get_attr:
1250         if v is not None:
1251             v = getattr(v, get_attr, None)
1252     if v == '':
1253         v = None
1254     return default if v is None else (int(v) * invscale // scale)
1255
1256
1257 def str_or_none(v, default=None):
1258     return default if v is None else compat_str(v)
1259
1260
1261 def str_to_int(int_str):
1262     """ A more relaxed version of int_or_none """
1263     if int_str is None:
1264         return None
1265     int_str = re.sub(r'[,\.\+]', '', int_str)
1266     return int(int_str)
1267
1268
1269 def float_or_none(v, scale=1, invscale=1, default=None):
1270     return default if v is None else (float(v) * invscale / scale)
1271
1272
1273 def parse_duration(s):
1274     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1275         return None
1276
1277     s = s.strip()
1278
1279     m = re.match(
1280         r'''(?ix)T?
1281         (?:
1282             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1283             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1284
1285             (?:
1286                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1287                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1288             )?
1289             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1290         )$''', s)
1291     if not m:
1292         return None
1293     res = 0
1294     if m.group('only_mins'):
1295         return float_or_none(m.group('only_mins'), invscale=60)
1296     if m.group('only_hours'):
1297         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1298     if m.group('secs'):
1299         res += int(m.group('secs'))
1300     if m.group('mins'):
1301         res += int(m.group('mins')) * 60
1302     if m.group('hours'):
1303         res += int(m.group('hours')) * 60 * 60
1304     if m.group('ms'):
1305         res += float(m.group('ms'))
1306     return res
1307
1308
1309 def prepend_extension(filename, ext):
1310     name, real_ext = os.path.splitext(filename)
1311     return '{0}.{1}{2}'.format(name, ext, real_ext)
1312
1313
1314 def check_executable(exe, args=[]):
1315     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316     args can be a list of arguments for a short output (like -version) """
1317     try:
1318         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1319     except OSError:
1320         return False
1321     return exe
1322
1323
1324 def get_exe_version(exe, args=['--version'],
1325                     version_re=None, unrecognized='present'):
1326     """ Returns the version of the specified executable,
1327     or False if the executable is not present """
1328     try:
1329         out, _ = subprocess.Popen(
1330             [exe] + args,
1331             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1332     except OSError:
1333         return False
1334     if isinstance(out, bytes):  # Python 2.x
1335         out = out.decode('ascii', 'ignore')
1336     return detect_exe_version(out, version_re, unrecognized)
1337
1338
1339 def detect_exe_version(output, version_re=None, unrecognized='present'):
1340     assert isinstance(output, compat_str)
1341     if version_re is None:
1342         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1343     m = re.search(version_re, output)
1344     if m:
1345         return m.group(1)
1346     else:
1347         return unrecognized
1348
1349
1350 class PagedList(object):
1351     def __len__(self):
1352         # This is only useful for tests
1353         return len(self.getslice())
1354
1355
1356 class OnDemandPagedList(PagedList):
1357     def __init__(self, pagefunc, pagesize):
1358         self._pagefunc = pagefunc
1359         self._pagesize = pagesize
1360
1361     def getslice(self, start=0, end=None):
1362         res = []
1363         for pagenum in itertools.count(start // self._pagesize):
1364             firstid = pagenum * self._pagesize
1365             nextfirstid = pagenum * self._pagesize + self._pagesize
1366             if start >= nextfirstid:
1367                 continue
1368
1369             page_results = list(self._pagefunc(pagenum))
1370
1371             startv = (
1372                 start % self._pagesize
1373                 if firstid <= start < nextfirstid
1374                 else 0)
1375
1376             endv = (
1377                 ((end - 1) % self._pagesize) + 1
1378                 if (end is not None and firstid <= end <= nextfirstid)
1379                 else None)
1380
1381             if startv != 0 or endv is not None:
1382                 page_results = page_results[startv:endv]
1383             res.extend(page_results)
1384
1385             # A little optimization - if current page is not "full", ie. does
1386             # not contain page_size videos then we can assume that this page
1387             # is the last one - there are no more ids on further pages -
1388             # i.e. no need to query again.
1389             if len(page_results) + startv < self._pagesize:
1390                 break
1391
1392             # If we got the whole page, but the next page is not interesting,
1393             # break out early as well
1394             if end == nextfirstid:
1395                 break
1396         return res
1397
1398
1399 class InAdvancePagedList(PagedList):
1400     def __init__(self, pagefunc, pagecount, pagesize):
1401         self._pagefunc = pagefunc
1402         self._pagecount = pagecount
1403         self._pagesize = pagesize
1404
1405     def getslice(self, start=0, end=None):
1406         res = []
1407         start_page = start // self._pagesize
1408         end_page = (
1409             self._pagecount if end is None else (end // self._pagesize + 1))
1410         skip_elems = start - start_page * self._pagesize
1411         only_more = None if end is None else end - start
1412         for pagenum in range(start_page, end_page):
1413             page = list(self._pagefunc(pagenum))
1414             if skip_elems:
1415                 page = page[skip_elems:]
1416                 skip_elems = None
1417             if only_more is not None:
1418                 if len(page) < only_more:
1419                     only_more -= len(page)
1420                 else:
1421                     page = page[:only_more]
1422                     res.extend(page)
1423                     break
1424             res.extend(page)
1425         return res
1426
1427
1428 def uppercase_escape(s):
1429     unicode_escape = codecs.getdecoder('unicode_escape')
1430     return re.sub(
1431         r'\\U[0-9a-fA-F]{8}',
1432         lambda m: unicode_escape(m.group(0))[0],
1433         s)
1434
1435
1436 def escape_rfc3986(s):
1437     """Escape non-ASCII characters as suggested by RFC 3986"""
1438     if sys.version_info < (3, 0) and isinstance(s, unicode):
1439         s = s.encode('utf-8')
1440     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1441
1442
1443 def escape_url(url):
1444     """Escape URL as suggested by RFC 3986"""
1445     url_parsed = compat_urllib_parse_urlparse(url)
1446     return url_parsed._replace(
1447         path=escape_rfc3986(url_parsed.path),
1448         params=escape_rfc3986(url_parsed.params),
1449         query=escape_rfc3986(url_parsed.query),
1450         fragment=escape_rfc3986(url_parsed.fragment)
1451     ).geturl()
1452
1453 try:
1454     struct.pack('!I', 0)
1455 except TypeError:
1456     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1457     def struct_pack(spec, *args):
1458         if isinstance(spec, compat_str):
1459             spec = spec.encode('ascii')
1460         return struct.pack(spec, *args)
1461
1462     def struct_unpack(spec, *args):
1463         if isinstance(spec, compat_str):
1464             spec = spec.encode('ascii')
1465         return struct.unpack(spec, *args)
1466 else:
1467     struct_pack = struct.pack
1468     struct_unpack = struct.unpack
1469
1470
1471 def read_batch_urls(batch_fd):
1472     def fixup(url):
1473         if not isinstance(url, compat_str):
1474             url = url.decode('utf-8', 'replace')
1475         BOM_UTF8 = '\xef\xbb\xbf'
1476         if url.startswith(BOM_UTF8):
1477             url = url[len(BOM_UTF8):]
1478         url = url.strip()
1479         if url.startswith(('#', ';', ']')):
1480             return False
1481         return url
1482
1483     with contextlib.closing(batch_fd) as fd:
1484         return [url for url in map(fixup, fd) if url]
1485
1486
1487 def urlencode_postdata(*args, **kargs):
1488     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1489
1490
1491 try:
1492     etree_iter = xml.etree.ElementTree.Element.iter
1493 except AttributeError:  # Python <=2.6
1494     etree_iter = lambda n: n.findall('.//*')
1495
1496
1497 def parse_xml(s):
1498     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1499         def doctype(self, name, pubid, system):
1500             pass  # Ignore doctypes
1501
1502     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1503     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1504     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1505     # Fix up XML parser in Python 2.x
1506     if sys.version_info < (3, 0):
1507         for n in etree_iter(tree):
1508             if n.text is not None:
1509                 if not isinstance(n.text, compat_str):
1510                     n.text = n.text.decode('utf-8')
1511     return tree
1512
1513
1514 US_RATINGS = {
1515     'G': 0,
1516     'PG': 10,
1517     'PG-13': 13,
1518     'R': 16,
1519     'NC': 18,
1520 }
1521
1522
1523 def parse_age_limit(s):
1524     if s is None:
1525         return None
1526     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1527     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1528
1529
1530 def strip_jsonp(code):
1531     return re.sub(
1532         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1533
1534
1535 def js_to_json(code):
1536     def fix_kv(m):
1537         v = m.group(0)
1538         if v in ('true', 'false', 'null'):
1539             return v
1540         if v.startswith('"'):
1541             return v
1542         if v.startswith("'"):
1543             v = v[1:-1]
1544             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1545                 '\\\\': '\\\\',
1546                 "\\'": "'",
1547                 '"': '\\"',
1548             }[m.group(0)], v)
1549         return '"%s"' % v
1550
1551     res = re.sub(r'''(?x)
1552         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1553         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1554         [a-zA-Z_][a-zA-Z_0-9]*
1555         ''', fix_kv, code)
1556     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1557     return res
1558
1559
1560 def qualities(quality_ids):
1561     """ Get a numeric quality value out of a list of possible values """
1562     def q(qid):
1563         try:
1564             return quality_ids.index(qid)
1565         except ValueError:
1566             return -1
1567     return q
1568
1569
1570 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1571
1572
1573 def limit_length(s, length):
1574     """ Add ellipses to overly long strings """
1575     if s is None:
1576         return None
1577     ELLIPSES = '...'
1578     if len(s) > length:
1579         return s[:length - len(ELLIPSES)] + ELLIPSES
1580     return s
1581
1582
1583 def version_tuple(v):
1584     return tuple(int(e) for e in re.split(r'[-.]', v))
1585
1586
1587 def is_outdated_version(version, limit, assume_new=True):
1588     if not version:
1589         return not assume_new
1590     try:
1591         return version_tuple(version) < version_tuple(limit)
1592     except ValueError:
1593         return not assume_new
1594
1595
1596 def ytdl_is_updateable():
1597     """ Returns if youtube-dl can be updated with -U """
1598     from zipimport import zipimporter
1599
1600     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1601
1602
1603 def args_to_str(args):
1604     # Get a short string representation for a subprocess command
1605     return ' '.join(shlex_quote(a) for a in args)
1606
1607
1608 def urlhandle_detect_ext(url_handle):
1609     try:
1610         url_handle.headers
1611         getheader = lambda h: url_handle.headers[h]
1612     except AttributeError:  # Python < 3
1613         getheader = url_handle.info().getheader
1614
1615     return getheader('Content-Type').split("/")[1]
1616
1617
1618 def age_restricted(content_limit, age_limit):
1619     """ Returns True iff the content should be blocked """
1620
1621     if age_limit is None:  # No limit set
1622         return False
1623     if content_limit is None:
1624         return False  # Content available for everyone
1625     return age_limit < content_limit