_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 403         if opts_no_check_certificate:
 404             context.verify_mode = ssl.CERT_NONE
 405         try:
 406             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 407         except TypeError:
 408             # Python 2.7.8
 409             # (create_default_context present but HTTPSHandler has no context=)
 410             pass
 411
 412     if sys.version_info < (3, 2):
 413         import httplib
 414
 415         class HTTPSConnectionV3(httplib.HTTPSConnection):
 416             def __init__(self, *args, **kwargs):
 417                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 418
 419             def connect(self):
 420                 sock = socket.create_connection((self.host, self.port), self.timeout)
 421                 if getattr(self, '_tunnel_host', False):
 422                     self.sock = sock
 423                     self._tunnel()
 424                 try:
 425                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 426                 except ssl.SSLError:
 427                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 428
 429         return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
 430     else:  # Python < 3.4
 431         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 432         context.verify_mode = (ssl.CERT_NONE
 433                                if opts_no_check_certificate
 434                                else ssl.CERT_REQUIRED)
 435         context.set_default_verify_paths()
 436         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 437
 438
 439 class ExtractorError(Exception):
 440     """Error during info extraction."""
 441
 442     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 443         """ tb, if given, is the original traceback (so that it can be printed out).
 444         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 445         """
 446
 447         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 448             expected = True
 449         if video_id is not None:
 450             msg = video_id + ': ' + msg
 451         if cause:
 452             msg += ' (caused by %r)' % cause
 453         if not expected:
 454             if ytdl_is_updateable():
 455                 update_cmd = 'type  youtube-dl -U  to update'
 456             else:
 457                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 458             msg += '; please report this issue on https://yt-dl.org/bug .'
 459             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 460             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 461         super(ExtractorError, self).__init__(msg)
 462
 463         self.traceback = tb
 464         self.exc_info = sys.exc_info()  # preserve original exception
 465         self.cause = cause
 466         self.video_id = video_id
 467
 468     def format_traceback(self):
 469         if self.traceback is None:
 470             return None
 471         return ''.join(traceback.format_tb(self.traceback))
 472
 473
 474 class UnsupportedError(ExtractorError):
 475     def __init__(self, url):
 476         super(UnsupportedError, self).__init__(
 477             'Unsupported URL: %s' % url, expected=True)
 478         self.url = url
 479
 480
 481 class RegexNotFoundError(ExtractorError):
 482     """Error when a regex didn't match"""
 483     pass
 484
 485
 486 class DownloadError(Exception):
 487     """Download Error exception.
 488
 489     This exception may be thrown by FileDownloader objects if they are not
 490     configured to continue on errors. They will contain the appropriate
 491     error message.
 492     """
 493
 494     def __init__(self, msg, exc_info=None):
 495         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 496         super(DownloadError, self).__init__(msg)
 497         self.exc_info = exc_info
 498
 499
 500 class SameFileError(Exception):
 501     """Same File exception.
 502
 503     This exception will be thrown by FileDownloader objects if they detect
 504     multiple files would have to be downloaded to the same file on disk.
 505     """
 506     pass
 507
 508
 509 class PostProcessingError(Exception):
 510     """Post Processing exception.
 511
 512     This exception may be raised by PostProcessor's .run() method to
 513     indicate an error in the postprocessing task.
 514     """
 515
 516     def __init__(self, msg):
 517         self.msg = msg
 518
 519
 520 class MaxDownloadsReached(Exception):
 521     """ --max-downloads limit has been reached. """
 522     pass
 523
 524
 525 class UnavailableVideoError(Exception):
 526     """Unavailable Format exception.
 527
 528     This exception will be thrown when a video is requested
 529     in a format that is not available for that video.
 530     """
 531     pass
 532
 533
 534 class ContentTooShortError(Exception):
 535     """Content Too Short exception.
 536
 537     This exception may be raised by FileDownloader objects when a file they
 538     download is too small for what the server announced first, indicating
 539     the connection was probably interrupted.
 540     """
 541     # Both in bytes
 542     downloaded = None
 543     expected = None
 544
 545     def __init__(self, downloaded, expected):
 546         self.downloaded = downloaded
 547         self.expected = expected
 548
 549
 550 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 551     hc = http_class(*args, **kwargs)
 552     source_address = ydl_handler._params.get('source_address')
 553     if source_address is not None:
 554         sa = (source_address, 0)
 555         if hasattr(hc, 'source_address'):  # Python 2.7+
 556             hc.source_address = sa
 557         else:  # Python 2.6
 558             def _hc_connect(self, *args, **kwargs):
 559                 sock = compat_socket_create_connection(
 560                     (self.host, self.port), self.timeout, sa)
 561                 if is_https:
 562                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
 563                 else:
 564                     self.sock = sock
 565             hc.connect = functools.partial(_hc_connect, hc)
 566
 567     return hc
 568
 569
 570 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 571     """Handler for HTTP requests and responses.
 572
 573     This class, when installed with an OpenerDirector, automatically adds
 574     the standard headers to every HTTP request and handles gzipped and
 575     deflated responses from web servers. If compression is to be avoided in
 576     a particular request, the original request in the program code only has
 577     to include the HTTP header "Youtubedl-No-Compression", which will be
 578     removed before making the real request.
 579
 580     Part of this code was copied from:
 581
 582     http://techknack.net/python-urllib2-handlers/
 583
 584     Andrew Rowls, the author of that code, agreed to release it to the
 585     public domain.
 586     """
 587
 588     def __init__(self, params, *args, **kwargs):
 589         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 590         self._params = params
 591
 592     def http_open(self, req):
 593         return self.do_open(functools.partial(
 594             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 595             req)
 596
 597     @staticmethod
 598     def deflate(data):
 599         try:
 600             return zlib.decompress(data, -zlib.MAX_WBITS)
 601         except zlib.error:
 602             return zlib.decompress(data)
 603
 604     @staticmethod
 605     def addinfourl_wrapper(stream, headers, url, code):
 606         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 607             return compat_urllib_request.addinfourl(stream, headers, url, code)
 608         ret = compat_urllib_request.addinfourl(stream, headers, url)
 609         ret.code = code
 610         return ret
 611
 612     def http_request(self, req):
 613         for h, v in std_headers.items():
 614             if h not in req.headers:
 615                 req.add_header(h, v)
 616         if 'Youtubedl-no-compression' in req.headers:
 617             if 'Accept-encoding' in req.headers:
 618                 del req.headers['Accept-encoding']
 619             del req.headers['Youtubedl-no-compression']
 620         if 'Youtubedl-user-agent' in req.headers:
 621             if 'User-agent' in req.headers:
 622                 del req.headers['User-agent']
 623             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 624             del req.headers['Youtubedl-user-agent']
 625
 626         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 627             # Python 2.6 is brain-dead when it comes to fragments
 628             req._Request__original = req._Request__original.partition('#')[0]
 629             req._Request__r_type = req._Request__r_type.partition('#')[0]
 630
 631         return req
 632
 633     def http_response(self, req, resp):
 634         old_resp = resp
 635         # gzip
 636         if resp.headers.get('Content-encoding', '') == 'gzip':
 637             content = resp.read()
 638             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 639             try:
 640                 uncompressed = io.BytesIO(gz.read())
 641             except IOError as original_ioerror:
 642                 # There may be junk add the end of the file
 643                 # See http://stackoverflow.com/q/4928560/35070 for details
 644                 for i in range(1, 1024):
 645                     try:
 646                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 647                         uncompressed = io.BytesIO(gz.read())
 648                     except IOError:
 649                         continue
 650                     break
 651                 else:
 652                     raise original_ioerror
 653             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 654             resp.msg = old_resp.msg
 655         # deflate
 656         if resp.headers.get('Content-encoding', '') == 'deflate':
 657             gz = io.BytesIO(self.deflate(resp.read()))
 658             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 659             resp.msg = old_resp.msg
 660         return resp
 661
 662     https_request = http_request
 663     https_response = http_response
 664
 665
 666 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 667     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 668         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 669         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 670         self._params = params
 671
 672     def https_open(self, req):
 673         return self.do_open(functools.partial(
 674             _create_http_connection, self, self._https_conn_class, True),
 675             req)
 676
 677
 678 def parse_iso8601(date_str, delimiter='T'):
 679     """ Return a UNIX timestamp from the given date """
 680
 681     if date_str is None:
 682         return None
 683
 684     m = re.search(
 685         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 686         date_str)
 687     if not m:
 688         timezone = datetime.timedelta()
 689     else:
 690         date_str = date_str[:-len(m.group(0))]
 691         if not m.group('sign'):
 692             timezone = datetime.timedelta()
 693         else:
 694             sign = 1 if m.group('sign') == '+' else -1
 695             timezone = datetime.timedelta(
 696                 hours=sign * int(m.group('hours')),
 697                 minutes=sign * int(m.group('minutes')))
 698     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 699     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 700     return calendar.timegm(dt.timetuple())
 701
 702
 703 def unified_strdate(date_str, day_first=True):
 704     """Return a string with the date in the format YYYYMMDD"""
 705
 706     if date_str is None:
 707         return None
 708     upload_date = None
 709     # Replace commas
 710     date_str = date_str.replace(',', ' ')
 711     # %z (UTC offset) is only supported in python>=3.2
 712     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 713     # Remove AM/PM + timezone
 714     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 715
 716     format_expressions = [
 717         '%d %B %Y',
 718         '%d %b %Y',
 719         '%B %d %Y',
 720         '%b %d %Y',
 721         '%b %dst %Y %I:%M%p',
 722         '%b %dnd %Y %I:%M%p',
 723         '%b %dth %Y %I:%M%p',
 724         '%Y %m %d',
 725         '%Y-%m-%d',
 726         '%Y/%m/%d',
 727         '%Y/%m/%d %H:%M:%S',
 728         '%Y-%m-%d %H:%M:%S',
 729         '%Y-%m-%d %H:%M:%S.%f',
 730         '%d.%m.%Y %H:%M',
 731         '%d.%m.%Y %H.%M',
 732         '%Y-%m-%dT%H:%M:%SZ',
 733         '%Y-%m-%dT%H:%M:%S.%fZ',
 734         '%Y-%m-%dT%H:%M:%S.%f0Z',
 735         '%Y-%m-%dT%H:%M:%S',
 736         '%Y-%m-%dT%H:%M:%S.%f',
 737         '%Y-%m-%dT%H:%M',
 738     ]
 739     if day_first:
 740         format_expressions.extend([
 741             '%d.%m.%Y',
 742             '%d/%m/%Y',
 743             '%d/%m/%y',
 744             '%d/%m/%Y %H:%M:%S',
 745         ])
 746     else:
 747         format_expressions.extend([
 748             '%m.%d.%Y',
 749             '%m/%d/%Y',
 750             '%m/%d/%y',
 751             '%m/%d/%Y %H:%M:%S',
 752         ])
 753     for expression in format_expressions:
 754         try:
 755             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 756         except ValueError:
 757             pass
 758     if upload_date is None:
 759         timetuple = email.utils.parsedate_tz(date_str)
 760         if timetuple:
 761             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 762     return upload_date
 763
 764
 765 def determine_ext(url, default_ext='unknown_video'):
 766     if url is None:
 767         return default_ext
 768     guess = url.partition('?')[0].rpartition('.')[2]
 769     if re.match(r'^[A-Za-z0-9]+$', guess):
 770         return guess
 771     else:
 772         return default_ext
 773
 774
 775 def subtitles_filename(filename, sub_lang, sub_format):
 776     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 777
 778
 779 def date_from_str(date_str):
 780     """
 781     Return a datetime object from a string in the format YYYYMMDD or
 782     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 783     today = datetime.date.today()
 784     if date_str in ('now', 'today'):
 785         return today
 786     if date_str == 'yesterday':
 787         return today - datetime.timedelta(days=1)
 788     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 789     if match is not None:
 790         sign = match.group('sign')
 791         time = int(match.group('time'))
 792         if sign == '-':
 793             time = -time
 794         unit = match.group('unit')
 795         # A bad aproximation?
 796         if unit == 'month':
 797             unit = 'day'
 798             time *= 30
 799         elif unit == 'year':
 800             unit = 'day'
 801             time *= 365
 802         unit += 's'
 803         delta = datetime.timedelta(**{unit: time})
 804         return today + delta
 805     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 806
 807
 808 def hyphenate_date(date_str):
 809     """
 810     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 811     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 812     if match is not None:
 813         return '-'.join(match.groups())
 814     else:
 815         return date_str
 816
 817
 818 class DateRange(object):
 819     """Represents a time interval between two dates"""
 820
 821     def __init__(self, start=None, end=None):
 822         """start and end must be strings in the format accepted by date"""
 823         if start is not None:
 824             self.start = date_from_str(start)
 825         else:
 826             self.start = datetime.datetime.min.date()
 827         if end is not None:
 828             self.end = date_from_str(end)
 829         else:
 830             self.end = datetime.datetime.max.date()
 831         if self.start > self.end:
 832             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 833
 834     @classmethod
 835     def day(cls, day):
 836         """Returns a range that only contains the given day"""
 837         return cls(day, day)
 838
 839     def __contains__(self, date):
 840         """Check if the date is in the range"""
 841         if not isinstance(date, datetime.date):
 842             date = date_from_str(date)
 843         return self.start <= date <= self.end
 844
 845     def __str__(self):
 846         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 847
 848
 849 def platform_name():
 850     """ Returns the platform name as a compat_str """
 851     res = platform.platform()
 852     if isinstance(res, bytes):
 853         res = res.decode(preferredencoding())
 854
 855     assert isinstance(res, compat_str)
 856     return res
 857
 858
 859 def _windows_write_string(s, out):
 860     """ Returns True if the string was written using special methods,
 861     False if it has yet to be written out."""
 862     # Adapted from http://stackoverflow.com/a/3259271/35070
 863
 864     import ctypes
 865     import ctypes.wintypes
 866
 867     WIN_OUTPUT_IDS = {
 868         1: -11,
 869         2: -12,
 870     }
 871
 872     try:
 873         fileno = out.fileno()
 874     except AttributeError:
 875         # If the output stream doesn't have a fileno, it's virtual
 876         return False
 877     if fileno not in WIN_OUTPUT_IDS:
 878         return False
 879
 880     GetStdHandle = ctypes.WINFUNCTYPE(
 881         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 882         (b"GetStdHandle", ctypes.windll.kernel32))
 883     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 884
 885     WriteConsoleW = ctypes.WINFUNCTYPE(
 886         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 887         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 888         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 889     written = ctypes.wintypes.DWORD(0)
 890
 891     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 892     FILE_TYPE_CHAR = 0x0002
 893     FILE_TYPE_REMOTE = 0x8000
 894     GetConsoleMode = ctypes.WINFUNCTYPE(
 895         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 896         ctypes.POINTER(ctypes.wintypes.DWORD))(
 897         (b"GetConsoleMode", ctypes.windll.kernel32))
 898     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 899
 900     def not_a_console(handle):
 901         if handle == INVALID_HANDLE_VALUE or handle is None:
 902             return True
 903         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 904                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 905
 906     if not_a_console(h):
 907         return False
 908
 909     def next_nonbmp_pos(s):
 910         try:
 911             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 912         except StopIteration:
 913             return len(s)
 914
 915     while s:
 916         count = min(next_nonbmp_pos(s), 1024)
 917
 918         ret = WriteConsoleW(
 919             h, s, count if count else 2, ctypes.byref(written), None)
 920         if ret == 0:
 921             raise OSError('Failed to write string')
 922         if not count:  # We just wrote a non-BMP character
 923             assert written.value == 2
 924             s = s[1:]
 925         else:
 926             assert written.value > 0
 927             s = s[written.value:]
 928     return True
 929
 930
 931 def write_string(s, out=None, encoding=None):
 932     if out is None:
 933         out = sys.stderr
 934     assert type(s) == compat_str
 935
 936     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 937         if _windows_write_string(s, out):
 938             return
 939
 940     if ('b' in getattr(out, 'mode', '') or
 941             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 942         byt = s.encode(encoding or preferredencoding(), 'ignore')
 943         out.write(byt)
 944     elif hasattr(out, 'buffer'):
 945         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 946         byt = s.encode(enc, 'ignore')
 947         out.buffer.write(byt)
 948     else:
 949         out.write(s)
 950     out.flush()
 951
 952
 953 def bytes_to_intlist(bs):
 954     if not bs:
 955         return []
 956     if isinstance(bs[0], int):  # Python 3
 957         return list(bs)
 958     else:
 959         return [ord(c) for c in bs]
 960
 961
 962 def intlist_to_bytes(xs):
 963     if not xs:
 964         return b''
 965     return struct_pack('%dB' % len(xs), *xs)
 966
 967
 968 # Cross-platform file locking
 969 if sys.platform == 'win32':
 970     import ctypes.wintypes
 971     import msvcrt
 972
 973     class OVERLAPPED(ctypes.Structure):
 974         _fields_ = [
 975             ('Internal', ctypes.wintypes.LPVOID),
 976             ('InternalHigh', ctypes.wintypes.LPVOID),
 977             ('Offset', ctypes.wintypes.DWORD),
 978             ('OffsetHigh', ctypes.wintypes.DWORD),
 979             ('hEvent', ctypes.wintypes.HANDLE),
 980         ]
 981
 982     kernel32 = ctypes.windll.kernel32
 983     LockFileEx = kernel32.LockFileEx
 984     LockFileEx.argtypes = [
 985         ctypes.wintypes.HANDLE,     # hFile
 986         ctypes.wintypes.DWORD,      # dwFlags
 987         ctypes.wintypes.DWORD,      # dwReserved
 988         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 989         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 990         ctypes.POINTER(OVERLAPPED)  # Overlapped
 991     ]
 992     LockFileEx.restype = ctypes.wintypes.BOOL
 993     UnlockFileEx = kernel32.UnlockFileEx
 994     UnlockFileEx.argtypes = [
 995         ctypes.wintypes.HANDLE,     # hFile
 996         ctypes.wintypes.DWORD,      # dwReserved
 997         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 998         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 999         ctypes.POINTER(OVERLAPPED)  # Overlapped
1000     ]
1001     UnlockFileEx.restype = ctypes.wintypes.BOOL
1002     whole_low = 0xffffffff
1003     whole_high = 0x7fffffff
1004
1005     def _lock_file(f, exclusive):
1006         overlapped = OVERLAPPED()
1007         overlapped.Offset = 0
1008         overlapped.OffsetHigh = 0
1009         overlapped.hEvent = 0
1010         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1011         handle = msvcrt.get_osfhandle(f.fileno())
1012         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1013                           whole_low, whole_high, f._lock_file_overlapped_p):
1014             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1015
1016     def _unlock_file(f):
1017         assert f._lock_file_overlapped_p
1018         handle = msvcrt.get_osfhandle(f.fileno())
1019         if not UnlockFileEx(handle, 0,
1020                             whole_low, whole_high, f._lock_file_overlapped_p):
1021             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1022
1023 else:
1024     import fcntl
1025
1026     def _lock_file(f, exclusive):
1027         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1028
1029     def _unlock_file(f):
1030         fcntl.flock(f, fcntl.LOCK_UN)
1031
1032
1033 class locked_file(object):
1034     def __init__(self, filename, mode, encoding=None):
1035         assert mode in ['r', 'a', 'w']
1036         self.f = io.open(filename, mode, encoding=encoding)
1037         self.mode = mode
1038
1039     def __enter__(self):
1040         exclusive = self.mode != 'r'
1041         try:
1042             _lock_file(self.f, exclusive)
1043         except IOError:
1044             self.f.close()
1045             raise
1046         return self
1047
1048     def __exit__(self, etype, value, traceback):
1049         try:
1050             _unlock_file(self.f)
1051         finally:
1052             self.f.close()
1053
1054     def __iter__(self):
1055         return iter(self.f)
1056
1057     def write(self, *args):
1058         return self.f.write(*args)
1059
1060     def read(self, *args):
1061         return self.f.read(*args)
1062
1063
1064 def get_filesystem_encoding():
1065     encoding = sys.getfilesystemencoding()
1066     return encoding if encoding is not None else 'utf-8'
1067
1068
1069 def shell_quote(args):
1070     quoted_args = []
1071     encoding = get_filesystem_encoding()
1072     for a in args:
1073         if isinstance(a, bytes):
1074             # We may get a filename encoded with 'encodeFilename'
1075             a = a.decode(encoding)
1076         quoted_args.append(pipes.quote(a))
1077     return ' '.join(quoted_args)
1078
1079
1080 def takewhile_inclusive(pred, seq):
1081     """ Like itertools.takewhile, but include the latest evaluated element
1082         (the first element so that Not pred(e)) """
1083     for e in seq:
1084         yield e
1085         if not pred(e):
1086             return
1087
1088
1089 def smuggle_url(url, data):
1090     """ Pass additional data in a URL for internal use. """
1091
1092     sdata = compat_urllib_parse.urlencode(
1093         {'__youtubedl_smuggle': json.dumps(data)})
1094     return url + '#' + sdata
1095
1096
1097 def unsmuggle_url(smug_url, default=None):
1098     if '#__youtubedl_smuggle' not in smug_url:
1099         return smug_url, default
1100     url, _, sdata = smug_url.rpartition('#')
1101     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1102     data = json.loads(jsond)
1103     return url, data
1104
1105
1106 def format_bytes(bytes):
1107     if bytes is None:
1108         return 'N/A'
1109     if type(bytes) is str:
1110         bytes = float(bytes)
1111     if bytes == 0.0:
1112         exponent = 0
1113     else:
1114         exponent = int(math.log(bytes, 1024.0))
1115     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1116     converted = float(bytes) / float(1024 ** exponent)
1117     return '%.2f%s' % (converted, suffix)
1118
1119
1120 def parse_filesize(s):
1121     if s is None:
1122         return None
1123
1124     # The lower-case forms are of course incorrect and inofficial,
1125     # but we support those too
1126     _UNIT_TABLE = {
1127         'B': 1,
1128         'b': 1,
1129         'KiB': 1024,
1130         'KB': 1000,
1131         'kB': 1024,
1132         'Kb': 1000,
1133         'MiB': 1024 ** 2,
1134         'MB': 1000 ** 2,
1135         'mB': 1024 ** 2,
1136         'Mb': 1000 ** 2,
1137         'GiB': 1024 ** 3,
1138         'GB': 1000 ** 3,
1139         'gB': 1024 ** 3,
1140         'Gb': 1000 ** 3,
1141         'TiB': 1024 ** 4,
1142         'TB': 1000 ** 4,
1143         'tB': 1024 ** 4,
1144         'Tb': 1000 ** 4,
1145         'PiB': 1024 ** 5,
1146         'PB': 1000 ** 5,
1147         'pB': 1024 ** 5,
1148         'Pb': 1000 ** 5,
1149         'EiB': 1024 ** 6,
1150         'EB': 1000 ** 6,
1151         'eB': 1024 ** 6,
1152         'Eb': 1000 ** 6,
1153         'ZiB': 1024 ** 7,
1154         'ZB': 1000 ** 7,
1155         'zB': 1024 ** 7,
1156         'Zb': 1000 ** 7,
1157         'YiB': 1024 ** 8,
1158         'YB': 1000 ** 8,
1159         'yB': 1024 ** 8,
1160         'Yb': 1000 ** 8,
1161     }
1162
1163     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1164     m = re.match(
1165         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1166     if not m:
1167         return None
1168
1169     num_str = m.group('num').replace(',', '.')
1170     mult = _UNIT_TABLE[m.group('unit')]
1171     return int(float(num_str) * mult)
1172
1173
1174 def get_term_width():
1175     columns = compat_getenv('COLUMNS', None)
1176     if columns:
1177         return int(columns)
1178
1179     try:
1180         sp = subprocess.Popen(
1181             ['stty', 'size'],
1182             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1183         out, err = sp.communicate()
1184         return int(out.split()[1])
1185     except:
1186         pass
1187     return None
1188
1189
1190 def month_by_name(name):
1191     """ Return the number of a month by (locale-independently) English name """
1192
1193     ENGLISH_NAMES = [
1194         'January', 'February', 'March', 'April', 'May', 'June',
1195         'July', 'August', 'September', 'October', 'November', 'December']
1196     try:
1197         return ENGLISH_NAMES.index(name) + 1
1198     except ValueError:
1199         return None
1200
1201
1202 def fix_xml_ampersands(xml_str):
1203     """Replace all the '&' by '&amp;' in XML"""
1204     return re.sub(
1205         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1206         '&amp;',
1207         xml_str)
1208
1209
1210 def setproctitle(title):
1211     assert isinstance(title, compat_str)
1212     try:
1213         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1214     except OSError:
1215         return
1216     title_bytes = title.encode('utf-8')
1217     buf = ctypes.create_string_buffer(len(title_bytes))
1218     buf.value = title_bytes
1219     try:
1220         libc.prctl(15, buf, 0, 0, 0)
1221     except AttributeError:
1222         return  # Strange libc, just skip this
1223
1224
1225 def remove_start(s, start):
1226     if s.startswith(start):
1227         return s[len(start):]
1228     return s
1229
1230
1231 def remove_end(s, end):
1232     if s.endswith(end):
1233         return s[:-len(end)]
1234     return s
1235
1236
1237 def url_basename(url):
1238     path = compat_urlparse.urlparse(url).path
1239     return path.strip('/').split('/')[-1]
1240
1241
1242 class HEADRequest(compat_urllib_request.Request):
1243     def get_method(self):
1244         return "HEAD"
1245
1246
1247 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1248     if get_attr:
1249         if v is not None:
1250             v = getattr(v, get_attr, None)
1251     if v == '':
1252         v = None
1253     return default if v is None else (int(v) * invscale // scale)
1254
1255
1256 def str_or_none(v, default=None):
1257     return default if v is None else compat_str(v)
1258
1259
1260 def str_to_int(int_str):
1261     """ A more relaxed version of int_or_none """
1262     if int_str is None:
1263         return None
1264     int_str = re.sub(r'[,\.\+]', '', int_str)
1265     return int(int_str)
1266
1267
1268 def float_or_none(v, scale=1, invscale=1, default=None):
1269     return default if v is None else (float(v) * invscale / scale)
1270
1271
1272 def parse_duration(s):
1273     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1274         return None
1275
1276     s = s.strip()
1277
1278     m = re.match(
1279         r'''(?ix)T?
1280         (?:
1281             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1282             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1283
1284             (?:
1285                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1286                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1287             )?
1288             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1289         )$''', s)
1290     if not m:
1291         return None
1292     res = 0
1293     if m.group('only_mins'):
1294         return float_or_none(m.group('only_mins'), invscale=60)
1295     if m.group('only_hours'):
1296         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1297     if m.group('secs'):
1298         res += int(m.group('secs'))
1299     if m.group('mins'):
1300         res += int(m.group('mins')) * 60
1301     if m.group('hours'):
1302         res += int(m.group('hours')) * 60 * 60
1303     if m.group('ms'):
1304         res += float(m.group('ms'))
1305     return res
1306
1307
1308 def prepend_extension(filename, ext):
1309     name, real_ext = os.path.splitext(filename)
1310     return '{0}.{1}{2}'.format(name, ext, real_ext)
1311
1312
1313 def check_executable(exe, args=[]):
1314     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1315     args can be a list of arguments for a short output (like -version) """
1316     try:
1317         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1318     except OSError:
1319         return False
1320     return exe
1321
1322
1323 def get_exe_version(exe, args=['--version'],
1324                     version_re=None, unrecognized='present'):
1325     """ Returns the version of the specified executable,
1326     or False if the executable is not present """
1327     try:
1328         out, _ = subprocess.Popen(
1329             [exe] + args,
1330             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1331     except OSError:
1332         return False
1333     if isinstance(out, bytes):  # Python 2.x
1334         out = out.decode('ascii', 'ignore')
1335     return detect_exe_version(out, version_re, unrecognized)
1336
1337
1338 def detect_exe_version(output, version_re=None, unrecognized='present'):
1339     assert isinstance(output, compat_str)
1340     if version_re is None:
1341         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1342     m = re.search(version_re, output)
1343     if m:
1344         return m.group(1)
1345     else:
1346         return unrecognized
1347
1348
1349 class PagedList(object):
1350     def __len__(self):
1351         # This is only useful for tests
1352         return len(self.getslice())
1353
1354
1355 class OnDemandPagedList(PagedList):
1356     def __init__(self, pagefunc, pagesize):
1357         self._pagefunc = pagefunc
1358         self._pagesize = pagesize
1359
1360     def getslice(self, start=0, end=None):
1361         res = []
1362         for pagenum in itertools.count(start // self._pagesize):
1363             firstid = pagenum * self._pagesize
1364             nextfirstid = pagenum * self._pagesize + self._pagesize
1365             if start >= nextfirstid:
1366                 continue
1367
1368             page_results = list(self._pagefunc(pagenum))
1369
1370             startv = (
1371                 start % self._pagesize
1372                 if firstid <= start < nextfirstid
1373                 else 0)
1374
1375             endv = (
1376                 ((end - 1) % self._pagesize) + 1
1377                 if (end is not None and firstid <= end <= nextfirstid)
1378                 else None)
1379
1380             if startv != 0 or endv is not None:
1381                 page_results = page_results[startv:endv]
1382             res.extend(page_results)
1383
1384             # A little optimization - if current page is not "full", ie. does
1385             # not contain page_size videos then we can assume that this page
1386             # is the last one - there are no more ids on further pages -
1387             # i.e. no need to query again.
1388             if len(page_results) + startv < self._pagesize:
1389                 break
1390
1391             # If we got the whole page, but the next page is not interesting,
1392             # break out early as well
1393             if end == nextfirstid:
1394                 break
1395         return res
1396
1397
1398 class InAdvancePagedList(PagedList):
1399     def __init__(self, pagefunc, pagecount, pagesize):
1400         self._pagefunc = pagefunc
1401         self._pagecount = pagecount
1402         self._pagesize = pagesize
1403
1404     def getslice(self, start=0, end=None):
1405         res = []
1406         start_page = start // self._pagesize
1407         end_page = (
1408             self._pagecount if end is None else (end // self._pagesize + 1))
1409         skip_elems = start - start_page * self._pagesize
1410         only_more = None if end is None else end - start
1411         for pagenum in range(start_page, end_page):
1412             page = list(self._pagefunc(pagenum))
1413             if skip_elems:
1414                 page = page[skip_elems:]
1415                 skip_elems = None
1416             if only_more is not None:
1417                 if len(page) < only_more:
1418                     only_more -= len(page)
1419                 else:
1420                     page = page[:only_more]
1421                     res.extend(page)
1422                     break
1423             res.extend(page)
1424         return res
1425
1426
1427 def uppercase_escape(s):
1428     unicode_escape = codecs.getdecoder('unicode_escape')
1429     return re.sub(
1430         r'\\U[0-9a-fA-F]{8}',
1431         lambda m: unicode_escape(m.group(0))[0],
1432         s)
1433
1434
1435 def escape_rfc3986(s):
1436     """Escape non-ASCII characters as suggested by RFC 3986"""
1437     if sys.version_info < (3, 0) and isinstance(s, unicode):
1438         s = s.encode('utf-8')
1439     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1440
1441
1442 def escape_url(url):
1443     """Escape URL as suggested by RFC 3986"""
1444     url_parsed = compat_urllib_parse_urlparse(url)
1445     return url_parsed._replace(
1446         path=escape_rfc3986(url_parsed.path),
1447         params=escape_rfc3986(url_parsed.params),
1448         query=escape_rfc3986(url_parsed.query),
1449         fragment=escape_rfc3986(url_parsed.fragment)
1450     ).geturl()
1451
1452 try:
1453     struct.pack('!I', 0)
1454 except TypeError:
1455     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1456     def struct_pack(spec, *args):
1457         if isinstance(spec, compat_str):
1458             spec = spec.encode('ascii')
1459         return struct.pack(spec, *args)
1460
1461     def struct_unpack(spec, *args):
1462         if isinstance(spec, compat_str):
1463             spec = spec.encode('ascii')
1464         return struct.unpack(spec, *args)
1465 else:
1466     struct_pack = struct.pack
1467     struct_unpack = struct.unpack
1468
1469
1470 def read_batch_urls(batch_fd):
1471     def fixup(url):
1472         if not isinstance(url, compat_str):
1473             url = url.decode('utf-8', 'replace')
1474         BOM_UTF8 = '\xef\xbb\xbf'
1475         if url.startswith(BOM_UTF8):
1476             url = url[len(BOM_UTF8):]
1477         url = url.strip()
1478         if url.startswith(('#', ';', ']')):
1479             return False
1480         return url
1481
1482     with contextlib.closing(batch_fd) as fd:
1483         return [url for url in map(fixup, fd) if url]
1484
1485
1486 def urlencode_postdata(*args, **kargs):
1487     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1488
1489
1490 try:
1491     etree_iter = xml.etree.ElementTree.Element.iter
1492 except AttributeError:  # Python <=2.6
1493     etree_iter = lambda n: n.findall('.//*')
1494
1495
1496 def parse_xml(s):
1497     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1498         def doctype(self, name, pubid, system):
1499             pass  # Ignore doctypes
1500
1501     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1502     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1503     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1504     # Fix up XML parser in Python 2.x
1505     if sys.version_info < (3, 0):
1506         for n in etree_iter(tree):
1507             if n.text is not None:
1508                 if not isinstance(n.text, compat_str):
1509                     n.text = n.text.decode('utf-8')
1510     return tree
1511
1512
1513 US_RATINGS = {
1514     'G': 0,
1515     'PG': 10,
1516     'PG-13': 13,
1517     'R': 16,
1518     'NC': 18,
1519 }
1520
1521
1522 def parse_age_limit(s):
1523     if s is None:
1524         return None
1525     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1526     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1527
1528
1529 def strip_jsonp(code):
1530     return re.sub(
1531         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1532
1533
1534 def js_to_json(code):
1535     def fix_kv(m):
1536         v = m.group(0)
1537         if v in ('true', 'false', 'null'):
1538             return v
1539         if v.startswith('"'):
1540             return v
1541         if v.startswith("'"):
1542             v = v[1:-1]
1543             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1544                 '\\\\': '\\\\',
1545                 "\\'": "'",
1546                 '"': '\\"',
1547             }[m.group(0)], v)
1548         return '"%s"' % v
1549
1550     res = re.sub(r'''(?x)
1551         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1552         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1553         [a-zA-Z_][a-zA-Z_0-9]*
1554         ''', fix_kv, code)
1555     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1556     return res
1557
1558
1559 def qualities(quality_ids):
1560     """ Get a numeric quality value out of a list of possible values """
1561     def q(qid):
1562         try:
1563             return quality_ids.index(qid)
1564         except ValueError:
1565             return -1
1566     return q
1567
1568
1569 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1570
1571
1572 def limit_length(s, length):
1573     """ Add ellipses to overly long strings """
1574     if s is None:
1575         return None
1576     ELLIPSES = '...'
1577     if len(s) > length:
1578         return s[:length - len(ELLIPSES)] + ELLIPSES
1579     return s
1580
1581
1582 def version_tuple(v):
1583     return tuple(int(e) for e in re.split(r'[-.]', v))
1584
1585
1586 def is_outdated_version(version, limit, assume_new=True):
1587     if not version:
1588         return not assume_new
1589     try:
1590         return version_tuple(version) < version_tuple(limit)
1591     except ValueError:
1592         return not assume_new
1593
1594
1595 def ytdl_is_updateable():
1596     """ Returns if youtube-dl can be updated with -U """
1597     from zipimport import zipimporter
1598
1599     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1600
1601
1602 def args_to_str(args):
1603     # Get a short string representation for a subprocess command
1604     return ' '.join(shlex_quote(a) for a in args)
1605
1606
1607 def urlhandle_detect_ext(url_handle):
1608     try:
1609         url_handle.headers
1610         getheader = lambda h: url_handle.headers[h]
1611     except AttributeError:  # Python < 3
1612         getheader = url_handle.info().getheader
1613
1614     return getheader('Content-Type').split("/")[1]
1615
1616
1617 def age_restricted(content_limit, age_limit):
1618     """ Returns True iff the content should be blocked """
1619
1620     if age_limit is None:  # No limit set
1621         return False
1622     if content_limit is None:
1623         return False  # Content available for everyone
1624     return age_limit < content_limit