_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 403         if opts_no_check_certificate:
 404             context.check_hostname = False
 405             context.verify_mode = ssl.CERT_NONE
 406         try:
 407             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 408         except TypeError:
 409             # Python 2.7.8
 410             # (create_default_context present but HTTPSHandler has no context=)
 411             pass
 412
 413     if sys.version_info < (3, 2):
 414         return YoutubeDLHTTPSHandler(params, **kwargs)
 415     else:  # Python < 3.4
 416         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 417         context.verify_mode = (ssl.CERT_NONE
 418                                if opts_no_check_certificate
 419                                else ssl.CERT_REQUIRED)
 420         context.set_default_verify_paths()
 421         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 422
 423
 424 class ExtractorError(Exception):
 425     """Error during info extraction."""
 426
 427     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 428         """ tb, if given, is the original traceback (so that it can be printed out).
 429         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 430         """
 431
 432         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 433             expected = True
 434         if video_id is not None:
 435             msg = video_id + ': ' + msg
 436         if cause:
 437             msg += ' (caused by %r)' % cause
 438         if not expected:
 439             if ytdl_is_updateable():
 440                 update_cmd = 'type  youtube-dl -U  to update'
 441             else:
 442                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 443             msg += '; please report this issue on https://yt-dl.org/bug .'
 444             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 445             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 446         super(ExtractorError, self).__init__(msg)
 447
 448         self.traceback = tb
 449         self.exc_info = sys.exc_info()  # preserve original exception
 450         self.cause = cause
 451         self.video_id = video_id
 452
 453     def format_traceback(self):
 454         if self.traceback is None:
 455             return None
 456         return ''.join(traceback.format_tb(self.traceback))
 457
 458
 459 class UnsupportedError(ExtractorError):
 460     def __init__(self, url):
 461         super(UnsupportedError, self).__init__(
 462             'Unsupported URL: %s' % url, expected=True)
 463         self.url = url
 464
 465
 466 class RegexNotFoundError(ExtractorError):
 467     """Error when a regex didn't match"""
 468     pass
 469
 470
 471 class DownloadError(Exception):
 472     """Download Error exception.
 473
 474     This exception may be thrown by FileDownloader objects if they are not
 475     configured to continue on errors. They will contain the appropriate
 476     error message.
 477     """
 478
 479     def __init__(self, msg, exc_info=None):
 480         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 481         super(DownloadError, self).__init__(msg)
 482         self.exc_info = exc_info
 483
 484
 485 class SameFileError(Exception):
 486     """Same File exception.
 487
 488     This exception will be thrown by FileDownloader objects if they detect
 489     multiple files would have to be downloaded to the same file on disk.
 490     """
 491     pass
 492
 493
 494 class PostProcessingError(Exception):
 495     """Post Processing exception.
 496
 497     This exception may be raised by PostProcessor's .run() method to
 498     indicate an error in the postprocessing task.
 499     """
 500
 501     def __init__(self, msg):
 502         self.msg = msg
 503
 504
 505 class MaxDownloadsReached(Exception):
 506     """ --max-downloads limit has been reached. """
 507     pass
 508
 509
 510 class UnavailableVideoError(Exception):
 511     """Unavailable Format exception.
 512
 513     This exception will be thrown when a video is requested
 514     in a format that is not available for that video.
 515     """
 516     pass
 517
 518
 519 class ContentTooShortError(Exception):
 520     """Content Too Short exception.
 521
 522     This exception may be raised by FileDownloader objects when a file they
 523     download is too small for what the server announced first, indicating
 524     the connection was probably interrupted.
 525     """
 526     # Both in bytes
 527     downloaded = None
 528     expected = None
 529
 530     def __init__(self, downloaded, expected):
 531         self.downloaded = downloaded
 532         self.expected = expected
 533
 534
 535 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 536     hc = http_class(*args, **kwargs)
 537     source_address = ydl_handler._params.get('source_address')
 538     if source_address is not None:
 539         sa = (source_address, 0)
 540         if hasattr(hc, 'source_address'):  # Python 2.7+
 541             hc.source_address = sa
 542         else:  # Python 2.6
 543             def _hc_connect(self, *args, **kwargs):
 544                 sock = compat_socket_create_connection(
 545                     (self.host, self.port), self.timeout, sa)
 546                 if is_https:
 547                     self.sock = ssl.wrap_socket(
 548                         sock, self.key_file, self.cert_file,
 549                         ssl_version=ssl.PROTOCOL_TLSv1)
 550                 else:
 551                     self.sock = sock
 552             hc.connect = functools.partial(_hc_connect, hc)
 553
 554     return hc
 555
 556
 557 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 558     """Handler for HTTP requests and responses.
 559
 560     This class, when installed with an OpenerDirector, automatically adds
 561     the standard headers to every HTTP request and handles gzipped and
 562     deflated responses from web servers. If compression is to be avoided in
 563     a particular request, the original request in the program code only has
 564     to include the HTTP header "Youtubedl-No-Compression", which will be
 565     removed before making the real request.
 566
 567     Part of this code was copied from:
 568
 569     http://techknack.net/python-urllib2-handlers/
 570
 571     Andrew Rowls, the author of that code, agreed to release it to the
 572     public domain.
 573     """
 574
 575     def __init__(self, params, *args, **kwargs):
 576         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 577         self._params = params
 578
 579     def http_open(self, req):
 580         return self.do_open(functools.partial(
 581             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 582             req)
 583
 584     @staticmethod
 585     def deflate(data):
 586         try:
 587             return zlib.decompress(data, -zlib.MAX_WBITS)
 588         except zlib.error:
 589             return zlib.decompress(data)
 590
 591     @staticmethod
 592     def addinfourl_wrapper(stream, headers, url, code):
 593         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 594             return compat_urllib_request.addinfourl(stream, headers, url, code)
 595         ret = compat_urllib_request.addinfourl(stream, headers, url)
 596         ret.code = code
 597         return ret
 598
 599     def http_request(self, req):
 600         for h, v in std_headers.items():
 601             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 602             # The dict keys are capitalized because of this bug by urllib
 603             if h.capitalize() not in req.headers:
 604                 req.add_header(h, v)
 605         if 'Youtubedl-no-compression' in req.headers:
 606             if 'Accept-encoding' in req.headers:
 607                 del req.headers['Accept-encoding']
 608             del req.headers['Youtubedl-no-compression']
 609         if 'Youtubedl-user-agent' in req.headers:
 610             if 'User-agent' in req.headers:
 611                 del req.headers['User-agent']
 612             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 613             del req.headers['Youtubedl-user-agent']
 614
 615         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 616             # Python 2.6 is brain-dead when it comes to fragments
 617             req._Request__original = req._Request__original.partition('#')[0]
 618             req._Request__r_type = req._Request__r_type.partition('#')[0]
 619
 620         return req
 621
 622     def http_response(self, req, resp):
 623         old_resp = resp
 624         # gzip
 625         if resp.headers.get('Content-encoding', '') == 'gzip':
 626             content = resp.read()
 627             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 628             try:
 629                 uncompressed = io.BytesIO(gz.read())
 630             except IOError as original_ioerror:
 631                 # There may be junk add the end of the file
 632                 # See http://stackoverflow.com/q/4928560/35070 for details
 633                 for i in range(1, 1024):
 634                     try:
 635                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 636                         uncompressed = io.BytesIO(gz.read())
 637                     except IOError:
 638                         continue
 639                     break
 640                 else:
 641                     raise original_ioerror
 642             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 643             resp.msg = old_resp.msg
 644         # deflate
 645         if resp.headers.get('Content-encoding', '') == 'deflate':
 646             gz = io.BytesIO(self.deflate(resp.read()))
 647             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 648             resp.msg = old_resp.msg
 649         return resp
 650
 651     https_request = http_request
 652     https_response = http_response
 653
 654
 655 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 656     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 657         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 658         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 659         self._params = params
 660
 661     def https_open(self, req):
 662         return self.do_open(functools.partial(
 663             _create_http_connection, self, self._https_conn_class, True),
 664             req)
 665
 666
 667 def parse_iso8601(date_str, delimiter='T'):
 668     """ Return a UNIX timestamp from the given date """
 669
 670     if date_str is None:
 671         return None
 672
 673     m = re.search(
 674         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 675         date_str)
 676     if not m:
 677         timezone = datetime.timedelta()
 678     else:
 679         date_str = date_str[:-len(m.group(0))]
 680         if not m.group('sign'):
 681             timezone = datetime.timedelta()
 682         else:
 683             sign = 1 if m.group('sign') == '+' else -1
 684             timezone = datetime.timedelta(
 685                 hours=sign * int(m.group('hours')),
 686                 minutes=sign * int(m.group('minutes')))
 687     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 688     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 689     return calendar.timegm(dt.timetuple())
 690
 691
 692 def unified_strdate(date_str, day_first=True):
 693     """Return a string with the date in the format YYYYMMDD"""
 694
 695     if date_str is None:
 696         return None
 697     upload_date = None
 698     # Replace commas
 699     date_str = date_str.replace(',', ' ')
 700     # %z (UTC offset) is only supported in python>=3.2
 701     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 702     # Remove AM/PM + timezone
 703     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 704
 705     format_expressions = [
 706         '%d %B %Y',
 707         '%d %b %Y',
 708         '%B %d %Y',
 709         '%b %d %Y',
 710         '%b %dst %Y %I:%M%p',
 711         '%b %dnd %Y %I:%M%p',
 712         '%b %dth %Y %I:%M%p',
 713         '%Y %m %d',
 714         '%Y-%m-%d',
 715         '%Y/%m/%d',
 716         '%Y/%m/%d %H:%M:%S',
 717         '%Y-%m-%d %H:%M:%S',
 718         '%Y-%m-%d %H:%M:%S.%f',
 719         '%d.%m.%Y %H:%M',
 720         '%d.%m.%Y %H.%M',
 721         '%Y-%m-%dT%H:%M:%SZ',
 722         '%Y-%m-%dT%H:%M:%S.%fZ',
 723         '%Y-%m-%dT%H:%M:%S.%f0Z',
 724         '%Y-%m-%dT%H:%M:%S',
 725         '%Y-%m-%dT%H:%M:%S.%f',
 726         '%Y-%m-%dT%H:%M',
 727     ]
 728     if day_first:
 729         format_expressions.extend([
 730             '%d.%m.%Y',
 731             '%d/%m/%Y',
 732             '%d/%m/%y',
 733             '%d/%m/%Y %H:%M:%S',
 734         ])
 735     else:
 736         format_expressions.extend([
 737             '%m.%d.%Y',
 738             '%m/%d/%Y',
 739             '%m/%d/%y',
 740             '%m/%d/%Y %H:%M:%S',
 741         ])
 742     for expression in format_expressions:
 743         try:
 744             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 745         except ValueError:
 746             pass
 747     if upload_date is None:
 748         timetuple = email.utils.parsedate_tz(date_str)
 749         if timetuple:
 750             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 751     return upload_date
 752
 753
 754 def determine_ext(url, default_ext='unknown_video'):
 755     if url is None:
 756         return default_ext
 757     guess = url.partition('?')[0].rpartition('.')[2]
 758     if re.match(r'^[A-Za-z0-9]+$', guess):
 759         return guess
 760     else:
 761         return default_ext
 762
 763
 764 def subtitles_filename(filename, sub_lang, sub_format):
 765     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 766
 767
 768 def date_from_str(date_str):
 769     """
 770     Return a datetime object from a string in the format YYYYMMDD or
 771     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 772     today = datetime.date.today()
 773     if date_str in ('now', 'today'):
 774         return today
 775     if date_str == 'yesterday':
 776         return today - datetime.timedelta(days=1)
 777     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 778     if match is not None:
 779         sign = match.group('sign')
 780         time = int(match.group('time'))
 781         if sign == '-':
 782             time = -time
 783         unit = match.group('unit')
 784         # A bad aproximation?
 785         if unit == 'month':
 786             unit = 'day'
 787             time *= 30
 788         elif unit == 'year':
 789             unit = 'day'
 790             time *= 365
 791         unit += 's'
 792         delta = datetime.timedelta(**{unit: time})
 793         return today + delta
 794     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 795
 796
 797 def hyphenate_date(date_str):
 798     """
 799     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 800     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 801     if match is not None:
 802         return '-'.join(match.groups())
 803     else:
 804         return date_str
 805
 806
 807 class DateRange(object):
 808     """Represents a time interval between two dates"""
 809
 810     def __init__(self, start=None, end=None):
 811         """start and end must be strings in the format accepted by date"""
 812         if start is not None:
 813             self.start = date_from_str(start)
 814         else:
 815             self.start = datetime.datetime.min.date()
 816         if end is not None:
 817             self.end = date_from_str(end)
 818         else:
 819             self.end = datetime.datetime.max.date()
 820         if self.start > self.end:
 821             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 822
 823     @classmethod
 824     def day(cls, day):
 825         """Returns a range that only contains the given day"""
 826         return cls(day, day)
 827
 828     def __contains__(self, date):
 829         """Check if the date is in the range"""
 830         if not isinstance(date, datetime.date):
 831             date = date_from_str(date)
 832         return self.start <= date <= self.end
 833
 834     def __str__(self):
 835         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 836
 837
 838 def platform_name():
 839     """ Returns the platform name as a compat_str """
 840     res = platform.platform()
 841     if isinstance(res, bytes):
 842         res = res.decode(preferredencoding())
 843
 844     assert isinstance(res, compat_str)
 845     return res
 846
 847
 848 def _windows_write_string(s, out):
 849     """ Returns True if the string was written using special methods,
 850     False if it has yet to be written out."""
 851     # Adapted from http://stackoverflow.com/a/3259271/35070
 852
 853     import ctypes
 854     import ctypes.wintypes
 855
 856     WIN_OUTPUT_IDS = {
 857         1: -11,
 858         2: -12,
 859     }
 860
 861     try:
 862         fileno = out.fileno()
 863     except AttributeError:
 864         # If the output stream doesn't have a fileno, it's virtual
 865         return False
 866     except io.UnsupportedOperation:
 867         # Some strange Windows pseudo files?
 868         return False
 869     if fileno not in WIN_OUTPUT_IDS:
 870         return False
 871
 872     GetStdHandle = ctypes.WINFUNCTYPE(
 873         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 874         (b"GetStdHandle", ctypes.windll.kernel32))
 875     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 876
 877     WriteConsoleW = ctypes.WINFUNCTYPE(
 878         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 879         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 880         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 881     written = ctypes.wintypes.DWORD(0)
 882
 883     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 884     FILE_TYPE_CHAR = 0x0002
 885     FILE_TYPE_REMOTE = 0x8000
 886     GetConsoleMode = ctypes.WINFUNCTYPE(
 887         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 888         ctypes.POINTER(ctypes.wintypes.DWORD))(
 889         (b"GetConsoleMode", ctypes.windll.kernel32))
 890     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 891
 892     def not_a_console(handle):
 893         if handle == INVALID_HANDLE_VALUE or handle is None:
 894             return True
 895         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 896                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 897
 898     if not_a_console(h):
 899         return False
 900
 901     def next_nonbmp_pos(s):
 902         try:
 903             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 904         except StopIteration:
 905             return len(s)
 906
 907     while s:
 908         count = min(next_nonbmp_pos(s), 1024)
 909
 910         ret = WriteConsoleW(
 911             h, s, count if count else 2, ctypes.byref(written), None)
 912         if ret == 0:
 913             raise OSError('Failed to write string')
 914         if not count:  # We just wrote a non-BMP character
 915             assert written.value == 2
 916             s = s[1:]
 917         else:
 918             assert written.value > 0
 919             s = s[written.value:]
 920     return True
 921
 922
 923 def write_string(s, out=None, encoding=None):
 924     if out is None:
 925         out = sys.stderr
 926     assert type(s) == compat_str
 927
 928     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 929         if _windows_write_string(s, out):
 930             return
 931
 932     if ('b' in getattr(out, 'mode', '') or
 933             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 934         byt = s.encode(encoding or preferredencoding(), 'ignore')
 935         out.write(byt)
 936     elif hasattr(out, 'buffer'):
 937         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 938         byt = s.encode(enc, 'ignore')
 939         out.buffer.write(byt)
 940     else:
 941         out.write(s)
 942     out.flush()
 943
 944
 945 def bytes_to_intlist(bs):
 946     if not bs:
 947         return []
 948     if isinstance(bs[0], int):  # Python 3
 949         return list(bs)
 950     else:
 951         return [ord(c) for c in bs]
 952
 953
 954 def intlist_to_bytes(xs):
 955     if not xs:
 956         return b''
 957     return struct_pack('%dB' % len(xs), *xs)
 958
 959
 960 # Cross-platform file locking
 961 if sys.platform == 'win32':
 962     import ctypes.wintypes
 963     import msvcrt
 964
 965     class OVERLAPPED(ctypes.Structure):
 966         _fields_ = [
 967             ('Internal', ctypes.wintypes.LPVOID),
 968             ('InternalHigh', ctypes.wintypes.LPVOID),
 969             ('Offset', ctypes.wintypes.DWORD),
 970             ('OffsetHigh', ctypes.wintypes.DWORD),
 971             ('hEvent', ctypes.wintypes.HANDLE),
 972         ]
 973
 974     kernel32 = ctypes.windll.kernel32
 975     LockFileEx = kernel32.LockFileEx
 976     LockFileEx.argtypes = [
 977         ctypes.wintypes.HANDLE,     # hFile
 978         ctypes.wintypes.DWORD,      # dwFlags
 979         ctypes.wintypes.DWORD,      # dwReserved
 980         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 981         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 982         ctypes.POINTER(OVERLAPPED)  # Overlapped
 983     ]
 984     LockFileEx.restype = ctypes.wintypes.BOOL
 985     UnlockFileEx = kernel32.UnlockFileEx
 986     UnlockFileEx.argtypes = [
 987         ctypes.wintypes.HANDLE,     # hFile
 988         ctypes.wintypes.DWORD,      # dwReserved
 989         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 990         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 991         ctypes.POINTER(OVERLAPPED)  # Overlapped
 992     ]
 993     UnlockFileEx.restype = ctypes.wintypes.BOOL
 994     whole_low = 0xffffffff
 995     whole_high = 0x7fffffff
 996
 997     def _lock_file(f, exclusive):
 998         overlapped = OVERLAPPED()
 999         overlapped.Offset = 0
1000         overlapped.OffsetHigh = 0
1001         overlapped.hEvent = 0
1002         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1003         handle = msvcrt.get_osfhandle(f.fileno())
1004         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1005                           whole_low, whole_high, f._lock_file_overlapped_p):
1006             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1007
1008     def _unlock_file(f):
1009         assert f._lock_file_overlapped_p
1010         handle = msvcrt.get_osfhandle(f.fileno())
1011         if not UnlockFileEx(handle, 0,
1012                             whole_low, whole_high, f._lock_file_overlapped_p):
1013             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1014
1015 else:
1016     import fcntl
1017
1018     def _lock_file(f, exclusive):
1019         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1020
1021     def _unlock_file(f):
1022         fcntl.flock(f, fcntl.LOCK_UN)
1023
1024
1025 class locked_file(object):
1026     def __init__(self, filename, mode, encoding=None):
1027         assert mode in ['r', 'a', 'w']
1028         self.f = io.open(filename, mode, encoding=encoding)
1029         self.mode = mode
1030
1031     def __enter__(self):
1032         exclusive = self.mode != 'r'
1033         try:
1034             _lock_file(self.f, exclusive)
1035         except IOError:
1036             self.f.close()
1037             raise
1038         return self
1039
1040     def __exit__(self, etype, value, traceback):
1041         try:
1042             _unlock_file(self.f)
1043         finally:
1044             self.f.close()
1045
1046     def __iter__(self):
1047         return iter(self.f)
1048
1049     def write(self, *args):
1050         return self.f.write(*args)
1051
1052     def read(self, *args):
1053         return self.f.read(*args)
1054
1055
1056 def get_filesystem_encoding():
1057     encoding = sys.getfilesystemencoding()
1058     return encoding if encoding is not None else 'utf-8'
1059
1060
1061 def shell_quote(args):
1062     quoted_args = []
1063     encoding = get_filesystem_encoding()
1064     for a in args:
1065         if isinstance(a, bytes):
1066             # We may get a filename encoded with 'encodeFilename'
1067             a = a.decode(encoding)
1068         quoted_args.append(pipes.quote(a))
1069     return ' '.join(quoted_args)
1070
1071
1072 def takewhile_inclusive(pred, seq):
1073     """ Like itertools.takewhile, but include the latest evaluated element
1074         (the first element so that Not pred(e)) """
1075     for e in seq:
1076         yield e
1077         if not pred(e):
1078             return
1079
1080
1081 def smuggle_url(url, data):
1082     """ Pass additional data in a URL for internal use. """
1083
1084     sdata = compat_urllib_parse.urlencode(
1085         {'__youtubedl_smuggle': json.dumps(data)})
1086     return url + '#' + sdata
1087
1088
1089 def unsmuggle_url(smug_url, default=None):
1090     if '#__youtubedl_smuggle' not in smug_url:
1091         return smug_url, default
1092     url, _, sdata = smug_url.rpartition('#')
1093     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1094     data = json.loads(jsond)
1095     return url, data
1096
1097
1098 def format_bytes(bytes):
1099     if bytes is None:
1100         return 'N/A'
1101     if type(bytes) is str:
1102         bytes = float(bytes)
1103     if bytes == 0.0:
1104         exponent = 0
1105     else:
1106         exponent = int(math.log(bytes, 1024.0))
1107     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1108     converted = float(bytes) / float(1024 ** exponent)
1109     return '%.2f%s' % (converted, suffix)
1110
1111
1112 def parse_filesize(s):
1113     if s is None:
1114         return None
1115
1116     # The lower-case forms are of course incorrect and inofficial,
1117     # but we support those too
1118     _UNIT_TABLE = {
1119         'B': 1,
1120         'b': 1,
1121         'KiB': 1024,
1122         'KB': 1000,
1123         'kB': 1024,
1124         'Kb': 1000,
1125         'MiB': 1024 ** 2,
1126         'MB': 1000 ** 2,
1127         'mB': 1024 ** 2,
1128         'Mb': 1000 ** 2,
1129         'GiB': 1024 ** 3,
1130         'GB': 1000 ** 3,
1131         'gB': 1024 ** 3,
1132         'Gb': 1000 ** 3,
1133         'TiB': 1024 ** 4,
1134         'TB': 1000 ** 4,
1135         'tB': 1024 ** 4,
1136         'Tb': 1000 ** 4,
1137         'PiB': 1024 ** 5,
1138         'PB': 1000 ** 5,
1139         'pB': 1024 ** 5,
1140         'Pb': 1000 ** 5,
1141         'EiB': 1024 ** 6,
1142         'EB': 1000 ** 6,
1143         'eB': 1024 ** 6,
1144         'Eb': 1000 ** 6,
1145         'ZiB': 1024 ** 7,
1146         'ZB': 1000 ** 7,
1147         'zB': 1024 ** 7,
1148         'Zb': 1000 ** 7,
1149         'YiB': 1024 ** 8,
1150         'YB': 1000 ** 8,
1151         'yB': 1024 ** 8,
1152         'Yb': 1000 ** 8,
1153     }
1154
1155     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1156     m = re.match(
1157         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1158     if not m:
1159         return None
1160
1161     num_str = m.group('num').replace(',', '.')
1162     mult = _UNIT_TABLE[m.group('unit')]
1163     return int(float(num_str) * mult)
1164
1165
1166 def get_term_width():
1167     columns = compat_getenv('COLUMNS', None)
1168     if columns:
1169         return int(columns)
1170
1171     try:
1172         sp = subprocess.Popen(
1173             ['stty', 'size'],
1174             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1175         out, err = sp.communicate()
1176         return int(out.split()[1])
1177     except:
1178         pass
1179     return None
1180
1181
1182 def month_by_name(name):
1183     """ Return the number of a month by (locale-independently) English name """
1184
1185     ENGLISH_NAMES = [
1186         'January', 'February', 'March', 'April', 'May', 'June',
1187         'July', 'August', 'September', 'October', 'November', 'December']
1188     try:
1189         return ENGLISH_NAMES.index(name) + 1
1190     except ValueError:
1191         return None
1192
1193
1194 def fix_xml_ampersands(xml_str):
1195     """Replace all the '&' by '&amp;' in XML"""
1196     return re.sub(
1197         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1198         '&amp;',
1199         xml_str)
1200
1201
1202 def setproctitle(title):
1203     assert isinstance(title, compat_str)
1204     try:
1205         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1206     except OSError:
1207         return
1208     title_bytes = title.encode('utf-8')
1209     buf = ctypes.create_string_buffer(len(title_bytes))
1210     buf.value = title_bytes
1211     try:
1212         libc.prctl(15, buf, 0, 0, 0)
1213     except AttributeError:
1214         return  # Strange libc, just skip this
1215
1216
1217 def remove_start(s, start):
1218     if s.startswith(start):
1219         return s[len(start):]
1220     return s
1221
1222
1223 def remove_end(s, end):
1224     if s.endswith(end):
1225         return s[:-len(end)]
1226     return s
1227
1228
1229 def url_basename(url):
1230     path = compat_urlparse.urlparse(url).path
1231     return path.strip('/').split('/')[-1]
1232
1233
1234 class HEADRequest(compat_urllib_request.Request):
1235     def get_method(self):
1236         return "HEAD"
1237
1238
1239 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1240     if get_attr:
1241         if v is not None:
1242             v = getattr(v, get_attr, None)
1243     if v == '':
1244         v = None
1245     return default if v is None else (int(v) * invscale // scale)
1246
1247
1248 def str_or_none(v, default=None):
1249     return default if v is None else compat_str(v)
1250
1251
1252 def str_to_int(int_str):
1253     """ A more relaxed version of int_or_none """
1254     if int_str is None:
1255         return None
1256     int_str = re.sub(r'[,\.\+]', '', int_str)
1257     return int(int_str)
1258
1259
1260 def float_or_none(v, scale=1, invscale=1, default=None):
1261     return default if v is None else (float(v) * invscale / scale)
1262
1263
1264 def parse_duration(s):
1265     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1266         return None
1267
1268     s = s.strip()
1269
1270     m = re.match(
1271         r'''(?ix)(?:P?T)?
1272         (?:
1273             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1275
1276             (?:
1277                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1278                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1279             )?
1280             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1281         )$''', s)
1282     if not m:
1283         return None
1284     res = 0
1285     if m.group('only_mins'):
1286         return float_or_none(m.group('only_mins'), invscale=60)
1287     if m.group('only_hours'):
1288         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1289     if m.group('secs'):
1290         res += int(m.group('secs'))
1291     if m.group('mins'):
1292         res += int(m.group('mins')) * 60
1293     if m.group('hours'):
1294         res += int(m.group('hours')) * 60 * 60
1295     if m.group('ms'):
1296         res += float(m.group('ms'))
1297     return res
1298
1299
1300 def prepend_extension(filename, ext):
1301     name, real_ext = os.path.splitext(filename)
1302     return '{0}.{1}{2}'.format(name, ext, real_ext)
1303
1304
1305 def check_executable(exe, args=[]):
1306     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1307     args can be a list of arguments for a short output (like -version) """
1308     try:
1309         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1310     except OSError:
1311         return False
1312     return exe
1313
1314
1315 def get_exe_version(exe, args=['--version'],
1316                     version_re=None, unrecognized='present'):
1317     """ Returns the version of the specified executable,
1318     or False if the executable is not present """
1319     try:
1320         out, _ = subprocess.Popen(
1321             [exe] + args,
1322             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1323     except OSError:
1324         return False
1325     if isinstance(out, bytes):  # Python 2.x
1326         out = out.decode('ascii', 'ignore')
1327     return detect_exe_version(out, version_re, unrecognized)
1328
1329
1330 def detect_exe_version(output, version_re=None, unrecognized='present'):
1331     assert isinstance(output, compat_str)
1332     if version_re is None:
1333         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1334     m = re.search(version_re, output)
1335     if m:
1336         return m.group(1)
1337     else:
1338         return unrecognized
1339
1340
1341 class PagedList(object):
1342     def __len__(self):
1343         # This is only useful for tests
1344         return len(self.getslice())
1345
1346
1347 class OnDemandPagedList(PagedList):
1348     def __init__(self, pagefunc, pagesize):
1349         self._pagefunc = pagefunc
1350         self._pagesize = pagesize
1351
1352     def getslice(self, start=0, end=None):
1353         res = []
1354         for pagenum in itertools.count(start // self._pagesize):
1355             firstid = pagenum * self._pagesize
1356             nextfirstid = pagenum * self._pagesize + self._pagesize
1357             if start >= nextfirstid:
1358                 continue
1359
1360             page_results = list(self._pagefunc(pagenum))
1361
1362             startv = (
1363                 start % self._pagesize
1364                 if firstid <= start < nextfirstid
1365                 else 0)
1366
1367             endv = (
1368                 ((end - 1) % self._pagesize) + 1
1369                 if (end is not None and firstid <= end <= nextfirstid)
1370                 else None)
1371
1372             if startv != 0 or endv is not None:
1373                 page_results = page_results[startv:endv]
1374             res.extend(page_results)
1375
1376             # A little optimization - if current page is not "full", ie. does
1377             # not contain page_size videos then we can assume that this page
1378             # is the last one - there are no more ids on further pages -
1379             # i.e. no need to query again.
1380             if len(page_results) + startv < self._pagesize:
1381                 break
1382
1383             # If we got the whole page, but the next page is not interesting,
1384             # break out early as well
1385             if end == nextfirstid:
1386                 break
1387         return res
1388
1389
1390 class InAdvancePagedList(PagedList):
1391     def __init__(self, pagefunc, pagecount, pagesize):
1392         self._pagefunc = pagefunc
1393         self._pagecount = pagecount
1394         self._pagesize = pagesize
1395
1396     def getslice(self, start=0, end=None):
1397         res = []
1398         start_page = start // self._pagesize
1399         end_page = (
1400             self._pagecount if end is None else (end // self._pagesize + 1))
1401         skip_elems = start - start_page * self._pagesize
1402         only_more = None if end is None else end - start
1403         for pagenum in range(start_page, end_page):
1404             page = list(self._pagefunc(pagenum))
1405             if skip_elems:
1406                 page = page[skip_elems:]
1407                 skip_elems = None
1408             if only_more is not None:
1409                 if len(page) < only_more:
1410                     only_more -= len(page)
1411                 else:
1412                     page = page[:only_more]
1413                     res.extend(page)
1414                     break
1415             res.extend(page)
1416         return res
1417
1418
1419 def uppercase_escape(s):
1420     unicode_escape = codecs.getdecoder('unicode_escape')
1421     return re.sub(
1422         r'\\U[0-9a-fA-F]{8}',
1423         lambda m: unicode_escape(m.group(0))[0],
1424         s)
1425
1426
1427 def escape_rfc3986(s):
1428     """Escape non-ASCII characters as suggested by RFC 3986"""
1429     if sys.version_info < (3, 0) and isinstance(s, unicode):
1430         s = s.encode('utf-8')
1431     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1432
1433
1434 def escape_url(url):
1435     """Escape URL as suggested by RFC 3986"""
1436     url_parsed = compat_urllib_parse_urlparse(url)
1437     return url_parsed._replace(
1438         path=escape_rfc3986(url_parsed.path),
1439         params=escape_rfc3986(url_parsed.params),
1440         query=escape_rfc3986(url_parsed.query),
1441         fragment=escape_rfc3986(url_parsed.fragment)
1442     ).geturl()
1443
1444 try:
1445     struct.pack('!I', 0)
1446 except TypeError:
1447     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1448     def struct_pack(spec, *args):
1449         if isinstance(spec, compat_str):
1450             spec = spec.encode('ascii')
1451         return struct.pack(spec, *args)
1452
1453     def struct_unpack(spec, *args):
1454         if isinstance(spec, compat_str):
1455             spec = spec.encode('ascii')
1456         return struct.unpack(spec, *args)
1457 else:
1458     struct_pack = struct.pack
1459     struct_unpack = struct.unpack
1460
1461
1462 def read_batch_urls(batch_fd):
1463     def fixup(url):
1464         if not isinstance(url, compat_str):
1465             url = url.decode('utf-8', 'replace')
1466         BOM_UTF8 = '\xef\xbb\xbf'
1467         if url.startswith(BOM_UTF8):
1468             url = url[len(BOM_UTF8):]
1469         url = url.strip()
1470         if url.startswith(('#', ';', ']')):
1471             return False
1472         return url
1473
1474     with contextlib.closing(batch_fd) as fd:
1475         return [url for url in map(fixup, fd) if url]
1476
1477
1478 def urlencode_postdata(*args, **kargs):
1479     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1480
1481
1482 try:
1483     etree_iter = xml.etree.ElementTree.Element.iter
1484 except AttributeError:  # Python <=2.6
1485     etree_iter = lambda n: n.findall('.//*')
1486
1487
1488 def parse_xml(s):
1489     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1490         def doctype(self, name, pubid, system):
1491             pass  # Ignore doctypes
1492
1493     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1494     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1495     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1496     # Fix up XML parser in Python 2.x
1497     if sys.version_info < (3, 0):
1498         for n in etree_iter(tree):
1499             if n.text is not None:
1500                 if not isinstance(n.text, compat_str):
1501                     n.text = n.text.decode('utf-8')
1502     return tree
1503
1504
1505 US_RATINGS = {
1506     'G': 0,
1507     'PG': 10,
1508     'PG-13': 13,
1509     'R': 16,
1510     'NC': 18,
1511 }
1512
1513
1514 def parse_age_limit(s):
1515     if s is None:
1516         return None
1517     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1518     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1519
1520
1521 def strip_jsonp(code):
1522     return re.sub(
1523         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1524
1525
1526 def js_to_json(code):
1527     def fix_kv(m):
1528         v = m.group(0)
1529         if v in ('true', 'false', 'null'):
1530             return v
1531         if v.startswith('"'):
1532             return v
1533         if v.startswith("'"):
1534             v = v[1:-1]
1535             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1536                 '\\\\': '\\\\',
1537                 "\\'": "'",
1538                 '"': '\\"',
1539             }[m.group(0)], v)
1540         return '"%s"' % v
1541
1542     res = re.sub(r'''(?x)
1543         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1544         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1545         [a-zA-Z_][a-zA-Z_0-9]*
1546         ''', fix_kv, code)
1547     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548     return res
1549
1550
1551 def qualities(quality_ids):
1552     """ Get a numeric quality value out of a list of possible values """
1553     def q(qid):
1554         try:
1555             return quality_ids.index(qid)
1556         except ValueError:
1557             return -1
1558     return q
1559
1560
1561 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1562
1563
1564 def limit_length(s, length):
1565     """ Add ellipses to overly long strings """
1566     if s is None:
1567         return None
1568     ELLIPSES = '...'
1569     if len(s) > length:
1570         return s[:length - len(ELLIPSES)] + ELLIPSES
1571     return s
1572
1573
1574 def version_tuple(v):
1575     return tuple(int(e) for e in re.split(r'[-.]', v))
1576
1577
1578 def is_outdated_version(version, limit, assume_new=True):
1579     if not version:
1580         return not assume_new
1581     try:
1582         return version_tuple(version) < version_tuple(limit)
1583     except ValueError:
1584         return not assume_new
1585
1586
1587 def ytdl_is_updateable():
1588     """ Returns if youtube-dl can be updated with -U """
1589     from zipimport import zipimporter
1590
1591     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1592
1593
1594 def args_to_str(args):
1595     # Get a short string representation for a subprocess command
1596     return ' '.join(shlex_quote(a) for a in args)
1597
1598
1599 def urlhandle_detect_ext(url_handle):
1600     try:
1601         url_handle.headers
1602         getheader = lambda h: url_handle.headers[h]
1603     except AttributeError:  # Python < 3
1604         getheader = url_handle.info().getheader
1605
1606     cd = getheader('Content-Disposition')
1607     if cd:
1608         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1609         if m:
1610             e = determine_ext(m.group('filename'), default_ext=None)
1611             if e:
1612                 return e
1613
1614     return getheader('Content-Type').split("/")[1]
1615
1616
1617 def age_restricted(content_limit, age_limit):
1618     """ Returns True iff the content should be blocked """
1619
1620     if age_limit is None:  # No limit set
1621         return False
1622     if content_limit is None:
1623         return False  # Content available for everyone
1624     return age_limit < content_limit
1625
1626
1627 def is_html(first_bytes):
1628     """ Detect whether a file contains HTML by examining its first bytes. """
1629
1630     BOMS = [
1631         (b'\xef\xbb\xbf', 'utf-8'),
1632         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1633         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1634         (b'\xff\xfe', 'utf-16-le'),
1635         (b'\xfe\xff', 'utf-16-be'),
1636     ]
1637     for bom, enc in BOMS:
1638         if first_bytes.startswith(bom):
1639             s = first_bytes[len(bom):].decode(enc, 'replace')
1640             break
1641     else:
1642         s = first_bytes.decode('utf-8', 'replace')
1643
1644     return re.match(r'^\s*<', s)
1645
1646
1647 def determine_protocol(info_dict):
1648     protocol = info_dict.get('protocol')
1649     if protocol is not None:
1650         return protocol
1651
1652     url = info_dict['url']
1653     if url.startswith('rtmp'):
1654         return 'rtmp'
1655     elif url.startswith('mms'):
1656         return 'mms'
1657     elif url.startswith('rtsp'):
1658         return 'rtsp'
1659
1660     ext = determine_ext(url)
1661     if ext == 'm3u8':
1662         return 'm3u8'
1663     elif ext == 'f4m':
1664         return 'f4m'
1665
1666     return compat_urllib_parse_urlparse(url).scheme