_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208
 209     if html is None:  # Convenience for sanitizing descriptions etc.
 210         return html
 211
 212     # Newline vs <br />
 213     html = html.replace('\n', ' ')
 214     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 215     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 216     # Strip html tags
 217     html = re.sub('<.*?>', '', html)
 218     # Replace html entities
 219     html = unescapeHTML(html)
 220     return html.strip()
 221
 222
 223 def sanitize_open(filename, open_mode):
 224     """Try to open the given filename, and slightly tweak it if this fails.
 225
 226     Attempts to open the given filename. If this fails, it tries to change
 227     the filename slightly, step by step, until it's either able to open it
 228     or it fails and raises a final exception, like the standard open()
 229     function.
 230
 231     It returns the tuple (stream, definitive_file_name).
 232     """
 233     try:
 234         if filename == '-':
 235             if sys.platform == 'win32':
 236                 import msvcrt
 237                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 239         stream = open(encodeFilename(filename), open_mode)
 240         return (stream, filename)
 241     except (IOError, OSError) as err:
 242         if err.errno in (errno.EACCES,):
 243             raise
 244
 245         # In case of error, try to remove win32 forbidden chars
 246         alt_filename = os.path.join(
 247             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 248             for path_part in os.path.split(filename)
 249         )
 250         if alt_filename == filename:
 251             raise
 252         else:
 253             # An exception here should be caught in the caller
 254             stream = open(encodeFilename(filename), open_mode)
 255             return (stream, alt_filename)
 256
 257
 258 def timeconvert(timestr):
 259     """Convert RFC 2822 defined time string into system timestamp"""
 260     timestamp = None
 261     timetuple = email.utils.parsedate_tz(timestr)
 262     if timetuple is not None:
 263         timestamp = email.utils.mktime_tz(timetuple)
 264     return timestamp
 265
 266
 267 def sanitize_filename(s, restricted=False, is_id=False):
 268     """Sanitizes a string so it could be used as part of a filename.
 269     If restricted is set, use a stricter subset of allowed characters.
 270     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 271     """
 272     def replace_insane(char):
 273         if char == '?' or ord(char) < 32 or ord(char) == 127:
 274             return ''
 275         elif char == '"':
 276             return '' if restricted else '\''
 277         elif char == ':':
 278             return '_-' if restricted else ' -'
 279         elif char in '\\/|*<>':
 280             return '_'
 281         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 282             return '_'
 283         if restricted and ord(char) > 127:
 284             return '_'
 285         return char
 286
 287     result = ''.join(map(replace_insane, s))
 288     if not is_id:
 289         while '__' in result:
 290             result = result.replace('__', '_')
 291         result = result.strip('_')
 292         # Common case of "Foreign band name - English song title"
 293         if restricted and result.startswith('-_'):
 294             result = result[2:]
 295         if not result:
 296             result = '_'
 297     return result
 298
 299
 300 def orderedSet(iterable):
 301     """ Remove all duplicates from the input iterable """
 302     res = []
 303     for el in iterable:
 304         if el not in res:
 305             res.append(el)
 306     return res
 307
 308
 309 def _htmlentity_transform(entity):
 310     """Transforms an HTML entity to a character."""
 311     # Known non-numeric HTML entity
 312     if entity in compat_html_entities.name2codepoint:
 313         return compat_chr(compat_html_entities.name2codepoint[entity])
 314
 315     mobj = re.match(r'#(x?[0-9]+)', entity)
 316     if mobj is not None:
 317         numstr = mobj.group(1)
 318         if numstr.startswith('x'):
 319             base = 16
 320             numstr = '0%s' % numstr
 321         else:
 322             base = 10
 323         return compat_chr(int(numstr, base))
 324
 325     # Unknown entity in name, return its literal representation
 326     return ('&%s;' % entity)
 327
 328
 329 def unescapeHTML(s):
 330     if s is None:
 331         return None
 332     assert type(s) == compat_str
 333
 334     return re.sub(
 335         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 336
 337
 338 def encodeFilename(s, for_subprocess=False):
 339     """
 340     @param s The name of the file
 341     """
 342
 343     assert type(s) == compat_str
 344
 345     # Python 3 has a Unicode API
 346     if sys.version_info >= (3, 0):
 347         return s
 348
 349     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 350         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 351         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 352         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 353         if not for_subprocess:
 354             return s
 355         else:
 356             # For subprocess calls, encode with locale encoding
 357             # Refer to http://stackoverflow.com/a/9951851/35070
 358             encoding = preferredencoding()
 359     else:
 360         encoding = sys.getfilesystemencoding()
 361     if encoding is None:
 362         encoding = 'utf-8'
 363     return s.encode(encoding, 'ignore')
 364
 365
 366 def encodeArgument(s):
 367     if not isinstance(s, compat_str):
 368         # Legacy code that uses byte strings
 369         # Uncomment the following line after fixing all post processors
 370         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 371         s = s.decode('ascii')
 372     return encodeFilename(s, True)
 373
 374
 375 def decodeOption(optval):
 376     if optval is None:
 377         return optval
 378     if isinstance(optval, bytes):
 379         optval = optval.decode(preferredencoding())
 380
 381     assert isinstance(optval, compat_str)
 382     return optval
 383
 384
 385 def formatSeconds(secs):
 386     if secs > 3600:
 387         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 388     elif secs > 60:
 389         return '%d:%02d' % (secs // 60, secs % 60)
 390     else:
 391         return '%d' % secs
 392
 393
 394 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 395     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 396         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 397         if opts_no_check_certificate:
 398             context.verify_mode = ssl.CERT_NONE
 399         try:
 400             return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 401         except TypeError:
 402             # Python 2.7.8
 403             # (create_default_context present but HTTPSHandler has no context=)
 404             pass
 405
 406     if sys.version_info < (3, 2):
 407         import httplib
 408
 409         class HTTPSConnectionV3(httplib.HTTPSConnection):
 410             def __init__(self, *args, **kwargs):
 411                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 412
 413             def connect(self):
 414                 sock = socket.create_connection((self.host, self.port), self.timeout)
 415                 if getattr(self, '_tunnel_host', False):
 416                     self.sock = sock
 417                     self._tunnel()
 418                 try:
 419                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 420                 except ssl.SSLError:
 421                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 422
 423         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 424             def https_open(self, req):
 425                 return self.do_open(HTTPSConnectionV3, req)
 426         return HTTPSHandlerV3(**kwargs)
 427     else:  # Python < 3.4
 428         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 429         context.verify_mode = (ssl.CERT_NONE
 430                                if opts_no_check_certificate
 431                                else ssl.CERT_REQUIRED)
 432         context.set_default_verify_paths()
 433         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 434
 435
 436 class ExtractorError(Exception):
 437     """Error during info extraction."""
 438
 439     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 440         """ tb, if given, is the original traceback (so that it can be printed out).
 441         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 442         """
 443
 444         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 445             expected = True
 446         if video_id is not None:
 447             msg = video_id + ': ' + msg
 448         if cause:
 449             msg += ' (caused by %r)' % cause
 450         if not expected:
 451             if ytdl_is_updateable():
 452                 update_cmd = 'type  youtube-dl -U  to update'
 453             else:
 454                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 455             msg += '; please report this issue on https://yt-dl.org/bug .'
 456             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 457             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 458         super(ExtractorError, self).__init__(msg)
 459
 460         self.traceback = tb
 461         self.exc_info = sys.exc_info()  # preserve original exception
 462         self.cause = cause
 463         self.video_id = video_id
 464
 465     def format_traceback(self):
 466         if self.traceback is None:
 467             return None
 468         return ''.join(traceback.format_tb(self.traceback))
 469
 470
 471 class UnsupportedError(ExtractorError):
 472     def __init__(self, url):
 473         super(UnsupportedError, self).__init__(
 474             'Unsupported URL: %s' % url, expected=True)
 475         self.url = url
 476
 477
 478 class RegexNotFoundError(ExtractorError):
 479     """Error when a regex didn't match"""
 480     pass
 481
 482
 483 class DownloadError(Exception):
 484     """Download Error exception.
 485
 486     This exception may be thrown by FileDownloader objects if they are not
 487     configured to continue on errors. They will contain the appropriate
 488     error message.
 489     """
 490
 491     def __init__(self, msg, exc_info=None):
 492         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 493         super(DownloadError, self).__init__(msg)
 494         self.exc_info = exc_info
 495
 496
 497 class SameFileError(Exception):
 498     """Same File exception.
 499
 500     This exception will be thrown by FileDownloader objects if they detect
 501     multiple files would have to be downloaded to the same file on disk.
 502     """
 503     pass
 504
 505
 506 class PostProcessingError(Exception):
 507     """Post Processing exception.
 508
 509     This exception may be raised by PostProcessor's .run() method to
 510     indicate an error in the postprocessing task.
 511     """
 512
 513     def __init__(self, msg):
 514         self.msg = msg
 515
 516
 517 class MaxDownloadsReached(Exception):
 518     """ --max-downloads limit has been reached. """
 519     pass
 520
 521
 522 class UnavailableVideoError(Exception):
 523     """Unavailable Format exception.
 524
 525     This exception will be thrown when a video is requested
 526     in a format that is not available for that video.
 527     """
 528     pass
 529
 530
 531 class ContentTooShortError(Exception):
 532     """Content Too Short exception.
 533
 534     This exception may be raised by FileDownloader objects when a file they
 535     download is too small for what the server announced first, indicating
 536     the connection was probably interrupted.
 537     """
 538     # Both in bytes
 539     downloaded = None
 540     expected = None
 541
 542     def __init__(self, downloaded, expected):
 543         self.downloaded = downloaded
 544         self.expected = expected
 545
 546
 547 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 548     """Handler for HTTP requests and responses.
 549
 550     This class, when installed with an OpenerDirector, automatically adds
 551     the standard headers to every HTTP request and handles gzipped and
 552     deflated responses from web servers. If compression is to be avoided in
 553     a particular request, the original request in the program code only has
 554     to include the HTTP header "Youtubedl-No-Compression", which will be
 555     removed before making the real request.
 556
 557     Part of this code was copied from:
 558
 559     http://techknack.net/python-urllib2-handlers/
 560
 561     Andrew Rowls, the author of that code, agreed to release it to the
 562     public domain.
 563     """
 564
 565     @staticmethod
 566     def deflate(data):
 567         try:
 568             return zlib.decompress(data, -zlib.MAX_WBITS)
 569         except zlib.error:
 570             return zlib.decompress(data)
 571
 572     @staticmethod
 573     def addinfourl_wrapper(stream, headers, url, code):
 574         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 575             return compat_urllib_request.addinfourl(stream, headers, url, code)
 576         ret = compat_urllib_request.addinfourl(stream, headers, url)
 577         ret.code = code
 578         return ret
 579
 580     def http_request(self, req):
 581         for h, v in std_headers.items():
 582             if h not in req.headers:
 583                 req.add_header(h, v)
 584         if 'Youtubedl-no-compression' in req.headers:
 585             if 'Accept-encoding' in req.headers:
 586                 del req.headers['Accept-encoding']
 587             del req.headers['Youtubedl-no-compression']
 588         if 'Youtubedl-user-agent' in req.headers:
 589             if 'User-agent' in req.headers:
 590                 del req.headers['User-agent']
 591             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 592             del req.headers['Youtubedl-user-agent']
 593
 594         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 595             # Python 2.6 is brain-dead when it comes to fragments
 596             req._Request__original = req._Request__original.partition('#')[0]
 597             req._Request__r_type = req._Request__r_type.partition('#')[0]
 598
 599         return req
 600
 601     def http_response(self, req, resp):
 602         old_resp = resp
 603         # gzip
 604         if resp.headers.get('Content-encoding', '') == 'gzip':
 605             content = resp.read()
 606             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 607             try:
 608                 uncompressed = io.BytesIO(gz.read())
 609             except IOError as original_ioerror:
 610                 # There may be junk add the end of the file
 611                 # See http://stackoverflow.com/q/4928560/35070 for details
 612                 for i in range(1, 1024):
 613                     try:
 614                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 615                         uncompressed = io.BytesIO(gz.read())
 616                     except IOError:
 617                         continue
 618                     break
 619                 else:
 620                     raise original_ioerror
 621             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 622             resp.msg = old_resp.msg
 623         # deflate
 624         if resp.headers.get('Content-encoding', '') == 'deflate':
 625             gz = io.BytesIO(self.deflate(resp.read()))
 626             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 627             resp.msg = old_resp.msg
 628         return resp
 629
 630     https_request = http_request
 631     https_response = http_response
 632
 633
 634 def parse_iso8601(date_str, delimiter='T'):
 635     """ Return a UNIX timestamp from the given date """
 636
 637     if date_str is None:
 638         return None
 639
 640     m = re.search(
 641         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 642         date_str)
 643     if not m:
 644         timezone = datetime.timedelta()
 645     else:
 646         date_str = date_str[:-len(m.group(0))]
 647         if not m.group('sign'):
 648             timezone = datetime.timedelta()
 649         else:
 650             sign = 1 if m.group('sign') == '+' else -1
 651             timezone = datetime.timedelta(
 652                 hours=sign * int(m.group('hours')),
 653                 minutes=sign * int(m.group('minutes')))
 654     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 655     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 656     return calendar.timegm(dt.timetuple())
 657
 658
 659 def unified_strdate(date_str, day_first=True):
 660     """Return a string with the date in the format YYYYMMDD"""
 661
 662     if date_str is None:
 663         return None
 664     upload_date = None
 665     # Replace commas
 666     date_str = date_str.replace(',', ' ')
 667     # %z (UTC offset) is only supported in python>=3.2
 668     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 669     # Remove AM/PM + timezone
 670     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 671
 672     format_expressions = [
 673         '%d %B %Y',
 674         '%d %b %Y',
 675         '%B %d %Y',
 676         '%b %d %Y',
 677         '%b %dst %Y %I:%M%p',
 678         '%b %dnd %Y %I:%M%p',
 679         '%b %dth %Y %I:%M%p',
 680         '%Y-%m-%d',
 681         '%Y/%m/%d',
 682         '%Y/%m/%d %H:%M:%S',
 683         '%Y-%m-%d %H:%M:%S',
 684         '%Y-%m-%d %H:%M:%S.%f',
 685         '%d.%m.%Y %H:%M',
 686         '%d.%m.%Y %H.%M',
 687         '%Y-%m-%dT%H:%M:%SZ',
 688         '%Y-%m-%dT%H:%M:%S.%fZ',
 689         '%Y-%m-%dT%H:%M:%S.%f0Z',
 690         '%Y-%m-%dT%H:%M:%S',
 691         '%Y-%m-%dT%H:%M:%S.%f',
 692         '%Y-%m-%dT%H:%M',
 693     ]
 694     if day_first:
 695         format_expressions.extend([
 696             '%d.%m.%Y',
 697             '%d/%m/%Y',
 698             '%d/%m/%y',
 699             '%d/%m/%Y %H:%M:%S',
 700         ])
 701     else:
 702         format_expressions.extend([
 703             '%m.%d.%Y',
 704             '%m/%d/%Y',
 705             '%m/%d/%y',
 706             '%m/%d/%Y %H:%M:%S',
 707         ])
 708     for expression in format_expressions:
 709         try:
 710             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 711         except ValueError:
 712             pass
 713     if upload_date is None:
 714         timetuple = email.utils.parsedate_tz(date_str)
 715         if timetuple:
 716             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 717     return upload_date
 718
 719
 720 def determine_ext(url, default_ext='unknown_video'):
 721     if url is None:
 722         return default_ext
 723     guess = url.partition('?')[0].rpartition('.')[2]
 724     if re.match(r'^[A-Za-z0-9]+$', guess):
 725         return guess
 726     else:
 727         return default_ext
 728
 729
 730 def subtitles_filename(filename, sub_lang, sub_format):
 731     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 732
 733
 734 def date_from_str(date_str):
 735     """
 736     Return a datetime object from a string in the format YYYYMMDD or
 737     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 738     today = datetime.date.today()
 739     if date_str in ('now', 'today'):
 740         return today
 741     if date_str == 'yesterday':
 742         return today - datetime.timedelta(days=1)
 743     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 744     if match is not None:
 745         sign = match.group('sign')
 746         time = int(match.group('time'))
 747         if sign == '-':
 748             time = -time
 749         unit = match.group('unit')
 750         # A bad aproximation?
 751         if unit == 'month':
 752             unit = 'day'
 753             time *= 30
 754         elif unit == 'year':
 755             unit = 'day'
 756             time *= 365
 757         unit += 's'
 758         delta = datetime.timedelta(**{unit: time})
 759         return today + delta
 760     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 761
 762
 763 def hyphenate_date(date_str):
 764     """
 765     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 766     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 767     if match is not None:
 768         return '-'.join(match.groups())
 769     else:
 770         return date_str
 771
 772
 773 class DateRange(object):
 774     """Represents a time interval between two dates"""
 775
 776     def __init__(self, start=None, end=None):
 777         """start and end must be strings in the format accepted by date"""
 778         if start is not None:
 779             self.start = date_from_str(start)
 780         else:
 781             self.start = datetime.datetime.min.date()
 782         if end is not None:
 783             self.end = date_from_str(end)
 784         else:
 785             self.end = datetime.datetime.max.date()
 786         if self.start > self.end:
 787             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 788
 789     @classmethod
 790     def day(cls, day):
 791         """Returns a range that only contains the given day"""
 792         return cls(day, day)
 793
 794     def __contains__(self, date):
 795         """Check if the date is in the range"""
 796         if not isinstance(date, datetime.date):
 797             date = date_from_str(date)
 798         return self.start <= date <= self.end
 799
 800     def __str__(self):
 801         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 802
 803
 804 def platform_name():
 805     """ Returns the platform name as a compat_str """
 806     res = platform.platform()
 807     if isinstance(res, bytes):
 808         res = res.decode(preferredencoding())
 809
 810     assert isinstance(res, compat_str)
 811     return res
 812
 813
 814 def _windows_write_string(s, out):
 815     """ Returns True if the string was written using special methods,
 816     False if it has yet to be written out."""
 817     # Adapted from http://stackoverflow.com/a/3259271/35070
 818
 819     import ctypes
 820     import ctypes.wintypes
 821
 822     WIN_OUTPUT_IDS = {
 823         1: -11,
 824         2: -12,
 825     }
 826
 827     try:
 828         fileno = out.fileno()
 829     except AttributeError:
 830         # If the output stream doesn't have a fileno, it's virtual
 831         return False
 832     if fileno not in WIN_OUTPUT_IDS:
 833         return False
 834
 835     GetStdHandle = ctypes.WINFUNCTYPE(
 836         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 837         (b"GetStdHandle", ctypes.windll.kernel32))
 838     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 839
 840     WriteConsoleW = ctypes.WINFUNCTYPE(
 841         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 842         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 843         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 844     written = ctypes.wintypes.DWORD(0)
 845
 846     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 847     FILE_TYPE_CHAR = 0x0002
 848     FILE_TYPE_REMOTE = 0x8000
 849     GetConsoleMode = ctypes.WINFUNCTYPE(
 850         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 851         ctypes.POINTER(ctypes.wintypes.DWORD))(
 852         (b"GetConsoleMode", ctypes.windll.kernel32))
 853     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 854
 855     def not_a_console(handle):
 856         if handle == INVALID_HANDLE_VALUE or handle is None:
 857             return True
 858         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 859                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 860
 861     if not_a_console(h):
 862         return False
 863
 864     def next_nonbmp_pos(s):
 865         try:
 866             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 867         except StopIteration:
 868             return len(s)
 869
 870     while s:
 871         count = min(next_nonbmp_pos(s), 1024)
 872
 873         ret = WriteConsoleW(
 874             h, s, count if count else 2, ctypes.byref(written), None)
 875         if ret == 0:
 876             raise OSError('Failed to write string')
 877         if not count:  # We just wrote a non-BMP character
 878             assert written.value == 2
 879             s = s[1:]
 880         else:
 881             assert written.value > 0
 882             s = s[written.value:]
 883     return True
 884
 885
 886 def write_string(s, out=None, encoding=None):
 887     if out is None:
 888         out = sys.stderr
 889     assert type(s) == compat_str
 890
 891     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 892         if _windows_write_string(s, out):
 893             return
 894
 895     if ('b' in getattr(out, 'mode', '') or
 896             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 897         byt = s.encode(encoding or preferredencoding(), 'ignore')
 898         out.write(byt)
 899     elif hasattr(out, 'buffer'):
 900         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 901         byt = s.encode(enc, 'ignore')
 902         out.buffer.write(byt)
 903     else:
 904         out.write(s)
 905     out.flush()
 906
 907
 908 def bytes_to_intlist(bs):
 909     if not bs:
 910         return []
 911     if isinstance(bs[0], int):  # Python 3
 912         return list(bs)
 913     else:
 914         return [ord(c) for c in bs]
 915
 916
 917 def intlist_to_bytes(xs):
 918     if not xs:
 919         return b''
 920     return struct_pack('%dB' % len(xs), *xs)
 921
 922
 923 # Cross-platform file locking
 924 if sys.platform == 'win32':
 925     import ctypes.wintypes
 926     import msvcrt
 927
 928     class OVERLAPPED(ctypes.Structure):
 929         _fields_ = [
 930             ('Internal', ctypes.wintypes.LPVOID),
 931             ('InternalHigh', ctypes.wintypes.LPVOID),
 932             ('Offset', ctypes.wintypes.DWORD),
 933             ('OffsetHigh', ctypes.wintypes.DWORD),
 934             ('hEvent', ctypes.wintypes.HANDLE),
 935         ]
 936
 937     kernel32 = ctypes.windll.kernel32
 938     LockFileEx = kernel32.LockFileEx
 939     LockFileEx.argtypes = [
 940         ctypes.wintypes.HANDLE,     # hFile
 941         ctypes.wintypes.DWORD,      # dwFlags
 942         ctypes.wintypes.DWORD,      # dwReserved
 943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 945         ctypes.POINTER(OVERLAPPED)  # Overlapped
 946     ]
 947     LockFileEx.restype = ctypes.wintypes.BOOL
 948     UnlockFileEx = kernel32.UnlockFileEx
 949     UnlockFileEx.argtypes = [
 950         ctypes.wintypes.HANDLE,     # hFile
 951         ctypes.wintypes.DWORD,      # dwReserved
 952         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 954         ctypes.POINTER(OVERLAPPED)  # Overlapped
 955     ]
 956     UnlockFileEx.restype = ctypes.wintypes.BOOL
 957     whole_low = 0xffffffff
 958     whole_high = 0x7fffffff
 959
 960     def _lock_file(f, exclusive):
 961         overlapped = OVERLAPPED()
 962         overlapped.Offset = 0
 963         overlapped.OffsetHigh = 0
 964         overlapped.hEvent = 0
 965         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 966         handle = msvcrt.get_osfhandle(f.fileno())
 967         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 968                           whole_low, whole_high, f._lock_file_overlapped_p):
 969             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 970
 971     def _unlock_file(f):
 972         assert f._lock_file_overlapped_p
 973         handle = msvcrt.get_osfhandle(f.fileno())
 974         if not UnlockFileEx(handle, 0,
 975                             whole_low, whole_high, f._lock_file_overlapped_p):
 976             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 977
 978 else:
 979     import fcntl
 980
 981     def _lock_file(f, exclusive):
 982         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 983
 984     def _unlock_file(f):
 985         fcntl.flock(f, fcntl.LOCK_UN)
 986
 987
 988 class locked_file(object):
 989     def __init__(self, filename, mode, encoding=None):
 990         assert mode in ['r', 'a', 'w']
 991         self.f = io.open(filename, mode, encoding=encoding)
 992         self.mode = mode
 993
 994     def __enter__(self):
 995         exclusive = self.mode != 'r'
 996         try:
 997             _lock_file(self.f, exclusive)
 998         except IOError:
 999             self.f.close()
1000             raise
1001         return self
1002
1003     def __exit__(self, etype, value, traceback):
1004         try:
1005             _unlock_file(self.f)
1006         finally:
1007             self.f.close()
1008
1009     def __iter__(self):
1010         return iter(self.f)
1011
1012     def write(self, *args):
1013         return self.f.write(*args)
1014
1015     def read(self, *args):
1016         return self.f.read(*args)
1017
1018
1019 def get_filesystem_encoding():
1020     encoding = sys.getfilesystemencoding()
1021     return encoding if encoding is not None else 'utf-8'
1022
1023
1024 def shell_quote(args):
1025     quoted_args = []
1026     encoding = get_filesystem_encoding()
1027     for a in args:
1028         if isinstance(a, bytes):
1029             # We may get a filename encoded with 'encodeFilename'
1030             a = a.decode(encoding)
1031         quoted_args.append(pipes.quote(a))
1032     return ' '.join(quoted_args)
1033
1034
1035 def takewhile_inclusive(pred, seq):
1036     """ Like itertools.takewhile, but include the latest evaluated element
1037         (the first element so that Not pred(e)) """
1038     for e in seq:
1039         yield e
1040         if not pred(e):
1041             return
1042
1043
1044 def smuggle_url(url, data):
1045     """ Pass additional data in a URL for internal use. """
1046
1047     sdata = compat_urllib_parse.urlencode(
1048         {'__youtubedl_smuggle': json.dumps(data)})
1049     return url + '#' + sdata
1050
1051
1052 def unsmuggle_url(smug_url, default=None):
1053     if '#__youtubedl_smuggle' not in smug_url:
1054         return smug_url, default
1055     url, _, sdata = smug_url.rpartition('#')
1056     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1057     data = json.loads(jsond)
1058     return url, data
1059
1060
1061 def format_bytes(bytes):
1062     if bytes is None:
1063         return 'N/A'
1064     if type(bytes) is str:
1065         bytes = float(bytes)
1066     if bytes == 0.0:
1067         exponent = 0
1068     else:
1069         exponent = int(math.log(bytes, 1024.0))
1070     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1071     converted = float(bytes) / float(1024 ** exponent)
1072     return '%.2f%s' % (converted, suffix)
1073
1074
1075 def parse_filesize(s):
1076     if s is None:
1077         return None
1078
1079     # The lower-case forms are of course incorrect and inofficial,
1080     # but we support those too
1081     _UNIT_TABLE = {
1082         'B': 1,
1083         'b': 1,
1084         'KiB': 1024,
1085         'KB': 1000,
1086         'kB': 1024,
1087         'Kb': 1000,
1088         'MiB': 1024 ** 2,
1089         'MB': 1000 ** 2,
1090         'mB': 1024 ** 2,
1091         'Mb': 1000 ** 2,
1092         'GiB': 1024 ** 3,
1093         'GB': 1000 ** 3,
1094         'gB': 1024 ** 3,
1095         'Gb': 1000 ** 3,
1096         'TiB': 1024 ** 4,
1097         'TB': 1000 ** 4,
1098         'tB': 1024 ** 4,
1099         'Tb': 1000 ** 4,
1100         'PiB': 1024 ** 5,
1101         'PB': 1000 ** 5,
1102         'pB': 1024 ** 5,
1103         'Pb': 1000 ** 5,
1104         'EiB': 1024 ** 6,
1105         'EB': 1000 ** 6,
1106         'eB': 1024 ** 6,
1107         'Eb': 1000 ** 6,
1108         'ZiB': 1024 ** 7,
1109         'ZB': 1000 ** 7,
1110         'zB': 1024 ** 7,
1111         'Zb': 1000 ** 7,
1112         'YiB': 1024 ** 8,
1113         'YB': 1000 ** 8,
1114         'yB': 1024 ** 8,
1115         'Yb': 1000 ** 8,
1116     }
1117
1118     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1119     m = re.match(
1120         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1121     if not m:
1122         return None
1123
1124     num_str = m.group('num').replace(',', '.')
1125     mult = _UNIT_TABLE[m.group('unit')]
1126     return int(float(num_str) * mult)
1127
1128
1129 def get_term_width():
1130     columns = compat_getenv('COLUMNS', None)
1131     if columns:
1132         return int(columns)
1133
1134     try:
1135         sp = subprocess.Popen(
1136             ['stty', 'size'],
1137             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1138         out, err = sp.communicate()
1139         return int(out.split()[1])
1140     except:
1141         pass
1142     return None
1143
1144
1145 def month_by_name(name):
1146     """ Return the number of a month by (locale-independently) English name """
1147
1148     ENGLISH_NAMES = [
1149         'January', 'February', 'March', 'April', 'May', 'June',
1150         'July', 'August', 'September', 'October', 'November', 'December']
1151     try:
1152         return ENGLISH_NAMES.index(name) + 1
1153     except ValueError:
1154         return None
1155
1156
1157 def fix_xml_ampersands(xml_str):
1158     """Replace all the '&' by '&amp;' in XML"""
1159     return re.sub(
1160         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1161         '&amp;',
1162         xml_str)
1163
1164
1165 def setproctitle(title):
1166     assert isinstance(title, compat_str)
1167     try:
1168         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1169     except OSError:
1170         return
1171     title_bytes = title.encode('utf-8')
1172     buf = ctypes.create_string_buffer(len(title_bytes))
1173     buf.value = title_bytes
1174     try:
1175         libc.prctl(15, buf, 0, 0, 0)
1176     except AttributeError:
1177         return  # Strange libc, just skip this
1178
1179
1180 def remove_start(s, start):
1181     if s.startswith(start):
1182         return s[len(start):]
1183     return s
1184
1185
1186 def remove_end(s, end):
1187     if s.endswith(end):
1188         return s[:-len(end)]
1189     return s
1190
1191
1192 def url_basename(url):
1193     path = compat_urlparse.urlparse(url).path
1194     return path.strip('/').split('/')[-1]
1195
1196
1197 class HEADRequest(compat_urllib_request.Request):
1198     def get_method(self):
1199         return "HEAD"
1200
1201
1202 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1203     if get_attr:
1204         if v is not None:
1205             v = getattr(v, get_attr, None)
1206     if v == '':
1207         v = None
1208     return default if v is None else (int(v) * invscale // scale)
1209
1210
1211 def str_or_none(v, default=None):
1212     return default if v is None else compat_str(v)
1213
1214
1215 def str_to_int(int_str):
1216     """ A more relaxed version of int_or_none """
1217     if int_str is None:
1218         return None
1219     int_str = re.sub(r'[,\.\+]', '', int_str)
1220     return int(int_str)
1221
1222
1223 def float_or_none(v, scale=1, invscale=1, default=None):
1224     return default if v is None else (float(v) * invscale / scale)
1225
1226
1227 def parse_duration(s):
1228     if s is None:
1229         return None
1230
1231     s = s.strip()
1232
1233     m = re.match(
1234         r'''(?ix)T?
1235         (?:
1236             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1237             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1238
1239             (?:
1240                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1241                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1242             )?
1243             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1244         )$''', s)
1245     if not m:
1246         return None
1247     res = 0
1248     if m.group('only_mins'):
1249         return float_or_none(m.group('only_mins'), invscale=60)
1250     if m.group('only_hours'):
1251         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1252     if m.group('secs'):
1253         res += int(m.group('secs'))
1254     if m.group('mins'):
1255         res += int(m.group('mins')) * 60
1256     if m.group('hours'):
1257         res += int(m.group('hours')) * 60 * 60
1258     if m.group('ms'):
1259         res += float(m.group('ms'))
1260     return res
1261
1262
1263 def prepend_extension(filename, ext):
1264     name, real_ext = os.path.splitext(filename)
1265     return '{0}.{1}{2}'.format(name, ext, real_ext)
1266
1267
1268 def check_executable(exe, args=[]):
1269     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1270     args can be a list of arguments for a short output (like -version) """
1271     try:
1272         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1273     except OSError:
1274         return False
1275     return exe
1276
1277
1278 def get_exe_version(exe, args=['--version'],
1279                     version_re=None, unrecognized='present'):
1280     """ Returns the version of the specified executable,
1281     or False if the executable is not present """
1282     try:
1283         out, _ = subprocess.Popen(
1284             [exe] + args,
1285             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1286     except OSError:
1287         return False
1288     if isinstance(out, bytes):  # Python 2.x
1289         out = out.decode('ascii', 'ignore')
1290     return detect_exe_version(out, version_re, unrecognized)
1291
1292
1293 def detect_exe_version(output, version_re=None, unrecognized='present'):
1294     assert isinstance(output, compat_str)
1295     if version_re is None:
1296         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1297     m = re.search(version_re, output)
1298     if m:
1299         return m.group(1)
1300     else:
1301         return unrecognized
1302
1303
1304 class PagedList(object):
1305     def __len__(self):
1306         # This is only useful for tests
1307         return len(self.getslice())
1308
1309
1310 class OnDemandPagedList(PagedList):
1311     def __init__(self, pagefunc, pagesize):
1312         self._pagefunc = pagefunc
1313         self._pagesize = pagesize
1314
1315     def getslice(self, start=0, end=None):
1316         res = []
1317         for pagenum in itertools.count(start // self._pagesize):
1318             firstid = pagenum * self._pagesize
1319             nextfirstid = pagenum * self._pagesize + self._pagesize
1320             if start >= nextfirstid:
1321                 continue
1322
1323             page_results = list(self._pagefunc(pagenum))
1324
1325             startv = (
1326                 start % self._pagesize
1327                 if firstid <= start < nextfirstid
1328                 else 0)
1329
1330             endv = (
1331                 ((end - 1) % self._pagesize) + 1
1332                 if (end is not None and firstid <= end <= nextfirstid)
1333                 else None)
1334
1335             if startv != 0 or endv is not None:
1336                 page_results = page_results[startv:endv]
1337             res.extend(page_results)
1338
1339             # A little optimization - if current page is not "full", ie. does
1340             # not contain page_size videos then we can assume that this page
1341             # is the last one - there are no more ids on further pages -
1342             # i.e. no need to query again.
1343             if len(page_results) + startv < self._pagesize:
1344                 break
1345
1346             # If we got the whole page, but the next page is not interesting,
1347             # break out early as well
1348             if end == nextfirstid:
1349                 break
1350         return res
1351
1352
1353 class InAdvancePagedList(PagedList):
1354     def __init__(self, pagefunc, pagecount, pagesize):
1355         self._pagefunc = pagefunc
1356         self._pagecount = pagecount
1357         self._pagesize = pagesize
1358
1359     def getslice(self, start=0, end=None):
1360         res = []
1361         start_page = start // self._pagesize
1362         end_page = (
1363             self._pagecount if end is None else (end // self._pagesize + 1))
1364         skip_elems = start - start_page * self._pagesize
1365         only_more = None if end is None else end - start
1366         for pagenum in range(start_page, end_page):
1367             page = list(self._pagefunc(pagenum))
1368             if skip_elems:
1369                 page = page[skip_elems:]
1370                 skip_elems = None
1371             if only_more is not None:
1372                 if len(page) < only_more:
1373                     only_more -= len(page)
1374                 else:
1375                     page = page[:only_more]
1376                     res.extend(page)
1377                     break
1378             res.extend(page)
1379         return res
1380
1381
1382 def uppercase_escape(s):
1383     unicode_escape = codecs.getdecoder('unicode_escape')
1384     return re.sub(
1385         r'\\U[0-9a-fA-F]{8}',
1386         lambda m: unicode_escape(m.group(0))[0],
1387         s)
1388
1389
1390 def escape_rfc3986(s):
1391     """Escape non-ASCII characters as suggested by RFC 3986"""
1392     if sys.version_info < (3, 0) and isinstance(s, unicode):
1393         s = s.encode('utf-8')
1394     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1395
1396
1397 def escape_url(url):
1398     """Escape URL as suggested by RFC 3986"""
1399     url_parsed = compat_urllib_parse_urlparse(url)
1400     return url_parsed._replace(
1401         path=escape_rfc3986(url_parsed.path),
1402         params=escape_rfc3986(url_parsed.params),
1403         query=escape_rfc3986(url_parsed.query),
1404         fragment=escape_rfc3986(url_parsed.fragment)
1405     ).geturl()
1406
1407 try:
1408     struct.pack('!I', 0)
1409 except TypeError:
1410     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1411     def struct_pack(spec, *args):
1412         if isinstance(spec, compat_str):
1413             spec = spec.encode('ascii')
1414         return struct.pack(spec, *args)
1415
1416     def struct_unpack(spec, *args):
1417         if isinstance(spec, compat_str):
1418             spec = spec.encode('ascii')
1419         return struct.unpack(spec, *args)
1420 else:
1421     struct_pack = struct.pack
1422     struct_unpack = struct.unpack
1423
1424
1425 def read_batch_urls(batch_fd):
1426     def fixup(url):
1427         if not isinstance(url, compat_str):
1428             url = url.decode('utf-8', 'replace')
1429         BOM_UTF8 = '\xef\xbb\xbf'
1430         if url.startswith(BOM_UTF8):
1431             url = url[len(BOM_UTF8):]
1432         url = url.strip()
1433         if url.startswith(('#', ';', ']')):
1434             return False
1435         return url
1436
1437     with contextlib.closing(batch_fd) as fd:
1438         return [url for url in map(fixup, fd) if url]
1439
1440
1441 def urlencode_postdata(*args, **kargs):
1442     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1443
1444
1445 try:
1446     etree_iter = xml.etree.ElementTree.Element.iter
1447 except AttributeError:  # Python <=2.6
1448     etree_iter = lambda n: n.findall('.//*')
1449
1450
1451 def parse_xml(s):
1452     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1453         def doctype(self, name, pubid, system):
1454             pass  # Ignore doctypes
1455
1456     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1457     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1458     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1459     # Fix up XML parser in Python 2.x
1460     if sys.version_info < (3, 0):
1461         for n in etree_iter(tree):
1462             if n.text is not None:
1463                 if not isinstance(n.text, compat_str):
1464                     n.text = n.text.decode('utf-8')
1465     return tree
1466
1467
1468 US_RATINGS = {
1469     'G': 0,
1470     'PG': 10,
1471     'PG-13': 13,
1472     'R': 16,
1473     'NC': 18,
1474 }
1475
1476
1477 def parse_age_limit(s):
1478     if s is None:
1479         return None
1480     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1481     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1482
1483
1484 def strip_jsonp(code):
1485     return re.sub(
1486         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1487
1488
1489 def js_to_json(code):
1490     def fix_kv(m):
1491         v = m.group(0)
1492         if v in ('true', 'false', 'null'):
1493             return v
1494         if v.startswith('"'):
1495             return v
1496         if v.startswith("'"):
1497             v = v[1:-1]
1498             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1499                 '\\\\': '\\\\',
1500                 "\\'": "'",
1501                 '"': '\\"',
1502             }[m.group(0)], v)
1503         return '"%s"' % v
1504
1505     res = re.sub(r'''(?x)
1506         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1507         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1508         [a-zA-Z_][a-zA-Z_0-9]*
1509         ''', fix_kv, code)
1510     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1511     return res
1512
1513
1514 def qualities(quality_ids):
1515     """ Get a numeric quality value out of a list of possible values """
1516     def q(qid):
1517         try:
1518             return quality_ids.index(qid)
1519         except ValueError:
1520             return -1
1521     return q
1522
1523
1524 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1525
1526
1527 def limit_length(s, length):
1528     """ Add ellipses to overly long strings """
1529     if s is None:
1530         return None
1531     ELLIPSES = '...'
1532     if len(s) > length:
1533         return s[:length - len(ELLIPSES)] + ELLIPSES
1534     return s
1535
1536
1537 def version_tuple(v):
1538     return tuple(int(e) for e in re.split(r'[-.]', v))
1539
1540
1541 def is_outdated_version(version, limit, assume_new=True):
1542     if not version:
1543         return not assume_new
1544     try:
1545         return version_tuple(version) < version_tuple(limit)
1546     except ValueError:
1547         return not assume_new
1548
1549
1550 def ytdl_is_updateable():
1551     """ Returns if youtube-dl can be updated with -U """
1552     from zipimport import zipimporter
1553
1554     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1555
1556
1557 def args_to_str(args):
1558     # Get a short string representation for a subprocess command
1559     return ' '.join(shlex_quote(a) for a in args)
1560
1561
1562 def urlhandle_detect_ext(url_handle):
1563     try:
1564         url_handle.headers
1565         getheader = lambda h: url_handle.headers[h]
1566     except AttributeError:  # Python < 3
1567         getheader = url_handle.info().getheader
1568
1569     return getheader('Content-Type').split("/")[1]
1570
1571
1572 def age_restricted(content_limit, age_limit):
1573     """ Returns True iff the content should be blocked """
1574
1575     if age_limit is None:  # No limit set
1576         return False
1577     if content_limit is None:
1578         return False  # Content available for everyone
1579     return age_limit < content_limit