_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 403         if opts_no_check_certificate:
 404             context.check_hostname = False
 405             context.verify_mode = ssl.CERT_NONE
 406         try:
 407             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 408         except TypeError:
 409             # Python 2.7.8
 410             # (create_default_context present but HTTPSHandler has no context=)
 411             pass
 412
 413     if sys.version_info < (3, 2):
 414         return YoutubeDLHTTPSHandler(params, **kwargs)
 415     else:  # Python < 3.4
 416         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 417         context.verify_mode = (ssl.CERT_NONE
 418                                if opts_no_check_certificate
 419                                else ssl.CERT_REQUIRED)
 420         context.set_default_verify_paths()
 421         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 422
 423
 424 class ExtractorError(Exception):
 425     """Error during info extraction."""
 426
 427     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 428         """ tb, if given, is the original traceback (so that it can be printed out).
 429         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 430         """
 431
 432         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 433             expected = True
 434         if video_id is not None:
 435             msg = video_id + ': ' + msg
 436         if cause:
 437             msg += ' (caused by %r)' % cause
 438         if not expected:
 439             if ytdl_is_updateable():
 440                 update_cmd = 'type  youtube-dl -U  to update'
 441             else:
 442                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 443             msg += '; please report this issue on https://yt-dl.org/bug .'
 444             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 445             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 446         super(ExtractorError, self).__init__(msg)
 447
 448         self.traceback = tb
 449         self.exc_info = sys.exc_info()  # preserve original exception
 450         self.cause = cause
 451         self.video_id = video_id
 452
 453     def format_traceback(self):
 454         if self.traceback is None:
 455             return None
 456         return ''.join(traceback.format_tb(self.traceback))
 457
 458
 459 class UnsupportedError(ExtractorError):
 460     def __init__(self, url):
 461         super(UnsupportedError, self).__init__(
 462             'Unsupported URL: %s' % url, expected=True)
 463         self.url = url
 464
 465
 466 class RegexNotFoundError(ExtractorError):
 467     """Error when a regex didn't match"""
 468     pass
 469
 470
 471 class DownloadError(Exception):
 472     """Download Error exception.
 473
 474     This exception may be thrown by FileDownloader objects if they are not
 475     configured to continue on errors. They will contain the appropriate
 476     error message.
 477     """
 478
 479     def __init__(self, msg, exc_info=None):
 480         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 481         super(DownloadError, self).__init__(msg)
 482         self.exc_info = exc_info
 483
 484
 485 class SameFileError(Exception):
 486     """Same File exception.
 487
 488     This exception will be thrown by FileDownloader objects if they detect
 489     multiple files would have to be downloaded to the same file on disk.
 490     """
 491     pass
 492
 493
 494 class PostProcessingError(Exception):
 495     """Post Processing exception.
 496
 497     This exception may be raised by PostProcessor's .run() method to
 498     indicate an error in the postprocessing task.
 499     """
 500
 501     def __init__(self, msg):
 502         self.msg = msg
 503
 504
 505 class MaxDownloadsReached(Exception):
 506     """ --max-downloads limit has been reached. """
 507     pass
 508
 509
 510 class UnavailableVideoError(Exception):
 511     """Unavailable Format exception.
 512
 513     This exception will be thrown when a video is requested
 514     in a format that is not available for that video.
 515     """
 516     pass
 517
 518
 519 class ContentTooShortError(Exception):
 520     """Content Too Short exception.
 521
 522     This exception may be raised by FileDownloader objects when a file they
 523     download is too small for what the server announced first, indicating
 524     the connection was probably interrupted.
 525     """
 526     # Both in bytes
 527     downloaded = None
 528     expected = None
 529
 530     def __init__(self, downloaded, expected):
 531         self.downloaded = downloaded
 532         self.expected = expected
 533
 534
 535 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 536     hc = http_class(*args, **kwargs)
 537     source_address = ydl_handler._params.get('source_address')
 538     if source_address is not None:
 539         sa = (source_address, 0)
 540         if hasattr(hc, 'source_address'):  # Python 2.7+
 541             hc.source_address = sa
 542         else:  # Python 2.6
 543             def _hc_connect(self, *args, **kwargs):
 544                 sock = compat_socket_create_connection(
 545                     (self.host, self.port), self.timeout, sa)
 546                 if is_https:
 547                     self.sock = ssl.wrap_socket(
 548                         sock, self.key_file, self.cert_file,
 549                         ssl_version=ssl.PROTOCOL_TLSv1)
 550                 else:
 551                     self.sock = sock
 552             hc.connect = functools.partial(_hc_connect, hc)
 553
 554     return hc
 555
 556
 557 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 558     """Handler for HTTP requests and responses.
 559
 560     This class, when installed with an OpenerDirector, automatically adds
 561     the standard headers to every HTTP request and handles gzipped and
 562     deflated responses from web servers. If compression is to be avoided in
 563     a particular request, the original request in the program code only has
 564     to include the HTTP header "Youtubedl-No-Compression", which will be
 565     removed before making the real request.
 566
 567     Part of this code was copied from:
 568
 569     http://techknack.net/python-urllib2-handlers/
 570
 571     Andrew Rowls, the author of that code, agreed to release it to the
 572     public domain.
 573     """
 574
 575     def __init__(self, params, *args, **kwargs):
 576         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 577         self._params = params
 578
 579     def http_open(self, req):
 580         return self.do_open(functools.partial(
 581             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 582             req)
 583
 584     @staticmethod
 585     def deflate(data):
 586         try:
 587             return zlib.decompress(data, -zlib.MAX_WBITS)
 588         except zlib.error:
 589             return zlib.decompress(data)
 590
 591     @staticmethod
 592     def addinfourl_wrapper(stream, headers, url, code):
 593         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 594             return compat_urllib_request.addinfourl(stream, headers, url, code)
 595         ret = compat_urllib_request.addinfourl(stream, headers, url)
 596         ret.code = code
 597         return ret
 598
 599     def http_request(self, req):
 600         for h, v in std_headers.items():
 601             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 602             # The dict keys are capitalized because of this bug by urllib
 603             if h.capitalize() not in req.headers:
 604                 req.add_header(h, v)
 605         if 'Youtubedl-no-compression' in req.headers:
 606             if 'Accept-encoding' in req.headers:
 607                 del req.headers['Accept-encoding']
 608             del req.headers['Youtubedl-no-compression']
 609
 610         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 611             # Python 2.6 is brain-dead when it comes to fragments
 612             req._Request__original = req._Request__original.partition('#')[0]
 613             req._Request__r_type = req._Request__r_type.partition('#')[0]
 614
 615         return req
 616
 617     def http_response(self, req, resp):
 618         old_resp = resp
 619         # gzip
 620         if resp.headers.get('Content-encoding', '') == 'gzip':
 621             content = resp.read()
 622             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 623             try:
 624                 uncompressed = io.BytesIO(gz.read())
 625             except IOError as original_ioerror:
 626                 # There may be junk add the end of the file
 627                 # See http://stackoverflow.com/q/4928560/35070 for details
 628                 for i in range(1, 1024):
 629                     try:
 630                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 631                         uncompressed = io.BytesIO(gz.read())
 632                     except IOError:
 633                         continue
 634                     break
 635                 else:
 636                     raise original_ioerror
 637             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 638             resp.msg = old_resp.msg
 639         # deflate
 640         if resp.headers.get('Content-encoding', '') == 'deflate':
 641             gz = io.BytesIO(self.deflate(resp.read()))
 642             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 643             resp.msg = old_resp.msg
 644         return resp
 645
 646     https_request = http_request
 647     https_response = http_response
 648
 649
 650 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 651     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 652         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 653         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 654         self._params = params
 655
 656     def https_open(self, req):
 657         return self.do_open(functools.partial(
 658             _create_http_connection, self, self._https_conn_class, True),
 659             req)
 660
 661
 662 def parse_iso8601(date_str, delimiter='T'):
 663     """ Return a UNIX timestamp from the given date """
 664
 665     if date_str is None:
 666         return None
 667
 668     m = re.search(
 669         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 670         date_str)
 671     if not m:
 672         timezone = datetime.timedelta()
 673     else:
 674         date_str = date_str[:-len(m.group(0))]
 675         if not m.group('sign'):
 676             timezone = datetime.timedelta()
 677         else:
 678             sign = 1 if m.group('sign') == '+' else -1
 679             timezone = datetime.timedelta(
 680                 hours=sign * int(m.group('hours')),
 681                 minutes=sign * int(m.group('minutes')))
 682     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 683     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 684     return calendar.timegm(dt.timetuple())
 685
 686
 687 def unified_strdate(date_str, day_first=True):
 688     """Return a string with the date in the format YYYYMMDD"""
 689
 690     if date_str is None:
 691         return None
 692     upload_date = None
 693     # Replace commas
 694     date_str = date_str.replace(',', ' ')
 695     # %z (UTC offset) is only supported in python>=3.2
 696     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 697     # Remove AM/PM + timezone
 698     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 699
 700     format_expressions = [
 701         '%d %B %Y',
 702         '%d %b %Y',
 703         '%B %d %Y',
 704         '%b %d %Y',
 705         '%b %dst %Y %I:%M%p',
 706         '%b %dnd %Y %I:%M%p',
 707         '%b %dth %Y %I:%M%p',
 708         '%Y %m %d',
 709         '%Y-%m-%d',
 710         '%Y/%m/%d',
 711         '%Y/%m/%d %H:%M:%S',
 712         '%Y-%m-%d %H:%M:%S',
 713         '%Y-%m-%d %H:%M:%S.%f',
 714         '%d.%m.%Y %H:%M',
 715         '%d.%m.%Y %H.%M',
 716         '%Y-%m-%dT%H:%M:%SZ',
 717         '%Y-%m-%dT%H:%M:%S.%fZ',
 718         '%Y-%m-%dT%H:%M:%S.%f0Z',
 719         '%Y-%m-%dT%H:%M:%S',
 720         '%Y-%m-%dT%H:%M:%S.%f',
 721         '%Y-%m-%dT%H:%M',
 722     ]
 723     if day_first:
 724         format_expressions.extend([
 725             '%d.%m.%Y',
 726             '%d/%m/%Y',
 727             '%d/%m/%y',
 728             '%d/%m/%Y %H:%M:%S',
 729         ])
 730     else:
 731         format_expressions.extend([
 732             '%m.%d.%Y',
 733             '%m/%d/%Y',
 734             '%m/%d/%y',
 735             '%m/%d/%Y %H:%M:%S',
 736         ])
 737     for expression in format_expressions:
 738         try:
 739             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 740         except ValueError:
 741             pass
 742     if upload_date is None:
 743         timetuple = email.utils.parsedate_tz(date_str)
 744         if timetuple:
 745             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 746     return upload_date
 747
 748
 749 def determine_ext(url, default_ext='unknown_video'):
 750     if url is None:
 751         return default_ext
 752     guess = url.partition('?')[0].rpartition('.')[2]
 753     if re.match(r'^[A-Za-z0-9]+$', guess):
 754         return guess
 755     else:
 756         return default_ext
 757
 758
 759 def subtitles_filename(filename, sub_lang, sub_format):
 760     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 761
 762
 763 def date_from_str(date_str):
 764     """
 765     Return a datetime object from a string in the format YYYYMMDD or
 766     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 767     today = datetime.date.today()
 768     if date_str in ('now', 'today'):
 769         return today
 770     if date_str == 'yesterday':
 771         return today - datetime.timedelta(days=1)
 772     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 773     if match is not None:
 774         sign = match.group('sign')
 775         time = int(match.group('time'))
 776         if sign == '-':
 777             time = -time
 778         unit = match.group('unit')
 779         # A bad aproximation?
 780         if unit == 'month':
 781             unit = 'day'
 782             time *= 30
 783         elif unit == 'year':
 784             unit = 'day'
 785             time *= 365
 786         unit += 's'
 787         delta = datetime.timedelta(**{unit: time})
 788         return today + delta
 789     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 790
 791
 792 def hyphenate_date(date_str):
 793     """
 794     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 795     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 796     if match is not None:
 797         return '-'.join(match.groups())
 798     else:
 799         return date_str
 800
 801
 802 class DateRange(object):
 803     """Represents a time interval between two dates"""
 804
 805     def __init__(self, start=None, end=None):
 806         """start and end must be strings in the format accepted by date"""
 807         if start is not None:
 808             self.start = date_from_str(start)
 809         else:
 810             self.start = datetime.datetime.min.date()
 811         if end is not None:
 812             self.end = date_from_str(end)
 813         else:
 814             self.end = datetime.datetime.max.date()
 815         if self.start > self.end:
 816             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 817
 818     @classmethod
 819     def day(cls, day):
 820         """Returns a range that only contains the given day"""
 821         return cls(day, day)
 822
 823     def __contains__(self, date):
 824         """Check if the date is in the range"""
 825         if not isinstance(date, datetime.date):
 826             date = date_from_str(date)
 827         return self.start <= date <= self.end
 828
 829     def __str__(self):
 830         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 831
 832
 833 def platform_name():
 834     """ Returns the platform name as a compat_str """
 835     res = platform.platform()
 836     if isinstance(res, bytes):
 837         res = res.decode(preferredencoding())
 838
 839     assert isinstance(res, compat_str)
 840     return res
 841
 842
 843 def _windows_write_string(s, out):
 844     """ Returns True if the string was written using special methods,
 845     False if it has yet to be written out."""
 846     # Adapted from http://stackoverflow.com/a/3259271/35070
 847
 848     import ctypes
 849     import ctypes.wintypes
 850
 851     WIN_OUTPUT_IDS = {
 852         1: -11,
 853         2: -12,
 854     }
 855
 856     try:
 857         fileno = out.fileno()
 858     except AttributeError:
 859         # If the output stream doesn't have a fileno, it's virtual
 860         return False
 861     except io.UnsupportedOperation:
 862         # Some strange Windows pseudo files?
 863         return False
 864     if fileno not in WIN_OUTPUT_IDS:
 865         return False
 866
 867     GetStdHandle = ctypes.WINFUNCTYPE(
 868         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 869         (b"GetStdHandle", ctypes.windll.kernel32))
 870     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 871
 872     WriteConsoleW = ctypes.WINFUNCTYPE(
 873         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 874         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 875         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 876     written = ctypes.wintypes.DWORD(0)
 877
 878     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 879     FILE_TYPE_CHAR = 0x0002
 880     FILE_TYPE_REMOTE = 0x8000
 881     GetConsoleMode = ctypes.WINFUNCTYPE(
 882         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 883         ctypes.POINTER(ctypes.wintypes.DWORD))(
 884         (b"GetConsoleMode", ctypes.windll.kernel32))
 885     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 886
 887     def not_a_console(handle):
 888         if handle == INVALID_HANDLE_VALUE or handle is None:
 889             return True
 890         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 891                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 892
 893     if not_a_console(h):
 894         return False
 895
 896     def next_nonbmp_pos(s):
 897         try:
 898             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 899         except StopIteration:
 900             return len(s)
 901
 902     while s:
 903         count = min(next_nonbmp_pos(s), 1024)
 904
 905         ret = WriteConsoleW(
 906             h, s, count if count else 2, ctypes.byref(written), None)
 907         if ret == 0:
 908             raise OSError('Failed to write string')
 909         if not count:  # We just wrote a non-BMP character
 910             assert written.value == 2
 911             s = s[1:]
 912         else:
 913             assert written.value > 0
 914             s = s[written.value:]
 915     return True
 916
 917
 918 def write_string(s, out=None, encoding=None):
 919     if out is None:
 920         out = sys.stderr
 921     assert type(s) == compat_str
 922
 923     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 924         if _windows_write_string(s, out):
 925             return
 926
 927     if ('b' in getattr(out, 'mode', '') or
 928             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 929         byt = s.encode(encoding or preferredencoding(), 'ignore')
 930         out.write(byt)
 931     elif hasattr(out, 'buffer'):
 932         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 933         byt = s.encode(enc, 'ignore')
 934         out.buffer.write(byt)
 935     else:
 936         out.write(s)
 937     out.flush()
 938
 939
 940 def bytes_to_intlist(bs):
 941     if not bs:
 942         return []
 943     if isinstance(bs[0], int):  # Python 3
 944         return list(bs)
 945     else:
 946         return [ord(c) for c in bs]
 947
 948
 949 def intlist_to_bytes(xs):
 950     if not xs:
 951         return b''
 952     return struct_pack('%dB' % len(xs), *xs)
 953
 954
 955 # Cross-platform file locking
 956 if sys.platform == 'win32':
 957     import ctypes.wintypes
 958     import msvcrt
 959
 960     class OVERLAPPED(ctypes.Structure):
 961         _fields_ = [
 962             ('Internal', ctypes.wintypes.LPVOID),
 963             ('InternalHigh', ctypes.wintypes.LPVOID),
 964             ('Offset', ctypes.wintypes.DWORD),
 965             ('OffsetHigh', ctypes.wintypes.DWORD),
 966             ('hEvent', ctypes.wintypes.HANDLE),
 967         ]
 968
 969     kernel32 = ctypes.windll.kernel32
 970     LockFileEx = kernel32.LockFileEx
 971     LockFileEx.argtypes = [
 972         ctypes.wintypes.HANDLE,     # hFile
 973         ctypes.wintypes.DWORD,      # dwFlags
 974         ctypes.wintypes.DWORD,      # dwReserved
 975         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 976         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 977         ctypes.POINTER(OVERLAPPED)  # Overlapped
 978     ]
 979     LockFileEx.restype = ctypes.wintypes.BOOL
 980     UnlockFileEx = kernel32.UnlockFileEx
 981     UnlockFileEx.argtypes = [
 982         ctypes.wintypes.HANDLE,     # hFile
 983         ctypes.wintypes.DWORD,      # dwReserved
 984         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 985         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 986         ctypes.POINTER(OVERLAPPED)  # Overlapped
 987     ]
 988     UnlockFileEx.restype = ctypes.wintypes.BOOL
 989     whole_low = 0xffffffff
 990     whole_high = 0x7fffffff
 991
 992     def _lock_file(f, exclusive):
 993         overlapped = OVERLAPPED()
 994         overlapped.Offset = 0
 995         overlapped.OffsetHigh = 0
 996         overlapped.hEvent = 0
 997         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 998         handle = msvcrt.get_osfhandle(f.fileno())
 999         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1000                           whole_low, whole_high, f._lock_file_overlapped_p):
1001             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1002
1003     def _unlock_file(f):
1004         assert f._lock_file_overlapped_p
1005         handle = msvcrt.get_osfhandle(f.fileno())
1006         if not UnlockFileEx(handle, 0,
1007                             whole_low, whole_high, f._lock_file_overlapped_p):
1008             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1009
1010 else:
1011     import fcntl
1012
1013     def _lock_file(f, exclusive):
1014         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1015
1016     def _unlock_file(f):
1017         fcntl.flock(f, fcntl.LOCK_UN)
1018
1019
1020 class locked_file(object):
1021     def __init__(self, filename, mode, encoding=None):
1022         assert mode in ['r', 'a', 'w']
1023         self.f = io.open(filename, mode, encoding=encoding)
1024         self.mode = mode
1025
1026     def __enter__(self):
1027         exclusive = self.mode != 'r'
1028         try:
1029             _lock_file(self.f, exclusive)
1030         except IOError:
1031             self.f.close()
1032             raise
1033         return self
1034
1035     def __exit__(self, etype, value, traceback):
1036         try:
1037             _unlock_file(self.f)
1038         finally:
1039             self.f.close()
1040
1041     def __iter__(self):
1042         return iter(self.f)
1043
1044     def write(self, *args):
1045         return self.f.write(*args)
1046
1047     def read(self, *args):
1048         return self.f.read(*args)
1049
1050
1051 def get_filesystem_encoding():
1052     encoding = sys.getfilesystemencoding()
1053     return encoding if encoding is not None else 'utf-8'
1054
1055
1056 def shell_quote(args):
1057     quoted_args = []
1058     encoding = get_filesystem_encoding()
1059     for a in args:
1060         if isinstance(a, bytes):
1061             # We may get a filename encoded with 'encodeFilename'
1062             a = a.decode(encoding)
1063         quoted_args.append(pipes.quote(a))
1064     return ' '.join(quoted_args)
1065
1066
1067 def takewhile_inclusive(pred, seq):
1068     """ Like itertools.takewhile, but include the latest evaluated element
1069         (the first element so that Not pred(e)) """
1070     for e in seq:
1071         yield e
1072         if not pred(e):
1073             return
1074
1075
1076 def smuggle_url(url, data):
1077     """ Pass additional data in a URL for internal use. """
1078
1079     sdata = compat_urllib_parse.urlencode(
1080         {'__youtubedl_smuggle': json.dumps(data)})
1081     return url + '#' + sdata
1082
1083
1084 def unsmuggle_url(smug_url, default=None):
1085     if '#__youtubedl_smuggle' not in smug_url:
1086         return smug_url, default
1087     url, _, sdata = smug_url.rpartition('#')
1088     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1089     data = json.loads(jsond)
1090     return url, data
1091
1092
1093 def format_bytes(bytes):
1094     if bytes is None:
1095         return 'N/A'
1096     if type(bytes) is str:
1097         bytes = float(bytes)
1098     if bytes == 0.0:
1099         exponent = 0
1100     else:
1101         exponent = int(math.log(bytes, 1024.0))
1102     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1103     converted = float(bytes) / float(1024 ** exponent)
1104     return '%.2f%s' % (converted, suffix)
1105
1106
1107 def parse_filesize(s):
1108     if s is None:
1109         return None
1110
1111     # The lower-case forms are of course incorrect and inofficial,
1112     # but we support those too
1113     _UNIT_TABLE = {
1114         'B': 1,
1115         'b': 1,
1116         'KiB': 1024,
1117         'KB': 1000,
1118         'kB': 1024,
1119         'Kb': 1000,
1120         'MiB': 1024 ** 2,
1121         'MB': 1000 ** 2,
1122         'mB': 1024 ** 2,
1123         'Mb': 1000 ** 2,
1124         'GiB': 1024 ** 3,
1125         'GB': 1000 ** 3,
1126         'gB': 1024 ** 3,
1127         'Gb': 1000 ** 3,
1128         'TiB': 1024 ** 4,
1129         'TB': 1000 ** 4,
1130         'tB': 1024 ** 4,
1131         'Tb': 1000 ** 4,
1132         'PiB': 1024 ** 5,
1133         'PB': 1000 ** 5,
1134         'pB': 1024 ** 5,
1135         'Pb': 1000 ** 5,
1136         'EiB': 1024 ** 6,
1137         'EB': 1000 ** 6,
1138         'eB': 1024 ** 6,
1139         'Eb': 1000 ** 6,
1140         'ZiB': 1024 ** 7,
1141         'ZB': 1000 ** 7,
1142         'zB': 1024 ** 7,
1143         'Zb': 1000 ** 7,
1144         'YiB': 1024 ** 8,
1145         'YB': 1000 ** 8,
1146         'yB': 1024 ** 8,
1147         'Yb': 1000 ** 8,
1148     }
1149
1150     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1151     m = re.match(
1152         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1153     if not m:
1154         return None
1155
1156     num_str = m.group('num').replace(',', '.')
1157     mult = _UNIT_TABLE[m.group('unit')]
1158     return int(float(num_str) * mult)
1159
1160
1161 def get_term_width():
1162     columns = compat_getenv('COLUMNS', None)
1163     if columns:
1164         return int(columns)
1165
1166     try:
1167         sp = subprocess.Popen(
1168             ['stty', 'size'],
1169             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1170         out, err = sp.communicate()
1171         return int(out.split()[1])
1172     except:
1173         pass
1174     return None
1175
1176
1177 def month_by_name(name):
1178     """ Return the number of a month by (locale-independently) English name """
1179
1180     ENGLISH_NAMES = [
1181         'January', 'February', 'March', 'April', 'May', 'June',
1182         'July', 'August', 'September', 'October', 'November', 'December']
1183     try:
1184         return ENGLISH_NAMES.index(name) + 1
1185     except ValueError:
1186         return None
1187
1188
1189 def fix_xml_ampersands(xml_str):
1190     """Replace all the '&' by '&amp;' in XML"""
1191     return re.sub(
1192         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1193         '&amp;',
1194         xml_str)
1195
1196
1197 def setproctitle(title):
1198     assert isinstance(title, compat_str)
1199     try:
1200         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1201     except OSError:
1202         return
1203     title_bytes = title.encode('utf-8')
1204     buf = ctypes.create_string_buffer(len(title_bytes))
1205     buf.value = title_bytes
1206     try:
1207         libc.prctl(15, buf, 0, 0, 0)
1208     except AttributeError:
1209         return  # Strange libc, just skip this
1210
1211
1212 def remove_start(s, start):
1213     if s.startswith(start):
1214         return s[len(start):]
1215     return s
1216
1217
1218 def remove_end(s, end):
1219     if s.endswith(end):
1220         return s[:-len(end)]
1221     return s
1222
1223
1224 def url_basename(url):
1225     path = compat_urlparse.urlparse(url).path
1226     return path.strip('/').split('/')[-1]
1227
1228
1229 class HEADRequest(compat_urllib_request.Request):
1230     def get_method(self):
1231         return "HEAD"
1232
1233
1234 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1235     if get_attr:
1236         if v is not None:
1237             v = getattr(v, get_attr, None)
1238     if v == '':
1239         v = None
1240     return default if v is None else (int(v) * invscale // scale)
1241
1242
1243 def str_or_none(v, default=None):
1244     return default if v is None else compat_str(v)
1245
1246
1247 def str_to_int(int_str):
1248     """ A more relaxed version of int_or_none """
1249     if int_str is None:
1250         return None
1251     int_str = re.sub(r'[,\.\+]', '', int_str)
1252     return int(int_str)
1253
1254
1255 def float_or_none(v, scale=1, invscale=1, default=None):
1256     return default if v is None else (float(v) * invscale / scale)
1257
1258
1259 def parse_duration(s):
1260     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1261         return None
1262
1263     s = s.strip()
1264
1265     m = re.match(
1266         r'''(?ix)(?:P?T)?
1267         (?:
1268             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1269             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1270
1271             (?:
1272                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1273                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1274             )?
1275             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1276         )$''', s)
1277     if not m:
1278         return None
1279     res = 0
1280     if m.group('only_mins'):
1281         return float_or_none(m.group('only_mins'), invscale=60)
1282     if m.group('only_hours'):
1283         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1284     if m.group('secs'):
1285         res += int(m.group('secs'))
1286     if m.group('mins'):
1287         res += int(m.group('mins')) * 60
1288     if m.group('hours'):
1289         res += int(m.group('hours')) * 60 * 60
1290     if m.group('ms'):
1291         res += float(m.group('ms'))
1292     return res
1293
1294
1295 def prepend_extension(filename, ext):
1296     name, real_ext = os.path.splitext(filename)
1297     return '{0}.{1}{2}'.format(name, ext, real_ext)
1298
1299
1300 def check_executable(exe, args=[]):
1301     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1302     args can be a list of arguments for a short output (like -version) """
1303     try:
1304         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1305     except OSError:
1306         return False
1307     return exe
1308
1309
1310 def get_exe_version(exe, args=['--version'],
1311                     version_re=None, unrecognized='present'):
1312     """ Returns the version of the specified executable,
1313     or False if the executable is not present """
1314     try:
1315         out, _ = subprocess.Popen(
1316             [exe] + args,
1317             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1318     except OSError:
1319         return False
1320     if isinstance(out, bytes):  # Python 2.x
1321         out = out.decode('ascii', 'ignore')
1322     return detect_exe_version(out, version_re, unrecognized)
1323
1324
1325 def detect_exe_version(output, version_re=None, unrecognized='present'):
1326     assert isinstance(output, compat_str)
1327     if version_re is None:
1328         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1329     m = re.search(version_re, output)
1330     if m:
1331         return m.group(1)
1332     else:
1333         return unrecognized
1334
1335
1336 class PagedList(object):
1337     def __len__(self):
1338         # This is only useful for tests
1339         return len(self.getslice())
1340
1341
1342 class OnDemandPagedList(PagedList):
1343     def __init__(self, pagefunc, pagesize):
1344         self._pagefunc = pagefunc
1345         self._pagesize = pagesize
1346
1347     def getslice(self, start=0, end=None):
1348         res = []
1349         for pagenum in itertools.count(start // self._pagesize):
1350             firstid = pagenum * self._pagesize
1351             nextfirstid = pagenum * self._pagesize + self._pagesize
1352             if start >= nextfirstid:
1353                 continue
1354
1355             page_results = list(self._pagefunc(pagenum))
1356
1357             startv = (
1358                 start % self._pagesize
1359                 if firstid <= start < nextfirstid
1360                 else 0)
1361
1362             endv = (
1363                 ((end - 1) % self._pagesize) + 1
1364                 if (end is not None and firstid <= end <= nextfirstid)
1365                 else None)
1366
1367             if startv != 0 or endv is not None:
1368                 page_results = page_results[startv:endv]
1369             res.extend(page_results)
1370
1371             # A little optimization - if current page is not "full", ie. does
1372             # not contain page_size videos then we can assume that this page
1373             # is the last one - there are no more ids on further pages -
1374             # i.e. no need to query again.
1375             if len(page_results) + startv < self._pagesize:
1376                 break
1377
1378             # If we got the whole page, but the next page is not interesting,
1379             # break out early as well
1380             if end == nextfirstid:
1381                 break
1382         return res
1383
1384
1385 class InAdvancePagedList(PagedList):
1386     def __init__(self, pagefunc, pagecount, pagesize):
1387         self._pagefunc = pagefunc
1388         self._pagecount = pagecount
1389         self._pagesize = pagesize
1390
1391     def getslice(self, start=0, end=None):
1392         res = []
1393         start_page = start // self._pagesize
1394         end_page = (
1395             self._pagecount if end is None else (end // self._pagesize + 1))
1396         skip_elems = start - start_page * self._pagesize
1397         only_more = None if end is None else end - start
1398         for pagenum in range(start_page, end_page):
1399             page = list(self._pagefunc(pagenum))
1400             if skip_elems:
1401                 page = page[skip_elems:]
1402                 skip_elems = None
1403             if only_more is not None:
1404                 if len(page) < only_more:
1405                     only_more -= len(page)
1406                 else:
1407                     page = page[:only_more]
1408                     res.extend(page)
1409                     break
1410             res.extend(page)
1411         return res
1412
1413
1414 def uppercase_escape(s):
1415     unicode_escape = codecs.getdecoder('unicode_escape')
1416     return re.sub(
1417         r'\\U[0-9a-fA-F]{8}',
1418         lambda m: unicode_escape(m.group(0))[0],
1419         s)
1420
1421
1422 def escape_rfc3986(s):
1423     """Escape non-ASCII characters as suggested by RFC 3986"""
1424     if sys.version_info < (3, 0) and isinstance(s, unicode):
1425         s = s.encode('utf-8')
1426     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1427
1428
1429 def escape_url(url):
1430     """Escape URL as suggested by RFC 3986"""
1431     url_parsed = compat_urllib_parse_urlparse(url)
1432     return url_parsed._replace(
1433         path=escape_rfc3986(url_parsed.path),
1434         params=escape_rfc3986(url_parsed.params),
1435         query=escape_rfc3986(url_parsed.query),
1436         fragment=escape_rfc3986(url_parsed.fragment)
1437     ).geturl()
1438
1439 try:
1440     struct.pack('!I', 0)
1441 except TypeError:
1442     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1443     def struct_pack(spec, *args):
1444         if isinstance(spec, compat_str):
1445             spec = spec.encode('ascii')
1446         return struct.pack(spec, *args)
1447
1448     def struct_unpack(spec, *args):
1449         if isinstance(spec, compat_str):
1450             spec = spec.encode('ascii')
1451         return struct.unpack(spec, *args)
1452 else:
1453     struct_pack = struct.pack
1454     struct_unpack = struct.unpack
1455
1456
1457 def read_batch_urls(batch_fd):
1458     def fixup(url):
1459         if not isinstance(url, compat_str):
1460             url = url.decode('utf-8', 'replace')
1461         BOM_UTF8 = '\xef\xbb\xbf'
1462         if url.startswith(BOM_UTF8):
1463             url = url[len(BOM_UTF8):]
1464         url = url.strip()
1465         if url.startswith(('#', ';', ']')):
1466             return False
1467         return url
1468
1469     with contextlib.closing(batch_fd) as fd:
1470         return [url for url in map(fixup, fd) if url]
1471
1472
1473 def urlencode_postdata(*args, **kargs):
1474     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1475
1476
1477 try:
1478     etree_iter = xml.etree.ElementTree.Element.iter
1479 except AttributeError:  # Python <=2.6
1480     etree_iter = lambda n: n.findall('.//*')
1481
1482
1483 def parse_xml(s):
1484     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1485         def doctype(self, name, pubid, system):
1486             pass  # Ignore doctypes
1487
1488     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1489     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1490     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1491     # Fix up XML parser in Python 2.x
1492     if sys.version_info < (3, 0):
1493         for n in etree_iter(tree):
1494             if n.text is not None:
1495                 if not isinstance(n.text, compat_str):
1496                     n.text = n.text.decode('utf-8')
1497     return tree
1498
1499
1500 US_RATINGS = {
1501     'G': 0,
1502     'PG': 10,
1503     'PG-13': 13,
1504     'R': 16,
1505     'NC': 18,
1506 }
1507
1508
1509 def parse_age_limit(s):
1510     if s is None:
1511         return None
1512     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1513     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1514
1515
1516 def strip_jsonp(code):
1517     return re.sub(
1518         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1519
1520
1521 def js_to_json(code):
1522     def fix_kv(m):
1523         v = m.group(0)
1524         if v in ('true', 'false', 'null'):
1525             return v
1526         if v.startswith('"'):
1527             return v
1528         if v.startswith("'"):
1529             v = v[1:-1]
1530             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1531                 '\\\\': '\\\\',
1532                 "\\'": "'",
1533                 '"': '\\"',
1534             }[m.group(0)], v)
1535         return '"%s"' % v
1536
1537     res = re.sub(r'''(?x)
1538         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1539         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1540         [a-zA-Z_][a-zA-Z_0-9]*
1541         ''', fix_kv, code)
1542     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1543     return res
1544
1545
1546 def qualities(quality_ids):
1547     """ Get a numeric quality value out of a list of possible values """
1548     def q(qid):
1549         try:
1550             return quality_ids.index(qid)
1551         except ValueError:
1552             return -1
1553     return q
1554
1555
1556 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1557
1558
1559 def limit_length(s, length):
1560     """ Add ellipses to overly long strings """
1561     if s is None:
1562         return None
1563     ELLIPSES = '...'
1564     if len(s) > length:
1565         return s[:length - len(ELLIPSES)] + ELLIPSES
1566     return s
1567
1568
1569 def version_tuple(v):
1570     return tuple(int(e) for e in re.split(r'[-.]', v))
1571
1572
1573 def is_outdated_version(version, limit, assume_new=True):
1574     if not version:
1575         return not assume_new
1576     try:
1577         return version_tuple(version) < version_tuple(limit)
1578     except ValueError:
1579         return not assume_new
1580
1581
1582 def ytdl_is_updateable():
1583     """ Returns if youtube-dl can be updated with -U """
1584     from zipimport import zipimporter
1585
1586     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1587
1588
1589 def args_to_str(args):
1590     # Get a short string representation for a subprocess command
1591     return ' '.join(shlex_quote(a) for a in args)
1592
1593
1594 def urlhandle_detect_ext(url_handle):
1595     try:
1596         url_handle.headers
1597         getheader = lambda h: url_handle.headers[h]
1598     except AttributeError:  # Python < 3
1599         getheader = url_handle.info().getheader
1600
1601     cd = getheader('Content-Disposition')
1602     if cd:
1603         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1604         if m:
1605             e = determine_ext(m.group('filename'), default_ext=None)
1606             if e:
1607                 return e
1608
1609     return getheader('Content-Type').split("/")[1]
1610
1611
1612 def age_restricted(content_limit, age_limit):
1613     """ Returns True iff the content should be blocked """
1614
1615     if age_limit is None:  # No limit set
1616         return False
1617     if content_limit is None:
1618         return False  # Content available for everyone
1619     return age_limit < content_limit
1620
1621
1622 def is_html(first_bytes):
1623     """ Detect whether a file contains HTML by examining its first bytes. """
1624
1625     BOMS = [
1626         (b'\xef\xbb\xbf', 'utf-8'),
1627         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1628         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1629         (b'\xff\xfe', 'utf-16-le'),
1630         (b'\xfe\xff', 'utf-16-be'),
1631     ]
1632     for bom, enc in BOMS:
1633         if first_bytes.startswith(bom):
1634             s = first_bytes[len(bom):].decode(enc, 'replace')
1635             break
1636     else:
1637         s = first_bytes.decode('utf-8', 'replace')
1638
1639     return re.match(r'^\s*<', s)
1640
1641
1642 def determine_protocol(info_dict):
1643     protocol = info_dict.get('protocol')
1644     if protocol is not None:
1645         return protocol
1646
1647     url = info_dict['url']
1648     if url.startswith('rtmp'):
1649         return 'rtmp'
1650     elif url.startswith('mms'):
1651         return 'mms'
1652     elif url.startswith('rtsp'):
1653         return 'rtsp'
1654
1655     ext = determine_ext(url)
1656     if ext == 'm3u8':
1657         return 'm3u8'
1658     elif ext == 'f4m':
1659         return 'f4m'
1660
1661     return compat_urllib_parse_urlparse(url).scheme