_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_basestring,
  36     compat_chr,
  37     compat_getenv,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_parse_qs,
  41     compat_socket_create_connection,
  42     compat_str,
  43     compat_urllib_error,
  44     compat_urllib_parse,
  45     compat_urllib_parse_urlparse,
  46     compat_urllib_request,
  47     compat_urlparse,
  48     shlex_quote,
  49 )
  50
  51
  52 # This is not clearly defined otherwise
  53 compiled_regex_type = type(re.compile(''))
  54
  55 std_headers = {
  56     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  57     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59     'Accept-Encoding': 'gzip, deflate',
  60     'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62
  63
  64 def preferredencoding():
  65     """Get preferred encoding.
  66
  67     Returns the best encoding scheme for the system, based on
  68     locale.getpreferredencoding() and some further tweaks.
  69     """
  70     try:
  71         pref = locale.getpreferredencoding()
  72         'TEST'.encode(pref)
  73     except:
  74         pref = 'UTF-8'
  75
  76     return pref
  77
  78
  79 def write_json_file(obj, fn):
  80     """ Encode obj as JSON and write it to fn, atomically if possible """
  81
  82     fn = encodeFilename(fn)
  83     if sys.version_info < (3, 0) and sys.platform != 'win32':
  84         encoding = get_filesystem_encoding()
  85         # os.path.basename returns a bytes object, but NamedTemporaryFile
  86         # will fail if the filename contains non ascii characters unless we
  87         # use a unicode object
  88         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  89         # the same for os.path.dirname
  90         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  91     else:
  92         path_basename = os.path.basename
  93         path_dirname = os.path.dirname
  94
  95     args = {
  96         'suffix': '.tmp',
  97         'prefix': path_basename(fn) + '.',
  98         'dir': path_dirname(fn),
  99         'delete': False,
 100     }
 101
 102     # In Python 2.x, json.dump expects a bytestream.
 103     # In Python 3.x, it writes to a character stream
 104     if sys.version_info < (3, 0):
 105         args['mode'] = 'wb'
 106     else:
 107         args.update({
 108             'mode': 'w',
 109             'encoding': 'utf-8',
 110         })
 111
 112     tf = tempfile.NamedTemporaryFile(**args)
 113
 114     try:
 115         with tf:
 116             json.dump(obj, tf)
 117         if sys.platform == 'win32':
 118             # Need to remove existing file on Windows, else os.rename raises
 119             # WindowsError or FileExistsError.
 120             try:
 121                 os.unlink(fn)
 122             except OSError:
 123                 pass
 124         os.rename(tf.name, fn)
 125     except:
 126         try:
 127             os.remove(tf.name)
 128         except OSError:
 129             pass
 130         raise
 131
 132
 133 if sys.version_info >= (2, 7):
 134     def find_xpath_attr(node, xpath, key, val):
 135         """ Find the xpath xpath[@key=val] """
 136         assert re.match(r'^[a-zA-Z-]+$', key)
 137         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 138         expr = xpath + "[@%s='%s']" % (key, val)
 139         return node.find(expr)
 140 else:
 141     def find_xpath_attr(node, xpath, key, val):
 142         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 143         # .//node does not match if a node is a direct child of . !
 144         if isinstance(xpath, compat_str):
 145             xpath = xpath.encode('ascii')
 146
 147         for f in node.findall(xpath):
 148             if f.attrib.get(key) == val:
 149                 return f
 150         return None
 151
 152 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 153 # the namespace parameter
 154
 155
 156 def xpath_with_ns(path, ns_map):
 157     components = [c.split(':') for c in path.split('/')]
 158     replaced = []
 159     for c in components:
 160         if len(c) == 1:
 161             replaced.append(c[0])
 162         else:
 163             ns, tag = c
 164             replaced.append('{%s}%s' % (ns_map[ns], tag))
 165     return '/'.join(replaced)
 166
 167
 168 def xpath_text(node, xpath, name=None, fatal=False):
 169     if sys.version_info < (2, 7):  # Crazy 2.6
 170         xpath = xpath.encode('ascii')
 171
 172     n = node.find(xpath)
 173     if n is None or n.text is None:
 174         if fatal:
 175             name = xpath if name is None else name
 176             raise ExtractorError('Could not find XML element %s' % name)
 177         else:
 178             return None
 179     return n.text
 180
 181
 182 def get_element_by_id(id, html):
 183     """Return the content of the tag with the specified ID in the passed HTML document"""
 184     return get_element_by_attribute("id", id, html)
 185
 186
 187 def get_element_by_attribute(attribute, value, html):
 188     """Return the content of the tag with the specified attribute in the passed HTML document"""
 189
 190     m = re.search(r'''(?xs)
 191         <([a-zA-Z0-9:._-]+)
 192          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 193          \s+%s=['"]?%s['"]?
 194          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 195         \s*>
 196         (?P<content>.*?)
 197         </\1>
 198     ''' % (re.escape(attribute), re.escape(value)), html)
 199
 200     if not m:
 201         return None
 202     res = m.group('content')
 203
 204     if res.startswith('"') or res.startswith("'"):
 205         res = res[1:-1]
 206
 207     return unescapeHTML(res)
 208
 209
 210 def clean_html(html):
 211     """Clean an HTML snippet into a readable string"""
 212
 213     if html is None:  # Convenience for sanitizing descriptions etc.
 214         return html
 215
 216     # Newline vs <br />
 217     html = html.replace('\n', ' ')
 218     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 219     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 220     # Strip html tags
 221     html = re.sub('<.*?>', '', html)
 222     # Replace html entities
 223     html = unescapeHTML(html)
 224     return html.strip()
 225
 226
 227 def sanitize_open(filename, open_mode):
 228     """Try to open the given filename, and slightly tweak it if this fails.
 229
 230     Attempts to open the given filename. If this fails, it tries to change
 231     the filename slightly, step by step, until it's either able to open it
 232     or it fails and raises a final exception, like the standard open()
 233     function.
 234
 235     It returns the tuple (stream, definitive_file_name).
 236     """
 237     try:
 238         if filename == '-':
 239             if sys.platform == 'win32':
 240                 import msvcrt
 241                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 242             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 243         stream = open(encodeFilename(filename), open_mode)
 244         return (stream, filename)
 245     except (IOError, OSError) as err:
 246         if err.errno in (errno.EACCES,):
 247             raise
 248
 249         # In case of error, try to remove win32 forbidden chars
 250         alt_filename = os.path.join(
 251             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 252             for path_part in os.path.split(filename)
 253         )
 254         if alt_filename == filename:
 255             raise
 256         else:
 257             # An exception here should be caught in the caller
 258             stream = open(encodeFilename(filename), open_mode)
 259             return (stream, alt_filename)
 260
 261
 262 def timeconvert(timestr):
 263     """Convert RFC 2822 defined time string into system timestamp"""
 264     timestamp = None
 265     timetuple = email.utils.parsedate_tz(timestr)
 266     if timetuple is not None:
 267         timestamp = email.utils.mktime_tz(timetuple)
 268     return timestamp
 269
 270
 271 def sanitize_filename(s, restricted=False, is_id=False):
 272     """Sanitizes a string so it could be used as part of a filename.
 273     If restricted is set, use a stricter subset of allowed characters.
 274     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 275     """
 276     def replace_insane(char):
 277         if char == '?' or ord(char) < 32 or ord(char) == 127:
 278             return ''
 279         elif char == '"':
 280             return '' if restricted else '\''
 281         elif char == ':':
 282             return '_-' if restricted else ' -'
 283         elif char in '\\/|*<>':
 284             return '_'
 285         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 286             return '_'
 287         if restricted and ord(char) > 127:
 288             return '_'
 289         return char
 290
 291     # Handle timestamps
 292     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 293     result = ''.join(map(replace_insane, s))
 294     if not is_id:
 295         while '__' in result:
 296             result = result.replace('__', '_')
 297         result = result.strip('_')
 298         # Common case of "Foreign band name - English song title"
 299         if restricted and result.startswith('-_'):
 300             result = result[2:]
 301         if not result:
 302             result = '_'
 303     return result
 304
 305
 306 def orderedSet(iterable):
 307     """ Remove all duplicates from the input iterable """
 308     res = []
 309     for el in iterable:
 310         if el not in res:
 311             res.append(el)
 312     return res
 313
 314
 315 def _htmlentity_transform(entity):
 316     """Transforms an HTML entity to a character."""
 317     # Known non-numeric HTML entity
 318     if entity in compat_html_entities.name2codepoint:
 319         return compat_chr(compat_html_entities.name2codepoint[entity])
 320
 321     mobj = re.match(r'#(x?[0-9]+)', entity)
 322     if mobj is not None:
 323         numstr = mobj.group(1)
 324         if numstr.startswith('x'):
 325             base = 16
 326             numstr = '0%s' % numstr
 327         else:
 328             base = 10
 329         return compat_chr(int(numstr, base))
 330
 331     # Unknown entity in name, return its literal representation
 332     return ('&%s;' % entity)
 333
 334
 335 def unescapeHTML(s):
 336     if s is None:
 337         return None
 338     assert type(s) == compat_str
 339
 340     return re.sub(
 341         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 342
 343
 344 def encodeFilename(s, for_subprocess=False):
 345     """
 346     @param s The name of the file
 347     """
 348
 349     assert type(s) == compat_str
 350
 351     # Python 3 has a Unicode API
 352     if sys.version_info >= (3, 0):
 353         return s
 354
 355     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 356         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 357         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 358         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 359         if not for_subprocess:
 360             return s
 361         else:
 362             # For subprocess calls, encode with locale encoding
 363             # Refer to http://stackoverflow.com/a/9951851/35070
 364             encoding = preferredencoding()
 365     else:
 366         encoding = sys.getfilesystemencoding()
 367     if encoding is None:
 368         encoding = 'utf-8'
 369     return s.encode(encoding, 'ignore')
 370
 371
 372 def encodeArgument(s):
 373     if not isinstance(s, compat_str):
 374         # Legacy code that uses byte strings
 375         # Uncomment the following line after fixing all post processors
 376         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 377         s = s.decode('ascii')
 378     return encodeFilename(s, True)
 379
 380
 381 def decodeOption(optval):
 382     if optval is None:
 383         return optval
 384     if isinstance(optval, bytes):
 385         optval = optval.decode(preferredencoding())
 386
 387     assert isinstance(optval, compat_str)
 388     return optval
 389
 390
 391 def formatSeconds(secs):
 392     if secs > 3600:
 393         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 394     elif secs > 60:
 395         return '%d:%02d' % (secs // 60, secs % 60)
 396     else:
 397         return '%d' % secs
 398
 399
 400 def make_HTTPS_handler(params, **kwargs):
 401     opts_no_check_certificate = params.get('nocheckcertificate', False)
 402     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 403         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 404         if opts_no_check_certificate:
 405             context.check_hostname = False
 406             context.verify_mode = ssl.CERT_NONE
 407         try:
 408             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 409         except TypeError:
 410             # Python 2.7.8
 411             # (create_default_context present but HTTPSHandler has no context=)
 412             pass
 413
 414     if sys.version_info < (3, 2):
 415         return YoutubeDLHTTPSHandler(params, **kwargs)
 416     else:  # Python < 3.4
 417         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 418         context.verify_mode = (ssl.CERT_NONE
 419                                if opts_no_check_certificate
 420                                else ssl.CERT_REQUIRED)
 421         context.set_default_verify_paths()
 422         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 423
 424
 425 class ExtractorError(Exception):
 426     """Error during info extraction."""
 427
 428     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 429         """ tb, if given, is the original traceback (so that it can be printed out).
 430         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 431         """
 432
 433         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 434             expected = True
 435         if video_id is not None:
 436             msg = video_id + ': ' + msg
 437         if cause:
 438             msg += ' (caused by %r)' % cause
 439         if not expected:
 440             if ytdl_is_updateable():
 441                 update_cmd = 'type  youtube-dl -U  to update'
 442             else:
 443                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 444             msg += '; please report this issue on https://yt-dl.org/bug .'
 445             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 446             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 447         super(ExtractorError, self).__init__(msg)
 448
 449         self.traceback = tb
 450         self.exc_info = sys.exc_info()  # preserve original exception
 451         self.cause = cause
 452         self.video_id = video_id
 453
 454     def format_traceback(self):
 455         if self.traceback is None:
 456             return None
 457         return ''.join(traceback.format_tb(self.traceback))
 458
 459
 460 class UnsupportedError(ExtractorError):
 461     def __init__(self, url):
 462         super(UnsupportedError, self).__init__(
 463             'Unsupported URL: %s' % url, expected=True)
 464         self.url = url
 465
 466
 467 class RegexNotFoundError(ExtractorError):
 468     """Error when a regex didn't match"""
 469     pass
 470
 471
 472 class DownloadError(Exception):
 473     """Download Error exception.
 474
 475     This exception may be thrown by FileDownloader objects if they are not
 476     configured to continue on errors. They will contain the appropriate
 477     error message.
 478     """
 479
 480     def __init__(self, msg, exc_info=None):
 481         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 482         super(DownloadError, self).__init__(msg)
 483         self.exc_info = exc_info
 484
 485
 486 class SameFileError(Exception):
 487     """Same File exception.
 488
 489     This exception will be thrown by FileDownloader objects if they detect
 490     multiple files would have to be downloaded to the same file on disk.
 491     """
 492     pass
 493
 494
 495 class PostProcessingError(Exception):
 496     """Post Processing exception.
 497
 498     This exception may be raised by PostProcessor's .run() method to
 499     indicate an error in the postprocessing task.
 500     """
 501
 502     def __init__(self, msg):
 503         self.msg = msg
 504
 505
 506 class MaxDownloadsReached(Exception):
 507     """ --max-downloads limit has been reached. """
 508     pass
 509
 510
 511 class UnavailableVideoError(Exception):
 512     """Unavailable Format exception.
 513
 514     This exception will be thrown when a video is requested
 515     in a format that is not available for that video.
 516     """
 517     pass
 518
 519
 520 class ContentTooShortError(Exception):
 521     """Content Too Short exception.
 522
 523     This exception may be raised by FileDownloader objects when a file they
 524     download is too small for what the server announced first, indicating
 525     the connection was probably interrupted.
 526     """
 527     # Both in bytes
 528     downloaded = None
 529     expected = None
 530
 531     def __init__(self, downloaded, expected):
 532         self.downloaded = downloaded
 533         self.expected = expected
 534
 535
 536 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 537     hc = http_class(*args, **kwargs)
 538     source_address = ydl_handler._params.get('source_address')
 539     if source_address is not None:
 540         sa = (source_address, 0)
 541         if hasattr(hc, 'source_address'):  # Python 2.7+
 542             hc.source_address = sa
 543         else:  # Python 2.6
 544             def _hc_connect(self, *args, **kwargs):
 545                 sock = compat_socket_create_connection(
 546                     (self.host, self.port), self.timeout, sa)
 547                 if is_https:
 548                     self.sock = ssl.wrap_socket(
 549                         sock, self.key_file, self.cert_file,
 550                         ssl_version=ssl.PROTOCOL_TLSv1)
 551                 else:
 552                     self.sock = sock
 553             hc.connect = functools.partial(_hc_connect, hc)
 554
 555     return hc
 556
 557
 558 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 559     """Handler for HTTP requests and responses.
 560
 561     This class, when installed with an OpenerDirector, automatically adds
 562     the standard headers to every HTTP request and handles gzipped and
 563     deflated responses from web servers. If compression is to be avoided in
 564     a particular request, the original request in the program code only has
 565     to include the HTTP header "Youtubedl-No-Compression", which will be
 566     removed before making the real request.
 567
 568     Part of this code was copied from:
 569
 570     http://techknack.net/python-urllib2-handlers/
 571
 572     Andrew Rowls, the author of that code, agreed to release it to the
 573     public domain.
 574     """
 575
 576     def __init__(self, params, *args, **kwargs):
 577         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 578         self._params = params
 579
 580     def http_open(self, req):
 581         return self.do_open(functools.partial(
 582             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 583             req)
 584
 585     @staticmethod
 586     def deflate(data):
 587         try:
 588             return zlib.decompress(data, -zlib.MAX_WBITS)
 589         except zlib.error:
 590             return zlib.decompress(data)
 591
 592     @staticmethod
 593     def addinfourl_wrapper(stream, headers, url, code):
 594         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 595             return compat_urllib_request.addinfourl(stream, headers, url, code)
 596         ret = compat_urllib_request.addinfourl(stream, headers, url)
 597         ret.code = code
 598         return ret
 599
 600     def http_request(self, req):
 601         for h, v in std_headers.items():
 602             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 603             # The dict keys are capitalized because of this bug by urllib
 604             if h.capitalize() not in req.headers:
 605                 req.add_header(h, v)
 606         if 'Youtubedl-no-compression' in req.headers:
 607             if 'Accept-encoding' in req.headers:
 608                 del req.headers['Accept-encoding']
 609             del req.headers['Youtubedl-no-compression']
 610
 611         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 612             # Python 2.6 is brain-dead when it comes to fragments
 613             req._Request__original = req._Request__original.partition('#')[0]
 614             req._Request__r_type = req._Request__r_type.partition('#')[0]
 615
 616         return req
 617
 618     def http_response(self, req, resp):
 619         old_resp = resp
 620         # gzip
 621         if resp.headers.get('Content-encoding', '') == 'gzip':
 622             content = resp.read()
 623             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 624             try:
 625                 uncompressed = io.BytesIO(gz.read())
 626             except IOError as original_ioerror:
 627                 # There may be junk add the end of the file
 628                 # See http://stackoverflow.com/q/4928560/35070 for details
 629                 for i in range(1, 1024):
 630                     try:
 631                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 632                         uncompressed = io.BytesIO(gz.read())
 633                     except IOError:
 634                         continue
 635                     break
 636                 else:
 637                     raise original_ioerror
 638             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 639             resp.msg = old_resp.msg
 640         # deflate
 641         if resp.headers.get('Content-encoding', '') == 'deflate':
 642             gz = io.BytesIO(self.deflate(resp.read()))
 643             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 644             resp.msg = old_resp.msg
 645         return resp
 646
 647     https_request = http_request
 648     https_response = http_response
 649
 650
 651 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 652     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 653         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 654         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 655         self._params = params
 656
 657     def https_open(self, req):
 658         kwargs = {}
 659         if hasattr(self, '_context'):  # python > 2.6
 660             kwargs['context'] = self._context
 661         if hasattr(self, '_check_hostname'):  # python 3.x
 662             kwargs['check_hostname'] = self._check_hostname
 663         return self.do_open(functools.partial(
 664             _create_http_connection, self, self._https_conn_class, True),
 665             req, **kwargs)
 666
 667
 668 def parse_iso8601(date_str, delimiter='T'):
 669     """ Return a UNIX timestamp from the given date """
 670
 671     if date_str is None:
 672         return None
 673
 674     m = re.search(
 675         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 676         date_str)
 677     if not m:
 678         timezone = datetime.timedelta()
 679     else:
 680         date_str = date_str[:-len(m.group(0))]
 681         if not m.group('sign'):
 682             timezone = datetime.timedelta()
 683         else:
 684             sign = 1 if m.group('sign') == '+' else -1
 685             timezone = datetime.timedelta(
 686                 hours=sign * int(m.group('hours')),
 687                 minutes=sign * int(m.group('minutes')))
 688     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 689     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 690     return calendar.timegm(dt.timetuple())
 691
 692
 693 def unified_strdate(date_str, day_first=True):
 694     """Return a string with the date in the format YYYYMMDD"""
 695
 696     if date_str is None:
 697         return None
 698     upload_date = None
 699     # Replace commas
 700     date_str = date_str.replace(',', ' ')
 701     # %z (UTC offset) is only supported in python>=3.2
 702     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 703     # Remove AM/PM + timezone
 704     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 705
 706     format_expressions = [
 707         '%d %B %Y',
 708         '%d %b %Y',
 709         '%B %d %Y',
 710         '%b %d %Y',
 711         '%b %dst %Y %I:%M%p',
 712         '%b %dnd %Y %I:%M%p',
 713         '%b %dth %Y %I:%M%p',
 714         '%Y %m %d',
 715         '%Y-%m-%d',
 716         '%Y/%m/%d',
 717         '%Y/%m/%d %H:%M:%S',
 718         '%Y-%m-%d %H:%M:%S',
 719         '%Y-%m-%d %H:%M:%S.%f',
 720         '%d.%m.%Y %H:%M',
 721         '%d.%m.%Y %H.%M',
 722         '%Y-%m-%dT%H:%M:%SZ',
 723         '%Y-%m-%dT%H:%M:%S.%fZ',
 724         '%Y-%m-%dT%H:%M:%S.%f0Z',
 725         '%Y-%m-%dT%H:%M:%S',
 726         '%Y-%m-%dT%H:%M:%S.%f',
 727         '%Y-%m-%dT%H:%M',
 728     ]
 729     if day_first:
 730         format_expressions.extend([
 731             '%d.%m.%Y',
 732             '%d/%m/%Y',
 733             '%d/%m/%y',
 734             '%d/%m/%Y %H:%M:%S',
 735         ])
 736     else:
 737         format_expressions.extend([
 738             '%m.%d.%Y',
 739             '%m/%d/%Y',
 740             '%m/%d/%y',
 741             '%m/%d/%Y %H:%M:%S',
 742         ])
 743     for expression in format_expressions:
 744         try:
 745             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 746         except ValueError:
 747             pass
 748     if upload_date is None:
 749         timetuple = email.utils.parsedate_tz(date_str)
 750         if timetuple:
 751             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 752     return upload_date
 753
 754
 755 def determine_ext(url, default_ext='unknown_video'):
 756     if url is None:
 757         return default_ext
 758     guess = url.partition('?')[0].rpartition('.')[2]
 759     if re.match(r'^[A-Za-z0-9]+$', guess):
 760         return guess
 761     else:
 762         return default_ext
 763
 764
 765 def subtitles_filename(filename, sub_lang, sub_format):
 766     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 767
 768
 769 def date_from_str(date_str):
 770     """
 771     Return a datetime object from a string in the format YYYYMMDD or
 772     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 773     today = datetime.date.today()
 774     if date_str in ('now', 'today'):
 775         return today
 776     if date_str == 'yesterday':
 777         return today - datetime.timedelta(days=1)
 778     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 779     if match is not None:
 780         sign = match.group('sign')
 781         time = int(match.group('time'))
 782         if sign == '-':
 783             time = -time
 784         unit = match.group('unit')
 785         # A bad aproximation?
 786         if unit == 'month':
 787             unit = 'day'
 788             time *= 30
 789         elif unit == 'year':
 790             unit = 'day'
 791             time *= 365
 792         unit += 's'
 793         delta = datetime.timedelta(**{unit: time})
 794         return today + delta
 795     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 796
 797
 798 def hyphenate_date(date_str):
 799     """
 800     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 801     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 802     if match is not None:
 803         return '-'.join(match.groups())
 804     else:
 805         return date_str
 806
 807
 808 class DateRange(object):
 809     """Represents a time interval between two dates"""
 810
 811     def __init__(self, start=None, end=None):
 812         """start and end must be strings in the format accepted by date"""
 813         if start is not None:
 814             self.start = date_from_str(start)
 815         else:
 816             self.start = datetime.datetime.min.date()
 817         if end is not None:
 818             self.end = date_from_str(end)
 819         else:
 820             self.end = datetime.datetime.max.date()
 821         if self.start > self.end:
 822             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 823
 824     @classmethod
 825     def day(cls, day):
 826         """Returns a range that only contains the given day"""
 827         return cls(day, day)
 828
 829     def __contains__(self, date):
 830         """Check if the date is in the range"""
 831         if not isinstance(date, datetime.date):
 832             date = date_from_str(date)
 833         return self.start <= date <= self.end
 834
 835     def __str__(self):
 836         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 837
 838
 839 def platform_name():
 840     """ Returns the platform name as a compat_str """
 841     res = platform.platform()
 842     if isinstance(res, bytes):
 843         res = res.decode(preferredencoding())
 844
 845     assert isinstance(res, compat_str)
 846     return res
 847
 848
 849 def _windows_write_string(s, out):
 850     """ Returns True if the string was written using special methods,
 851     False if it has yet to be written out."""
 852     # Adapted from http://stackoverflow.com/a/3259271/35070
 853
 854     import ctypes
 855     import ctypes.wintypes
 856
 857     WIN_OUTPUT_IDS = {
 858         1: -11,
 859         2: -12,
 860     }
 861
 862     try:
 863         fileno = out.fileno()
 864     except AttributeError:
 865         # If the output stream doesn't have a fileno, it's virtual
 866         return False
 867     except io.UnsupportedOperation:
 868         # Some strange Windows pseudo files?
 869         return False
 870     if fileno not in WIN_OUTPUT_IDS:
 871         return False
 872
 873     GetStdHandle = ctypes.WINFUNCTYPE(
 874         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 875         (b"GetStdHandle", ctypes.windll.kernel32))
 876     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 877
 878     WriteConsoleW = ctypes.WINFUNCTYPE(
 879         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 880         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 881         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 882     written = ctypes.wintypes.DWORD(0)
 883
 884     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 885     FILE_TYPE_CHAR = 0x0002
 886     FILE_TYPE_REMOTE = 0x8000
 887     GetConsoleMode = ctypes.WINFUNCTYPE(
 888         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 889         ctypes.POINTER(ctypes.wintypes.DWORD))(
 890         (b"GetConsoleMode", ctypes.windll.kernel32))
 891     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 892
 893     def not_a_console(handle):
 894         if handle == INVALID_HANDLE_VALUE or handle is None:
 895             return True
 896         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 897                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 898
 899     if not_a_console(h):
 900         return False
 901
 902     def next_nonbmp_pos(s):
 903         try:
 904             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 905         except StopIteration:
 906             return len(s)
 907
 908     while s:
 909         count = min(next_nonbmp_pos(s), 1024)
 910
 911         ret = WriteConsoleW(
 912             h, s, count if count else 2, ctypes.byref(written), None)
 913         if ret == 0:
 914             raise OSError('Failed to write string')
 915         if not count:  # We just wrote a non-BMP character
 916             assert written.value == 2
 917             s = s[1:]
 918         else:
 919             assert written.value > 0
 920             s = s[written.value:]
 921     return True
 922
 923
 924 def write_string(s, out=None, encoding=None):
 925     if out is None:
 926         out = sys.stderr
 927     assert type(s) == compat_str
 928
 929     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 930         if _windows_write_string(s, out):
 931             return
 932
 933     if ('b' in getattr(out, 'mode', '') or
 934             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 935         byt = s.encode(encoding or preferredencoding(), 'ignore')
 936         out.write(byt)
 937     elif hasattr(out, 'buffer'):
 938         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 939         byt = s.encode(enc, 'ignore')
 940         out.buffer.write(byt)
 941     else:
 942         out.write(s)
 943     out.flush()
 944
 945
 946 def bytes_to_intlist(bs):
 947     if not bs:
 948         return []
 949     if isinstance(bs[0], int):  # Python 3
 950         return list(bs)
 951     else:
 952         return [ord(c) for c in bs]
 953
 954
 955 def intlist_to_bytes(xs):
 956     if not xs:
 957         return b''
 958     return struct_pack('%dB' % len(xs), *xs)
 959
 960
 961 # Cross-platform file locking
 962 if sys.platform == 'win32':
 963     import ctypes.wintypes
 964     import msvcrt
 965
 966     class OVERLAPPED(ctypes.Structure):
 967         _fields_ = [
 968             ('Internal', ctypes.wintypes.LPVOID),
 969             ('InternalHigh', ctypes.wintypes.LPVOID),
 970             ('Offset', ctypes.wintypes.DWORD),
 971             ('OffsetHigh', ctypes.wintypes.DWORD),
 972             ('hEvent', ctypes.wintypes.HANDLE),
 973         ]
 974
 975     kernel32 = ctypes.windll.kernel32
 976     LockFileEx = kernel32.LockFileEx
 977     LockFileEx.argtypes = [
 978         ctypes.wintypes.HANDLE,     # hFile
 979         ctypes.wintypes.DWORD,      # dwFlags
 980         ctypes.wintypes.DWORD,      # dwReserved
 981         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 982         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 983         ctypes.POINTER(OVERLAPPED)  # Overlapped
 984     ]
 985     LockFileEx.restype = ctypes.wintypes.BOOL
 986     UnlockFileEx = kernel32.UnlockFileEx
 987     UnlockFileEx.argtypes = [
 988         ctypes.wintypes.HANDLE,     # hFile
 989         ctypes.wintypes.DWORD,      # dwReserved
 990         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 991         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 992         ctypes.POINTER(OVERLAPPED)  # Overlapped
 993     ]
 994     UnlockFileEx.restype = ctypes.wintypes.BOOL
 995     whole_low = 0xffffffff
 996     whole_high = 0x7fffffff
 997
 998     def _lock_file(f, exclusive):
 999         overlapped = OVERLAPPED()
1000         overlapped.Offset = 0
1001         overlapped.OffsetHigh = 0
1002         overlapped.hEvent = 0
1003         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1004         handle = msvcrt.get_osfhandle(f.fileno())
1005         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1006                           whole_low, whole_high, f._lock_file_overlapped_p):
1007             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1008
1009     def _unlock_file(f):
1010         assert f._lock_file_overlapped_p
1011         handle = msvcrt.get_osfhandle(f.fileno())
1012         if not UnlockFileEx(handle, 0,
1013                             whole_low, whole_high, f._lock_file_overlapped_p):
1014             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1015
1016 else:
1017     import fcntl
1018
1019     def _lock_file(f, exclusive):
1020         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1021
1022     def _unlock_file(f):
1023         fcntl.flock(f, fcntl.LOCK_UN)
1024
1025
1026 class locked_file(object):
1027     def __init__(self, filename, mode, encoding=None):
1028         assert mode in ['r', 'a', 'w']
1029         self.f = io.open(filename, mode, encoding=encoding)
1030         self.mode = mode
1031
1032     def __enter__(self):
1033         exclusive = self.mode != 'r'
1034         try:
1035             _lock_file(self.f, exclusive)
1036         except IOError:
1037             self.f.close()
1038             raise
1039         return self
1040
1041     def __exit__(self, etype, value, traceback):
1042         try:
1043             _unlock_file(self.f)
1044         finally:
1045             self.f.close()
1046
1047     def __iter__(self):
1048         return iter(self.f)
1049
1050     def write(self, *args):
1051         return self.f.write(*args)
1052
1053     def read(self, *args):
1054         return self.f.read(*args)
1055
1056
1057 def get_filesystem_encoding():
1058     encoding = sys.getfilesystemencoding()
1059     return encoding if encoding is not None else 'utf-8'
1060
1061
1062 def shell_quote(args):
1063     quoted_args = []
1064     encoding = get_filesystem_encoding()
1065     for a in args:
1066         if isinstance(a, bytes):
1067             # We may get a filename encoded with 'encodeFilename'
1068             a = a.decode(encoding)
1069         quoted_args.append(pipes.quote(a))
1070     return ' '.join(quoted_args)
1071
1072
1073 def takewhile_inclusive(pred, seq):
1074     """ Like itertools.takewhile, but include the latest evaluated element
1075         (the first element so that Not pred(e)) """
1076     for e in seq:
1077         yield e
1078         if not pred(e):
1079             return
1080
1081
1082 def smuggle_url(url, data):
1083     """ Pass additional data in a URL for internal use. """
1084
1085     sdata = compat_urllib_parse.urlencode(
1086         {'__youtubedl_smuggle': json.dumps(data)})
1087     return url + '#' + sdata
1088
1089
1090 def unsmuggle_url(smug_url, default=None):
1091     if '#__youtubedl_smuggle' not in smug_url:
1092         return smug_url, default
1093     url, _, sdata = smug_url.rpartition('#')
1094     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1095     data = json.loads(jsond)
1096     return url, data
1097
1098
1099 def format_bytes(bytes):
1100     if bytes is None:
1101         return 'N/A'
1102     if type(bytes) is str:
1103         bytes = float(bytes)
1104     if bytes == 0.0:
1105         exponent = 0
1106     else:
1107         exponent = int(math.log(bytes, 1024.0))
1108     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1109     converted = float(bytes) / float(1024 ** exponent)
1110     return '%.2f%s' % (converted, suffix)
1111
1112
1113 def parse_filesize(s):
1114     if s is None:
1115         return None
1116
1117     # The lower-case forms are of course incorrect and inofficial,
1118     # but we support those too
1119     _UNIT_TABLE = {
1120         'B': 1,
1121         'b': 1,
1122         'KiB': 1024,
1123         'KB': 1000,
1124         'kB': 1024,
1125         'Kb': 1000,
1126         'MiB': 1024 ** 2,
1127         'MB': 1000 ** 2,
1128         'mB': 1024 ** 2,
1129         'Mb': 1000 ** 2,
1130         'GiB': 1024 ** 3,
1131         'GB': 1000 ** 3,
1132         'gB': 1024 ** 3,
1133         'Gb': 1000 ** 3,
1134         'TiB': 1024 ** 4,
1135         'TB': 1000 ** 4,
1136         'tB': 1024 ** 4,
1137         'Tb': 1000 ** 4,
1138         'PiB': 1024 ** 5,
1139         'PB': 1000 ** 5,
1140         'pB': 1024 ** 5,
1141         'Pb': 1000 ** 5,
1142         'EiB': 1024 ** 6,
1143         'EB': 1000 ** 6,
1144         'eB': 1024 ** 6,
1145         'Eb': 1000 ** 6,
1146         'ZiB': 1024 ** 7,
1147         'ZB': 1000 ** 7,
1148         'zB': 1024 ** 7,
1149         'Zb': 1000 ** 7,
1150         'YiB': 1024 ** 8,
1151         'YB': 1000 ** 8,
1152         'yB': 1024 ** 8,
1153         'Yb': 1000 ** 8,
1154     }
1155
1156     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1157     m = re.match(
1158         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1159     if not m:
1160         return None
1161
1162     num_str = m.group('num').replace(',', '.')
1163     mult = _UNIT_TABLE[m.group('unit')]
1164     return int(float(num_str) * mult)
1165
1166
1167 def get_term_width():
1168     columns = compat_getenv('COLUMNS', None)
1169     if columns:
1170         return int(columns)
1171
1172     try:
1173         sp = subprocess.Popen(
1174             ['stty', 'size'],
1175             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1176         out, err = sp.communicate()
1177         return int(out.split()[1])
1178     except:
1179         pass
1180     return None
1181
1182
1183 def month_by_name(name):
1184     """ Return the number of a month by (locale-independently) English name """
1185
1186     ENGLISH_NAMES = [
1187         'January', 'February', 'March', 'April', 'May', 'June',
1188         'July', 'August', 'September', 'October', 'November', 'December']
1189     try:
1190         return ENGLISH_NAMES.index(name) + 1
1191     except ValueError:
1192         return None
1193
1194
1195 def fix_xml_ampersands(xml_str):
1196     """Replace all the '&' by '&amp;' in XML"""
1197     return re.sub(
1198         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1199         '&amp;',
1200         xml_str)
1201
1202
1203 def setproctitle(title):
1204     assert isinstance(title, compat_str)
1205     try:
1206         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1207     except OSError:
1208         return
1209     title_bytes = title.encode('utf-8')
1210     buf = ctypes.create_string_buffer(len(title_bytes))
1211     buf.value = title_bytes
1212     try:
1213         libc.prctl(15, buf, 0, 0, 0)
1214     except AttributeError:
1215         return  # Strange libc, just skip this
1216
1217
1218 def remove_start(s, start):
1219     if s.startswith(start):
1220         return s[len(start):]
1221     return s
1222
1223
1224 def remove_end(s, end):
1225     if s.endswith(end):
1226         return s[:-len(end)]
1227     return s
1228
1229
1230 def url_basename(url):
1231     path = compat_urlparse.urlparse(url).path
1232     return path.strip('/').split('/')[-1]
1233
1234
1235 class HEADRequest(compat_urllib_request.Request):
1236     def get_method(self):
1237         return "HEAD"
1238
1239
1240 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1241     if get_attr:
1242         if v is not None:
1243             v = getattr(v, get_attr, None)
1244     if v == '':
1245         v = None
1246     return default if v is None else (int(v) * invscale // scale)
1247
1248
1249 def str_or_none(v, default=None):
1250     return default if v is None else compat_str(v)
1251
1252
1253 def str_to_int(int_str):
1254     """ A more relaxed version of int_or_none """
1255     if int_str is None:
1256         return None
1257     int_str = re.sub(r'[,\.\+]', '', int_str)
1258     return int(int_str)
1259
1260
1261 def float_or_none(v, scale=1, invscale=1, default=None):
1262     return default if v is None else (float(v) * invscale / scale)
1263
1264
1265 def parse_duration(s):
1266     if not isinstance(s, compat_basestring):
1267         return None
1268
1269     s = s.strip()
1270
1271     m = re.match(
1272         r'''(?ix)(?:P?T)?
1273         (?:
1274             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1275             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1276
1277             (?:
1278                 (?:
1279                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1280                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1281                 )?
1282                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1283             )?
1284             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1285         )$''', s)
1286     if not m:
1287         return None
1288     res = 0
1289     if m.group('only_mins'):
1290         return float_or_none(m.group('only_mins'), invscale=60)
1291     if m.group('only_hours'):
1292         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1293     if m.group('secs'):
1294         res += int(m.group('secs'))
1295     if m.group('mins'):
1296         res += int(m.group('mins')) * 60
1297     if m.group('hours'):
1298         res += int(m.group('hours')) * 60 * 60
1299     if m.group('days'):
1300         res += int(m.group('days')) * 24 * 60 * 60
1301     if m.group('ms'):
1302         res += float(m.group('ms'))
1303     return res
1304
1305
1306 def prepend_extension(filename, ext):
1307     name, real_ext = os.path.splitext(filename)
1308     return '{0}.{1}{2}'.format(name, ext, real_ext)
1309
1310
1311 def check_executable(exe, args=[]):
1312     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1313     args can be a list of arguments for a short output (like -version) """
1314     try:
1315         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1316     except OSError:
1317         return False
1318     return exe
1319
1320
1321 def get_exe_version(exe, args=['--version'],
1322                     version_re=None, unrecognized='present'):
1323     """ Returns the version of the specified executable,
1324     or False if the executable is not present """
1325     try:
1326         out, _ = subprocess.Popen(
1327             [exe] + args,
1328             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1329     except OSError:
1330         return False
1331     if isinstance(out, bytes):  # Python 2.x
1332         out = out.decode('ascii', 'ignore')
1333     return detect_exe_version(out, version_re, unrecognized)
1334
1335
1336 def detect_exe_version(output, version_re=None, unrecognized='present'):
1337     assert isinstance(output, compat_str)
1338     if version_re is None:
1339         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1340     m = re.search(version_re, output)
1341     if m:
1342         return m.group(1)
1343     else:
1344         return unrecognized
1345
1346
1347 class PagedList(object):
1348     def __len__(self):
1349         # This is only useful for tests
1350         return len(self.getslice())
1351
1352
1353 class OnDemandPagedList(PagedList):
1354     def __init__(self, pagefunc, pagesize):
1355         self._pagefunc = pagefunc
1356         self._pagesize = pagesize
1357
1358     def getslice(self, start=0, end=None):
1359         res = []
1360         for pagenum in itertools.count(start // self._pagesize):
1361             firstid = pagenum * self._pagesize
1362             nextfirstid = pagenum * self._pagesize + self._pagesize
1363             if start >= nextfirstid:
1364                 continue
1365
1366             page_results = list(self._pagefunc(pagenum))
1367
1368             startv = (
1369                 start % self._pagesize
1370                 if firstid <= start < nextfirstid
1371                 else 0)
1372
1373             endv = (
1374                 ((end - 1) % self._pagesize) + 1
1375                 if (end is not None and firstid <= end <= nextfirstid)
1376                 else None)
1377
1378             if startv != 0 or endv is not None:
1379                 page_results = page_results[startv:endv]
1380             res.extend(page_results)
1381
1382             # A little optimization - if current page is not "full", ie. does
1383             # not contain page_size videos then we can assume that this page
1384             # is the last one - there are no more ids on further pages -
1385             # i.e. no need to query again.
1386             if len(page_results) + startv < self._pagesize:
1387                 break
1388
1389             # If we got the whole page, but the next page is not interesting,
1390             # break out early as well
1391             if end == nextfirstid:
1392                 break
1393         return res
1394
1395
1396 class InAdvancePagedList(PagedList):
1397     def __init__(self, pagefunc, pagecount, pagesize):
1398         self._pagefunc = pagefunc
1399         self._pagecount = pagecount
1400         self._pagesize = pagesize
1401
1402     def getslice(self, start=0, end=None):
1403         res = []
1404         start_page = start // self._pagesize
1405         end_page = (
1406             self._pagecount if end is None else (end // self._pagesize + 1))
1407         skip_elems = start - start_page * self._pagesize
1408         only_more = None if end is None else end - start
1409         for pagenum in range(start_page, end_page):
1410             page = list(self._pagefunc(pagenum))
1411             if skip_elems:
1412                 page = page[skip_elems:]
1413                 skip_elems = None
1414             if only_more is not None:
1415                 if len(page) < only_more:
1416                     only_more -= len(page)
1417                 else:
1418                     page = page[:only_more]
1419                     res.extend(page)
1420                     break
1421             res.extend(page)
1422         return res
1423
1424
1425 def uppercase_escape(s):
1426     unicode_escape = codecs.getdecoder('unicode_escape')
1427     return re.sub(
1428         r'\\U[0-9a-fA-F]{8}',
1429         lambda m: unicode_escape(m.group(0))[0],
1430         s)
1431
1432
1433 def escape_rfc3986(s):
1434     """Escape non-ASCII characters as suggested by RFC 3986"""
1435     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1436         s = s.encode('utf-8')
1437     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1438
1439
1440 def escape_url(url):
1441     """Escape URL as suggested by RFC 3986"""
1442     url_parsed = compat_urllib_parse_urlparse(url)
1443     return url_parsed._replace(
1444         path=escape_rfc3986(url_parsed.path),
1445         params=escape_rfc3986(url_parsed.params),
1446         query=escape_rfc3986(url_parsed.query),
1447         fragment=escape_rfc3986(url_parsed.fragment)
1448     ).geturl()
1449
1450 try:
1451     struct.pack('!I', 0)
1452 except TypeError:
1453     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1454     def struct_pack(spec, *args):
1455         if isinstance(spec, compat_str):
1456             spec = spec.encode('ascii')
1457         return struct.pack(spec, *args)
1458
1459     def struct_unpack(spec, *args):
1460         if isinstance(spec, compat_str):
1461             spec = spec.encode('ascii')
1462         return struct.unpack(spec, *args)
1463 else:
1464     struct_pack = struct.pack
1465     struct_unpack = struct.unpack
1466
1467
1468 def read_batch_urls(batch_fd):
1469     def fixup(url):
1470         if not isinstance(url, compat_str):
1471             url = url.decode('utf-8', 'replace')
1472         BOM_UTF8 = '\xef\xbb\xbf'
1473         if url.startswith(BOM_UTF8):
1474             url = url[len(BOM_UTF8):]
1475         url = url.strip()
1476         if url.startswith(('#', ';', ']')):
1477             return False
1478         return url
1479
1480     with contextlib.closing(batch_fd) as fd:
1481         return [url for url in map(fixup, fd) if url]
1482
1483
1484 def urlencode_postdata(*args, **kargs):
1485     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1486
1487
1488 try:
1489     etree_iter = xml.etree.ElementTree.Element.iter
1490 except AttributeError:  # Python <=2.6
1491     etree_iter = lambda n: n.findall('.//*')
1492
1493
1494 def parse_xml(s):
1495     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1496         def doctype(self, name, pubid, system):
1497             pass  # Ignore doctypes
1498
1499     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1500     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1501     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1502     # Fix up XML parser in Python 2.x
1503     if sys.version_info < (3, 0):
1504         for n in etree_iter(tree):
1505             if n.text is not None:
1506                 if not isinstance(n.text, compat_str):
1507                     n.text = n.text.decode('utf-8')
1508     return tree
1509
1510
1511 US_RATINGS = {
1512     'G': 0,
1513     'PG': 10,
1514     'PG-13': 13,
1515     'R': 16,
1516     'NC': 18,
1517 }
1518
1519
1520 def parse_age_limit(s):
1521     if s is None:
1522         return None
1523     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1524     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1525
1526
1527 def strip_jsonp(code):
1528     return re.sub(
1529         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1530
1531
1532 def js_to_json(code):
1533     def fix_kv(m):
1534         v = m.group(0)
1535         if v in ('true', 'false', 'null'):
1536             return v
1537         if v.startswith('"'):
1538             return v
1539         if v.startswith("'"):
1540             v = v[1:-1]
1541             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1542                 '\\\\': '\\\\',
1543                 "\\'": "'",
1544                 '"': '\\"',
1545             }[m.group(0)], v)
1546         return '"%s"' % v
1547
1548     res = re.sub(r'''(?x)
1549         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1550         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1551         [a-zA-Z_][.a-zA-Z_0-9]*
1552         ''', fix_kv, code)
1553     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1554     return res
1555
1556
1557 def qualities(quality_ids):
1558     """ Get a numeric quality value out of a list of possible values """
1559     def q(qid):
1560         try:
1561             return quality_ids.index(qid)
1562         except ValueError:
1563             return -1
1564     return q
1565
1566
1567 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1568
1569
1570 def limit_length(s, length):
1571     """ Add ellipses to overly long strings """
1572     if s is None:
1573         return None
1574     ELLIPSES = '...'
1575     if len(s) > length:
1576         return s[:length - len(ELLIPSES)] + ELLIPSES
1577     return s
1578
1579
1580 def version_tuple(v):
1581     return tuple(int(e) for e in re.split(r'[-.]', v))
1582
1583
1584 def is_outdated_version(version, limit, assume_new=True):
1585     if not version:
1586         return not assume_new
1587     try:
1588         return version_tuple(version) < version_tuple(limit)
1589     except ValueError:
1590         return not assume_new
1591
1592
1593 def ytdl_is_updateable():
1594     """ Returns if youtube-dl can be updated with -U """
1595     from zipimport import zipimporter
1596
1597     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1598
1599
1600 def args_to_str(args):
1601     # Get a short string representation for a subprocess command
1602     return ' '.join(shlex_quote(a) for a in args)
1603
1604
1605 def urlhandle_detect_ext(url_handle):
1606     try:
1607         url_handle.headers
1608         getheader = lambda h: url_handle.headers[h]
1609     except AttributeError:  # Python < 3
1610         getheader = url_handle.info().getheader
1611
1612     cd = getheader('Content-Disposition')
1613     if cd:
1614         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1615         if m:
1616             e = determine_ext(m.group('filename'), default_ext=None)
1617             if e:
1618                 return e
1619
1620     return getheader('Content-Type').split("/")[1]
1621
1622
1623 def age_restricted(content_limit, age_limit):
1624     """ Returns True iff the content should be blocked """
1625
1626     if age_limit is None:  # No limit set
1627         return False
1628     if content_limit is None:
1629         return False  # Content available for everyone
1630     return age_limit < content_limit
1631
1632
1633 def is_html(first_bytes):
1634     """ Detect whether a file contains HTML by examining its first bytes. """
1635
1636     BOMS = [
1637         (b'\xef\xbb\xbf', 'utf-8'),
1638         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1639         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1640         (b'\xff\xfe', 'utf-16-le'),
1641         (b'\xfe\xff', 'utf-16-be'),
1642     ]
1643     for bom, enc in BOMS:
1644         if first_bytes.startswith(bom):
1645             s = first_bytes[len(bom):].decode(enc, 'replace')
1646             break
1647     else:
1648         s = first_bytes.decode('utf-8', 'replace')
1649
1650     return re.match(r'^\s*<', s)
1651
1652
1653 def determine_protocol(info_dict):
1654     protocol = info_dict.get('protocol')
1655     if protocol is not None:
1656         return protocol
1657
1658     url = info_dict['url']
1659     if url.startswith('rtmp'):
1660         return 'rtmp'
1661     elif url.startswith('mms'):
1662         return 'mms'
1663     elif url.startswith('rtsp'):
1664         return 'rtsp'
1665
1666     ext = determine_ext(url)
1667     if ext == 'm3u8':
1668         return 'm3u8'
1669     elif ext == 'f4m':
1670         return 'f4m'
1671
1672     return compat_urllib_parse_urlparse(url).scheme
1673
1674
1675 def render_table(header_row, data):
1676     """ Render a list of rows, each as a list of values """
1677     table = [header_row] + data
1678     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1679     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1680     return '\n'.join(format_str % tuple(row) for row in table)