_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     compat_WINFUNCTYPE,
  45     shlex_quote,
  46 )
  47
  48
  49 # This is not clearly defined otherwise
  50 compiled_regex_type = type(re.compile(''))
  51
  52 std_headers = {
  53     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  54     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  55     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  56     'Accept-Encoding': 'gzip, deflate',
  57     'Accept-Language': 'en-us,en;q=0.5',
  58 }
  59
  60
  61 def preferredencoding():
  62     """Get preferred encoding.
  63
  64     Returns the best encoding scheme for the system, based on
  65     locale.getpreferredencoding() and some further tweaks.
  66     """
  67     try:
  68         pref = locale.getpreferredencoding()
  69         'TEST'.encode(pref)
  70     except:
  71         pref = 'UTF-8'
  72
  73     return pref
  74
  75
  76 def write_json_file(obj, fn):
  77     """ Encode obj as JSON and write it to fn, atomically if possible """
  78
  79     fn = encodeFilename(fn)
  80     if sys.version_info < (3, 0) and sys.platform != 'win32':
  81         encoding = get_filesystem_encoding()
  82         # os.path.basename returns a bytes object, but NamedTemporaryFile
  83         # will fail if the filename contains non ascii characters unless we
  84         # use a unicode object
  85         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  86         # the same for os.path.dirname
  87         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  88     else:
  89         path_basename = os.path.basename
  90         path_dirname = os.path.dirname
  91
  92     args = {
  93         'suffix': '.tmp',
  94         'prefix': path_basename(fn) + '.',
  95         'dir': path_dirname(fn),
  96         'delete': False,
  97     }
  98
  99     # In Python 2.x, json.dump expects a bytestream.
 100     # In Python 3.x, it writes to a character stream
 101     if sys.version_info < (3, 0):
 102         args['mode'] = 'wb'
 103     else:
 104         args.update({
 105             'mode': 'w',
 106             'encoding': 'utf-8',
 107         })
 108
 109     tf = tempfile.NamedTemporaryFile(**args)
 110
 111     try:
 112         with tf:
 113             json.dump(obj, tf)
 114         if sys.platform == 'win32':
 115             # Need to remove existing file on Windows, else os.rename raises
 116             # WindowsError or FileExistsError.
 117             try:
 118                 os.unlink(fn)
 119             except OSError:
 120                 pass
 121         os.rename(tf.name, fn)
 122     except:
 123         try:
 124             os.remove(tf.name)
 125         except OSError:
 126             pass
 127         raise
 128
 129
 130 if sys.version_info >= (2, 7):
 131     def find_xpath_attr(node, xpath, key, val):
 132         """ Find the xpath xpath[@key=val] """
 133         assert re.match(r'^[a-zA-Z-]+$', key)
 134         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 135         expr = xpath + "[@%s='%s']" % (key, val)
 136         return node.find(expr)
 137 else:
 138     def find_xpath_attr(node, xpath, key, val):
 139         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 140         # .//node does not match if a node is a direct child of . !
 141         if isinstance(xpath, unicode):
 142             xpath = xpath.encode('ascii')
 143
 144         for f in node.findall(xpath):
 145             if f.attrib.get(key) == val:
 146                 return f
 147         return None
 148
 149 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 150 # the namespace parameter
 151
 152
 153 def xpath_with_ns(path, ns_map):
 154     components = [c.split(':') for c in path.split('/')]
 155     replaced = []
 156     for c in components:
 157         if len(c) == 1:
 158             replaced.append(c[0])
 159         else:
 160             ns, tag = c
 161             replaced.append('{%s}%s' % (ns_map[ns], tag))
 162     return '/'.join(replaced)
 163
 164
 165 def xpath_text(node, xpath, name=None, fatal=False):
 166     if sys.version_info < (2, 7):  # Crazy 2.6
 167         xpath = xpath.encode('ascii')
 168
 169     n = node.find(xpath)
 170     if n is None or n.text is None:
 171         if fatal:
 172             name = xpath if name is None else name
 173             raise ExtractorError('Could not find XML element %s' % name)
 174         else:
 175             return None
 176     return n.text
 177
 178
 179 def get_element_by_id(id, html):
 180     """Return the content of the tag with the specified ID in the passed HTML document"""
 181     return get_element_by_attribute("id", id, html)
 182
 183
 184 def get_element_by_attribute(attribute, value, html):
 185     """Return the content of the tag with the specified attribute in the passed HTML document"""
 186
 187     m = re.search(r'''(?xs)
 188         <([a-zA-Z0-9:._-]+)
 189          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 190          \s+%s=['"]?%s['"]?
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192         \s*>
 193         (?P<content>.*?)
 194         </\1>
 195     ''' % (re.escape(attribute), re.escape(value)), html)
 196
 197     if not m:
 198         return None
 199     res = m.group('content')
 200
 201     if res.startswith('"') or res.startswith("'"):
 202         res = res[1:-1]
 203
 204     return unescapeHTML(res)
 205
 206
 207 def clean_html(html):
 208     """Clean an HTML snippet into a readable string"""
 209     # Newline vs <br />
 210     html = html.replace('\n', ' ')
 211     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 212     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 213     # Strip html tags
 214     html = re.sub('<.*?>', '', html)
 215     # Replace html entities
 216     html = unescapeHTML(html)
 217     return html.strip()
 218
 219
 220 def sanitize_open(filename, open_mode):
 221     """Try to open the given filename, and slightly tweak it if this fails.
 222
 223     Attempts to open the given filename. If this fails, it tries to change
 224     the filename slightly, step by step, until it's either able to open it
 225     or it fails and raises a final exception, like the standard open()
 226     function.
 227
 228     It returns the tuple (stream, definitive_file_name).
 229     """
 230     try:
 231         if filename == '-':
 232             if sys.platform == 'win32':
 233                 import msvcrt
 234                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 235             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 236         stream = open(encodeFilename(filename), open_mode)
 237         return (stream, filename)
 238     except (IOError, OSError) as err:
 239         if err.errno in (errno.EACCES,):
 240             raise
 241
 242         # In case of error, try to remove win32 forbidden chars
 243         alt_filename = os.path.join(
 244             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 245             for path_part in os.path.split(filename)
 246         )
 247         if alt_filename == filename:
 248             raise
 249         else:
 250             # An exception here should be caught in the caller
 251             stream = open(encodeFilename(filename), open_mode)
 252             return (stream, alt_filename)
 253
 254
 255 def timeconvert(timestr):
 256     """Convert RFC 2822 defined time string into system timestamp"""
 257     timestamp = None
 258     timetuple = email.utils.parsedate_tz(timestr)
 259     if timetuple is not None:
 260         timestamp = email.utils.mktime_tz(timetuple)
 261     return timestamp
 262
 263
 264 def sanitize_filename(s, restricted=False, is_id=False):
 265     """Sanitizes a string so it could be used as part of a filename.
 266     If restricted is set, use a stricter subset of allowed characters.
 267     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 268     """
 269     def replace_insane(char):
 270         if char == '?' or ord(char) < 32 or ord(char) == 127:
 271             return ''
 272         elif char == '"':
 273             return '' if restricted else '\''
 274         elif char == ':':
 275             return '_-' if restricted else ' -'
 276         elif char in '\\/|*<>':
 277             return '_'
 278         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 279             return '_'
 280         if restricted and ord(char) > 127:
 281             return '_'
 282         return char
 283
 284     result = ''.join(map(replace_insane, s))
 285     if not is_id:
 286         while '__' in result:
 287             result = result.replace('__', '_')
 288         result = result.strip('_')
 289         # Common case of "Foreign band name - English song title"
 290         if restricted and result.startswith('-_'):
 291             result = result[2:]
 292         if not result:
 293             result = '_'
 294     return result
 295
 296
 297 def orderedSet(iterable):
 298     """ Remove all duplicates from the input iterable """
 299     res = []
 300     for el in iterable:
 301         if el not in res:
 302             res.append(el)
 303     return res
 304
 305
 306 def _htmlentity_transform(entity):
 307     """Transforms an HTML entity to a character."""
 308     # Known non-numeric HTML entity
 309     if entity in compat_html_entities.name2codepoint:
 310         return compat_chr(compat_html_entities.name2codepoint[entity])
 311
 312     mobj = re.match(r'#(x?[0-9]+)', entity)
 313     if mobj is not None:
 314         numstr = mobj.group(1)
 315         if numstr.startswith('x'):
 316             base = 16
 317             numstr = '0%s' % numstr
 318         else:
 319             base = 10
 320         return compat_chr(int(numstr, base))
 321
 322     # Unknown entity in name, return its literal representation
 323     return ('&%s;' % entity)
 324
 325
 326 def unescapeHTML(s):
 327     if s is None:
 328         return None
 329     assert type(s) == compat_str
 330
 331     return re.sub(
 332         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 333
 334
 335 def encodeFilename(s, for_subprocess=False):
 336     """
 337     @param s The name of the file
 338     """
 339
 340     assert type(s) == compat_str
 341
 342     # Python 3 has a Unicode API
 343     if sys.version_info >= (3, 0):
 344         return s
 345
 346     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 347         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 348         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 349         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 350         if not for_subprocess:
 351             return s
 352         else:
 353             # For subprocess calls, encode with locale encoding
 354             # Refer to http://stackoverflow.com/a/9951851/35070
 355             encoding = preferredencoding()
 356     else:
 357         encoding = sys.getfilesystemencoding()
 358     if encoding is None:
 359         encoding = 'utf-8'
 360     return s.encode(encoding, 'ignore')
 361
 362
 363 def encodeArgument(s):
 364     if not isinstance(s, compat_str):
 365         # Legacy code that uses byte strings
 366         # Uncomment the following line after fixing all post processors
 367         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 368         s = s.decode('ascii')
 369     return encodeFilename(s, True)
 370
 371
 372 def decodeOption(optval):
 373     if optval is None:
 374         return optval
 375     if isinstance(optval, bytes):
 376         optval = optval.decode(preferredencoding())
 377
 378     assert isinstance(optval, compat_str)
 379     return optval
 380
 381
 382 def formatSeconds(secs):
 383     if secs > 3600:
 384         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 385     elif secs > 60:
 386         return '%d:%02d' % (secs // 60, secs % 60)
 387     else:
 388         return '%d' % secs
 389
 390
 391 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 392     if sys.version_info < (3, 2):
 393         import httplib
 394
 395         class HTTPSConnectionV3(httplib.HTTPSConnection):
 396             def __init__(self, *args, **kwargs):
 397                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 398
 399             def connect(self):
 400                 sock = socket.create_connection((self.host, self.port), self.timeout)
 401                 if getattr(self, '_tunnel_host', False):
 402                     self.sock = sock
 403                     self._tunnel()
 404                 try:
 405                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 406                 except ssl.SSLError:
 407                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 408
 409         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 410             def https_open(self, req):
 411                 return self.do_open(HTTPSConnectionV3, req)
 412         return HTTPSHandlerV3(**kwargs)
 413     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 414         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 415         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 416         if opts_no_check_certificate:
 417             context.verify_mode = ssl.CERT_NONE
 418         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 419     else:  # Python < 3.4
 420         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 421         context.verify_mode = (ssl.CERT_NONE
 422                                if opts_no_check_certificate
 423                                else ssl.CERT_REQUIRED)
 424         context.set_default_verify_paths()
 425         try:
 426             context.load_default_certs()
 427         except AttributeError:
 428             pass  # Python < 3.4
 429         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 430
 431
 432 class ExtractorError(Exception):
 433     """Error during info extraction."""
 434
 435     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 436         """ tb, if given, is the original traceback (so that it can be printed out).
 437         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 438         """
 439
 440         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 441             expected = True
 442         if video_id is not None:
 443             msg = video_id + ': ' + msg
 444         if cause:
 445             msg += ' (caused by %r)' % cause
 446         if not expected:
 447             if ytdl_is_updateable():
 448                 update_cmd = 'type  youtube-dl -U  to update'
 449             else:
 450                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 451             msg += '; please report this issue on https://yt-dl.org/bug .'
 452             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 453             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 454         super(ExtractorError, self).__init__(msg)
 455
 456         self.traceback = tb
 457         self.exc_info = sys.exc_info()  # preserve original exception
 458         self.cause = cause
 459         self.video_id = video_id
 460
 461     def format_traceback(self):
 462         if self.traceback is None:
 463             return None
 464         return ''.join(traceback.format_tb(self.traceback))
 465
 466
 467 class RegexNotFoundError(ExtractorError):
 468     """Error when a regex didn't match"""
 469     pass
 470
 471
 472 class DownloadError(Exception):
 473     """Download Error exception.
 474
 475     This exception may be thrown by FileDownloader objects if they are not
 476     configured to continue on errors. They will contain the appropriate
 477     error message.
 478     """
 479
 480     def __init__(self, msg, exc_info=None):
 481         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 482         super(DownloadError, self).__init__(msg)
 483         self.exc_info = exc_info
 484
 485
 486 class SameFileError(Exception):
 487     """Same File exception.
 488
 489     This exception will be thrown by FileDownloader objects if they detect
 490     multiple files would have to be downloaded to the same file on disk.
 491     """
 492     pass
 493
 494
 495 class PostProcessingError(Exception):
 496     """Post Processing exception.
 497
 498     This exception may be raised by PostProcessor's .run() method to
 499     indicate an error in the postprocessing task.
 500     """
 501
 502     def __init__(self, msg):
 503         self.msg = msg
 504
 505
 506 class MaxDownloadsReached(Exception):
 507     """ --max-downloads limit has been reached. """
 508     pass
 509
 510
 511 class UnavailableVideoError(Exception):
 512     """Unavailable Format exception.
 513
 514     This exception will be thrown when a video is requested
 515     in a format that is not available for that video.
 516     """
 517     pass
 518
 519
 520 class ContentTooShortError(Exception):
 521     """Content Too Short exception.
 522
 523     This exception may be raised by FileDownloader objects when a file they
 524     download is too small for what the server announced first, indicating
 525     the connection was probably interrupted.
 526     """
 527     # Both in bytes
 528     downloaded = None
 529     expected = None
 530
 531     def __init__(self, downloaded, expected):
 532         self.downloaded = downloaded
 533         self.expected = expected
 534
 535
 536 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 537     """Handler for HTTP requests and responses.
 538
 539     This class, when installed with an OpenerDirector, automatically adds
 540     the standard headers to every HTTP request and handles gzipped and
 541     deflated responses from web servers. If compression is to be avoided in
 542     a particular request, the original request in the program code only has
 543     to include the HTTP header "Youtubedl-No-Compression", which will be
 544     removed before making the real request.
 545
 546     Part of this code was copied from:
 547
 548     http://techknack.net/python-urllib2-handlers/
 549
 550     Andrew Rowls, the author of that code, agreed to release it to the
 551     public domain.
 552     """
 553
 554     @staticmethod
 555     def deflate(data):
 556         try:
 557             return zlib.decompress(data, -zlib.MAX_WBITS)
 558         except zlib.error:
 559             return zlib.decompress(data)
 560
 561     @staticmethod
 562     def addinfourl_wrapper(stream, headers, url, code):
 563         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 564             return compat_urllib_request.addinfourl(stream, headers, url, code)
 565         ret = compat_urllib_request.addinfourl(stream, headers, url)
 566         ret.code = code
 567         return ret
 568
 569     def http_request(self, req):
 570         for h, v in std_headers.items():
 571             if h not in req.headers:
 572                 req.add_header(h, v)
 573         if 'Youtubedl-no-compression' in req.headers:
 574             if 'Accept-encoding' in req.headers:
 575                 del req.headers['Accept-encoding']
 576             del req.headers['Youtubedl-no-compression']
 577         if 'Youtubedl-user-agent' in req.headers:
 578             if 'User-agent' in req.headers:
 579                 del req.headers['User-agent']
 580             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 581             del req.headers['Youtubedl-user-agent']
 582
 583         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 584             # Python 2.6 is brain-dead when it comes to fragments
 585             req._Request__original = req._Request__original.partition('#')[0]
 586             req._Request__r_type = req._Request__r_type.partition('#')[0]
 587
 588         return req
 589
 590     def http_response(self, req, resp):
 591         old_resp = resp
 592         # gzip
 593         if resp.headers.get('Content-encoding', '') == 'gzip':
 594             content = resp.read()
 595             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 596             try:
 597                 uncompressed = io.BytesIO(gz.read())
 598             except IOError as original_ioerror:
 599                 # There may be junk add the end of the file
 600                 # See http://stackoverflow.com/q/4928560/35070 for details
 601                 for i in range(1, 1024):
 602                     try:
 603                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 604                         uncompressed = io.BytesIO(gz.read())
 605                     except IOError:
 606                         continue
 607                     break
 608                 else:
 609                     raise original_ioerror
 610             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 611             resp.msg = old_resp.msg
 612         # deflate
 613         if resp.headers.get('Content-encoding', '') == 'deflate':
 614             gz = io.BytesIO(self.deflate(resp.read()))
 615             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 616             resp.msg = old_resp.msg
 617         return resp
 618
 619     https_request = http_request
 620     https_response = http_response
 621
 622
 623 def parse_iso8601(date_str, delimiter='T'):
 624     """ Return a UNIX timestamp from the given date """
 625
 626     if date_str is None:
 627         return None
 628
 629     m = re.search(
 630         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 631         date_str)
 632     if not m:
 633         timezone = datetime.timedelta()
 634     else:
 635         date_str = date_str[:-len(m.group(0))]
 636         if not m.group('sign'):
 637             timezone = datetime.timedelta()
 638         else:
 639             sign = 1 if m.group('sign') == '+' else -1
 640             timezone = datetime.timedelta(
 641                 hours=sign * int(m.group('hours')),
 642                 minutes=sign * int(m.group('minutes')))
 643     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 644     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 645     return calendar.timegm(dt.timetuple())
 646
 647
 648 def unified_strdate(date_str, day_first=True):
 649     """Return a string with the date in the format YYYYMMDD"""
 650
 651     if date_str is None:
 652         return None
 653     upload_date = None
 654     # Replace commas
 655     date_str = date_str.replace(',', ' ')
 656     # %z (UTC offset) is only supported in python>=3.2
 657     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 658     # Remove AM/PM + timezone
 659     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 660
 661     format_expressions = [
 662         '%d %B %Y',
 663         '%d %b %Y',
 664         '%B %d %Y',
 665         '%b %d %Y',
 666         '%b %dst %Y %I:%M%p',
 667         '%b %dnd %Y %I:%M%p',
 668         '%b %dth %Y %I:%M%p',
 669         '%Y-%m-%d',
 670         '%Y/%m/%d',
 671         '%d.%m.%Y',
 672         '%d/%m/%Y',
 673         '%d/%m/%y',
 674         '%Y/%m/%d %H:%M:%S',
 675         '%Y-%m-%d %H:%M:%S',
 676         '%Y-%m-%d %H:%M:%S.%f',
 677         '%d.%m.%Y %H:%M',
 678         '%d.%m.%Y %H.%M',
 679         '%Y-%m-%dT%H:%M:%SZ',
 680         '%Y-%m-%dT%H:%M:%S.%fZ',
 681         '%Y-%m-%dT%H:%M:%S.%f0Z',
 682         '%Y-%m-%dT%H:%M:%S',
 683         '%Y-%m-%dT%H:%M:%S.%f',
 684         '%Y-%m-%dT%H:%M',
 685     ]
 686     if day_first:
 687         format_expressions.extend([
 688             '%d/%m/%Y %H:%M:%S',
 689         ])
 690     else:
 691         format_expressions.extend([
 692             '%m/%d/%Y %H:%M:%S',
 693         ])
 694     for expression in format_expressions:
 695         try:
 696             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 697         except ValueError:
 698             pass
 699     if upload_date is None:
 700         timetuple = email.utils.parsedate_tz(date_str)
 701         if timetuple:
 702             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 703     return upload_date
 704
 705
 706 def determine_ext(url, default_ext='unknown_video'):
 707     if url is None:
 708         return default_ext
 709     guess = url.partition('?')[0].rpartition('.')[2]
 710     if re.match(r'^[A-Za-z0-9]+$', guess):
 711         return guess
 712     else:
 713         return default_ext
 714
 715
 716 def subtitles_filename(filename, sub_lang, sub_format):
 717     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 718
 719
 720 def date_from_str(date_str):
 721     """
 722     Return a datetime object from a string in the format YYYYMMDD or
 723     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 724     today = datetime.date.today()
 725     if date_str in ('now', 'today'):
 726         return today
 727     if date_str == 'yesterday':
 728         return today - datetime.timedelta(days=1)
 729     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 730     if match is not None:
 731         sign = match.group('sign')
 732         time = int(match.group('time'))
 733         if sign == '-':
 734             time = -time
 735         unit = match.group('unit')
 736         # A bad aproximation?
 737         if unit == 'month':
 738             unit = 'day'
 739             time *= 30
 740         elif unit == 'year':
 741             unit = 'day'
 742             time *= 365
 743         unit += 's'
 744         delta = datetime.timedelta(**{unit: time})
 745         return today + delta
 746     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 747
 748
 749 def hyphenate_date(date_str):
 750     """
 751     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 752     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 753     if match is not None:
 754         return '-'.join(match.groups())
 755     else:
 756         return date_str
 757
 758
 759 class DateRange(object):
 760     """Represents a time interval between two dates"""
 761
 762     def __init__(self, start=None, end=None):
 763         """start and end must be strings in the format accepted by date"""
 764         if start is not None:
 765             self.start = date_from_str(start)
 766         else:
 767             self.start = datetime.datetime.min.date()
 768         if end is not None:
 769             self.end = date_from_str(end)
 770         else:
 771             self.end = datetime.datetime.max.date()
 772         if self.start > self.end:
 773             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 774
 775     @classmethod
 776     def day(cls, day):
 777         """Returns a range that only contains the given day"""
 778         return cls(day, day)
 779
 780     def __contains__(self, date):
 781         """Check if the date is in the range"""
 782         if not isinstance(date, datetime.date):
 783             date = date_from_str(date)
 784         return self.start <= date <= self.end
 785
 786     def __str__(self):
 787         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 788
 789
 790 def platform_name():
 791     """ Returns the platform name as a compat_str """
 792     res = platform.platform()
 793     if isinstance(res, bytes):
 794         res = res.decode(preferredencoding())
 795
 796     assert isinstance(res, compat_str)
 797     return res
 798
 799
 800 def _windows_write_string(s, out):
 801     """ Returns True if the string was written using special methods,
 802     False if it has yet to be written out."""
 803     # Adapted from http://stackoverflow.com/a/3259271/35070
 804
 805     import ctypes
 806     import ctypes.wintypes
 807
 808     WIN_OUTPUT_IDS = {
 809         1: -11,
 810         2: -12,
 811     }
 812
 813     try:
 814         fileno = out.fileno()
 815     except AttributeError:
 816         # If the output stream doesn't have a fileno, it's virtual
 817         return False
 818     if fileno not in WIN_OUTPUT_IDS:
 819         return False
 820
 821     GetStdHandle = compat_WINFUNCTYPE(
 822         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 823         ("GetStdHandle", ctypes.windll.kernel32))
 824     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 825
 826     WriteConsoleW = compat_WINFUNCTYPE(
 827         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 828         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 829         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 830     written = ctypes.wintypes.DWORD(0)
 831
 832     GetFileType = compat_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 833     FILE_TYPE_CHAR = 0x0002
 834     FILE_TYPE_REMOTE = 0x8000
 835     GetConsoleMode = compat_WINFUNCTYPE(
 836         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 837         ctypes.POINTER(ctypes.wintypes.DWORD))(
 838         ("GetConsoleMode", ctypes.windll.kernel32))
 839     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 840
 841     def not_a_console(handle):
 842         if handle == INVALID_HANDLE_VALUE or handle is None:
 843             return True
 844         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 845                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 846
 847     if not_a_console(h):
 848         return False
 849
 850     def next_nonbmp_pos(s):
 851         try:
 852             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 853         except StopIteration:
 854             return len(s)
 855
 856     while s:
 857         count = min(next_nonbmp_pos(s), 1024)
 858
 859         ret = WriteConsoleW(
 860             h, s, count if count else 2, ctypes.byref(written), None)
 861         if ret == 0:
 862             raise OSError('Failed to write string')
 863         if not count:  # We just wrote a non-BMP character
 864             assert written.value == 2
 865             s = s[1:]
 866         else:
 867             assert written.value > 0
 868             s = s[written.value:]
 869     return True
 870
 871
 872 def write_string(s, out=None, encoding=None):
 873     if out is None:
 874         out = sys.stderr
 875     assert type(s) == compat_str
 876
 877     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 878         if _windows_write_string(s, out):
 879             return
 880
 881     if ('b' in getattr(out, 'mode', '') or
 882             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 883         byt = s.encode(encoding or preferredencoding(), 'ignore')
 884         out.write(byt)
 885     elif hasattr(out, 'buffer'):
 886         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 887         byt = s.encode(enc, 'ignore')
 888         out.buffer.write(byt)
 889     else:
 890         out.write(s)
 891     out.flush()
 892
 893
 894 def bytes_to_intlist(bs):
 895     if not bs:
 896         return []
 897     if isinstance(bs[0], int):  # Python 3
 898         return list(bs)
 899     else:
 900         return [ord(c) for c in bs]
 901
 902
 903 def intlist_to_bytes(xs):
 904     if not xs:
 905         return b''
 906     return struct_pack('%dB' % len(xs), *xs)
 907
 908
 909 # Cross-platform file locking
 910 if sys.platform == 'win32':
 911     import ctypes.wintypes
 912     import msvcrt
 913
 914     class OVERLAPPED(ctypes.Structure):
 915         _fields_ = [
 916             ('Internal', ctypes.wintypes.LPVOID),
 917             ('InternalHigh', ctypes.wintypes.LPVOID),
 918             ('Offset', ctypes.wintypes.DWORD),
 919             ('OffsetHigh', ctypes.wintypes.DWORD),
 920             ('hEvent', ctypes.wintypes.HANDLE),
 921         ]
 922
 923     kernel32 = ctypes.windll.kernel32
 924     LockFileEx = kernel32.LockFileEx
 925     LockFileEx.argtypes = [
 926         ctypes.wintypes.HANDLE,     # hFile
 927         ctypes.wintypes.DWORD,      # dwFlags
 928         ctypes.wintypes.DWORD,      # dwReserved
 929         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 930         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 931         ctypes.POINTER(OVERLAPPED)  # Overlapped
 932     ]
 933     LockFileEx.restype = ctypes.wintypes.BOOL
 934     UnlockFileEx = kernel32.UnlockFileEx
 935     UnlockFileEx.argtypes = [
 936         ctypes.wintypes.HANDLE,     # hFile
 937         ctypes.wintypes.DWORD,      # dwReserved
 938         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 939         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 940         ctypes.POINTER(OVERLAPPED)  # Overlapped
 941     ]
 942     UnlockFileEx.restype = ctypes.wintypes.BOOL
 943     whole_low = 0xffffffff
 944     whole_high = 0x7fffffff
 945
 946     def _lock_file(f, exclusive):
 947         overlapped = OVERLAPPED()
 948         overlapped.Offset = 0
 949         overlapped.OffsetHigh = 0
 950         overlapped.hEvent = 0
 951         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 952         handle = msvcrt.get_osfhandle(f.fileno())
 953         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 954                           whole_low, whole_high, f._lock_file_overlapped_p):
 955             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 956
 957     def _unlock_file(f):
 958         assert f._lock_file_overlapped_p
 959         handle = msvcrt.get_osfhandle(f.fileno())
 960         if not UnlockFileEx(handle, 0,
 961                             whole_low, whole_high, f._lock_file_overlapped_p):
 962             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 963
 964 else:
 965     import fcntl
 966
 967     def _lock_file(f, exclusive):
 968         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 969
 970     def _unlock_file(f):
 971         fcntl.flock(f, fcntl.LOCK_UN)
 972
 973
 974 class locked_file(object):
 975     def __init__(self, filename, mode, encoding=None):
 976         assert mode in ['r', 'a', 'w']
 977         self.f = io.open(filename, mode, encoding=encoding)
 978         self.mode = mode
 979
 980     def __enter__(self):
 981         exclusive = self.mode != 'r'
 982         try:
 983             _lock_file(self.f, exclusive)
 984         except IOError:
 985             self.f.close()
 986             raise
 987         return self
 988
 989     def __exit__(self, etype, value, traceback):
 990         try:
 991             _unlock_file(self.f)
 992         finally:
 993             self.f.close()
 994
 995     def __iter__(self):
 996         return iter(self.f)
 997
 998     def write(self, *args):
 999         return self.f.write(*args)
1000
1001     def read(self, *args):
1002         return self.f.read(*args)
1003
1004
1005 def get_filesystem_encoding():
1006     encoding = sys.getfilesystemencoding()
1007     return encoding if encoding is not None else 'utf-8'
1008
1009
1010 def shell_quote(args):
1011     quoted_args = []
1012     encoding = get_filesystem_encoding()
1013     for a in args:
1014         if isinstance(a, bytes):
1015             # We may get a filename encoded with 'encodeFilename'
1016             a = a.decode(encoding)
1017         quoted_args.append(pipes.quote(a))
1018     return ' '.join(quoted_args)
1019
1020
1021 def takewhile_inclusive(pred, seq):
1022     """ Like itertools.takewhile, but include the latest evaluated element
1023         (the first element so that Not pred(e)) """
1024     for e in seq:
1025         yield e
1026         if not pred(e):
1027             return
1028
1029
1030 def smuggle_url(url, data):
1031     """ Pass additional data in a URL for internal use. """
1032
1033     sdata = compat_urllib_parse.urlencode(
1034         {'__youtubedl_smuggle': json.dumps(data)})
1035     return url + '#' + sdata
1036
1037
1038 def unsmuggle_url(smug_url, default=None):
1039     if '#__youtubedl_smuggle' not in smug_url:
1040         return smug_url, default
1041     url, _, sdata = smug_url.rpartition('#')
1042     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1043     data = json.loads(jsond)
1044     return url, data
1045
1046
1047 def format_bytes(bytes):
1048     if bytes is None:
1049         return 'N/A'
1050     if type(bytes) is str:
1051         bytes = float(bytes)
1052     if bytes == 0.0:
1053         exponent = 0
1054     else:
1055         exponent = int(math.log(bytes, 1024.0))
1056     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1057     converted = float(bytes) / float(1024 ** exponent)
1058     return '%.2f%s' % (converted, suffix)
1059
1060
1061 def parse_filesize(s):
1062     if s is None:
1063         return None
1064
1065     # The lower-case forms are of course incorrect and inofficial,
1066     # but we support those too
1067     _UNIT_TABLE = {
1068         'B': 1,
1069         'b': 1,
1070         'KiB': 1024,
1071         'KB': 1000,
1072         'kB': 1024,
1073         'Kb': 1000,
1074         'MiB': 1024 ** 2,
1075         'MB': 1000 ** 2,
1076         'mB': 1024 ** 2,
1077         'Mb': 1000 ** 2,
1078         'GiB': 1024 ** 3,
1079         'GB': 1000 ** 3,
1080         'gB': 1024 ** 3,
1081         'Gb': 1000 ** 3,
1082         'TiB': 1024 ** 4,
1083         'TB': 1000 ** 4,
1084         'tB': 1024 ** 4,
1085         'Tb': 1000 ** 4,
1086         'PiB': 1024 ** 5,
1087         'PB': 1000 ** 5,
1088         'pB': 1024 ** 5,
1089         'Pb': 1000 ** 5,
1090         'EiB': 1024 ** 6,
1091         'EB': 1000 ** 6,
1092         'eB': 1024 ** 6,
1093         'Eb': 1000 ** 6,
1094         'ZiB': 1024 ** 7,
1095         'ZB': 1000 ** 7,
1096         'zB': 1024 ** 7,
1097         'Zb': 1000 ** 7,
1098         'YiB': 1024 ** 8,
1099         'YB': 1000 ** 8,
1100         'yB': 1024 ** 8,
1101         'Yb': 1000 ** 8,
1102     }
1103
1104     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1105     m = re.match(
1106         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1107     if not m:
1108         return None
1109
1110     num_str = m.group('num').replace(',', '.')
1111     mult = _UNIT_TABLE[m.group('unit')]
1112     return int(float(num_str) * mult)
1113
1114
1115 def get_term_width():
1116     columns = compat_getenv('COLUMNS', None)
1117     if columns:
1118         return int(columns)
1119
1120     try:
1121         sp = subprocess.Popen(
1122             ['stty', 'size'],
1123             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1124         out, err = sp.communicate()
1125         return int(out.split()[1])
1126     except:
1127         pass
1128     return None
1129
1130
1131 def month_by_name(name):
1132     """ Return the number of a month by (locale-independently) English name """
1133
1134     ENGLISH_NAMES = [
1135         'January', 'February', 'March', 'April', 'May', 'June',
1136         'July', 'August', 'September', 'October', 'November', 'December']
1137     try:
1138         return ENGLISH_NAMES.index(name) + 1
1139     except ValueError:
1140         return None
1141
1142
1143 def fix_xml_ampersands(xml_str):
1144     """Replace all the '&' by '&amp;' in XML"""
1145     return re.sub(
1146         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1147         '&amp;',
1148         xml_str)
1149
1150
1151 def setproctitle(title):
1152     assert isinstance(title, compat_str)
1153     try:
1154         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1155     except OSError:
1156         return
1157     title_bytes = title.encode('utf-8')
1158     buf = ctypes.create_string_buffer(len(title_bytes))
1159     buf.value = title_bytes
1160     try:
1161         libc.prctl(15, buf, 0, 0, 0)
1162     except AttributeError:
1163         return  # Strange libc, just skip this
1164
1165
1166 def remove_start(s, start):
1167     if s.startswith(start):
1168         return s[len(start):]
1169     return s
1170
1171
1172 def remove_end(s, end):
1173     if s.endswith(end):
1174         return s[:-len(end)]
1175     return s
1176
1177
1178 def url_basename(url):
1179     path = compat_urlparse.urlparse(url).path
1180     return path.strip('/').split('/')[-1]
1181
1182
1183 class HEADRequest(compat_urllib_request.Request):
1184     def get_method(self):
1185         return "HEAD"
1186
1187
1188 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1189     if get_attr:
1190         if v is not None:
1191             v = getattr(v, get_attr, None)
1192     if v == '':
1193         v = None
1194     return default if v is None else (int(v) * invscale // scale)
1195
1196
1197 def str_or_none(v, default=None):
1198     return default if v is None else compat_str(v)
1199
1200
1201 def str_to_int(int_str):
1202     """ A more relaxed version of int_or_none """
1203     if int_str is None:
1204         return None
1205     int_str = re.sub(r'[,\.\+]', '', int_str)
1206     return int(int_str)
1207
1208
1209 def float_or_none(v, scale=1, invscale=1, default=None):
1210     return default if v is None else (float(v) * invscale / scale)
1211
1212
1213 def parse_duration(s):
1214     if s is None:
1215         return None
1216
1217     s = s.strip()
1218
1219     m = re.match(
1220         r'''(?ix)T?
1221         (?:
1222             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1223             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1224
1225             (?:
1226                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1227                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1228             )?
1229             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1230         )$''', s)
1231     if not m:
1232         return None
1233     res = 0
1234     if m.group('only_mins'):
1235         return float_or_none(m.group('only_mins'), invscale=60)
1236     if m.group('only_hours'):
1237         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1238     if m.group('secs'):
1239         res += int(m.group('secs'))
1240     if m.group('mins'):
1241         res += int(m.group('mins')) * 60
1242     if m.group('hours'):
1243         res += int(m.group('hours')) * 60 * 60
1244     if m.group('ms'):
1245         res += float(m.group('ms'))
1246     return res
1247
1248
1249 def prepend_extension(filename, ext):
1250     name, real_ext = os.path.splitext(filename)
1251     return '{0}.{1}{2}'.format(name, ext, real_ext)
1252
1253
1254 def check_executable(exe, args=[]):
1255     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1256     args can be a list of arguments for a short output (like -version) """
1257     try:
1258         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1259     except OSError:
1260         return False
1261     return exe
1262
1263
1264 def get_exe_version(exe, args=['--version'],
1265                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1266                     unrecognized='present'):
1267     """ Returns the version of the specified executable,
1268     or False if the executable is not present """
1269     try:
1270         out, err = subprocess.Popen(
1271             [exe] + args,
1272             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1273     except OSError:
1274         return False
1275     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1276     m = re.search(version_re, firstline)
1277     if m:
1278         return m.group(1)
1279     else:
1280         return unrecognized
1281
1282
1283 class PagedList(object):
1284     def __len__(self):
1285         # This is only useful for tests
1286         return len(self.getslice())
1287
1288
1289 class OnDemandPagedList(PagedList):
1290     def __init__(self, pagefunc, pagesize):
1291         self._pagefunc = pagefunc
1292         self._pagesize = pagesize
1293
1294     def getslice(self, start=0, end=None):
1295         res = []
1296         for pagenum in itertools.count(start // self._pagesize):
1297             firstid = pagenum * self._pagesize
1298             nextfirstid = pagenum * self._pagesize + self._pagesize
1299             if start >= nextfirstid:
1300                 continue
1301
1302             page_results = list(self._pagefunc(pagenum))
1303
1304             startv = (
1305                 start % self._pagesize
1306                 if firstid <= start < nextfirstid
1307                 else 0)
1308
1309             endv = (
1310                 ((end - 1) % self._pagesize) + 1
1311                 if (end is not None and firstid <= end <= nextfirstid)
1312                 else None)
1313
1314             if startv != 0 or endv is not None:
1315                 page_results = page_results[startv:endv]
1316             res.extend(page_results)
1317
1318             # A little optimization - if current page is not "full", ie. does
1319             # not contain page_size videos then we can assume that this page
1320             # is the last one - there are no more ids on further pages -
1321             # i.e. no need to query again.
1322             if len(page_results) + startv < self._pagesize:
1323                 break
1324
1325             # If we got the whole page, but the next page is not interesting,
1326             # break out early as well
1327             if end == nextfirstid:
1328                 break
1329         return res
1330
1331
1332 class InAdvancePagedList(PagedList):
1333     def __init__(self, pagefunc, pagecount, pagesize):
1334         self._pagefunc = pagefunc
1335         self._pagecount = pagecount
1336         self._pagesize = pagesize
1337
1338     def getslice(self, start=0, end=None):
1339         res = []
1340         start_page = start // self._pagesize
1341         end_page = (
1342             self._pagecount if end is None else (end // self._pagesize + 1))
1343         skip_elems = start - start_page * self._pagesize
1344         only_more = None if end is None else end - start
1345         for pagenum in range(start_page, end_page):
1346             page = list(self._pagefunc(pagenum))
1347             if skip_elems:
1348                 page = page[skip_elems:]
1349                 skip_elems = None
1350             if only_more is not None:
1351                 if len(page) < only_more:
1352                     only_more -= len(page)
1353                 else:
1354                     page = page[:only_more]
1355                     res.extend(page)
1356                     break
1357             res.extend(page)
1358         return res
1359
1360
1361 def uppercase_escape(s):
1362     unicode_escape = codecs.getdecoder('unicode_escape')
1363     return re.sub(
1364         r'\\U[0-9a-fA-F]{8}',
1365         lambda m: unicode_escape(m.group(0))[0],
1366         s)
1367
1368
1369 def escape_rfc3986(s):
1370     """Escape non-ASCII characters as suggested by RFC 3986"""
1371     if sys.version_info < (3, 0) and isinstance(s, unicode):
1372         s = s.encode('utf-8')
1373     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1374
1375
1376 def escape_url(url):
1377     """Escape URL as suggested by RFC 3986"""
1378     url_parsed = compat_urllib_parse_urlparse(url)
1379     return url_parsed._replace(
1380         path=escape_rfc3986(url_parsed.path),
1381         params=escape_rfc3986(url_parsed.params),
1382         query=escape_rfc3986(url_parsed.query),
1383         fragment=escape_rfc3986(url_parsed.fragment)
1384     ).geturl()
1385
1386 try:
1387     struct.pack('!I', 0)
1388 except TypeError:
1389     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1390     def struct_pack(spec, *args):
1391         if isinstance(spec, compat_str):
1392             spec = spec.encode('ascii')
1393         return struct.pack(spec, *args)
1394
1395     def struct_unpack(spec, *args):
1396         if isinstance(spec, compat_str):
1397             spec = spec.encode('ascii')
1398         return struct.unpack(spec, *args)
1399 else:
1400     struct_pack = struct.pack
1401     struct_unpack = struct.unpack
1402
1403
1404 def read_batch_urls(batch_fd):
1405     def fixup(url):
1406         if not isinstance(url, compat_str):
1407             url = url.decode('utf-8', 'replace')
1408         BOM_UTF8 = '\xef\xbb\xbf'
1409         if url.startswith(BOM_UTF8):
1410             url = url[len(BOM_UTF8):]
1411         url = url.strip()
1412         if url.startswith(('#', ';', ']')):
1413             return False
1414         return url
1415
1416     with contextlib.closing(batch_fd) as fd:
1417         return [url for url in map(fixup, fd) if url]
1418
1419
1420 def urlencode_postdata(*args, **kargs):
1421     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1422
1423
1424 try:
1425     etree_iter = xml.etree.ElementTree.Element.iter
1426 except AttributeError:  # Python <=2.6
1427     etree_iter = lambda n: n.findall('.//*')
1428
1429
1430 def parse_xml(s):
1431     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1432         def doctype(self, name, pubid, system):
1433             pass  # Ignore doctypes
1434
1435     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1436     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1437     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1438     # Fix up XML parser in Python 2.x
1439     if sys.version_info < (3, 0):
1440         for n in etree_iter(tree):
1441             if n.text is not None:
1442                 if not isinstance(n.text, compat_str):
1443                     n.text = n.text.decode('utf-8')
1444     return tree
1445
1446
1447 US_RATINGS = {
1448     'G': 0,
1449     'PG': 10,
1450     'PG-13': 13,
1451     'R': 16,
1452     'NC': 18,
1453 }
1454
1455
1456 def parse_age_limit(s):
1457     if s is None:
1458         return None
1459     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1460     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1461
1462
1463 def strip_jsonp(code):
1464     return re.sub(
1465         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1466
1467
1468 def js_to_json(code):
1469     def fix_kv(m):
1470         v = m.group(0)
1471         if v in ('true', 'false', 'null'):
1472             return v
1473         if v.startswith('"'):
1474             return v
1475         if v.startswith("'"):
1476             v = v[1:-1]
1477             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1478                 '\\\\': '\\\\',
1479                 "\\'": "'",
1480                 '"': '\\"',
1481             }[m.group(0)], v)
1482         return '"%s"' % v
1483
1484     res = re.sub(r'''(?x)
1485         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1486         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1487         [a-zA-Z_][a-zA-Z_0-9]*
1488         ''', fix_kv, code)
1489     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1490     return res
1491
1492
1493 def qualities(quality_ids):
1494     """ Get a numeric quality value out of a list of possible values """
1495     def q(qid):
1496         try:
1497             return quality_ids.index(qid)
1498         except ValueError:
1499             return -1
1500     return q
1501
1502
1503 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1504
1505
1506 def limit_length(s, length):
1507     """ Add ellipses to overly long strings """
1508     if s is None:
1509         return None
1510     ELLIPSES = '...'
1511     if len(s) > length:
1512         return s[:length - len(ELLIPSES)] + ELLIPSES
1513     return s
1514
1515
1516 def version_tuple(v):
1517     return tuple(int(e) for e in re.split(r'[-.]', v))
1518
1519
1520 def is_outdated_version(version, limit, assume_new=True):
1521     if not version:
1522         return not assume_new
1523     try:
1524         return version_tuple(version) < version_tuple(limit)
1525     except ValueError:
1526         return not assume_new
1527
1528
1529 def ytdl_is_updateable():
1530     """ Returns if youtube-dl can be updated with -U """
1531     from zipimport import zipimporter
1532
1533     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1534
1535
1536 def args_to_str(args):
1537     # Get a short string representation for a subprocess command
1538     return ' '.join(shlex_quote(a) for a in args)