_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208     # Newline vs <br />
 209     html = html.replace('\n', ' ')
 210     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 211     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 212     # Strip html tags
 213     html = re.sub('<.*?>', '', html)
 214     # Replace html entities
 215     html = unescapeHTML(html)
 216     return html.strip()
 217
 218
 219 def sanitize_open(filename, open_mode):
 220     """Try to open the given filename, and slightly tweak it if this fails.
 221
 222     Attempts to open the given filename. If this fails, it tries to change
 223     the filename slightly, step by step, until it's either able to open it
 224     or it fails and raises a final exception, like the standard open()
 225     function.
 226
 227     It returns the tuple (stream, definitive_file_name).
 228     """
 229     try:
 230         if filename == '-':
 231             if sys.platform == 'win32':
 232                 import msvcrt
 233                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 234             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 235         stream = open(encodeFilename(filename), open_mode)
 236         return (stream, filename)
 237     except (IOError, OSError) as err:
 238         if err.errno in (errno.EACCES,):
 239             raise
 240
 241         # In case of error, try to remove win32 forbidden chars
 242         alt_filename = os.path.join(
 243             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 244             for path_part in os.path.split(filename)
 245         )
 246         if alt_filename == filename:
 247             raise
 248         else:
 249             # An exception here should be caught in the caller
 250             stream = open(encodeFilename(filename), open_mode)
 251             return (stream, alt_filename)
 252
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262
 263 def sanitize_filename(s, restricted=False, is_id=False):
 264     """Sanitizes a string so it could be used as part of a filename.
 265     If restricted is set, use a stricter subset of allowed characters.
 266     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 267     """
 268     def replace_insane(char):
 269         if char == '?' or ord(char) < 32 or ord(char) == 127:
 270             return ''
 271         elif char == '"':
 272             return '' if restricted else '\''
 273         elif char == ':':
 274             return '_-' if restricted else ' -'
 275         elif char in '\\/|*<>':
 276             return '_'
 277         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 278             return '_'
 279         if restricted and ord(char) > 127:
 280             return '_'
 281         return char
 282
 283     result = ''.join(map(replace_insane, s))
 284     if not is_id:
 285         while '__' in result:
 286             result = result.replace('__', '_')
 287         result = result.strip('_')
 288         # Common case of "Foreign band name - English song title"
 289         if restricted and result.startswith('-_'):
 290             result = result[2:]
 291         if not result:
 292             result = '_'
 293     return result
 294
 295
 296 def orderedSet(iterable):
 297     """ Remove all duplicates from the input iterable """
 298     res = []
 299     for el in iterable:
 300         if el not in res:
 301             res.append(el)
 302     return res
 303
 304
 305 def _htmlentity_transform(entity):
 306     """Transforms an HTML entity to a character."""
 307     # Known non-numeric HTML entity
 308     if entity in compat_html_entities.name2codepoint:
 309         return compat_chr(compat_html_entities.name2codepoint[entity])
 310
 311     mobj = re.match(r'#(x?[0-9]+)', entity)
 312     if mobj is not None:
 313         numstr = mobj.group(1)
 314         if numstr.startswith('x'):
 315             base = 16
 316             numstr = '0%s' % numstr
 317         else:
 318             base = 10
 319         return compat_chr(int(numstr, base))
 320
 321     # Unknown entity in name, return its literal representation
 322     return ('&%s;' % entity)
 323
 324
 325 def unescapeHTML(s):
 326     if s is None:
 327         return None
 328     assert type(s) == compat_str
 329
 330     return re.sub(
 331         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 332
 333
 334 def encodeFilename(s, for_subprocess=False):
 335     """
 336     @param s The name of the file
 337     """
 338
 339     assert type(s) == compat_str
 340
 341     # Python 3 has a Unicode API
 342     if sys.version_info >= (3, 0):
 343         return s
 344
 345     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 346         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 347         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 348         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 349         if not for_subprocess:
 350             return s
 351         else:
 352             # For subprocess calls, encode with locale encoding
 353             # Refer to http://stackoverflow.com/a/9951851/35070
 354             encoding = preferredencoding()
 355     else:
 356         encoding = sys.getfilesystemencoding()
 357     if encoding is None:
 358         encoding = 'utf-8'
 359     return s.encode(encoding, 'ignore')
 360
 361
 362 def encodeArgument(s):
 363     if not isinstance(s, compat_str):
 364         # Legacy code that uses byte strings
 365         # Uncomment the following line after fixing all post processors
 366         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 367         s = s.decode('ascii')
 368     return encodeFilename(s, True)
 369
 370
 371 def decodeOption(optval):
 372     if optval is None:
 373         return optval
 374     if isinstance(optval, bytes):
 375         optval = optval.decode(preferredencoding())
 376
 377     assert isinstance(optval, compat_str)
 378     return optval
 379
 380
 381 def formatSeconds(secs):
 382     if secs > 3600:
 383         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 384     elif secs > 60:
 385         return '%d:%02d' % (secs // 60, secs % 60)
 386     else:
 387         return '%d' % secs
 388
 389
 390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 391     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 392         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 393         if opts_no_check_certificate:
 394             context.verify_mode = ssl.CERT_NONE
 395         try:
 396             return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 397         except TypeError:
 398             # Python 2.7.8
 399             # (create_default_context present but HTTPSHandler has no context=)
 400             pass
 401
 402     if sys.version_info < (3, 2):
 403         import httplib
 404
 405         class HTTPSConnectionV3(httplib.HTTPSConnection):
 406             def __init__(self, *args, **kwargs):
 407                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 408
 409             def connect(self):
 410                 sock = socket.create_connection((self.host, self.port), self.timeout)
 411                 if getattr(self, '_tunnel_host', False):
 412                     self.sock = sock
 413                     self._tunnel()
 414                 try:
 415                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 416                 except ssl.SSLError:
 417                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 418
 419         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 420             def https_open(self, req):
 421                 return self.do_open(HTTPSConnectionV3, req)
 422         return HTTPSHandlerV3(**kwargs)
 423     else:  # Python < 3.4
 424         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 425         context.verify_mode = (ssl.CERT_NONE
 426                                if opts_no_check_certificate
 427                                else ssl.CERT_REQUIRED)
 428         context.set_default_verify_paths()
 429         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 430
 431
 432 class ExtractorError(Exception):
 433     """Error during info extraction."""
 434
 435     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 436         """ tb, if given, is the original traceback (so that it can be printed out).
 437         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 438         """
 439
 440         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 441             expected = True
 442         if video_id is not None:
 443             msg = video_id + ': ' + msg
 444         if cause:
 445             msg += ' (caused by %r)' % cause
 446         if not expected:
 447             if ytdl_is_updateable():
 448                 update_cmd = 'type  youtube-dl -U  to update'
 449             else:
 450                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 451             msg += '; please report this issue on https://yt-dl.org/bug .'
 452             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 453             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 454         super(ExtractorError, self).__init__(msg)
 455
 456         self.traceback = tb
 457         self.exc_info = sys.exc_info()  # preserve original exception
 458         self.cause = cause
 459         self.video_id = video_id
 460
 461     def format_traceback(self):
 462         if self.traceback is None:
 463             return None
 464         return ''.join(traceback.format_tb(self.traceback))
 465
 466
 467 class RegexNotFoundError(ExtractorError):
 468     """Error when a regex didn't match"""
 469     pass
 470
 471
 472 class DownloadError(Exception):
 473     """Download Error exception.
 474
 475     This exception may be thrown by FileDownloader objects if they are not
 476     configured to continue on errors. They will contain the appropriate
 477     error message.
 478     """
 479
 480     def __init__(self, msg, exc_info=None):
 481         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 482         super(DownloadError, self).__init__(msg)
 483         self.exc_info = exc_info
 484
 485
 486 class SameFileError(Exception):
 487     """Same File exception.
 488
 489     This exception will be thrown by FileDownloader objects if they detect
 490     multiple files would have to be downloaded to the same file on disk.
 491     """
 492     pass
 493
 494
 495 class PostProcessingError(Exception):
 496     """Post Processing exception.
 497
 498     This exception may be raised by PostProcessor's .run() method to
 499     indicate an error in the postprocessing task.
 500     """
 501
 502     def __init__(self, msg):
 503         self.msg = msg
 504
 505
 506 class MaxDownloadsReached(Exception):
 507     """ --max-downloads limit has been reached. """
 508     pass
 509
 510
 511 class UnavailableVideoError(Exception):
 512     """Unavailable Format exception.
 513
 514     This exception will be thrown when a video is requested
 515     in a format that is not available for that video.
 516     """
 517     pass
 518
 519
 520 class ContentTooShortError(Exception):
 521     """Content Too Short exception.
 522
 523     This exception may be raised by FileDownloader objects when a file they
 524     download is too small for what the server announced first, indicating
 525     the connection was probably interrupted.
 526     """
 527     # Both in bytes
 528     downloaded = None
 529     expected = None
 530
 531     def __init__(self, downloaded, expected):
 532         self.downloaded = downloaded
 533         self.expected = expected
 534
 535
 536 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 537     """Handler for HTTP requests and responses.
 538
 539     This class, when installed with an OpenerDirector, automatically adds
 540     the standard headers to every HTTP request and handles gzipped and
 541     deflated responses from web servers. If compression is to be avoided in
 542     a particular request, the original request in the program code only has
 543     to include the HTTP header "Youtubedl-No-Compression", which will be
 544     removed before making the real request.
 545
 546     Part of this code was copied from:
 547
 548     http://techknack.net/python-urllib2-handlers/
 549
 550     Andrew Rowls, the author of that code, agreed to release it to the
 551     public domain.
 552     """
 553
 554     @staticmethod
 555     def deflate(data):
 556         try:
 557             return zlib.decompress(data, -zlib.MAX_WBITS)
 558         except zlib.error:
 559             return zlib.decompress(data)
 560
 561     @staticmethod
 562     def addinfourl_wrapper(stream, headers, url, code):
 563         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 564             return compat_urllib_request.addinfourl(stream, headers, url, code)
 565         ret = compat_urllib_request.addinfourl(stream, headers, url)
 566         ret.code = code
 567         return ret
 568
 569     def http_request(self, req):
 570         for h, v in std_headers.items():
 571             if h not in req.headers:
 572                 req.add_header(h, v)
 573         if 'Youtubedl-no-compression' in req.headers:
 574             if 'Accept-encoding' in req.headers:
 575                 del req.headers['Accept-encoding']
 576             del req.headers['Youtubedl-no-compression']
 577         if 'Youtubedl-user-agent' in req.headers:
 578             if 'User-agent' in req.headers:
 579                 del req.headers['User-agent']
 580             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 581             del req.headers['Youtubedl-user-agent']
 582
 583         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 584             # Python 2.6 is brain-dead when it comes to fragments
 585             req._Request__original = req._Request__original.partition('#')[0]
 586             req._Request__r_type = req._Request__r_type.partition('#')[0]
 587
 588         return req
 589
 590     def http_response(self, req, resp):
 591         old_resp = resp
 592         # gzip
 593         if resp.headers.get('Content-encoding', '') == 'gzip':
 594             content = resp.read()
 595             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 596             try:
 597                 uncompressed = io.BytesIO(gz.read())
 598             except IOError as original_ioerror:
 599                 # There may be junk add the end of the file
 600                 # See http://stackoverflow.com/q/4928560/35070 for details
 601                 for i in range(1, 1024):
 602                     try:
 603                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 604                         uncompressed = io.BytesIO(gz.read())
 605                     except IOError:
 606                         continue
 607                     break
 608                 else:
 609                     raise original_ioerror
 610             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 611             resp.msg = old_resp.msg
 612         # deflate
 613         if resp.headers.get('Content-encoding', '') == 'deflate':
 614             gz = io.BytesIO(self.deflate(resp.read()))
 615             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 616             resp.msg = old_resp.msg
 617         return resp
 618
 619     https_request = http_request
 620     https_response = http_response
 621
 622
 623 def parse_iso8601(date_str, delimiter='T'):
 624     """ Return a UNIX timestamp from the given date """
 625
 626     if date_str is None:
 627         return None
 628
 629     m = re.search(
 630         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 631         date_str)
 632     if not m:
 633         timezone = datetime.timedelta()
 634     else:
 635         date_str = date_str[:-len(m.group(0))]
 636         if not m.group('sign'):
 637             timezone = datetime.timedelta()
 638         else:
 639             sign = 1 if m.group('sign') == '+' else -1
 640             timezone = datetime.timedelta(
 641                 hours=sign * int(m.group('hours')),
 642                 minutes=sign * int(m.group('minutes')))
 643     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 644     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 645     return calendar.timegm(dt.timetuple())
 646
 647
 648 def unified_strdate(date_str, day_first=True):
 649     """Return a string with the date in the format YYYYMMDD"""
 650
 651     if date_str is None:
 652         return None
 653     upload_date = None
 654     # Replace commas
 655     date_str = date_str.replace(',', ' ')
 656     # %z (UTC offset) is only supported in python>=3.2
 657     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 658     # Remove AM/PM + timezone
 659     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 660
 661     format_expressions = [
 662         '%d %B %Y',
 663         '%d %b %Y',
 664         '%B %d %Y',
 665         '%b %d %Y',
 666         '%b %dst %Y %I:%M%p',
 667         '%b %dnd %Y %I:%M%p',
 668         '%b %dth %Y %I:%M%p',
 669         '%Y-%m-%d',
 670         '%Y/%m/%d',
 671         '%d.%m.%Y',
 672         '%d/%m/%Y',
 673         '%d/%m/%y',
 674         '%Y/%m/%d %H:%M:%S',
 675         '%Y-%m-%d %H:%M:%S',
 676         '%Y-%m-%d %H:%M:%S.%f',
 677         '%d.%m.%Y %H:%M',
 678         '%d.%m.%Y %H.%M',
 679         '%Y-%m-%dT%H:%M:%SZ',
 680         '%Y-%m-%dT%H:%M:%S.%fZ',
 681         '%Y-%m-%dT%H:%M:%S.%f0Z',
 682         '%Y-%m-%dT%H:%M:%S',
 683         '%Y-%m-%dT%H:%M:%S.%f',
 684         '%Y-%m-%dT%H:%M',
 685     ]
 686     if day_first:
 687         format_expressions.extend([
 688             '%d/%m/%Y %H:%M:%S',
 689         ])
 690     else:
 691         format_expressions.extend([
 692             '%m/%d/%Y %H:%M:%S',
 693         ])
 694     for expression in format_expressions:
 695         try:
 696             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 697         except ValueError:
 698             pass
 699     if upload_date is None:
 700         timetuple = email.utils.parsedate_tz(date_str)
 701         if timetuple:
 702             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 703     return upload_date
 704
 705
 706 def determine_ext(url, default_ext='unknown_video'):
 707     if url is None:
 708         return default_ext
 709     guess = url.partition('?')[0].rpartition('.')[2]
 710     if re.match(r'^[A-Za-z0-9]+$', guess):
 711         return guess
 712     else:
 713         return default_ext
 714
 715
 716 def subtitles_filename(filename, sub_lang, sub_format):
 717     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 718
 719
 720 def date_from_str(date_str):
 721     """
 722     Return a datetime object from a string in the format YYYYMMDD or
 723     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 724     today = datetime.date.today()
 725     if date_str in ('now', 'today'):
 726         return today
 727     if date_str == 'yesterday':
 728         return today - datetime.timedelta(days=1)
 729     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 730     if match is not None:
 731         sign = match.group('sign')
 732         time = int(match.group('time'))
 733         if sign == '-':
 734             time = -time
 735         unit = match.group('unit')
 736         # A bad aproximation?
 737         if unit == 'month':
 738             unit = 'day'
 739             time *= 30
 740         elif unit == 'year':
 741             unit = 'day'
 742             time *= 365
 743         unit += 's'
 744         delta = datetime.timedelta(**{unit: time})
 745         return today + delta
 746     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 747
 748
 749 def hyphenate_date(date_str):
 750     """
 751     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 752     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 753     if match is not None:
 754         return '-'.join(match.groups())
 755     else:
 756         return date_str
 757
 758
 759 class DateRange(object):
 760     """Represents a time interval between two dates"""
 761
 762     def __init__(self, start=None, end=None):
 763         """start and end must be strings in the format accepted by date"""
 764         if start is not None:
 765             self.start = date_from_str(start)
 766         else:
 767             self.start = datetime.datetime.min.date()
 768         if end is not None:
 769             self.end = date_from_str(end)
 770         else:
 771             self.end = datetime.datetime.max.date()
 772         if self.start > self.end:
 773             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 774
 775     @classmethod
 776     def day(cls, day):
 777         """Returns a range that only contains the given day"""
 778         return cls(day, day)
 779
 780     def __contains__(self, date):
 781         """Check if the date is in the range"""
 782         if not isinstance(date, datetime.date):
 783             date = date_from_str(date)
 784         return self.start <= date <= self.end
 785
 786     def __str__(self):
 787         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 788
 789
 790 def platform_name():
 791     """ Returns the platform name as a compat_str """
 792     res = platform.platform()
 793     if isinstance(res, bytes):
 794         res = res.decode(preferredencoding())
 795
 796     assert isinstance(res, compat_str)
 797     return res
 798
 799
 800 def _windows_write_string(s, out):
 801     """ Returns True if the string was written using special methods,
 802     False if it has yet to be written out."""
 803     # Adapted from http://stackoverflow.com/a/3259271/35070
 804
 805     import ctypes
 806     import ctypes.wintypes
 807
 808     WIN_OUTPUT_IDS = {
 809         1: -11,
 810         2: -12,
 811     }
 812
 813     try:
 814         fileno = out.fileno()
 815     except AttributeError:
 816         # If the output stream doesn't have a fileno, it's virtual
 817         return False
 818     if fileno not in WIN_OUTPUT_IDS:
 819         return False
 820
 821     GetStdHandle = ctypes.WINFUNCTYPE(
 822         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 823         (b"GetStdHandle", ctypes.windll.kernel32))
 824     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 825
 826     WriteConsoleW = ctypes.WINFUNCTYPE(
 827         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 828         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 829         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 830     written = ctypes.wintypes.DWORD(0)
 831
 832     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 833     FILE_TYPE_CHAR = 0x0002
 834     FILE_TYPE_REMOTE = 0x8000
 835     GetConsoleMode = ctypes.WINFUNCTYPE(
 836         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 837         ctypes.POINTER(ctypes.wintypes.DWORD))(
 838         (b"GetConsoleMode", ctypes.windll.kernel32))
 839     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 840
 841     def not_a_console(handle):
 842         if handle == INVALID_HANDLE_VALUE or handle is None:
 843             return True
 844         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 845                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 846
 847     if not_a_console(h):
 848         return False
 849
 850     def next_nonbmp_pos(s):
 851         try:
 852             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 853         except StopIteration:
 854             return len(s)
 855
 856     while s:
 857         count = min(next_nonbmp_pos(s), 1024)
 858
 859         ret = WriteConsoleW(
 860             h, s, count if count else 2, ctypes.byref(written), None)
 861         if ret == 0:
 862             raise OSError('Failed to write string')
 863         if not count:  # We just wrote a non-BMP character
 864             assert written.value == 2
 865             s = s[1:]
 866         else:
 867             assert written.value > 0
 868             s = s[written.value:]
 869     return True
 870
 871
 872 def write_string(s, out=None, encoding=None):
 873     if out is None:
 874         out = sys.stderr
 875     assert type(s) == compat_str
 876
 877     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 878         if _windows_write_string(s, out):
 879             return
 880
 881     if ('b' in getattr(out, 'mode', '') or
 882             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 883         byt = s.encode(encoding or preferredencoding(), 'ignore')
 884         out.write(byt)
 885     elif hasattr(out, 'buffer'):
 886         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 887         byt = s.encode(enc, 'ignore')
 888         out.buffer.write(byt)
 889     else:
 890         out.write(s)
 891     out.flush()
 892
 893
 894 def bytes_to_intlist(bs):
 895     if not bs:
 896         return []
 897     if isinstance(bs[0], int):  # Python 3
 898         return list(bs)
 899     else:
 900         return [ord(c) for c in bs]
 901
 902
 903 def intlist_to_bytes(xs):
 904     if not xs:
 905         return b''
 906     return struct_pack('%dB' % len(xs), *xs)
 907
 908
 909 # Cross-platform file locking
 910 if sys.platform == 'win32':
 911     import ctypes.wintypes
 912     import msvcrt
 913
 914     class OVERLAPPED(ctypes.Structure):
 915         _fields_ = [
 916             ('Internal', ctypes.wintypes.LPVOID),
 917             ('InternalHigh', ctypes.wintypes.LPVOID),
 918             ('Offset', ctypes.wintypes.DWORD),
 919             ('OffsetHigh', ctypes.wintypes.DWORD),
 920             ('hEvent', ctypes.wintypes.HANDLE),
 921         ]
 922
 923     kernel32 = ctypes.windll.kernel32
 924     LockFileEx = kernel32.LockFileEx
 925     LockFileEx.argtypes = [
 926         ctypes.wintypes.HANDLE,     # hFile
 927         ctypes.wintypes.DWORD,      # dwFlags
 928         ctypes.wintypes.DWORD,      # dwReserved
 929         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 930         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 931         ctypes.POINTER(OVERLAPPED)  # Overlapped
 932     ]
 933     LockFileEx.restype = ctypes.wintypes.BOOL
 934     UnlockFileEx = kernel32.UnlockFileEx
 935     UnlockFileEx.argtypes = [
 936         ctypes.wintypes.HANDLE,     # hFile
 937         ctypes.wintypes.DWORD,      # dwReserved
 938         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 939         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 940         ctypes.POINTER(OVERLAPPED)  # Overlapped
 941     ]
 942     UnlockFileEx.restype = ctypes.wintypes.BOOL
 943     whole_low = 0xffffffff
 944     whole_high = 0x7fffffff
 945
 946     def _lock_file(f, exclusive):
 947         overlapped = OVERLAPPED()
 948         overlapped.Offset = 0
 949         overlapped.OffsetHigh = 0
 950         overlapped.hEvent = 0
 951         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 952         handle = msvcrt.get_osfhandle(f.fileno())
 953         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 954                           whole_low, whole_high, f._lock_file_overlapped_p):
 955             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 956
 957     def _unlock_file(f):
 958         assert f._lock_file_overlapped_p
 959         handle = msvcrt.get_osfhandle(f.fileno())
 960         if not UnlockFileEx(handle, 0,
 961                             whole_low, whole_high, f._lock_file_overlapped_p):
 962             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 963
 964 else:
 965     import fcntl
 966
 967     def _lock_file(f, exclusive):
 968         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 969
 970     def _unlock_file(f):
 971         fcntl.flock(f, fcntl.LOCK_UN)
 972
 973
 974 class locked_file(object):
 975     def __init__(self, filename, mode, encoding=None):
 976         assert mode in ['r', 'a', 'w']
 977         self.f = io.open(filename, mode, encoding=encoding)
 978         self.mode = mode
 979
 980     def __enter__(self):
 981         exclusive = self.mode != 'r'
 982         try:
 983             _lock_file(self.f, exclusive)
 984         except IOError:
 985             self.f.close()
 986             raise
 987         return self
 988
 989     def __exit__(self, etype, value, traceback):
 990         try:
 991             _unlock_file(self.f)
 992         finally:
 993             self.f.close()
 994
 995     def __iter__(self):
 996         return iter(self.f)
 997
 998     def write(self, *args):
 999         return self.f.write(*args)
1000
1001     def read(self, *args):
1002         return self.f.read(*args)
1003
1004
1005 def get_filesystem_encoding():
1006     encoding = sys.getfilesystemencoding()
1007     return encoding if encoding is not None else 'utf-8'
1008
1009
1010 def shell_quote(args):
1011     quoted_args = []
1012     encoding = get_filesystem_encoding()
1013     for a in args:
1014         if isinstance(a, bytes):
1015             # We may get a filename encoded with 'encodeFilename'
1016             a = a.decode(encoding)
1017         quoted_args.append(pipes.quote(a))
1018     return ' '.join(quoted_args)
1019
1020
1021 def takewhile_inclusive(pred, seq):
1022     """ Like itertools.takewhile, but include the latest evaluated element
1023         (the first element so that Not pred(e)) """
1024     for e in seq:
1025         yield e
1026         if not pred(e):
1027             return
1028
1029
1030 def smuggle_url(url, data):
1031     """ Pass additional data in a URL for internal use. """
1032
1033     sdata = compat_urllib_parse.urlencode(
1034         {'__youtubedl_smuggle': json.dumps(data)})
1035     return url + '#' + sdata
1036
1037
1038 def unsmuggle_url(smug_url, default=None):
1039     if '#__youtubedl_smuggle' not in smug_url:
1040         return smug_url, default
1041     url, _, sdata = smug_url.rpartition('#')
1042     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1043     data = json.loads(jsond)
1044     return url, data
1045
1046
1047 def format_bytes(bytes):
1048     if bytes is None:
1049         return 'N/A'
1050     if type(bytes) is str:
1051         bytes = float(bytes)
1052     if bytes == 0.0:
1053         exponent = 0
1054     else:
1055         exponent = int(math.log(bytes, 1024.0))
1056     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1057     converted = float(bytes) / float(1024 ** exponent)
1058     return '%.2f%s' % (converted, suffix)
1059
1060
1061 def parse_filesize(s):
1062     if s is None:
1063         return None
1064
1065     # The lower-case forms are of course incorrect and inofficial,
1066     # but we support those too
1067     _UNIT_TABLE = {
1068         'B': 1,
1069         'b': 1,
1070         'KiB': 1024,
1071         'KB': 1000,
1072         'kB': 1024,
1073         'Kb': 1000,
1074         'MiB': 1024 ** 2,
1075         'MB': 1000 ** 2,
1076         'mB': 1024 ** 2,
1077         'Mb': 1000 ** 2,
1078         'GiB': 1024 ** 3,
1079         'GB': 1000 ** 3,
1080         'gB': 1024 ** 3,
1081         'Gb': 1000 ** 3,
1082         'TiB': 1024 ** 4,
1083         'TB': 1000 ** 4,
1084         'tB': 1024 ** 4,
1085         'Tb': 1000 ** 4,
1086         'PiB': 1024 ** 5,
1087         'PB': 1000 ** 5,
1088         'pB': 1024 ** 5,
1089         'Pb': 1000 ** 5,
1090         'EiB': 1024 ** 6,
1091         'EB': 1000 ** 6,
1092         'eB': 1024 ** 6,
1093         'Eb': 1000 ** 6,
1094         'ZiB': 1024 ** 7,
1095         'ZB': 1000 ** 7,
1096         'zB': 1024 ** 7,
1097         'Zb': 1000 ** 7,
1098         'YiB': 1024 ** 8,
1099         'YB': 1000 ** 8,
1100         'yB': 1024 ** 8,
1101         'Yb': 1000 ** 8,
1102     }
1103
1104     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1105     m = re.match(
1106         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1107     if not m:
1108         return None
1109
1110     num_str = m.group('num').replace(',', '.')
1111     mult = _UNIT_TABLE[m.group('unit')]
1112     return int(float(num_str) * mult)
1113
1114
1115 def get_term_width():
1116     columns = compat_getenv('COLUMNS', None)
1117     if columns:
1118         return int(columns)
1119
1120     try:
1121         sp = subprocess.Popen(
1122             ['stty', 'size'],
1123             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1124         out, err = sp.communicate()
1125         return int(out.split()[1])
1126     except:
1127         pass
1128     return None
1129
1130
1131 def month_by_name(name):
1132     """ Return the number of a month by (locale-independently) English name """
1133
1134     ENGLISH_NAMES = [
1135         'January', 'February', 'March', 'April', 'May', 'June',
1136         'July', 'August', 'September', 'October', 'November', 'December']
1137     try:
1138         return ENGLISH_NAMES.index(name) + 1
1139     except ValueError:
1140         return None
1141
1142
1143 def fix_xml_ampersands(xml_str):
1144     """Replace all the '&' by '&amp;' in XML"""
1145     return re.sub(
1146         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1147         '&amp;',
1148         xml_str)
1149
1150
1151 def setproctitle(title):
1152     assert isinstance(title, compat_str)
1153     try:
1154         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1155     except OSError:
1156         return
1157     title_bytes = title.encode('utf-8')
1158     buf = ctypes.create_string_buffer(len(title_bytes))
1159     buf.value = title_bytes
1160     try:
1161         libc.prctl(15, buf, 0, 0, 0)
1162     except AttributeError:
1163         return  # Strange libc, just skip this
1164
1165
1166 def remove_start(s, start):
1167     if s.startswith(start):
1168         return s[len(start):]
1169     return s
1170
1171
1172 def remove_end(s, end):
1173     if s.endswith(end):
1174         return s[:-len(end)]
1175     return s
1176
1177
1178 def url_basename(url):
1179     path = compat_urlparse.urlparse(url).path
1180     return path.strip('/').split('/')[-1]
1181
1182
1183 class HEADRequest(compat_urllib_request.Request):
1184     def get_method(self):
1185         return "HEAD"
1186
1187
1188 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1189     if get_attr:
1190         if v is not None:
1191             v = getattr(v, get_attr, None)
1192     if v == '':
1193         v = None
1194     return default if v is None else (int(v) * invscale // scale)
1195
1196
1197 def str_or_none(v, default=None):
1198     return default if v is None else compat_str(v)
1199
1200
1201 def str_to_int(int_str):
1202     """ A more relaxed version of int_or_none """
1203     if int_str is None:
1204         return None
1205     int_str = re.sub(r'[,\.\+]', '', int_str)
1206     return int(int_str)
1207
1208
1209 def float_or_none(v, scale=1, invscale=1, default=None):
1210     return default if v is None else (float(v) * invscale / scale)
1211
1212
1213 def parse_duration(s):
1214     if s is None:
1215         return None
1216
1217     s = s.strip()
1218
1219     m = re.match(
1220         r'''(?ix)T?
1221         (?:
1222             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1223             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1224
1225             (?:
1226                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1227                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1228             )?
1229             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1230         )$''', s)
1231     if not m:
1232         return None
1233     res = 0
1234     if m.group('only_mins'):
1235         return float_or_none(m.group('only_mins'), invscale=60)
1236     if m.group('only_hours'):
1237         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1238     if m.group('secs'):
1239         res += int(m.group('secs'))
1240     if m.group('mins'):
1241         res += int(m.group('mins')) * 60
1242     if m.group('hours'):
1243         res += int(m.group('hours')) * 60 * 60
1244     if m.group('ms'):
1245         res += float(m.group('ms'))
1246     return res
1247
1248
1249 def prepend_extension(filename, ext):
1250     name, real_ext = os.path.splitext(filename)
1251     return '{0}.{1}{2}'.format(name, ext, real_ext)
1252
1253
1254 def check_executable(exe, args=[]):
1255     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1256     args can be a list of arguments for a short output (like -version) """
1257     try:
1258         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1259     except OSError:
1260         return False
1261     return exe
1262
1263
1264 def get_exe_version(exe, args=['--version'],
1265                     version_re=None, unrecognized='present'):
1266     """ Returns the version of the specified executable,
1267     or False if the executable is not present """
1268     try:
1269         out, _ = subprocess.Popen(
1270             [exe] + args,
1271             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1272     except OSError:
1273         return False
1274     if isinstance(out, bytes):  # Python 2.x
1275         out = out.decode('ascii', 'ignore')
1276     return detect_exe_version(out, version_re, unrecognized)
1277
1278
1279 def detect_exe_version(output, version_re=None, unrecognized='present'):
1280     assert isinstance(output, compat_str)
1281     if version_re is None:
1282         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1283     m = re.search(version_re, output)
1284     if m:
1285         return m.group(1)
1286     else:
1287         return unrecognized
1288
1289
1290 class PagedList(object):
1291     def __len__(self):
1292         # This is only useful for tests
1293         return len(self.getslice())
1294
1295
1296 class OnDemandPagedList(PagedList):
1297     def __init__(self, pagefunc, pagesize):
1298         self._pagefunc = pagefunc
1299         self._pagesize = pagesize
1300
1301     def getslice(self, start=0, end=None):
1302         res = []
1303         for pagenum in itertools.count(start // self._pagesize):
1304             firstid = pagenum * self._pagesize
1305             nextfirstid = pagenum * self._pagesize + self._pagesize
1306             if start >= nextfirstid:
1307                 continue
1308
1309             page_results = list(self._pagefunc(pagenum))
1310
1311             startv = (
1312                 start % self._pagesize
1313                 if firstid <= start < nextfirstid
1314                 else 0)
1315
1316             endv = (
1317                 ((end - 1) % self._pagesize) + 1
1318                 if (end is not None and firstid <= end <= nextfirstid)
1319                 else None)
1320
1321             if startv != 0 or endv is not None:
1322                 page_results = page_results[startv:endv]
1323             res.extend(page_results)
1324
1325             # A little optimization - if current page is not "full", ie. does
1326             # not contain page_size videos then we can assume that this page
1327             # is the last one - there are no more ids on further pages -
1328             # i.e. no need to query again.
1329             if len(page_results) + startv < self._pagesize:
1330                 break
1331
1332             # If we got the whole page, but the next page is not interesting,
1333             # break out early as well
1334             if end == nextfirstid:
1335                 break
1336         return res
1337
1338
1339 class InAdvancePagedList(PagedList):
1340     def __init__(self, pagefunc, pagecount, pagesize):
1341         self._pagefunc = pagefunc
1342         self._pagecount = pagecount
1343         self._pagesize = pagesize
1344
1345     def getslice(self, start=0, end=None):
1346         res = []
1347         start_page = start // self._pagesize
1348         end_page = (
1349             self._pagecount if end is None else (end // self._pagesize + 1))
1350         skip_elems = start - start_page * self._pagesize
1351         only_more = None if end is None else end - start
1352         for pagenum in range(start_page, end_page):
1353             page = list(self._pagefunc(pagenum))
1354             if skip_elems:
1355                 page = page[skip_elems:]
1356                 skip_elems = None
1357             if only_more is not None:
1358                 if len(page) < only_more:
1359                     only_more -= len(page)
1360                 else:
1361                     page = page[:only_more]
1362                     res.extend(page)
1363                     break
1364             res.extend(page)
1365         return res
1366
1367
1368 def uppercase_escape(s):
1369     unicode_escape = codecs.getdecoder('unicode_escape')
1370     return re.sub(
1371         r'\\U[0-9a-fA-F]{8}',
1372         lambda m: unicode_escape(m.group(0))[0],
1373         s)
1374
1375
1376 def escape_rfc3986(s):
1377     """Escape non-ASCII characters as suggested by RFC 3986"""
1378     if sys.version_info < (3, 0) and isinstance(s, unicode):
1379         s = s.encode('utf-8')
1380     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1381
1382
1383 def escape_url(url):
1384     """Escape URL as suggested by RFC 3986"""
1385     url_parsed = compat_urllib_parse_urlparse(url)
1386     return url_parsed._replace(
1387         path=escape_rfc3986(url_parsed.path),
1388         params=escape_rfc3986(url_parsed.params),
1389         query=escape_rfc3986(url_parsed.query),
1390         fragment=escape_rfc3986(url_parsed.fragment)
1391     ).geturl()
1392
1393 try:
1394     struct.pack('!I', 0)
1395 except TypeError:
1396     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1397     def struct_pack(spec, *args):
1398         if isinstance(spec, compat_str):
1399             spec = spec.encode('ascii')
1400         return struct.pack(spec, *args)
1401
1402     def struct_unpack(spec, *args):
1403         if isinstance(spec, compat_str):
1404             spec = spec.encode('ascii')
1405         return struct.unpack(spec, *args)
1406 else:
1407     struct_pack = struct.pack
1408     struct_unpack = struct.unpack
1409
1410
1411 def read_batch_urls(batch_fd):
1412     def fixup(url):
1413         if not isinstance(url, compat_str):
1414             url = url.decode('utf-8', 'replace')
1415         BOM_UTF8 = '\xef\xbb\xbf'
1416         if url.startswith(BOM_UTF8):
1417             url = url[len(BOM_UTF8):]
1418         url = url.strip()
1419         if url.startswith(('#', ';', ']')):
1420             return False
1421         return url
1422
1423     with contextlib.closing(batch_fd) as fd:
1424         return [url for url in map(fixup, fd) if url]
1425
1426
1427 def urlencode_postdata(*args, **kargs):
1428     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1429
1430
1431 try:
1432     etree_iter = xml.etree.ElementTree.Element.iter
1433 except AttributeError:  # Python <=2.6
1434     etree_iter = lambda n: n.findall('.//*')
1435
1436
1437 def parse_xml(s):
1438     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1439         def doctype(self, name, pubid, system):
1440             pass  # Ignore doctypes
1441
1442     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1443     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1444     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1445     # Fix up XML parser in Python 2.x
1446     if sys.version_info < (3, 0):
1447         for n in etree_iter(tree):
1448             if n.text is not None:
1449                 if not isinstance(n.text, compat_str):
1450                     n.text = n.text.decode('utf-8')
1451     return tree
1452
1453
1454 US_RATINGS = {
1455     'G': 0,
1456     'PG': 10,
1457     'PG-13': 13,
1458     'R': 16,
1459     'NC': 18,
1460 }
1461
1462
1463 def parse_age_limit(s):
1464     if s is None:
1465         return None
1466     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1467     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1468
1469
1470 def strip_jsonp(code):
1471     return re.sub(
1472         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1473
1474
1475 def js_to_json(code):
1476     def fix_kv(m):
1477         v = m.group(0)
1478         if v in ('true', 'false', 'null'):
1479             return v
1480         if v.startswith('"'):
1481             return v
1482         if v.startswith("'"):
1483             v = v[1:-1]
1484             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1485                 '\\\\': '\\\\',
1486                 "\\'": "'",
1487                 '"': '\\"',
1488             }[m.group(0)], v)
1489         return '"%s"' % v
1490
1491     res = re.sub(r'''(?x)
1492         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1493         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1494         [a-zA-Z_][a-zA-Z_0-9]*
1495         ''', fix_kv, code)
1496     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1497     return res
1498
1499
1500 def qualities(quality_ids):
1501     """ Get a numeric quality value out of a list of possible values """
1502     def q(qid):
1503         try:
1504             return quality_ids.index(qid)
1505         except ValueError:
1506             return -1
1507     return q
1508
1509
1510 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1511
1512
1513 def limit_length(s, length):
1514     """ Add ellipses to overly long strings """
1515     if s is None:
1516         return None
1517     ELLIPSES = '...'
1518     if len(s) > length:
1519         return s[:length - len(ELLIPSES)] + ELLIPSES
1520     return s
1521
1522
1523 def version_tuple(v):
1524     return tuple(int(e) for e in re.split(r'[-.]', v))
1525
1526
1527 def is_outdated_version(version, limit, assume_new=True):
1528     if not version:
1529         return not assume_new
1530     try:
1531         return version_tuple(version) < version_tuple(limit)
1532     except ValueError:
1533         return not assume_new
1534
1535
1536 def ytdl_is_updateable():
1537     """ Returns if youtube-dl can be updated with -U """
1538     from zipimport import zipimporter
1539
1540     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1541
1542
1543 def args_to_str(args):
1544     # Get a short string representation for a subprocess command
1545     return ' '.join(shlex_quote(a) for a in args)