_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208     # Newline vs <br />
 209     html = html.replace('\n', ' ')
 210     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 211     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 212     # Strip html tags
 213     html = re.sub('<.*?>', '', html)
 214     # Replace html entities
 215     html = unescapeHTML(html)
 216     return html.strip()
 217
 218
 219 def sanitize_open(filename, open_mode):
 220     """Try to open the given filename, and slightly tweak it if this fails.
 221
 222     Attempts to open the given filename. If this fails, it tries to change
 223     the filename slightly, step by step, until it's either able to open it
 224     or it fails and raises a final exception, like the standard open()
 225     function.
 226
 227     It returns the tuple (stream, definitive_file_name).
 228     """
 229     try:
 230         if filename == '-':
 231             if sys.platform == 'win32':
 232                 import msvcrt
 233                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 234             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 235         stream = open(encodeFilename(filename), open_mode)
 236         return (stream, filename)
 237     except (IOError, OSError) as err:
 238         if err.errno in (errno.EACCES,):
 239             raise
 240
 241         # In case of error, try to remove win32 forbidden chars
 242         alt_filename = os.path.join(
 243             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 244             for path_part in os.path.split(filename)
 245         )
 246         if alt_filename == filename:
 247             raise
 248         else:
 249             # An exception here should be caught in the caller
 250             stream = open(encodeFilename(filename), open_mode)
 251             return (stream, alt_filename)
 252
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262
 263 def sanitize_filename(s, restricted=False, is_id=False):
 264     """Sanitizes a string so it could be used as part of a filename.
 265     If restricted is set, use a stricter subset of allowed characters.
 266     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 267     """
 268     def replace_insane(char):
 269         if char == '?' or ord(char) < 32 or ord(char) == 127:
 270             return ''
 271         elif char == '"':
 272             return '' if restricted else '\''
 273         elif char == ':':
 274             return '_-' if restricted else ' -'
 275         elif char in '\\/|*<>':
 276             return '_'
 277         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 278             return '_'
 279         if restricted and ord(char) > 127:
 280             return '_'
 281         return char
 282
 283     result = ''.join(map(replace_insane, s))
 284     if not is_id:
 285         while '__' in result:
 286             result = result.replace('__', '_')
 287         result = result.strip('_')
 288         # Common case of "Foreign band name - English song title"
 289         if restricted and result.startswith('-_'):
 290             result = result[2:]
 291         if not result:
 292             result = '_'
 293     return result
 294
 295
 296 def orderedSet(iterable):
 297     """ Remove all duplicates from the input iterable """
 298     res = []
 299     for el in iterable:
 300         if el not in res:
 301             res.append(el)
 302     return res
 303
 304
 305 def _htmlentity_transform(entity):
 306     """Transforms an HTML entity to a character."""
 307     # Known non-numeric HTML entity
 308     if entity in compat_html_entities.name2codepoint:
 309         return compat_chr(compat_html_entities.name2codepoint[entity])
 310
 311     mobj = re.match(r'#(x?[0-9]+)', entity)
 312     if mobj is not None:
 313         numstr = mobj.group(1)
 314         if numstr.startswith('x'):
 315             base = 16
 316             numstr = '0%s' % numstr
 317         else:
 318             base = 10
 319         return compat_chr(int(numstr, base))
 320
 321     # Unknown entity in name, return its literal representation
 322     return ('&%s;' % entity)
 323
 324
 325 def unescapeHTML(s):
 326     if s is None:
 327         return None
 328     assert type(s) == compat_str
 329
 330     return re.sub(
 331         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 332
 333
 334 def encodeFilename(s, for_subprocess=False):
 335     """
 336     @param s The name of the file
 337     """
 338
 339     assert type(s) == compat_str
 340
 341     # Python 3 has a Unicode API
 342     if sys.version_info >= (3, 0):
 343         return s
 344
 345     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 346         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 347         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 348         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 349         if not for_subprocess:
 350             return s
 351         else:
 352             # For subprocess calls, encode with locale encoding
 353             # Refer to http://stackoverflow.com/a/9951851/35070
 354             encoding = preferredencoding()
 355     else:
 356         encoding = sys.getfilesystemencoding()
 357     if encoding is None:
 358         encoding = 'utf-8'
 359     return s.encode(encoding, 'ignore')
 360
 361
 362 def encodeArgument(s):
 363     if not isinstance(s, compat_str):
 364         # Legacy code that uses byte strings
 365         # Uncomment the following line after fixing all post processors
 366         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 367         s = s.decode('ascii')
 368     return encodeFilename(s, True)
 369
 370
 371 def decodeOption(optval):
 372     if optval is None:
 373         return optval
 374     if isinstance(optval, bytes):
 375         optval = optval.decode(preferredencoding())
 376
 377     assert isinstance(optval, compat_str)
 378     return optval
 379
 380
 381 def formatSeconds(secs):
 382     if secs > 3600:
 383         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 384     elif secs > 60:
 385         return '%d:%02d' % (secs // 60, secs % 60)
 386     else:
 387         return '%d' % secs
 388
 389
 390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 391     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 392         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 393         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 394         if opts_no_check_certificate:
 395             context.verify_mode = ssl.CERT_NONE
 396         try:
 397             return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 398         except TypeError:
 399             # Python 2.7.8
 400             # (create_default_context present but HTTPSHandler has no context=)
 401             pass
 402
 403     if sys.version_info < (3, 2):
 404         import httplib
 405
 406         class HTTPSConnectionV3(httplib.HTTPSConnection):
 407             def __init__(self, *args, **kwargs):
 408                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 409
 410             def connect(self):
 411                 sock = socket.create_connection((self.host, self.port), self.timeout)
 412                 if getattr(self, '_tunnel_host', False):
 413                     self.sock = sock
 414                     self._tunnel()
 415                 try:
 416                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 417                 except ssl.SSLError:
 418                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 419
 420         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 421             def https_open(self, req):
 422                 return self.do_open(HTTPSConnectionV3, req)
 423         return HTTPSHandlerV3(**kwargs)
 424     else:  # Python < 3.4
 425         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 426         context.verify_mode = (ssl.CERT_NONE
 427                                if opts_no_check_certificate
 428                                else ssl.CERT_REQUIRED)
 429         context.set_default_verify_paths()
 430         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 431
 432
 433 class ExtractorError(Exception):
 434     """Error during info extraction."""
 435
 436     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 437         """ tb, if given, is the original traceback (so that it can be printed out).
 438         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 439         """
 440
 441         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 442             expected = True
 443         if video_id is not None:
 444             msg = video_id + ': ' + msg
 445         if cause:
 446             msg += ' (caused by %r)' % cause
 447         if not expected:
 448             if ytdl_is_updateable():
 449                 update_cmd = 'type  youtube-dl -U  to update'
 450             else:
 451                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 452             msg += '; please report this issue on https://yt-dl.org/bug .'
 453             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 454             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 455         super(ExtractorError, self).__init__(msg)
 456
 457         self.traceback = tb
 458         self.exc_info = sys.exc_info()  # preserve original exception
 459         self.cause = cause
 460         self.video_id = video_id
 461
 462     def format_traceback(self):
 463         if self.traceback is None:
 464             return None
 465         return ''.join(traceback.format_tb(self.traceback))
 466
 467
 468 class RegexNotFoundError(ExtractorError):
 469     """Error when a regex didn't match"""
 470     pass
 471
 472
 473 class DownloadError(Exception):
 474     """Download Error exception.
 475
 476     This exception may be thrown by FileDownloader objects if they are not
 477     configured to continue on errors. They will contain the appropriate
 478     error message.
 479     """
 480
 481     def __init__(self, msg, exc_info=None):
 482         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 483         super(DownloadError, self).__init__(msg)
 484         self.exc_info = exc_info
 485
 486
 487 class SameFileError(Exception):
 488     """Same File exception.
 489
 490     This exception will be thrown by FileDownloader objects if they detect
 491     multiple files would have to be downloaded to the same file on disk.
 492     """
 493     pass
 494
 495
 496 class PostProcessingError(Exception):
 497     """Post Processing exception.
 498
 499     This exception may be raised by PostProcessor's .run() method to
 500     indicate an error in the postprocessing task.
 501     """
 502
 503     def __init__(self, msg):
 504         self.msg = msg
 505
 506
 507 class MaxDownloadsReached(Exception):
 508     """ --max-downloads limit has been reached. """
 509     pass
 510
 511
 512 class UnavailableVideoError(Exception):
 513     """Unavailable Format exception.
 514
 515     This exception will be thrown when a video is requested
 516     in a format that is not available for that video.
 517     """
 518     pass
 519
 520
 521 class ContentTooShortError(Exception):
 522     """Content Too Short exception.
 523
 524     This exception may be raised by FileDownloader objects when a file they
 525     download is too small for what the server announced first, indicating
 526     the connection was probably interrupted.
 527     """
 528     # Both in bytes
 529     downloaded = None
 530     expected = None
 531
 532     def __init__(self, downloaded, expected):
 533         self.downloaded = downloaded
 534         self.expected = expected
 535
 536
 537 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 538     """Handler for HTTP requests and responses.
 539
 540     This class, when installed with an OpenerDirector, automatically adds
 541     the standard headers to every HTTP request and handles gzipped and
 542     deflated responses from web servers. If compression is to be avoided in
 543     a particular request, the original request in the program code only has
 544     to include the HTTP header "Youtubedl-No-Compression", which will be
 545     removed before making the real request.
 546
 547     Part of this code was copied from:
 548
 549     http://techknack.net/python-urllib2-handlers/
 550
 551     Andrew Rowls, the author of that code, agreed to release it to the
 552     public domain.
 553     """
 554
 555     @staticmethod
 556     def deflate(data):
 557         try:
 558             return zlib.decompress(data, -zlib.MAX_WBITS)
 559         except zlib.error:
 560             return zlib.decompress(data)
 561
 562     @staticmethod
 563     def addinfourl_wrapper(stream, headers, url, code):
 564         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 565             return compat_urllib_request.addinfourl(stream, headers, url, code)
 566         ret = compat_urllib_request.addinfourl(stream, headers, url)
 567         ret.code = code
 568         return ret
 569
 570     def http_request(self, req):
 571         for h, v in std_headers.items():
 572             if h not in req.headers:
 573                 req.add_header(h, v)
 574         if 'Youtubedl-no-compression' in req.headers:
 575             if 'Accept-encoding' in req.headers:
 576                 del req.headers['Accept-encoding']
 577             del req.headers['Youtubedl-no-compression']
 578         if 'Youtubedl-user-agent' in req.headers:
 579             if 'User-agent' in req.headers:
 580                 del req.headers['User-agent']
 581             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 582             del req.headers['Youtubedl-user-agent']
 583
 584         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 585             # Python 2.6 is brain-dead when it comes to fragments
 586             req._Request__original = req._Request__original.partition('#')[0]
 587             req._Request__r_type = req._Request__r_type.partition('#')[0]
 588
 589         return req
 590
 591     def http_response(self, req, resp):
 592         old_resp = resp
 593         # gzip
 594         if resp.headers.get('Content-encoding', '') == 'gzip':
 595             content = resp.read()
 596             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 597             try:
 598                 uncompressed = io.BytesIO(gz.read())
 599             except IOError as original_ioerror:
 600                 # There may be junk add the end of the file
 601                 # See http://stackoverflow.com/q/4928560/35070 for details
 602                 for i in range(1, 1024):
 603                     try:
 604                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 605                         uncompressed = io.BytesIO(gz.read())
 606                     except IOError:
 607                         continue
 608                     break
 609                 else:
 610                     raise original_ioerror
 611             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 612             resp.msg = old_resp.msg
 613         # deflate
 614         if resp.headers.get('Content-encoding', '') == 'deflate':
 615             gz = io.BytesIO(self.deflate(resp.read()))
 616             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 617             resp.msg = old_resp.msg
 618         return resp
 619
 620     https_request = http_request
 621     https_response = http_response
 622
 623
 624 def parse_iso8601(date_str, delimiter='T'):
 625     """ Return a UNIX timestamp from the given date """
 626
 627     if date_str is None:
 628         return None
 629
 630     m = re.search(
 631         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 632         date_str)
 633     if not m:
 634         timezone = datetime.timedelta()
 635     else:
 636         date_str = date_str[:-len(m.group(0))]
 637         if not m.group('sign'):
 638             timezone = datetime.timedelta()
 639         else:
 640             sign = 1 if m.group('sign') == '+' else -1
 641             timezone = datetime.timedelta(
 642                 hours=sign * int(m.group('hours')),
 643                 minutes=sign * int(m.group('minutes')))
 644     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 645     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 646     return calendar.timegm(dt.timetuple())
 647
 648
 649 def unified_strdate(date_str, day_first=True):
 650     """Return a string with the date in the format YYYYMMDD"""
 651
 652     if date_str is None:
 653         return None
 654     upload_date = None
 655     # Replace commas
 656     date_str = date_str.replace(',', ' ')
 657     # %z (UTC offset) is only supported in python>=3.2
 658     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 659     # Remove AM/PM + timezone
 660     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 661
 662     format_expressions = [
 663         '%d %B %Y',
 664         '%d %b %Y',
 665         '%B %d %Y',
 666         '%b %d %Y',
 667         '%b %dst %Y %I:%M%p',
 668         '%b %dnd %Y %I:%M%p',
 669         '%b %dth %Y %I:%M%p',
 670         '%Y-%m-%d',
 671         '%Y/%m/%d',
 672         '%d.%m.%Y',
 673         '%d/%m/%Y',
 674         '%d/%m/%y',
 675         '%Y/%m/%d %H:%M:%S',
 676         '%Y-%m-%d %H:%M:%S',
 677         '%Y-%m-%d %H:%M:%S.%f',
 678         '%d.%m.%Y %H:%M',
 679         '%d.%m.%Y %H.%M',
 680         '%Y-%m-%dT%H:%M:%SZ',
 681         '%Y-%m-%dT%H:%M:%S.%fZ',
 682         '%Y-%m-%dT%H:%M:%S.%f0Z',
 683         '%Y-%m-%dT%H:%M:%S',
 684         '%Y-%m-%dT%H:%M:%S.%f',
 685         '%Y-%m-%dT%H:%M',
 686     ]
 687     if day_first:
 688         format_expressions.extend([
 689             '%d/%m/%Y %H:%M:%S',
 690         ])
 691     else:
 692         format_expressions.extend([
 693             '%m/%d/%Y %H:%M:%S',
 694         ])
 695     for expression in format_expressions:
 696         try:
 697             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 698         except ValueError:
 699             pass
 700     if upload_date is None:
 701         timetuple = email.utils.parsedate_tz(date_str)
 702         if timetuple:
 703             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 704     return upload_date
 705
 706
 707 def determine_ext(url, default_ext='unknown_video'):
 708     if url is None:
 709         return default_ext
 710     guess = url.partition('?')[0].rpartition('.')[2]
 711     if re.match(r'^[A-Za-z0-9]+$', guess):
 712         return guess
 713     else:
 714         return default_ext
 715
 716
 717 def subtitles_filename(filename, sub_lang, sub_format):
 718     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 719
 720
 721 def date_from_str(date_str):
 722     """
 723     Return a datetime object from a string in the format YYYYMMDD or
 724     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 725     today = datetime.date.today()
 726     if date_str in ('now', 'today'):
 727         return today
 728     if date_str == 'yesterday':
 729         return today - datetime.timedelta(days=1)
 730     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 731     if match is not None:
 732         sign = match.group('sign')
 733         time = int(match.group('time'))
 734         if sign == '-':
 735             time = -time
 736         unit = match.group('unit')
 737         # A bad aproximation?
 738         if unit == 'month':
 739             unit = 'day'
 740             time *= 30
 741         elif unit == 'year':
 742             unit = 'day'
 743             time *= 365
 744         unit += 's'
 745         delta = datetime.timedelta(**{unit: time})
 746         return today + delta
 747     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 748
 749
 750 def hyphenate_date(date_str):
 751     """
 752     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 753     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 754     if match is not None:
 755         return '-'.join(match.groups())
 756     else:
 757         return date_str
 758
 759
 760 class DateRange(object):
 761     """Represents a time interval between two dates"""
 762
 763     def __init__(self, start=None, end=None):
 764         """start and end must be strings in the format accepted by date"""
 765         if start is not None:
 766             self.start = date_from_str(start)
 767         else:
 768             self.start = datetime.datetime.min.date()
 769         if end is not None:
 770             self.end = date_from_str(end)
 771         else:
 772             self.end = datetime.datetime.max.date()
 773         if self.start > self.end:
 774             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 775
 776     @classmethod
 777     def day(cls, day):
 778         """Returns a range that only contains the given day"""
 779         return cls(day, day)
 780
 781     def __contains__(self, date):
 782         """Check if the date is in the range"""
 783         if not isinstance(date, datetime.date):
 784             date = date_from_str(date)
 785         return self.start <= date <= self.end
 786
 787     def __str__(self):
 788         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 789
 790
 791 def platform_name():
 792     """ Returns the platform name as a compat_str """
 793     res = platform.platform()
 794     if isinstance(res, bytes):
 795         res = res.decode(preferredencoding())
 796
 797     assert isinstance(res, compat_str)
 798     return res
 799
 800
 801 def _windows_write_string(s, out):
 802     """ Returns True if the string was written using special methods,
 803     False if it has yet to be written out."""
 804     # Adapted from http://stackoverflow.com/a/3259271/35070
 805
 806     import ctypes
 807     import ctypes.wintypes
 808
 809     WIN_OUTPUT_IDS = {
 810         1: -11,
 811         2: -12,
 812     }
 813
 814     try:
 815         fileno = out.fileno()
 816     except AttributeError:
 817         # If the output stream doesn't have a fileno, it's virtual
 818         return False
 819     if fileno not in WIN_OUTPUT_IDS:
 820         return False
 821
 822     GetStdHandle = ctypes.WINFUNCTYPE(
 823         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 824         (b"GetStdHandle", ctypes.windll.kernel32))
 825     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 826
 827     WriteConsoleW = ctypes.WINFUNCTYPE(
 828         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 829         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 830         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 831     written = ctypes.wintypes.DWORD(0)
 832
 833     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 834     FILE_TYPE_CHAR = 0x0002
 835     FILE_TYPE_REMOTE = 0x8000
 836     GetConsoleMode = ctypes.WINFUNCTYPE(
 837         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 838         ctypes.POINTER(ctypes.wintypes.DWORD))(
 839         (b"GetConsoleMode", ctypes.windll.kernel32))
 840     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 841
 842     def not_a_console(handle):
 843         if handle == INVALID_HANDLE_VALUE or handle is None:
 844             return True
 845         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 846                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 847
 848     if not_a_console(h):
 849         return False
 850
 851     def next_nonbmp_pos(s):
 852         try:
 853             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 854         except StopIteration:
 855             return len(s)
 856
 857     while s:
 858         count = min(next_nonbmp_pos(s), 1024)
 859
 860         ret = WriteConsoleW(
 861             h, s, count if count else 2, ctypes.byref(written), None)
 862         if ret == 0:
 863             raise OSError('Failed to write string')
 864         if not count:  # We just wrote a non-BMP character
 865             assert written.value == 2
 866             s = s[1:]
 867         else:
 868             assert written.value > 0
 869             s = s[written.value:]
 870     return True
 871
 872
 873 def write_string(s, out=None, encoding=None):
 874     if out is None:
 875         out = sys.stderr
 876     assert type(s) == compat_str
 877
 878     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 879         if _windows_write_string(s, out):
 880             return
 881
 882     if ('b' in getattr(out, 'mode', '') or
 883             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 884         byt = s.encode(encoding or preferredencoding(), 'ignore')
 885         out.write(byt)
 886     elif hasattr(out, 'buffer'):
 887         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 888         byt = s.encode(enc, 'ignore')
 889         out.buffer.write(byt)
 890     else:
 891         out.write(s)
 892     out.flush()
 893
 894
 895 def bytes_to_intlist(bs):
 896     if not bs:
 897         return []
 898     if isinstance(bs[0], int):  # Python 3
 899         return list(bs)
 900     else:
 901         return [ord(c) for c in bs]
 902
 903
 904 def intlist_to_bytes(xs):
 905     if not xs:
 906         return b''
 907     return struct_pack('%dB' % len(xs), *xs)
 908
 909
 910 # Cross-platform file locking
 911 if sys.platform == 'win32':
 912     import ctypes.wintypes
 913     import msvcrt
 914
 915     class OVERLAPPED(ctypes.Structure):
 916         _fields_ = [
 917             ('Internal', ctypes.wintypes.LPVOID),
 918             ('InternalHigh', ctypes.wintypes.LPVOID),
 919             ('Offset', ctypes.wintypes.DWORD),
 920             ('OffsetHigh', ctypes.wintypes.DWORD),
 921             ('hEvent', ctypes.wintypes.HANDLE),
 922         ]
 923
 924     kernel32 = ctypes.windll.kernel32
 925     LockFileEx = kernel32.LockFileEx
 926     LockFileEx.argtypes = [
 927         ctypes.wintypes.HANDLE,     # hFile
 928         ctypes.wintypes.DWORD,      # dwFlags
 929         ctypes.wintypes.DWORD,      # dwReserved
 930         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 931         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 932         ctypes.POINTER(OVERLAPPED)  # Overlapped
 933     ]
 934     LockFileEx.restype = ctypes.wintypes.BOOL
 935     UnlockFileEx = kernel32.UnlockFileEx
 936     UnlockFileEx.argtypes = [
 937         ctypes.wintypes.HANDLE,     # hFile
 938         ctypes.wintypes.DWORD,      # dwReserved
 939         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 940         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 941         ctypes.POINTER(OVERLAPPED)  # Overlapped
 942     ]
 943     UnlockFileEx.restype = ctypes.wintypes.BOOL
 944     whole_low = 0xffffffff
 945     whole_high = 0x7fffffff
 946
 947     def _lock_file(f, exclusive):
 948         overlapped = OVERLAPPED()
 949         overlapped.Offset = 0
 950         overlapped.OffsetHigh = 0
 951         overlapped.hEvent = 0
 952         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 953         handle = msvcrt.get_osfhandle(f.fileno())
 954         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 955                           whole_low, whole_high, f._lock_file_overlapped_p):
 956             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 957
 958     def _unlock_file(f):
 959         assert f._lock_file_overlapped_p
 960         handle = msvcrt.get_osfhandle(f.fileno())
 961         if not UnlockFileEx(handle, 0,
 962                             whole_low, whole_high, f._lock_file_overlapped_p):
 963             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 964
 965 else:
 966     import fcntl
 967
 968     def _lock_file(f, exclusive):
 969         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 970
 971     def _unlock_file(f):
 972         fcntl.flock(f, fcntl.LOCK_UN)
 973
 974
 975 class locked_file(object):
 976     def __init__(self, filename, mode, encoding=None):
 977         assert mode in ['r', 'a', 'w']
 978         self.f = io.open(filename, mode, encoding=encoding)
 979         self.mode = mode
 980
 981     def __enter__(self):
 982         exclusive = self.mode != 'r'
 983         try:
 984             _lock_file(self.f, exclusive)
 985         except IOError:
 986             self.f.close()
 987             raise
 988         return self
 989
 990     def __exit__(self, etype, value, traceback):
 991         try:
 992             _unlock_file(self.f)
 993         finally:
 994             self.f.close()
 995
 996     def __iter__(self):
 997         return iter(self.f)
 998
 999     def write(self, *args):
1000         return self.f.write(*args)
1001
1002     def read(self, *args):
1003         return self.f.read(*args)
1004
1005
1006 def get_filesystem_encoding():
1007     encoding = sys.getfilesystemencoding()
1008     return encoding if encoding is not None else 'utf-8'
1009
1010
1011 def shell_quote(args):
1012     quoted_args = []
1013     encoding = get_filesystem_encoding()
1014     for a in args:
1015         if isinstance(a, bytes):
1016             # We may get a filename encoded with 'encodeFilename'
1017             a = a.decode(encoding)
1018         quoted_args.append(pipes.quote(a))
1019     return ' '.join(quoted_args)
1020
1021
1022 def takewhile_inclusive(pred, seq):
1023     """ Like itertools.takewhile, but include the latest evaluated element
1024         (the first element so that Not pred(e)) """
1025     for e in seq:
1026         yield e
1027         if not pred(e):
1028             return
1029
1030
1031 def smuggle_url(url, data):
1032     """ Pass additional data in a URL for internal use. """
1033
1034     sdata = compat_urllib_parse.urlencode(
1035         {'__youtubedl_smuggle': json.dumps(data)})
1036     return url + '#' + sdata
1037
1038
1039 def unsmuggle_url(smug_url, default=None):
1040     if '#__youtubedl_smuggle' not in smug_url:
1041         return smug_url, default
1042     url, _, sdata = smug_url.rpartition('#')
1043     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1044     data = json.loads(jsond)
1045     return url, data
1046
1047
1048 def format_bytes(bytes):
1049     if bytes is None:
1050         return 'N/A'
1051     if type(bytes) is str:
1052         bytes = float(bytes)
1053     if bytes == 0.0:
1054         exponent = 0
1055     else:
1056         exponent = int(math.log(bytes, 1024.0))
1057     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1058     converted = float(bytes) / float(1024 ** exponent)
1059     return '%.2f%s' % (converted, suffix)
1060
1061
1062 def parse_filesize(s):
1063     if s is None:
1064         return None
1065
1066     # The lower-case forms are of course incorrect and inofficial,
1067     # but we support those too
1068     _UNIT_TABLE = {
1069         'B': 1,
1070         'b': 1,
1071         'KiB': 1024,
1072         'KB': 1000,
1073         'kB': 1024,
1074         'Kb': 1000,
1075         'MiB': 1024 ** 2,
1076         'MB': 1000 ** 2,
1077         'mB': 1024 ** 2,
1078         'Mb': 1000 ** 2,
1079         'GiB': 1024 ** 3,
1080         'GB': 1000 ** 3,
1081         'gB': 1024 ** 3,
1082         'Gb': 1000 ** 3,
1083         'TiB': 1024 ** 4,
1084         'TB': 1000 ** 4,
1085         'tB': 1024 ** 4,
1086         'Tb': 1000 ** 4,
1087         'PiB': 1024 ** 5,
1088         'PB': 1000 ** 5,
1089         'pB': 1024 ** 5,
1090         'Pb': 1000 ** 5,
1091         'EiB': 1024 ** 6,
1092         'EB': 1000 ** 6,
1093         'eB': 1024 ** 6,
1094         'Eb': 1000 ** 6,
1095         'ZiB': 1024 ** 7,
1096         'ZB': 1000 ** 7,
1097         'zB': 1024 ** 7,
1098         'Zb': 1000 ** 7,
1099         'YiB': 1024 ** 8,
1100         'YB': 1000 ** 8,
1101         'yB': 1024 ** 8,
1102         'Yb': 1000 ** 8,
1103     }
1104
1105     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1106     m = re.match(
1107         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1108     if not m:
1109         return None
1110
1111     num_str = m.group('num').replace(',', '.')
1112     mult = _UNIT_TABLE[m.group('unit')]
1113     return int(float(num_str) * mult)
1114
1115
1116 def get_term_width():
1117     columns = compat_getenv('COLUMNS', None)
1118     if columns:
1119         return int(columns)
1120
1121     try:
1122         sp = subprocess.Popen(
1123             ['stty', 'size'],
1124             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1125         out, err = sp.communicate()
1126         return int(out.split()[1])
1127     except:
1128         pass
1129     return None
1130
1131
1132 def month_by_name(name):
1133     """ Return the number of a month by (locale-independently) English name """
1134
1135     ENGLISH_NAMES = [
1136         'January', 'February', 'March', 'April', 'May', 'June',
1137         'July', 'August', 'September', 'October', 'November', 'December']
1138     try:
1139         return ENGLISH_NAMES.index(name) + 1
1140     except ValueError:
1141         return None
1142
1143
1144 def fix_xml_ampersands(xml_str):
1145     """Replace all the '&' by '&amp;' in XML"""
1146     return re.sub(
1147         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1148         '&amp;',
1149         xml_str)
1150
1151
1152 def setproctitle(title):
1153     assert isinstance(title, compat_str)
1154     try:
1155         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1156     except OSError:
1157         return
1158     title_bytes = title.encode('utf-8')
1159     buf = ctypes.create_string_buffer(len(title_bytes))
1160     buf.value = title_bytes
1161     try:
1162         libc.prctl(15, buf, 0, 0, 0)
1163     except AttributeError:
1164         return  # Strange libc, just skip this
1165
1166
1167 def remove_start(s, start):
1168     if s.startswith(start):
1169         return s[len(start):]
1170     return s
1171
1172
1173 def remove_end(s, end):
1174     if s.endswith(end):
1175         return s[:-len(end)]
1176     return s
1177
1178
1179 def url_basename(url):
1180     path = compat_urlparse.urlparse(url).path
1181     return path.strip('/').split('/')[-1]
1182
1183
1184 class HEADRequest(compat_urllib_request.Request):
1185     def get_method(self):
1186         return "HEAD"
1187
1188
1189 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1190     if get_attr:
1191         if v is not None:
1192             v = getattr(v, get_attr, None)
1193     if v == '':
1194         v = None
1195     return default if v is None else (int(v) * invscale // scale)
1196
1197
1198 def str_or_none(v, default=None):
1199     return default if v is None else compat_str(v)
1200
1201
1202 def str_to_int(int_str):
1203     """ A more relaxed version of int_or_none """
1204     if int_str is None:
1205         return None
1206     int_str = re.sub(r'[,\.\+]', '', int_str)
1207     return int(int_str)
1208
1209
1210 def float_or_none(v, scale=1, invscale=1, default=None):
1211     return default if v is None else (float(v) * invscale / scale)
1212
1213
1214 def parse_duration(s):
1215     if s is None:
1216         return None
1217
1218     s = s.strip()
1219
1220     m = re.match(
1221         r'''(?ix)T?
1222         (?:
1223             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1224             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1225
1226             (?:
1227                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1228                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1229             )?
1230             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1231         )$''', s)
1232     if not m:
1233         return None
1234     res = 0
1235     if m.group('only_mins'):
1236         return float_or_none(m.group('only_mins'), invscale=60)
1237     if m.group('only_hours'):
1238         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1239     if m.group('secs'):
1240         res += int(m.group('secs'))
1241     if m.group('mins'):
1242         res += int(m.group('mins')) * 60
1243     if m.group('hours'):
1244         res += int(m.group('hours')) * 60 * 60
1245     if m.group('ms'):
1246         res += float(m.group('ms'))
1247     return res
1248
1249
1250 def prepend_extension(filename, ext):
1251     name, real_ext = os.path.splitext(filename)
1252     return '{0}.{1}{2}'.format(name, ext, real_ext)
1253
1254
1255 def check_executable(exe, args=[]):
1256     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1257     args can be a list of arguments for a short output (like -version) """
1258     try:
1259         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1260     except OSError:
1261         return False
1262     return exe
1263
1264
1265 def get_exe_version(exe, args=['--version'],
1266                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1267                     unrecognized='present'):
1268     """ Returns the version of the specified executable,
1269     or False if the executable is not present """
1270     try:
1271         out, err = subprocess.Popen(
1272             [exe] + args,
1273             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1274     except OSError:
1275         return False
1276     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1277     m = re.search(version_re, firstline)
1278     if m:
1279         return m.group(1)
1280     else:
1281         return unrecognized
1282
1283
1284 class PagedList(object):
1285     def __len__(self):
1286         # This is only useful for tests
1287         return len(self.getslice())
1288
1289
1290 class OnDemandPagedList(PagedList):
1291     def __init__(self, pagefunc, pagesize):
1292         self._pagefunc = pagefunc
1293         self._pagesize = pagesize
1294
1295     def getslice(self, start=0, end=None):
1296         res = []
1297         for pagenum in itertools.count(start // self._pagesize):
1298             firstid = pagenum * self._pagesize
1299             nextfirstid = pagenum * self._pagesize + self._pagesize
1300             if start >= nextfirstid:
1301                 continue
1302
1303             page_results = list(self._pagefunc(pagenum))
1304
1305             startv = (
1306                 start % self._pagesize
1307                 if firstid <= start < nextfirstid
1308                 else 0)
1309
1310             endv = (
1311                 ((end - 1) % self._pagesize) + 1
1312                 if (end is not None and firstid <= end <= nextfirstid)
1313                 else None)
1314
1315             if startv != 0 or endv is not None:
1316                 page_results = page_results[startv:endv]
1317             res.extend(page_results)
1318
1319             # A little optimization - if current page is not "full", ie. does
1320             # not contain page_size videos then we can assume that this page
1321             # is the last one - there are no more ids on further pages -
1322             # i.e. no need to query again.
1323             if len(page_results) + startv < self._pagesize:
1324                 break
1325
1326             # If we got the whole page, but the next page is not interesting,
1327             # break out early as well
1328             if end == nextfirstid:
1329                 break
1330         return res
1331
1332
1333 class InAdvancePagedList(PagedList):
1334     def __init__(self, pagefunc, pagecount, pagesize):
1335         self._pagefunc = pagefunc
1336         self._pagecount = pagecount
1337         self._pagesize = pagesize
1338
1339     def getslice(self, start=0, end=None):
1340         res = []
1341         start_page = start // self._pagesize
1342         end_page = (
1343             self._pagecount if end is None else (end // self._pagesize + 1))
1344         skip_elems = start - start_page * self._pagesize
1345         only_more = None if end is None else end - start
1346         for pagenum in range(start_page, end_page):
1347             page = list(self._pagefunc(pagenum))
1348             if skip_elems:
1349                 page = page[skip_elems:]
1350                 skip_elems = None
1351             if only_more is not None:
1352                 if len(page) < only_more:
1353                     only_more -= len(page)
1354                 else:
1355                     page = page[:only_more]
1356                     res.extend(page)
1357                     break
1358             res.extend(page)
1359         return res
1360
1361
1362 def uppercase_escape(s):
1363     unicode_escape = codecs.getdecoder('unicode_escape')
1364     return re.sub(
1365         r'\\U[0-9a-fA-F]{8}',
1366         lambda m: unicode_escape(m.group(0))[0],
1367         s)
1368
1369
1370 def escape_rfc3986(s):
1371     """Escape non-ASCII characters as suggested by RFC 3986"""
1372     if sys.version_info < (3, 0) and isinstance(s, unicode):
1373         s = s.encode('utf-8')
1374     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1375
1376
1377 def escape_url(url):
1378     """Escape URL as suggested by RFC 3986"""
1379     url_parsed = compat_urllib_parse_urlparse(url)
1380     return url_parsed._replace(
1381         path=escape_rfc3986(url_parsed.path),
1382         params=escape_rfc3986(url_parsed.params),
1383         query=escape_rfc3986(url_parsed.query),
1384         fragment=escape_rfc3986(url_parsed.fragment)
1385     ).geturl()
1386
1387 try:
1388     struct.pack('!I', 0)
1389 except TypeError:
1390     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1391     def struct_pack(spec, *args):
1392         if isinstance(spec, compat_str):
1393             spec = spec.encode('ascii')
1394         return struct.pack(spec, *args)
1395
1396     def struct_unpack(spec, *args):
1397         if isinstance(spec, compat_str):
1398             spec = spec.encode('ascii')
1399         return struct.unpack(spec, *args)
1400 else:
1401     struct_pack = struct.pack
1402     struct_unpack = struct.unpack
1403
1404
1405 def read_batch_urls(batch_fd):
1406     def fixup(url):
1407         if not isinstance(url, compat_str):
1408             url = url.decode('utf-8', 'replace')
1409         BOM_UTF8 = '\xef\xbb\xbf'
1410         if url.startswith(BOM_UTF8):
1411             url = url[len(BOM_UTF8):]
1412         url = url.strip()
1413         if url.startswith(('#', ';', ']')):
1414             return False
1415         return url
1416
1417     with contextlib.closing(batch_fd) as fd:
1418         return [url for url in map(fixup, fd) if url]
1419
1420
1421 def urlencode_postdata(*args, **kargs):
1422     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1423
1424
1425 try:
1426     etree_iter = xml.etree.ElementTree.Element.iter
1427 except AttributeError:  # Python <=2.6
1428     etree_iter = lambda n: n.findall('.//*')
1429
1430
1431 def parse_xml(s):
1432     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1433         def doctype(self, name, pubid, system):
1434             pass  # Ignore doctypes
1435
1436     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1437     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1438     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1439     # Fix up XML parser in Python 2.x
1440     if sys.version_info < (3, 0):
1441         for n in etree_iter(tree):
1442             if n.text is not None:
1443                 if not isinstance(n.text, compat_str):
1444                     n.text = n.text.decode('utf-8')
1445     return tree
1446
1447
1448 US_RATINGS = {
1449     'G': 0,
1450     'PG': 10,
1451     'PG-13': 13,
1452     'R': 16,
1453     'NC': 18,
1454 }
1455
1456
1457 def parse_age_limit(s):
1458     if s is None:
1459         return None
1460     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1461     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1462
1463
1464 def strip_jsonp(code):
1465     return re.sub(
1466         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1467
1468
1469 def js_to_json(code):
1470     def fix_kv(m):
1471         v = m.group(0)
1472         if v in ('true', 'false', 'null'):
1473             return v
1474         if v.startswith('"'):
1475             return v
1476         if v.startswith("'"):
1477             v = v[1:-1]
1478             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1479                 '\\\\': '\\\\',
1480                 "\\'": "'",
1481                 '"': '\\"',
1482             }[m.group(0)], v)
1483         return '"%s"' % v
1484
1485     res = re.sub(r'''(?x)
1486         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1487         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1488         [a-zA-Z_][a-zA-Z_0-9]*
1489         ''', fix_kv, code)
1490     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1491     return res
1492
1493
1494 def qualities(quality_ids):
1495     """ Get a numeric quality value out of a list of possible values """
1496     def q(qid):
1497         try:
1498             return quality_ids.index(qid)
1499         except ValueError:
1500             return -1
1501     return q
1502
1503
1504 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1505
1506
1507 def limit_length(s, length):
1508     """ Add ellipses to overly long strings """
1509     if s is None:
1510         return None
1511     ELLIPSES = '...'
1512     if len(s) > length:
1513         return s[:length - len(ELLIPSES)] + ELLIPSES
1514     return s
1515
1516
1517 def version_tuple(v):
1518     return tuple(int(e) for e in re.split(r'[-.]', v))
1519
1520
1521 def is_outdated_version(version, limit, assume_new=True):
1522     if not version:
1523         return not assume_new
1524     try:
1525         return version_tuple(version) < version_tuple(limit)
1526     except ValueError:
1527         return not assume_new
1528
1529
1530 def ytdl_is_updateable():
1531     """ Returns if youtube-dl can be updated with -U """
1532     from zipimport import zipimporter
1533
1534     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1535
1536
1537 def args_to_str(args):
1538     # Get a short string representation for a subprocess command
1539     return ' '.join(shlex_quote(a) for a in args)