_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208     # Newline vs <br />
 209     html = html.replace('\n', ' ')
 210     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 211     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 212     # Strip html tags
 213     html = re.sub('<.*?>', '', html)
 214     # Replace html entities
 215     html = unescapeHTML(html)
 216     return html.strip()
 217
 218
 219 def sanitize_open(filename, open_mode):
 220     """Try to open the given filename, and slightly tweak it if this fails.
 221
 222     Attempts to open the given filename. If this fails, it tries to change
 223     the filename slightly, step by step, until it's either able to open it
 224     or it fails and raises a final exception, like the standard open()
 225     function.
 226
 227     It returns the tuple (stream, definitive_file_name).
 228     """
 229     try:
 230         if filename == '-':
 231             if sys.platform == 'win32':
 232                 import msvcrt
 233                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 234             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 235         stream = open(encodeFilename(filename), open_mode)
 236         return (stream, filename)
 237     except (IOError, OSError) as err:
 238         if err.errno in (errno.EACCES,):
 239             raise
 240
 241         # In case of error, try to remove win32 forbidden chars
 242         alt_filename = os.path.join(
 243             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 244             for path_part in os.path.split(filename)
 245         )
 246         if alt_filename == filename:
 247             raise
 248         else:
 249             # An exception here should be caught in the caller
 250             stream = open(encodeFilename(filename), open_mode)
 251             return (stream, alt_filename)
 252
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262
 263 def sanitize_filename(s, restricted=False, is_id=False):
 264     """Sanitizes a string so it could be used as part of a filename.
 265     If restricted is set, use a stricter subset of allowed characters.
 266     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 267     """
 268     def replace_insane(char):
 269         if char == '?' or ord(char) < 32 or ord(char) == 127:
 270             return ''
 271         elif char == '"':
 272             return '' if restricted else '\''
 273         elif char == ':':
 274             return '_-' if restricted else ' -'
 275         elif char in '\\/|*<>':
 276             return '_'
 277         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 278             return '_'
 279         if restricted and ord(char) > 127:
 280             return '_'
 281         return char
 282
 283     result = ''.join(map(replace_insane, s))
 284     if not is_id:
 285         while '__' in result:
 286             result = result.replace('__', '_')
 287         result = result.strip('_')
 288         # Common case of "Foreign band name - English song title"
 289         if restricted and result.startswith('-_'):
 290             result = result[2:]
 291         if not result:
 292             result = '_'
 293     return result
 294
 295
 296 def orderedSet(iterable):
 297     """ Remove all duplicates from the input iterable """
 298     res = []
 299     for el in iterable:
 300         if el not in res:
 301             res.append(el)
 302     return res
 303
 304
 305 def _htmlentity_transform(entity):
 306     """Transforms an HTML entity to a character."""
 307     # Known non-numeric HTML entity
 308     if entity in compat_html_entities.name2codepoint:
 309         return compat_chr(compat_html_entities.name2codepoint[entity])
 310
 311     mobj = re.match(r'#(x?[0-9]+)', entity)
 312     if mobj is not None:
 313         numstr = mobj.group(1)
 314         if numstr.startswith('x'):
 315             base = 16
 316             numstr = '0%s' % numstr
 317         else:
 318             base = 10
 319         return compat_chr(int(numstr, base))
 320
 321     # Unknown entity in name, return its literal representation
 322     return ('&%s;' % entity)
 323
 324
 325 def unescapeHTML(s):
 326     if s is None:
 327         return None
 328     assert type(s) == compat_str
 329
 330     return re.sub(
 331         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 332
 333
 334 def encodeFilename(s, for_subprocess=False):
 335     """
 336     @param s The name of the file
 337     """
 338
 339     assert type(s) == compat_str
 340
 341     # Python 3 has a Unicode API
 342     if sys.version_info >= (3, 0):
 343         return s
 344
 345     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 346         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 347         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 348         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 349         if not for_subprocess:
 350             return s
 351         else:
 352             # For subprocess calls, encode with locale encoding
 353             # Refer to http://stackoverflow.com/a/9951851/35070
 354             encoding = preferredencoding()
 355     else:
 356         encoding = sys.getfilesystemencoding()
 357     if encoding is None:
 358         encoding = 'utf-8'
 359     return s.encode(encoding, 'ignore')
 360
 361
 362 def encodeArgument(s):
 363     if not isinstance(s, compat_str):
 364         # Legacy code that uses byte strings
 365         # Uncomment the following line after fixing all post processors
 366         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 367         s = s.decode('ascii')
 368     return encodeFilename(s, True)
 369
 370
 371 def decodeOption(optval):
 372     if optval is None:
 373         return optval
 374     if isinstance(optval, bytes):
 375         optval = optval.decode(preferredencoding())
 376
 377     assert isinstance(optval, compat_str)
 378     return optval
 379
 380
 381 def formatSeconds(secs):
 382     if secs > 3600:
 383         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 384     elif secs > 60:
 385         return '%d:%02d' % (secs // 60, secs % 60)
 386     else:
 387         return '%d' % secs
 388
 389
 390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 391     if sys.version_info < (3, 2):
 392         import httplib
 393
 394         class HTTPSConnectionV3(httplib.HTTPSConnection):
 395             def __init__(self, *args, **kwargs):
 396                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 397
 398             def connect(self):
 399                 sock = socket.create_connection((self.host, self.port), self.timeout)
 400                 if getattr(self, '_tunnel_host', False):
 401                     self.sock = sock
 402                     self._tunnel()
 403                 try:
 404                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 405                 except ssl.SSLError:
 406                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 407
 408         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 409             def https_open(self, req):
 410                 return self.do_open(HTTPSConnectionV3, req)
 411         return HTTPSHandlerV3(**kwargs)
 412     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 413         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 414         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 415         if opts_no_check_certificate:
 416             context.verify_mode = ssl.CERT_NONE
 417         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 418     else:  # Python < 3.4
 419         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 420         context.verify_mode = (ssl.CERT_NONE
 421                                if opts_no_check_certificate
 422                                else ssl.CERT_REQUIRED)
 423         context.set_default_verify_paths()
 424         try:
 425             context.load_default_certs()
 426         except AttributeError:
 427             pass  # Python < 3.4
 428         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 429
 430
 431 class ExtractorError(Exception):
 432     """Error during info extraction."""
 433
 434     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 435         """ tb, if given, is the original traceback (so that it can be printed out).
 436         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 437         """
 438
 439         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 440             expected = True
 441         if video_id is not None:
 442             msg = video_id + ': ' + msg
 443         if cause:
 444             msg += ' (caused by %r)' % cause
 445         if not expected:
 446             if ytdl_is_updateable():
 447                 update_cmd = 'type  youtube-dl -U  to update'
 448             else:
 449                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 450             msg += '; please report this issue on https://yt-dl.org/bug .'
 451             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 452             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 453         super(ExtractorError, self).__init__(msg)
 454
 455         self.traceback = tb
 456         self.exc_info = sys.exc_info()  # preserve original exception
 457         self.cause = cause
 458         self.video_id = video_id
 459
 460     def format_traceback(self):
 461         if self.traceback is None:
 462             return None
 463         return ''.join(traceback.format_tb(self.traceback))
 464
 465
 466 class RegexNotFoundError(ExtractorError):
 467     """Error when a regex didn't match"""
 468     pass
 469
 470
 471 class DownloadError(Exception):
 472     """Download Error exception.
 473
 474     This exception may be thrown by FileDownloader objects if they are not
 475     configured to continue on errors. They will contain the appropriate
 476     error message.
 477     """
 478
 479     def __init__(self, msg, exc_info=None):
 480         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 481         super(DownloadError, self).__init__(msg)
 482         self.exc_info = exc_info
 483
 484
 485 class SameFileError(Exception):
 486     """Same File exception.
 487
 488     This exception will be thrown by FileDownloader objects if they detect
 489     multiple files would have to be downloaded to the same file on disk.
 490     """
 491     pass
 492
 493
 494 class PostProcessingError(Exception):
 495     """Post Processing exception.
 496
 497     This exception may be raised by PostProcessor's .run() method to
 498     indicate an error in the postprocessing task.
 499     """
 500
 501     def __init__(self, msg):
 502         self.msg = msg
 503
 504
 505 class MaxDownloadsReached(Exception):
 506     """ --max-downloads limit has been reached. """
 507     pass
 508
 509
 510 class UnavailableVideoError(Exception):
 511     """Unavailable Format exception.
 512
 513     This exception will be thrown when a video is requested
 514     in a format that is not available for that video.
 515     """
 516     pass
 517
 518
 519 class ContentTooShortError(Exception):
 520     """Content Too Short exception.
 521
 522     This exception may be raised by FileDownloader objects when a file they
 523     download is too small for what the server announced first, indicating
 524     the connection was probably interrupted.
 525     """
 526     # Both in bytes
 527     downloaded = None
 528     expected = None
 529
 530     def __init__(self, downloaded, expected):
 531         self.downloaded = downloaded
 532         self.expected = expected
 533
 534
 535 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 536     """Handler for HTTP requests and responses.
 537
 538     This class, when installed with an OpenerDirector, automatically adds
 539     the standard headers to every HTTP request and handles gzipped and
 540     deflated responses from web servers. If compression is to be avoided in
 541     a particular request, the original request in the program code only has
 542     to include the HTTP header "Youtubedl-No-Compression", which will be
 543     removed before making the real request.
 544
 545     Part of this code was copied from:
 546
 547     http://techknack.net/python-urllib2-handlers/
 548
 549     Andrew Rowls, the author of that code, agreed to release it to the
 550     public domain.
 551     """
 552
 553     @staticmethod
 554     def deflate(data):
 555         try:
 556             return zlib.decompress(data, -zlib.MAX_WBITS)
 557         except zlib.error:
 558             return zlib.decompress(data)
 559
 560     @staticmethod
 561     def addinfourl_wrapper(stream, headers, url, code):
 562         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 563             return compat_urllib_request.addinfourl(stream, headers, url, code)
 564         ret = compat_urllib_request.addinfourl(stream, headers, url)
 565         ret.code = code
 566         return ret
 567
 568     def http_request(self, req):
 569         for h, v in std_headers.items():
 570             if h not in req.headers:
 571                 req.add_header(h, v)
 572         if 'Youtubedl-no-compression' in req.headers:
 573             if 'Accept-encoding' in req.headers:
 574                 del req.headers['Accept-encoding']
 575             del req.headers['Youtubedl-no-compression']
 576         if 'Youtubedl-user-agent' in req.headers:
 577             if 'User-agent' in req.headers:
 578                 del req.headers['User-agent']
 579             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 580             del req.headers['Youtubedl-user-agent']
 581
 582         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 583             # Python 2.6 is brain-dead when it comes to fragments
 584             req._Request__original = req._Request__original.partition('#')[0]
 585             req._Request__r_type = req._Request__r_type.partition('#')[0]
 586
 587         return req
 588
 589     def http_response(self, req, resp):
 590         old_resp = resp
 591         # gzip
 592         if resp.headers.get('Content-encoding', '') == 'gzip':
 593             content = resp.read()
 594             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 595             try:
 596                 uncompressed = io.BytesIO(gz.read())
 597             except IOError as original_ioerror:
 598                 # There may be junk add the end of the file
 599                 # See http://stackoverflow.com/q/4928560/35070 for details
 600                 for i in range(1, 1024):
 601                     try:
 602                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 603                         uncompressed = io.BytesIO(gz.read())
 604                     except IOError:
 605                         continue
 606                     break
 607                 else:
 608                     raise original_ioerror
 609             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 610             resp.msg = old_resp.msg
 611         # deflate
 612         if resp.headers.get('Content-encoding', '') == 'deflate':
 613             gz = io.BytesIO(self.deflate(resp.read()))
 614             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 615             resp.msg = old_resp.msg
 616         return resp
 617
 618     https_request = http_request
 619     https_response = http_response
 620
 621
 622 def parse_iso8601(date_str, delimiter='T'):
 623     """ Return a UNIX timestamp from the given date """
 624
 625     if date_str is None:
 626         return None
 627
 628     m = re.search(
 629         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 630         date_str)
 631     if not m:
 632         timezone = datetime.timedelta()
 633     else:
 634         date_str = date_str[:-len(m.group(0))]
 635         if not m.group('sign'):
 636             timezone = datetime.timedelta()
 637         else:
 638             sign = 1 if m.group('sign') == '+' else -1
 639             timezone = datetime.timedelta(
 640                 hours=sign * int(m.group('hours')),
 641                 minutes=sign * int(m.group('minutes')))
 642     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 643     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 644     return calendar.timegm(dt.timetuple())
 645
 646
 647 def unified_strdate(date_str):
 648     """Return a string with the date in the format YYYYMMDD"""
 649
 650     if date_str is None:
 651         return None
 652
 653     upload_date = None
 654     # Replace commas
 655     date_str = date_str.replace(',', ' ')
 656     # %z (UTC offset) is only supported in python>=3.2
 657     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 658     format_expressions = [
 659         '%d %B %Y',
 660         '%d %b %Y',
 661         '%B %d %Y',
 662         '%b %d %Y',
 663         '%b %dst %Y %I:%M%p',
 664         '%b %dnd %Y %I:%M%p',
 665         '%b %dth %Y %I:%M%p',
 666         '%Y-%m-%d',
 667         '%Y/%m/%d',
 668         '%d.%m.%Y',
 669         '%d/%m/%Y',
 670         '%d/%m/%y',
 671         '%Y/%m/%d %H:%M:%S',
 672         '%d/%m/%Y %H:%M:%S',
 673         '%Y-%m-%d %H:%M:%S',
 674         '%Y-%m-%d %H:%M:%S.%f',
 675         '%d.%m.%Y %H:%M',
 676         '%d.%m.%Y %H.%M',
 677         '%Y-%m-%dT%H:%M:%SZ',
 678         '%Y-%m-%dT%H:%M:%S.%fZ',
 679         '%Y-%m-%dT%H:%M:%S.%f0Z',
 680         '%Y-%m-%dT%H:%M:%S',
 681         '%Y-%m-%dT%H:%M:%S.%f',
 682         '%Y-%m-%dT%H:%M',
 683     ]
 684     for expression in format_expressions:
 685         try:
 686             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 687         except ValueError:
 688             pass
 689     if upload_date is None:
 690         timetuple = email.utils.parsedate_tz(date_str)
 691         if timetuple:
 692             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 693     return upload_date
 694
 695
 696 def determine_ext(url, default_ext='unknown_video'):
 697     if url is None:
 698         return default_ext
 699     guess = url.partition('?')[0].rpartition('.')[2]
 700     if re.match(r'^[A-Za-z0-9]+$', guess):
 701         return guess
 702     else:
 703         return default_ext
 704
 705
 706 def subtitles_filename(filename, sub_lang, sub_format):
 707     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 708
 709
 710 def date_from_str(date_str):
 711     """
 712     Return a datetime object from a string in the format YYYYMMDD or
 713     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 714     today = datetime.date.today()
 715     if date_str in ('now', 'today'):
 716         return today
 717     if date_str == 'yesterday':
 718         return today - datetime.timedelta(days=1)
 719     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 720     if match is not None:
 721         sign = match.group('sign')
 722         time = int(match.group('time'))
 723         if sign == '-':
 724             time = -time
 725         unit = match.group('unit')
 726         # A bad aproximation?
 727         if unit == 'month':
 728             unit = 'day'
 729             time *= 30
 730         elif unit == 'year':
 731             unit = 'day'
 732             time *= 365
 733         unit += 's'
 734         delta = datetime.timedelta(**{unit: time})
 735         return today + delta
 736     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 737
 738
 739 def hyphenate_date(date_str):
 740     """
 741     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 742     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 743     if match is not None:
 744         return '-'.join(match.groups())
 745     else:
 746         return date_str
 747
 748
 749 class DateRange(object):
 750     """Represents a time interval between two dates"""
 751
 752     def __init__(self, start=None, end=None):
 753         """start and end must be strings in the format accepted by date"""
 754         if start is not None:
 755             self.start = date_from_str(start)
 756         else:
 757             self.start = datetime.datetime.min.date()
 758         if end is not None:
 759             self.end = date_from_str(end)
 760         else:
 761             self.end = datetime.datetime.max.date()
 762         if self.start > self.end:
 763             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 764
 765     @classmethod
 766     def day(cls, day):
 767         """Returns a range that only contains the given day"""
 768         return cls(day, day)
 769
 770     def __contains__(self, date):
 771         """Check if the date is in the range"""
 772         if not isinstance(date, datetime.date):
 773             date = date_from_str(date)
 774         return self.start <= date <= self.end
 775
 776     def __str__(self):
 777         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 778
 779
 780 def platform_name():
 781     """ Returns the platform name as a compat_str """
 782     res = platform.platform()
 783     if isinstance(res, bytes):
 784         res = res.decode(preferredencoding())
 785
 786     assert isinstance(res, compat_str)
 787     return res
 788
 789
 790 def _windows_write_string(s, out):
 791     """ Returns True if the string was written using special methods,
 792     False if it has yet to be written out."""
 793     # Adapted from http://stackoverflow.com/a/3259271/35070
 794
 795     import ctypes
 796     import ctypes.wintypes
 797
 798     WIN_OUTPUT_IDS = {
 799         1: -11,
 800         2: -12,
 801     }
 802
 803     try:
 804         fileno = out.fileno()
 805     except AttributeError:
 806         # If the output stream doesn't have a fileno, it's virtual
 807         return False
 808     if fileno not in WIN_OUTPUT_IDS:
 809         return False
 810
 811     GetStdHandle = ctypes.WINFUNCTYPE(
 812         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 813         ("GetStdHandle", ctypes.windll.kernel32))
 814     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 815
 816     WriteConsoleW = ctypes.WINFUNCTYPE(
 817         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 818         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 819         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 820     written = ctypes.wintypes.DWORD(0)
 821
 822     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 823     FILE_TYPE_CHAR = 0x0002
 824     FILE_TYPE_REMOTE = 0x8000
 825     GetConsoleMode = ctypes.WINFUNCTYPE(
 826         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 827         ctypes.POINTER(ctypes.wintypes.DWORD))(
 828         ("GetConsoleMode", ctypes.windll.kernel32))
 829     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 830
 831     def not_a_console(handle):
 832         if handle == INVALID_HANDLE_VALUE or handle is None:
 833             return True
 834         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 835                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 836
 837     if not_a_console(h):
 838         return False
 839
 840     def next_nonbmp_pos(s):
 841         try:
 842             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 843         except StopIteration:
 844             return len(s)
 845
 846     while s:
 847         count = min(next_nonbmp_pos(s), 1024)
 848
 849         ret = WriteConsoleW(
 850             h, s, count if count else 2, ctypes.byref(written), None)
 851         if ret == 0:
 852             raise OSError('Failed to write string')
 853         if not count:  # We just wrote a non-BMP character
 854             assert written.value == 2
 855             s = s[1:]
 856         else:
 857             assert written.value > 0
 858             s = s[written.value:]
 859     return True
 860
 861
 862 def write_string(s, out=None, encoding=None):
 863     if out is None:
 864         out = sys.stderr
 865     assert type(s) == compat_str
 866
 867     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 868         if _windows_write_string(s, out):
 869             return
 870
 871     if ('b' in getattr(out, 'mode', '') or
 872             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 873         byt = s.encode(encoding or preferredencoding(), 'ignore')
 874         out.write(byt)
 875     elif hasattr(out, 'buffer'):
 876         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 877         byt = s.encode(enc, 'ignore')
 878         out.buffer.write(byt)
 879     else:
 880         out.write(s)
 881     out.flush()
 882
 883
 884 def bytes_to_intlist(bs):
 885     if not bs:
 886         return []
 887     if isinstance(bs[0], int):  # Python 3
 888         return list(bs)
 889     else:
 890         return [ord(c) for c in bs]
 891
 892
 893 def intlist_to_bytes(xs):
 894     if not xs:
 895         return b''
 896     return struct_pack('%dB' % len(xs), *xs)
 897
 898
 899 # Cross-platform file locking
 900 if sys.platform == 'win32':
 901     import ctypes.wintypes
 902     import msvcrt
 903
 904     class OVERLAPPED(ctypes.Structure):
 905         _fields_ = [
 906             ('Internal', ctypes.wintypes.LPVOID),
 907             ('InternalHigh', ctypes.wintypes.LPVOID),
 908             ('Offset', ctypes.wintypes.DWORD),
 909             ('OffsetHigh', ctypes.wintypes.DWORD),
 910             ('hEvent', ctypes.wintypes.HANDLE),
 911         ]
 912
 913     kernel32 = ctypes.windll.kernel32
 914     LockFileEx = kernel32.LockFileEx
 915     LockFileEx.argtypes = [
 916         ctypes.wintypes.HANDLE,     # hFile
 917         ctypes.wintypes.DWORD,      # dwFlags
 918         ctypes.wintypes.DWORD,      # dwReserved
 919         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 920         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 921         ctypes.POINTER(OVERLAPPED)  # Overlapped
 922     ]
 923     LockFileEx.restype = ctypes.wintypes.BOOL
 924     UnlockFileEx = kernel32.UnlockFileEx
 925     UnlockFileEx.argtypes = [
 926         ctypes.wintypes.HANDLE,     # hFile
 927         ctypes.wintypes.DWORD,      # dwReserved
 928         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 929         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 930         ctypes.POINTER(OVERLAPPED)  # Overlapped
 931     ]
 932     UnlockFileEx.restype = ctypes.wintypes.BOOL
 933     whole_low = 0xffffffff
 934     whole_high = 0x7fffffff
 935
 936     def _lock_file(f, exclusive):
 937         overlapped = OVERLAPPED()
 938         overlapped.Offset = 0
 939         overlapped.OffsetHigh = 0
 940         overlapped.hEvent = 0
 941         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 942         handle = msvcrt.get_osfhandle(f.fileno())
 943         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 944                           whole_low, whole_high, f._lock_file_overlapped_p):
 945             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 946
 947     def _unlock_file(f):
 948         assert f._lock_file_overlapped_p
 949         handle = msvcrt.get_osfhandle(f.fileno())
 950         if not UnlockFileEx(handle, 0,
 951                             whole_low, whole_high, f._lock_file_overlapped_p):
 952             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 953
 954 else:
 955     import fcntl
 956
 957     def _lock_file(f, exclusive):
 958         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 959
 960     def _unlock_file(f):
 961         fcntl.flock(f, fcntl.LOCK_UN)
 962
 963
 964 class locked_file(object):
 965     def __init__(self, filename, mode, encoding=None):
 966         assert mode in ['r', 'a', 'w']
 967         self.f = io.open(filename, mode, encoding=encoding)
 968         self.mode = mode
 969
 970     def __enter__(self):
 971         exclusive = self.mode != 'r'
 972         try:
 973             _lock_file(self.f, exclusive)
 974         except IOError:
 975             self.f.close()
 976             raise
 977         return self
 978
 979     def __exit__(self, etype, value, traceback):
 980         try:
 981             _unlock_file(self.f)
 982         finally:
 983             self.f.close()
 984
 985     def __iter__(self):
 986         return iter(self.f)
 987
 988     def write(self, *args):
 989         return self.f.write(*args)
 990
 991     def read(self, *args):
 992         return self.f.read(*args)
 993
 994
 995 def get_filesystem_encoding():
 996     encoding = sys.getfilesystemencoding()
 997     return encoding if encoding is not None else 'utf-8'
 998
 999
1000 def shell_quote(args):
1001     quoted_args = []
1002     encoding = get_filesystem_encoding()
1003     for a in args:
1004         if isinstance(a, bytes):
1005             # We may get a filename encoded with 'encodeFilename'
1006             a = a.decode(encoding)
1007         quoted_args.append(pipes.quote(a))
1008     return ' '.join(quoted_args)
1009
1010
1011 def takewhile_inclusive(pred, seq):
1012     """ Like itertools.takewhile, but include the latest evaluated element
1013         (the first element so that Not pred(e)) """
1014     for e in seq:
1015         yield e
1016         if not pred(e):
1017             return
1018
1019
1020 def smuggle_url(url, data):
1021     """ Pass additional data in a URL for internal use. """
1022
1023     sdata = compat_urllib_parse.urlencode(
1024         {'__youtubedl_smuggle': json.dumps(data)})
1025     return url + '#' + sdata
1026
1027
1028 def unsmuggle_url(smug_url, default=None):
1029     if '#__youtubedl_smuggle' not in smug_url:
1030         return smug_url, default
1031     url, _, sdata = smug_url.rpartition('#')
1032     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1033     data = json.loads(jsond)
1034     return url, data
1035
1036
1037 def format_bytes(bytes):
1038     if bytes is None:
1039         return 'N/A'
1040     if type(bytes) is str:
1041         bytes = float(bytes)
1042     if bytes == 0.0:
1043         exponent = 0
1044     else:
1045         exponent = int(math.log(bytes, 1024.0))
1046     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1047     converted = float(bytes) / float(1024 ** exponent)
1048     return '%.2f%s' % (converted, suffix)
1049
1050
1051 def parse_filesize(s):
1052     if s is None:
1053         return None
1054
1055     # The lower-case forms are of course incorrect and inofficial,
1056     # but we support those too
1057     _UNIT_TABLE = {
1058         'B': 1,
1059         'b': 1,
1060         'KiB': 1024,
1061         'KB': 1000,
1062         'kB': 1024,
1063         'Kb': 1000,
1064         'MiB': 1024 ** 2,
1065         'MB': 1000 ** 2,
1066         'mB': 1024 ** 2,
1067         'Mb': 1000 ** 2,
1068         'GiB': 1024 ** 3,
1069         'GB': 1000 ** 3,
1070         'gB': 1024 ** 3,
1071         'Gb': 1000 ** 3,
1072         'TiB': 1024 ** 4,
1073         'TB': 1000 ** 4,
1074         'tB': 1024 ** 4,
1075         'Tb': 1000 ** 4,
1076         'PiB': 1024 ** 5,
1077         'PB': 1000 ** 5,
1078         'pB': 1024 ** 5,
1079         'Pb': 1000 ** 5,
1080         'EiB': 1024 ** 6,
1081         'EB': 1000 ** 6,
1082         'eB': 1024 ** 6,
1083         'Eb': 1000 ** 6,
1084         'ZiB': 1024 ** 7,
1085         'ZB': 1000 ** 7,
1086         'zB': 1024 ** 7,
1087         'Zb': 1000 ** 7,
1088         'YiB': 1024 ** 8,
1089         'YB': 1000 ** 8,
1090         'yB': 1024 ** 8,
1091         'Yb': 1000 ** 8,
1092     }
1093
1094     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1095     m = re.match(
1096         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1097     if not m:
1098         return None
1099
1100     num_str = m.group('num').replace(',', '.')
1101     mult = _UNIT_TABLE[m.group('unit')]
1102     return int(float(num_str) * mult)
1103
1104
1105 def get_term_width():
1106     columns = compat_getenv('COLUMNS', None)
1107     if columns:
1108         return int(columns)
1109
1110     try:
1111         sp = subprocess.Popen(
1112             ['stty', 'size'],
1113             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1114         out, err = sp.communicate()
1115         return int(out.split()[1])
1116     except:
1117         pass
1118     return None
1119
1120
1121 def month_by_name(name):
1122     """ Return the number of a month by (locale-independently) English name """
1123
1124     ENGLISH_NAMES = [
1125         'January', 'February', 'March', 'April', 'May', 'June',
1126         'July', 'August', 'September', 'October', 'November', 'December']
1127     try:
1128         return ENGLISH_NAMES.index(name) + 1
1129     except ValueError:
1130         return None
1131
1132
1133 def fix_xml_ampersands(xml_str):
1134     """Replace all the '&' by '&amp;' in XML"""
1135     return re.sub(
1136         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1137         '&amp;',
1138         xml_str)
1139
1140
1141 def setproctitle(title):
1142     assert isinstance(title, compat_str)
1143     try:
1144         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1145     except OSError:
1146         return
1147     title_bytes = title.encode('utf-8')
1148     buf = ctypes.create_string_buffer(len(title_bytes))
1149     buf.value = title_bytes
1150     try:
1151         libc.prctl(15, buf, 0, 0, 0)
1152     except AttributeError:
1153         return  # Strange libc, just skip this
1154
1155
1156 def remove_start(s, start):
1157     if s.startswith(start):
1158         return s[len(start):]
1159     return s
1160
1161
1162 def remove_end(s, end):
1163     if s.endswith(end):
1164         return s[:-len(end)]
1165     return s
1166
1167
1168 def url_basename(url):
1169     path = compat_urlparse.urlparse(url).path
1170     return path.strip('/').split('/')[-1]
1171
1172
1173 class HEADRequest(compat_urllib_request.Request):
1174     def get_method(self):
1175         return "HEAD"
1176
1177
1178 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1179     if get_attr:
1180         if v is not None:
1181             v = getattr(v, get_attr, None)
1182     if v == '':
1183         v = None
1184     return default if v is None else (int(v) * invscale // scale)
1185
1186
1187 def str_or_none(v, default=None):
1188     return default if v is None else compat_str(v)
1189
1190
1191 def str_to_int(int_str):
1192     """ A more relaxed version of int_or_none """
1193     if int_str is None:
1194         return None
1195     int_str = re.sub(r'[,\.\+]', '', int_str)
1196     return int(int_str)
1197
1198
1199 def float_or_none(v, scale=1, invscale=1, default=None):
1200     return default if v is None else (float(v) * invscale / scale)
1201
1202
1203 def parse_duration(s):
1204     if s is None:
1205         return None
1206
1207     s = s.strip()
1208
1209     m = re.match(
1210         r'''(?ix)T?
1211         (?:
1212             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1213             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1214
1215             (?:
1216                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1217                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1218             )?
1219             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1220         )$''', s)
1221     if not m:
1222         return None
1223     res = 0
1224     if m.group('only_mins'):
1225         return float_or_none(m.group('only_mins'), invscale=60)
1226     if m.group('only_hours'):
1227         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1228     if m.group('secs'):
1229         res += int(m.group('secs'))
1230     if m.group('mins'):
1231         res += int(m.group('mins')) * 60
1232     if m.group('hours'):
1233         res += int(m.group('hours')) * 60 * 60
1234     if m.group('ms'):
1235         res += float(m.group('ms'))
1236     return res
1237
1238
1239 def prepend_extension(filename, ext):
1240     name, real_ext = os.path.splitext(filename)
1241     return '{0}.{1}{2}'.format(name, ext, real_ext)
1242
1243
1244 def check_executable(exe, args=[]):
1245     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1246     args can be a list of arguments for a short output (like -version) """
1247     try:
1248         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1249     except OSError:
1250         return False
1251     return exe
1252
1253
1254 def get_exe_version(exe, args=['--version'],
1255                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1256                     unrecognized='present'):
1257     """ Returns the version of the specified executable,
1258     or False if the executable is not present """
1259     try:
1260         out, err = subprocess.Popen(
1261             [exe] + args,
1262             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1263     except OSError:
1264         return False
1265     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1266     m = re.search(version_re, firstline)
1267     if m:
1268         return m.group(1)
1269     else:
1270         return unrecognized
1271
1272
1273 class PagedList(object):
1274     def __len__(self):
1275         # This is only useful for tests
1276         return len(self.getslice())
1277
1278
1279 class OnDemandPagedList(PagedList):
1280     def __init__(self, pagefunc, pagesize):
1281         self._pagefunc = pagefunc
1282         self._pagesize = pagesize
1283
1284     def getslice(self, start=0, end=None):
1285         res = []
1286         for pagenum in itertools.count(start // self._pagesize):
1287             firstid = pagenum * self._pagesize
1288             nextfirstid = pagenum * self._pagesize + self._pagesize
1289             if start >= nextfirstid:
1290                 continue
1291
1292             page_results = list(self._pagefunc(pagenum))
1293
1294             startv = (
1295                 start % self._pagesize
1296                 if firstid <= start < nextfirstid
1297                 else 0)
1298
1299             endv = (
1300                 ((end - 1) % self._pagesize) + 1
1301                 if (end is not None and firstid <= end <= nextfirstid)
1302                 else None)
1303
1304             if startv != 0 or endv is not None:
1305                 page_results = page_results[startv:endv]
1306             res.extend(page_results)
1307
1308             # A little optimization - if current page is not "full", ie. does
1309             # not contain page_size videos then we can assume that this page
1310             # is the last one - there are no more ids on further pages -
1311             # i.e. no need to query again.
1312             if len(page_results) + startv < self._pagesize:
1313                 break
1314
1315             # If we got the whole page, but the next page is not interesting,
1316             # break out early as well
1317             if end == nextfirstid:
1318                 break
1319         return res
1320
1321
1322 class InAdvancePagedList(PagedList):
1323     def __init__(self, pagefunc, pagecount, pagesize):
1324         self._pagefunc = pagefunc
1325         self._pagecount = pagecount
1326         self._pagesize = pagesize
1327
1328     def getslice(self, start=0, end=None):
1329         res = []
1330         start_page = start // self._pagesize
1331         end_page = (
1332             self._pagecount if end is None else (end // self._pagesize + 1))
1333         skip_elems = start - start_page * self._pagesize
1334         only_more = None if end is None else end - start
1335         for pagenum in range(start_page, end_page):
1336             page = list(self._pagefunc(pagenum))
1337             if skip_elems:
1338                 page = page[skip_elems:]
1339                 skip_elems = None
1340             if only_more is not None:
1341                 if len(page) < only_more:
1342                     only_more -= len(page)
1343                 else:
1344                     page = page[:only_more]
1345                     res.extend(page)
1346                     break
1347             res.extend(page)
1348         return res
1349
1350
1351 def uppercase_escape(s):
1352     unicode_escape = codecs.getdecoder('unicode_escape')
1353     return re.sub(
1354         r'\\U[0-9a-fA-F]{8}',
1355         lambda m: unicode_escape(m.group(0))[0],
1356         s)
1357
1358
1359 def escape_rfc3986(s):
1360     """Escape non-ASCII characters as suggested by RFC 3986"""
1361     if sys.version_info < (3, 0) and isinstance(s, unicode):
1362         s = s.encode('utf-8')
1363     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1364
1365
1366 def escape_url(url):
1367     """Escape URL as suggested by RFC 3986"""
1368     url_parsed = compat_urllib_parse_urlparse(url)
1369     return url_parsed._replace(
1370         path=escape_rfc3986(url_parsed.path),
1371         params=escape_rfc3986(url_parsed.params),
1372         query=escape_rfc3986(url_parsed.query),
1373         fragment=escape_rfc3986(url_parsed.fragment)
1374     ).geturl()
1375
1376 try:
1377     struct.pack('!I', 0)
1378 except TypeError:
1379     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1380     def struct_pack(spec, *args):
1381         if isinstance(spec, compat_str):
1382             spec = spec.encode('ascii')
1383         return struct.pack(spec, *args)
1384
1385     def struct_unpack(spec, *args):
1386         if isinstance(spec, compat_str):
1387             spec = spec.encode('ascii')
1388         return struct.unpack(spec, *args)
1389 else:
1390     struct_pack = struct.pack
1391     struct_unpack = struct.unpack
1392
1393
1394 def read_batch_urls(batch_fd):
1395     def fixup(url):
1396         if not isinstance(url, compat_str):
1397             url = url.decode('utf-8', 'replace')
1398         BOM_UTF8 = '\xef\xbb\xbf'
1399         if url.startswith(BOM_UTF8):
1400             url = url[len(BOM_UTF8):]
1401         url = url.strip()
1402         if url.startswith(('#', ';', ']')):
1403             return False
1404         return url
1405
1406     with contextlib.closing(batch_fd) as fd:
1407         return [url for url in map(fixup, fd) if url]
1408
1409
1410 def urlencode_postdata(*args, **kargs):
1411     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1412
1413
1414 try:
1415     etree_iter = xml.etree.ElementTree.Element.iter
1416 except AttributeError:  # Python <=2.6
1417     etree_iter = lambda n: n.findall('.//*')
1418
1419
1420 def parse_xml(s):
1421     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1422         def doctype(self, name, pubid, system):
1423             pass  # Ignore doctypes
1424
1425     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1426     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1427     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1428     # Fix up XML parser in Python 2.x
1429     if sys.version_info < (3, 0):
1430         for n in etree_iter(tree):
1431             if n.text is not None:
1432                 if not isinstance(n.text, compat_str):
1433                     n.text = n.text.decode('utf-8')
1434     return tree
1435
1436
1437 US_RATINGS = {
1438     'G': 0,
1439     'PG': 10,
1440     'PG-13': 13,
1441     'R': 16,
1442     'NC': 18,
1443 }
1444
1445
1446 def parse_age_limit(s):
1447     if s is None:
1448         return None
1449     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1450     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1451
1452
1453 def strip_jsonp(code):
1454     return re.sub(
1455         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1456
1457
1458 def js_to_json(code):
1459     def fix_kv(m):
1460         v = m.group(0)
1461         if v in ('true', 'false', 'null'):
1462             return v
1463         if v.startswith('"'):
1464             return v
1465         if v.startswith("'"):
1466             v = v[1:-1]
1467             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1468                 '\\\\': '\\\\',
1469                 "\\'": "'",
1470                 '"': '\\"',
1471             }[m.group(0)], v)
1472         return '"%s"' % v
1473
1474     res = re.sub(r'''(?x)
1475         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1476         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1477         [a-zA-Z_][a-zA-Z_0-9]*
1478         ''', fix_kv, code)
1479     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1480     return res
1481
1482
1483 def qualities(quality_ids):
1484     """ Get a numeric quality value out of a list of possible values """
1485     def q(qid):
1486         try:
1487             return quality_ids.index(qid)
1488         except ValueError:
1489             return -1
1490     return q
1491
1492
1493 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1494
1495
1496 def limit_length(s, length):
1497     """ Add ellipses to overly long strings """
1498     if s is None:
1499         return None
1500     ELLIPSES = '...'
1501     if len(s) > length:
1502         return s[:length - len(ELLIPSES)] + ELLIPSES
1503     return s
1504
1505
1506 def version_tuple(v):
1507     return tuple(int(e) for e in re.split(r'[-.]', v))
1508
1509
1510 def is_outdated_version(version, limit, assume_new=True):
1511     if not version:
1512         return not assume_new
1513     try:
1514         return version_tuple(version) < version_tuple(limit)
1515     except ValueError:
1516         return not assume_new
1517
1518
1519 def ytdl_is_updateable():
1520     """ Returns if youtube-dl can be updated with -U """
1521     from zipimport import zipimporter
1522
1523     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1524
1525
1526 def args_to_str(args):
1527     # Get a short string representation for a subprocess command
1528     return ' '.join(shlex_quote(a) for a in args)