git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 ENGLISH_MONTH_NAMES = [
  66     'January', 'February', 'March', 'April', 'May', 'June',
  67     'July', 'August', 'September', 'October', 'November', 'December']
  68
  69
  70 def preferredencoding():
  71     """Get preferred encoding.
  72
  73     Returns the best encoding scheme for the system, based on
  74     locale.getpreferredencoding() and some further tweaks.
  75     """
  76     try:
  77         pref = locale.getpreferredencoding()
  78         'TEST'.encode(pref)
  79     except Exception:
  80         pref = 'UTF-8'
  81
  82     return pref
  83
  84
  85 def write_json_file(obj, fn):
  86     """ Encode obj as JSON and write it to fn, atomically if possible """
  87
  88     fn = encodeFilename(fn)
  89     if sys.version_info < (3, 0) and sys.platform != 'win32':
  90         encoding = get_filesystem_encoding()
  91         # os.path.basename returns a bytes object, but NamedTemporaryFile
  92         # will fail if the filename contains non ascii characters unless we
  93         # use a unicode object
  94         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  95         # the same for os.path.dirname
  96         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  97     else:
  98         path_basename = os.path.basename
  99         path_dirname = os.path.dirname
 100
 101     args = {
 102         'suffix': '.tmp',
 103         'prefix': path_basename(fn) + '.',
 104         'dir': path_dirname(fn),
 105         'delete': False,
 106     }
 107
 108     # In Python 2.x, json.dump expects a bytestream.
 109     # In Python 3.x, it writes to a character stream
 110     if sys.version_info < (3, 0):
 111         args['mode'] = 'wb'
 112     else:
 113         args.update({
 114             'mode': 'w',
 115             'encoding': 'utf-8',
 116         })
 117
 118     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 119
 120     try:
 121         with tf:
 122             json.dump(obj, tf)
 123         if sys.platform == 'win32':
 124             # Need to remove existing file on Windows, else os.rename raises
 125             # WindowsError or FileExistsError.
 126             try:
 127                 os.unlink(fn)
 128             except OSError:
 129                 pass
 130         os.rename(tf.name, fn)
 131     except Exception:
 132         try:
 133             os.remove(tf.name)
 134         except OSError:
 135             pass
 136         raise
 137
 138
 139 if sys.version_info >= (2, 7):
 140     def find_xpath_attr(node, xpath, key, val):
 141         """ Find the xpath xpath[@key=val] """
 142         assert re.match(r'^[a-zA-Z-]+$', key)
 143         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 144         expr = xpath + "[@%s='%s']" % (key, val)
 145         return node.find(expr)
 146 else:
 147     def find_xpath_attr(node, xpath, key, val):
 148         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 149         # .//node does not match if a node is a direct child of . !
 150         if isinstance(xpath, compat_str):
 151             xpath = xpath.encode('ascii')
 152
 153         for f in node.findall(xpath):
 154             if f.attrib.get(key) == val:
 155                 return f
 156         return None
 157
 158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 159 # the namespace parameter
 160
 161
 162 def xpath_with_ns(path, ns_map):
 163     components = [c.split(':') for c in path.split('/')]
 164     replaced = []
 165     for c in components:
 166         if len(c) == 1:
 167             replaced.append(c[0])
 168         else:
 169             ns, tag = c
 170             replaced.append('{%s}%s' % (ns_map[ns], tag))
 171     return '/'.join(replaced)
 172
 173
 174 def xpath_text(node, xpath, name=None, fatal=False):
 175     if sys.version_info < (2, 7):  # Crazy 2.6
 176         xpath = xpath.encode('ascii')
 177
 178     n = node.find(xpath)
 179     if n is None or n.text is None:
 180         if fatal:
 181             name = xpath if name is None else name
 182             raise ExtractorError('Could not find XML element %s' % name)
 183         else:
 184             return None
 185     return n.text
 186
 187
 188 def get_element_by_id(id, html):
 189     """Return the content of the tag with the specified ID in the passed HTML document"""
 190     return get_element_by_attribute("id", id, html)
 191
 192
 193 def get_element_by_attribute(attribute, value, html):
 194     """Return the content of the tag with the specified attribute in the passed HTML document"""
 195
 196     m = re.search(r'''(?xs)
 197         <([a-zA-Z0-9:._-]+)
 198          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 199          \s+%s=['"]?%s['"]?
 200          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 201         \s*>
 202         (?P<content>.*?)
 203         </\1>
 204     ''' % (re.escape(attribute), re.escape(value)), html)
 205
 206     if not m:
 207         return None
 208     res = m.group('content')
 209
 210     if res.startswith('"') or res.startswith("'"):
 211         res = res[1:-1]
 212
 213     return unescapeHTML(res)
 214
 215
 216 def clean_html(html):
 217     """Clean an HTML snippet into a readable string"""
 218
 219     if html is None:  # Convenience for sanitizing descriptions etc.
 220         return html
 221
 222     # Newline vs <br />
 223     html = html.replace('\n', ' ')
 224     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 225     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 226     # Strip html tags
 227     html = re.sub('<.*?>', '', html)
 228     # Replace html entities
 229     html = unescapeHTML(html)
 230     return html.strip()
 231
 232
 233 def sanitize_open(filename, open_mode):
 234     """Try to open the given filename, and slightly tweak it if this fails.
 235
 236     Attempts to open the given filename. If this fails, it tries to change
 237     the filename slightly, step by step, until it's either able to open it
 238     or it fails and raises a final exception, like the standard open()
 239     function.
 240
 241     It returns the tuple (stream, definitive_file_name).
 242     """
 243     try:
 244         if filename == '-':
 245             if sys.platform == 'win32':
 246                 import msvcrt
 247                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 248             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 249         stream = open(encodeFilename(filename), open_mode)
 250         return (stream, filename)
 251     except (IOError, OSError) as err:
 252         if err.errno in (errno.EACCES,):
 253             raise
 254
 255         # In case of error, try to remove win32 forbidden chars
 256         alt_filename = sanitize_path(filename)
 257         if alt_filename == filename:
 258             raise
 259         else:
 260             # An exception here should be caught in the caller
 261             stream = open(encodeFilename(alt_filename), open_mode)
 262             return (stream, alt_filename)
 263
 264
 265 def timeconvert(timestr):
 266     """Convert RFC 2822 defined time string into system timestamp"""
 267     timestamp = None
 268     timetuple = email.utils.parsedate_tz(timestr)
 269     if timetuple is not None:
 270         timestamp = email.utils.mktime_tz(timetuple)
 271     return timestamp
 272
 273
 274 def sanitize_filename(s, restricted=False, is_id=False):
 275     """Sanitizes a string so it could be used as part of a filename.
 276     If restricted is set, use a stricter subset of allowed characters.
 277     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 278     """
 279     def replace_insane(char):
 280         if char == '?' or ord(char) < 32 or ord(char) == 127:
 281             return ''
 282         elif char == '"':
 283             return '' if restricted else '\''
 284         elif char == ':':
 285             return '_-' if restricted else ' -'
 286         elif char in '\\/|*<>':
 287             return '_'
 288         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 289             return '_'
 290         if restricted and ord(char) > 127:
 291             return '_'
 292         return char
 293
 294     # Handle timestamps
 295     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 296     result = ''.join(map(replace_insane, s))
 297     if not is_id:
 298         while '__' in result:
 299             result = result.replace('__', '_')
 300         result = result.strip('_')
 301         # Common case of "Foreign band name - English song title"
 302         if restricted and result.startswith('-_'):
 303             result = result[2:]
 304         if result.startswith('-'):
 305             result = '_' + result[len('-'):]
 306         result = result.lstrip('.')
 307         if not result:
 308             result = '_'
 309     return result
 310
 311
 312 def sanitize_path(s):
 313     """Sanitizes and normalizes path on Windows"""
 314     if sys.platform != 'win32':
 315         return s
 316     drive_or_unc, _ = os.path.splitdrive(s)
 317     if sys.version_info < (2, 7) and not drive_or_unc:
 318         drive_or_unc, _ = os.path.splitunc(s)
 319     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 320     if drive_or_unc:
 321         norm_path.pop(0)
 322     sanitized_path = [
 323         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 324         for path_part in norm_path]
 325     if drive_or_unc:
 326         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 327     return os.path.join(*sanitized_path)
 328
 329
 330 def orderedSet(iterable):
 331     """ Remove all duplicates from the input iterable """
 332     res = []
 333     for el in iterable:
 334         if el not in res:
 335             res.append(el)
 336     return res
 337
 338
 339 def _htmlentity_transform(entity):
 340     """Transforms an HTML entity to a character."""
 341     # Known non-numeric HTML entity
 342     if entity in compat_html_entities.name2codepoint:
 343         return compat_chr(compat_html_entities.name2codepoint[entity])
 344
 345     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 346     if mobj is not None:
 347         numstr = mobj.group(1)
 348         if numstr.startswith('x'):
 349             base = 16
 350             numstr = '0%s' % numstr
 351         else:
 352             base = 10
 353         return compat_chr(int(numstr, base))
 354
 355     # Unknown entity in name, return its literal representation
 356     return ('&%s;' % entity)
 357
 358
 359 def unescapeHTML(s):
 360     if s is None:
 361         return None
 362     assert type(s) == compat_str
 363
 364     return re.sub(
 365         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 366
 367
 368 def get_subprocess_encoding():
 369     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 370         # For subprocess calls, encode with locale encoding
 371         # Refer to http://stackoverflow.com/a/9951851/35070
 372         encoding = preferredencoding()
 373     else:
 374         encoding = sys.getfilesystemencoding()
 375     if encoding is None:
 376         encoding = 'utf-8'
 377     return encoding
 378
 379
 380 def encodeFilename(s, for_subprocess=False):
 381     """
 382     @param s The name of the file
 383     """
 384
 385     assert type(s) == compat_str
 386
 387     # Python 3 has a Unicode API
 388     if sys.version_info >= (3, 0):
 389         return s
 390
 391     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 392     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 393     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 394     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 395         return s
 396
 397     return s.encode(get_subprocess_encoding(), 'ignore')
 398
 399
 400 def decodeFilename(b, for_subprocess=False):
 401
 402     if sys.version_info >= (3, 0):
 403         return b
 404
 405     if not isinstance(b, bytes):
 406         return b
 407
 408     return b.decode(get_subprocess_encoding(), 'ignore')
 409
 410
 411 def encodeArgument(s):
 412     if not isinstance(s, compat_str):
 413         # Legacy code that uses byte strings
 414         # Uncomment the following line after fixing all post processors
 415         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 416         s = s.decode('ascii')
 417     return encodeFilename(s, True)
 418
 419
 420 def decodeArgument(b):
 421     return decodeFilename(b, True)
 422
 423
 424 def decodeOption(optval):
 425     if optval is None:
 426         return optval
 427     if isinstance(optval, bytes):
 428         optval = optval.decode(preferredencoding())
 429
 430     assert isinstance(optval, compat_str)
 431     return optval
 432
 433
 434 def formatSeconds(secs):
 435     if secs > 3600:
 436         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 437     elif secs > 60:
 438         return '%d:%02d' % (secs // 60, secs % 60)
 439     else:
 440         return '%d' % secs
 441
 442
 443 def make_HTTPS_handler(params, **kwargs):
 444     opts_no_check_certificate = params.get('nocheckcertificate', False)
 445     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 446         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 447         if opts_no_check_certificate:
 448             context.check_hostname = False
 449             context.verify_mode = ssl.CERT_NONE
 450         try:
 451             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 452         except TypeError:
 453             # Python 2.7.8
 454             # (create_default_context present but HTTPSHandler has no context=)
 455             pass
 456
 457     if sys.version_info < (3, 2):
 458         return YoutubeDLHTTPSHandler(params, **kwargs)
 459     else:  # Python < 3.4
 460         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 461         context.verify_mode = (ssl.CERT_NONE
 462                                if opts_no_check_certificate
 463                                else ssl.CERT_REQUIRED)
 464         context.set_default_verify_paths()
 465         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 466
 467
 468 def bug_reports_message():
 469     if ytdl_is_updateable():
 470         update_cmd = 'type  youtube-dl -U  to update'
 471     else:
 472         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 473     msg = '; please report this issue on https://yt-dl.org/bug .'
 474     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 475     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 476     return msg
 477
 478
 479 class ExtractorError(Exception):
 480     """Error during info extraction."""
 481
 482     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 483         """ tb, if given, is the original traceback (so that it can be printed out).
 484         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 485         """
 486
 487         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 488             expected = True
 489         if video_id is not None:
 490             msg = video_id + ': ' + msg
 491         if cause:
 492             msg += ' (caused by %r)' % cause
 493         if not expected:
 494             msg += bug_reports_message()
 495         super(ExtractorError, self).__init__(msg)
 496
 497         self.traceback = tb
 498         self.exc_info = sys.exc_info()  # preserve original exception
 499         self.cause = cause
 500         self.video_id = video_id
 501
 502     def format_traceback(self):
 503         if self.traceback is None:
 504             return None
 505         return ''.join(traceback.format_tb(self.traceback))
 506
 507
 508 class UnsupportedError(ExtractorError):
 509     def __init__(self, url):
 510         super(UnsupportedError, self).__init__(
 511             'Unsupported URL: %s' % url, expected=True)
 512         self.url = url
 513
 514
 515 class RegexNotFoundError(ExtractorError):
 516     """Error when a regex didn't match"""
 517     pass
 518
 519
 520 class DownloadError(Exception):
 521     """Download Error exception.
 522
 523     This exception may be thrown by FileDownloader objects if they are not
 524     configured to continue on errors. They will contain the appropriate
 525     error message.
 526     """
 527
 528     def __init__(self, msg, exc_info=None):
 529         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 530         super(DownloadError, self).__init__(msg)
 531         self.exc_info = exc_info
 532
 533
 534 class SameFileError(Exception):
 535     """Same File exception.
 536
 537     This exception will be thrown by FileDownloader objects if they detect
 538     multiple files would have to be downloaded to the same file on disk.
 539     """
 540     pass
 541
 542
 543 class PostProcessingError(Exception):
 544     """Post Processing exception.
 545
 546     This exception may be raised by PostProcessor's .run() method to
 547     indicate an error in the postprocessing task.
 548     """
 549
 550     def __init__(self, msg):
 551         self.msg = msg
 552
 553
 554 class MaxDownloadsReached(Exception):
 555     """ --max-downloads limit has been reached. """
 556     pass
 557
 558
 559 class UnavailableVideoError(Exception):
 560     """Unavailable Format exception.
 561
 562     This exception will be thrown when a video is requested
 563     in a format that is not available for that video.
 564     """
 565     pass
 566
 567
 568 class ContentTooShortError(Exception):
 569     """Content Too Short exception.
 570
 571     This exception may be raised by FileDownloader objects when a file they
 572     download is too small for what the server announced first, indicating
 573     the connection was probably interrupted.
 574     """
 575     # Both in bytes
 576     downloaded = None
 577     expected = None
 578
 579     def __init__(self, downloaded, expected):
 580         self.downloaded = downloaded
 581         self.expected = expected
 582
 583
 584 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 585     hc = http_class(*args, **kwargs)
 586     source_address = ydl_handler._params.get('source_address')
 587     if source_address is not None:
 588         sa = (source_address, 0)
 589         if hasattr(hc, 'source_address'):  # Python 2.7+
 590             hc.source_address = sa
 591         else:  # Python 2.6
 592             def _hc_connect(self, *args, **kwargs):
 593                 sock = compat_socket_create_connection(
 594                     (self.host, self.port), self.timeout, sa)
 595                 if is_https:
 596                     self.sock = ssl.wrap_socket(
 597                         sock, self.key_file, self.cert_file,
 598                         ssl_version=ssl.PROTOCOL_TLSv1)
 599                 else:
 600                     self.sock = sock
 601             hc.connect = functools.partial(_hc_connect, hc)
 602
 603     return hc
 604
 605
 606 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 607     """Handler for HTTP requests and responses.
 608
 609     This class, when installed with an OpenerDirector, automatically adds
 610     the standard headers to every HTTP request and handles gzipped and
 611     deflated responses from web servers. If compression is to be avoided in
 612     a particular request, the original request in the program code only has
 613     to include the HTTP header "Youtubedl-No-Compression", which will be
 614     removed before making the real request.
 615
 616     Part of this code was copied from:
 617
 618     http://techknack.net/python-urllib2-handlers/
 619
 620     Andrew Rowls, the author of that code, agreed to release it to the
 621     public domain.
 622     """
 623
 624     def __init__(self, params, *args, **kwargs):
 625         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 626         self._params = params
 627
 628     def http_open(self, req):
 629         return self.do_open(functools.partial(
 630             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 631             req)
 632
 633     @staticmethod
 634     def deflate(data):
 635         try:
 636             return zlib.decompress(data, -zlib.MAX_WBITS)
 637         except zlib.error:
 638             return zlib.decompress(data)
 639
 640     @staticmethod
 641     def addinfourl_wrapper(stream, headers, url, code):
 642         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 643             return compat_urllib_request.addinfourl(stream, headers, url, code)
 644         ret = compat_urllib_request.addinfourl(stream, headers, url)
 645         ret.code = code
 646         return ret
 647
 648     def http_request(self, req):
 649         for h, v in std_headers.items():
 650             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 651             # The dict keys are capitalized because of this bug by urllib
 652             if h.capitalize() not in req.headers:
 653                 req.add_header(h, v)
 654         if 'Youtubedl-no-compression' in req.headers:
 655             if 'Accept-encoding' in req.headers:
 656                 del req.headers['Accept-encoding']
 657             del req.headers['Youtubedl-no-compression']
 658
 659         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 660             # Python 2.6 is brain-dead when it comes to fragments
 661             req._Request__original = req._Request__original.partition('#')[0]
 662             req._Request__r_type = req._Request__r_type.partition('#')[0]
 663
 664         return req
 665
 666     def http_response(self, req, resp):
 667         old_resp = resp
 668         # gzip
 669         if resp.headers.get('Content-encoding', '') == 'gzip':
 670             content = resp.read()
 671             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 672             try:
 673                 uncompressed = io.BytesIO(gz.read())
 674             except IOError as original_ioerror:
 675                 # There may be junk add the end of the file
 676                 # See http://stackoverflow.com/q/4928560/35070 for details
 677                 for i in range(1, 1024):
 678                     try:
 679                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 680                         uncompressed = io.BytesIO(gz.read())
 681                     except IOError:
 682                         continue
 683                     break
 684                 else:
 685                     raise original_ioerror
 686             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 687             resp.msg = old_resp.msg
 688         # deflate
 689         if resp.headers.get('Content-encoding', '') == 'deflate':
 690             gz = io.BytesIO(self.deflate(resp.read()))
 691             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 692             resp.msg = old_resp.msg
 693         return resp
 694
 695     https_request = http_request
 696     https_response = http_response
 697
 698
 699 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 700     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 701         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 702         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 703         self._params = params
 704
 705     def https_open(self, req):
 706         kwargs = {}
 707         if hasattr(self, '_context'):  # python > 2.6
 708             kwargs['context'] = self._context
 709         if hasattr(self, '_check_hostname'):  # python 3.x
 710             kwargs['check_hostname'] = self._check_hostname
 711         return self.do_open(functools.partial(
 712             _create_http_connection, self, self._https_conn_class, True),
 713             req, **kwargs)
 714
 715
 716 def parse_iso8601(date_str, delimiter='T', timezone=None):
 717     """ Return a UNIX timestamp from the given date """
 718
 719     if date_str is None:
 720         return None
 721
 722     if timezone is None:
 723         m = re.search(
 724             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 725             date_str)
 726         if not m:
 727             timezone = datetime.timedelta()
 728         else:
 729             date_str = date_str[:-len(m.group(0))]
 730             if not m.group('sign'):
 731                 timezone = datetime.timedelta()
 732             else:
 733                 sign = 1 if m.group('sign') == '+' else -1
 734                 timezone = datetime.timedelta(
 735                     hours=sign * int(m.group('hours')),
 736                     minutes=sign * int(m.group('minutes')))
 737     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 738     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 739     return calendar.timegm(dt.timetuple())
 740
 741
 742 def unified_strdate(date_str, day_first=True):
 743     """Return a string with the date in the format YYYYMMDD"""
 744
 745     if date_str is None:
 746         return None
 747     upload_date = None
 748     # Replace commas
 749     date_str = date_str.replace(',', ' ')
 750     # %z (UTC offset) is only supported in python>=3.2
 751     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 752         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 753     # Remove AM/PM + timezone
 754     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 755
 756     format_expressions = [
 757         '%d %B %Y',
 758         '%d %b %Y',
 759         '%B %d %Y',
 760         '%b %d %Y',
 761         '%b %dst %Y %I:%M%p',
 762         '%b %dnd %Y %I:%M%p',
 763         '%b %dth %Y %I:%M%p',
 764         '%Y %m %d',
 765         '%Y-%m-%d',
 766         '%Y/%m/%d',
 767         '%Y/%m/%d %H:%M:%S',
 768         '%Y-%m-%d %H:%M:%S',
 769         '%Y-%m-%d %H:%M:%S.%f',
 770         '%d.%m.%Y %H:%M',
 771         '%d.%m.%Y %H.%M',
 772         '%Y-%m-%dT%H:%M:%SZ',
 773         '%Y-%m-%dT%H:%M:%S.%fZ',
 774         '%Y-%m-%dT%H:%M:%S.%f0Z',
 775         '%Y-%m-%dT%H:%M:%S',
 776         '%Y-%m-%dT%H:%M:%S.%f',
 777         '%Y-%m-%dT%H:%M',
 778     ]
 779     if day_first:
 780         format_expressions.extend([
 781             '%d-%m-%Y',
 782             '%d.%m.%Y',
 783             '%d/%m/%Y',
 784             '%d/%m/%y',
 785             '%d/%m/%Y %H:%M:%S',
 786         ])
 787     else:
 788         format_expressions.extend([
 789             '%m-%d-%Y',
 790             '%m.%d.%Y',
 791             '%m/%d/%Y',
 792             '%m/%d/%y',
 793             '%m/%d/%Y %H:%M:%S',
 794         ])
 795     for expression in format_expressions:
 796         try:
 797             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 798         except ValueError:
 799             pass
 800     if upload_date is None:
 801         timetuple = email.utils.parsedate_tz(date_str)
 802         if timetuple:
 803             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 804     return upload_date
 805
 806
 807 def determine_ext(url, default_ext='unknown_video'):
 808     if url is None:
 809         return default_ext
 810     guess = url.partition('?')[0].rpartition('.')[2]
 811     if re.match(r'^[A-Za-z0-9]+$', guess):
 812         return guess
 813     else:
 814         return default_ext
 815
 816
 817 def subtitles_filename(filename, sub_lang, sub_format):
 818     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 819
 820
 821 def date_from_str(date_str):
 822     """
 823     Return a datetime object from a string in the format YYYYMMDD or
 824     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 825     today = datetime.date.today()
 826     if date_str in ('now', 'today'):
 827         return today
 828     if date_str == 'yesterday':
 829         return today - datetime.timedelta(days=1)
 830     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 831     if match is not None:
 832         sign = match.group('sign')
 833         time = int(match.group('time'))
 834         if sign == '-':
 835             time = -time
 836         unit = match.group('unit')
 837         # A bad aproximation?
 838         if unit == 'month':
 839             unit = 'day'
 840             time *= 30
 841         elif unit == 'year':
 842             unit = 'day'
 843             time *= 365
 844         unit += 's'
 845         delta = datetime.timedelta(**{unit: time})
 846         return today + delta
 847     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 848
 849
 850 def hyphenate_date(date_str):
 851     """
 852     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 853     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 854     if match is not None:
 855         return '-'.join(match.groups())
 856     else:
 857         return date_str
 858
 859
 860 class DateRange(object):
 861     """Represents a time interval between two dates"""
 862
 863     def __init__(self, start=None, end=None):
 864         """start and end must be strings in the format accepted by date"""
 865         if start is not None:
 866             self.start = date_from_str(start)
 867         else:
 868             self.start = datetime.datetime.min.date()
 869         if end is not None:
 870             self.end = date_from_str(end)
 871         else:
 872             self.end = datetime.datetime.max.date()
 873         if self.start > self.end:
 874             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 875
 876     @classmethod
 877     def day(cls, day):
 878         """Returns a range that only contains the given day"""
 879         return cls(day, day)
 880
 881     def __contains__(self, date):
 882         """Check if the date is in the range"""
 883         if not isinstance(date, datetime.date):
 884             date = date_from_str(date)
 885         return self.start <= date <= self.end
 886
 887     def __str__(self):
 888         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 889
 890
 891 def platform_name():
 892     """ Returns the platform name as a compat_str """
 893     res = platform.platform()
 894     if isinstance(res, bytes):
 895         res = res.decode(preferredencoding())
 896
 897     assert isinstance(res, compat_str)
 898     return res
 899
 900
 901 def _windows_write_string(s, out):
 902     """ Returns True if the string was written using special methods,
 903     False if it has yet to be written out."""
 904     # Adapted from http://stackoverflow.com/a/3259271/35070
 905
 906     import ctypes
 907     import ctypes.wintypes
 908
 909     WIN_OUTPUT_IDS = {
 910         1: -11,
 911         2: -12,
 912     }
 913
 914     try:
 915         fileno = out.fileno()
 916     except AttributeError:
 917         # If the output stream doesn't have a fileno, it's virtual
 918         return False
 919     except io.UnsupportedOperation:
 920         # Some strange Windows pseudo files?
 921         return False
 922     if fileno not in WIN_OUTPUT_IDS:
 923         return False
 924
 925     GetStdHandle = ctypes.WINFUNCTYPE(
 926         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 927         (b"GetStdHandle", ctypes.windll.kernel32))
 928     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 929
 930     WriteConsoleW = ctypes.WINFUNCTYPE(
 931         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 932         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 933         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 934     written = ctypes.wintypes.DWORD(0)
 935
 936     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 937     FILE_TYPE_CHAR = 0x0002
 938     FILE_TYPE_REMOTE = 0x8000
 939     GetConsoleMode = ctypes.WINFUNCTYPE(
 940         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 941         ctypes.POINTER(ctypes.wintypes.DWORD))(
 942         (b"GetConsoleMode", ctypes.windll.kernel32))
 943     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 944
 945     def not_a_console(handle):
 946         if handle == INVALID_HANDLE_VALUE or handle is None:
 947             return True
 948         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 949                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 950
 951     if not_a_console(h):
 952         return False
 953
 954     def next_nonbmp_pos(s):
 955         try:
 956             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 957         except StopIteration:
 958             return len(s)
 959
 960     while s:
 961         count = min(next_nonbmp_pos(s), 1024)
 962
 963         ret = WriteConsoleW(
 964             h, s, count if count else 2, ctypes.byref(written), None)
 965         if ret == 0:
 966             raise OSError('Failed to write string')
 967         if not count:  # We just wrote a non-BMP character
 968             assert written.value == 2
 969             s = s[1:]
 970         else:
 971             assert written.value > 0
 972             s = s[written.value:]
 973     return True
 974
 975
 976 def write_string(s, out=None, encoding=None):
 977     if out is None:
 978         out = sys.stderr
 979     assert type(s) == compat_str
 980
 981     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 982         if _windows_write_string(s, out):
 983             return
 984
 985     if ('b' in getattr(out, 'mode', '') or
 986             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 987         byt = s.encode(encoding or preferredencoding(), 'ignore')
 988         out.write(byt)
 989     elif hasattr(out, 'buffer'):
 990         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 991         byt = s.encode(enc, 'ignore')
 992         out.buffer.write(byt)
 993     else:
 994         out.write(s)
 995     out.flush()
 996
 997
 998 def bytes_to_intlist(bs):
 999     if not bs:
1000         return []
1001     if isinstance(bs[0], int):  # Python 3
1002         return list(bs)
1003     else:
1004         return [ord(c) for c in bs]
1005
1006
1007 def intlist_to_bytes(xs):
1008     if not xs:
1009         return b''
1010     return struct_pack('%dB' % len(xs), *xs)
1011
1012
1013 # Cross-platform file locking
1014 if sys.platform == 'win32':
1015     import ctypes.wintypes
1016     import msvcrt
1017
1018     class OVERLAPPED(ctypes.Structure):
1019         _fields_ = [
1020             ('Internal', ctypes.wintypes.LPVOID),
1021             ('InternalHigh', ctypes.wintypes.LPVOID),
1022             ('Offset', ctypes.wintypes.DWORD),
1023             ('OffsetHigh', ctypes.wintypes.DWORD),
1024             ('hEvent', ctypes.wintypes.HANDLE),
1025         ]
1026
1027     kernel32 = ctypes.windll.kernel32
1028     LockFileEx = kernel32.LockFileEx
1029     LockFileEx.argtypes = [
1030         ctypes.wintypes.HANDLE,     # hFile
1031         ctypes.wintypes.DWORD,      # dwFlags
1032         ctypes.wintypes.DWORD,      # dwReserved
1033         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1034         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1035         ctypes.POINTER(OVERLAPPED)  # Overlapped
1036     ]
1037     LockFileEx.restype = ctypes.wintypes.BOOL
1038     UnlockFileEx = kernel32.UnlockFileEx
1039     UnlockFileEx.argtypes = [
1040         ctypes.wintypes.HANDLE,     # hFile
1041         ctypes.wintypes.DWORD,      # dwReserved
1042         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1043         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1044         ctypes.POINTER(OVERLAPPED)  # Overlapped
1045     ]
1046     UnlockFileEx.restype = ctypes.wintypes.BOOL
1047     whole_low = 0xffffffff
1048     whole_high = 0x7fffffff
1049
1050     def _lock_file(f, exclusive):
1051         overlapped = OVERLAPPED()
1052         overlapped.Offset = 0
1053         overlapped.OffsetHigh = 0
1054         overlapped.hEvent = 0
1055         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1056         handle = msvcrt.get_osfhandle(f.fileno())
1057         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1058                           whole_low, whole_high, f._lock_file_overlapped_p):
1059             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1060
1061     def _unlock_file(f):
1062         assert f._lock_file_overlapped_p
1063         handle = msvcrt.get_osfhandle(f.fileno())
1064         if not UnlockFileEx(handle, 0,
1065                             whole_low, whole_high, f._lock_file_overlapped_p):
1066             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1067
1068 else:
1069     import fcntl
1070
1071     def _lock_file(f, exclusive):
1072         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1073
1074     def _unlock_file(f):
1075         fcntl.flock(f, fcntl.LOCK_UN)
1076
1077
1078 class locked_file(object):
1079     def __init__(self, filename, mode, encoding=None):
1080         assert mode in ['r', 'a', 'w']
1081         self.f = io.open(filename, mode, encoding=encoding)
1082         self.mode = mode
1083
1084     def __enter__(self):
1085         exclusive = self.mode != 'r'
1086         try:
1087             _lock_file(self.f, exclusive)
1088         except IOError:
1089             self.f.close()
1090             raise
1091         return self
1092
1093     def __exit__(self, etype, value, traceback):
1094         try:
1095             _unlock_file(self.f)
1096         finally:
1097             self.f.close()
1098
1099     def __iter__(self):
1100         return iter(self.f)
1101
1102     def write(self, *args):
1103         return self.f.write(*args)
1104
1105     def read(self, *args):
1106         return self.f.read(*args)
1107
1108
1109 def get_filesystem_encoding():
1110     encoding = sys.getfilesystemencoding()
1111     return encoding if encoding is not None else 'utf-8'
1112
1113
1114 def shell_quote(args):
1115     quoted_args = []
1116     encoding = get_filesystem_encoding()
1117     for a in args:
1118         if isinstance(a, bytes):
1119             # We may get a filename encoded with 'encodeFilename'
1120             a = a.decode(encoding)
1121         quoted_args.append(pipes.quote(a))
1122     return ' '.join(quoted_args)
1123
1124
1125 def smuggle_url(url, data):
1126     """ Pass additional data in a URL for internal use. """
1127
1128     sdata = compat_urllib_parse.urlencode(
1129         {'__youtubedl_smuggle': json.dumps(data)})
1130     return url + '#' + sdata
1131
1132
1133 def unsmuggle_url(smug_url, default=None):
1134     if '#__youtubedl_smuggle' not in smug_url:
1135         return smug_url, default
1136     url, _, sdata = smug_url.rpartition('#')
1137     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1138     data = json.loads(jsond)
1139     return url, data
1140
1141
1142 def format_bytes(bytes):
1143     if bytes is None:
1144         return 'N/A'
1145     if type(bytes) is str:
1146         bytes = float(bytes)
1147     if bytes == 0.0:
1148         exponent = 0
1149     else:
1150         exponent = int(math.log(bytes, 1024.0))
1151     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1152     converted = float(bytes) / float(1024 ** exponent)
1153     return '%.2f%s' % (converted, suffix)
1154
1155
1156 def parse_filesize(s):
1157     if s is None:
1158         return None
1159
1160     # The lower-case forms are of course incorrect and inofficial,
1161     # but we support those too
1162     _UNIT_TABLE = {
1163         'B': 1,
1164         'b': 1,
1165         'KiB': 1024,
1166         'KB': 1000,
1167         'kB': 1024,
1168         'Kb': 1000,
1169         'MiB': 1024 ** 2,
1170         'MB': 1000 ** 2,
1171         'mB': 1024 ** 2,
1172         'Mb': 1000 ** 2,
1173         'GiB': 1024 ** 3,
1174         'GB': 1000 ** 3,
1175         'gB': 1024 ** 3,
1176         'Gb': 1000 ** 3,
1177         'TiB': 1024 ** 4,
1178         'TB': 1000 ** 4,
1179         'tB': 1024 ** 4,
1180         'Tb': 1000 ** 4,
1181         'PiB': 1024 ** 5,
1182         'PB': 1000 ** 5,
1183         'pB': 1024 ** 5,
1184         'Pb': 1000 ** 5,
1185         'EiB': 1024 ** 6,
1186         'EB': 1000 ** 6,
1187         'eB': 1024 ** 6,
1188         'Eb': 1000 ** 6,
1189         'ZiB': 1024 ** 7,
1190         'ZB': 1000 ** 7,
1191         'zB': 1024 ** 7,
1192         'Zb': 1000 ** 7,
1193         'YiB': 1024 ** 8,
1194         'YB': 1000 ** 8,
1195         'yB': 1024 ** 8,
1196         'Yb': 1000 ** 8,
1197     }
1198
1199     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1200     m = re.match(
1201         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1202     if not m:
1203         return None
1204
1205     num_str = m.group('num').replace(',', '.')
1206     mult = _UNIT_TABLE[m.group('unit')]
1207     return int(float(num_str) * mult)
1208
1209
1210 def month_by_name(name):
1211     """ Return the number of a month by (locale-independently) English name """
1212
1213     try:
1214         return ENGLISH_MONTH_NAMES.index(name) + 1
1215     except ValueError:
1216         return None
1217
1218
1219 def month_by_abbreviation(abbrev):
1220     """ Return the number of a month by (locale-independently) English
1221         abbreviations """
1222
1223     try:
1224         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1225     except ValueError:
1226         return None
1227
1228
1229 def fix_xml_ampersands(xml_str):
1230     """Replace all the '&' by '&amp;' in XML"""
1231     return re.sub(
1232         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1233         '&amp;',
1234         xml_str)
1235
1236
1237 def setproctitle(title):
1238     assert isinstance(title, compat_str)
1239     try:
1240         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1241     except OSError:
1242         return
1243     title_bytes = title.encode('utf-8')
1244     buf = ctypes.create_string_buffer(len(title_bytes))
1245     buf.value = title_bytes
1246     try:
1247         libc.prctl(15, buf, 0, 0, 0)
1248     except AttributeError:
1249         return  # Strange libc, just skip this
1250
1251
1252 def remove_start(s, start):
1253     if s.startswith(start):
1254         return s[len(start):]
1255     return s
1256
1257
1258 def remove_end(s, end):
1259     if s.endswith(end):
1260         return s[:-len(end)]
1261     return s
1262
1263
1264 def url_basename(url):
1265     path = compat_urlparse.urlparse(url).path
1266     return path.strip('/').split('/')[-1]
1267
1268
1269 class HEADRequest(compat_urllib_request.Request):
1270     def get_method(self):
1271         return "HEAD"
1272
1273
1274 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1275     if get_attr:
1276         if v is not None:
1277             v = getattr(v, get_attr, None)
1278     if v == '':
1279         v = None
1280     return default if v is None else (int(v) * invscale // scale)
1281
1282
1283 def str_or_none(v, default=None):
1284     return default if v is None else compat_str(v)
1285
1286
1287 def str_to_int(int_str):
1288     """ A more relaxed version of int_or_none """
1289     if int_str is None:
1290         return None
1291     int_str = re.sub(r'[,\.\+]', '', int_str)
1292     return int(int_str)
1293
1294
1295 def float_or_none(v, scale=1, invscale=1, default=None):
1296     return default if v is None else (float(v) * invscale / scale)
1297
1298
1299 def parse_duration(s):
1300     if not isinstance(s, compat_basestring):
1301         return None
1302
1303     s = s.strip()
1304
1305     m = re.match(
1306         r'''(?ix)(?:P?T)?
1307         (?:
1308             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1309             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1310
1311             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1312             (?:
1313                 (?:
1314                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1315                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1316                 )?
1317                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1318             )?
1319             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1320         )$''', s)
1321     if not m:
1322         return None
1323     res = 0
1324     if m.group('only_mins'):
1325         return float_or_none(m.group('only_mins'), invscale=60)
1326     if m.group('only_hours'):
1327         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1328     if m.group('secs'):
1329         res += int(m.group('secs'))
1330     if m.group('mins_reversed'):
1331         res += int(m.group('mins_reversed')) * 60
1332     if m.group('mins'):
1333         res += int(m.group('mins')) * 60
1334     if m.group('hours'):
1335         res += int(m.group('hours')) * 60 * 60
1336     if m.group('hours_reversed'):
1337         res += int(m.group('hours_reversed')) * 60 * 60
1338     if m.group('days'):
1339         res += int(m.group('days')) * 24 * 60 * 60
1340     if m.group('ms'):
1341         res += float(m.group('ms'))
1342     return res
1343
1344
1345 def prepend_extension(filename, ext, expected_real_ext=None):
1346     name, real_ext = os.path.splitext(filename)
1347     return (
1348         '{0}.{1}{2}'.format(name, ext, real_ext)
1349         if not expected_real_ext or real_ext[1:] == expected_real_ext
1350         else '{0}.{1}'.format(filename, ext))
1351
1352
1353 def replace_extension(filename, ext, expected_real_ext=None):
1354     name, real_ext = os.path.splitext(filename)
1355     return '{0}.{1}'.format(
1356         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1357         ext)
1358
1359
1360 def check_executable(exe, args=[]):
1361     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1362     args can be a list of arguments for a short output (like -version) """
1363     try:
1364         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1365     except OSError:
1366         return False
1367     return exe
1368
1369
1370 def get_exe_version(exe, args=['--version'],
1371                     version_re=None, unrecognized='present'):
1372     """ Returns the version of the specified executable,
1373     or False if the executable is not present """
1374     try:
1375         out, _ = subprocess.Popen(
1376             [encodeArgument(exe)] + args,
1377             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1378     except OSError:
1379         return False
1380     if isinstance(out, bytes):  # Python 2.x
1381         out = out.decode('ascii', 'ignore')
1382     return detect_exe_version(out, version_re, unrecognized)
1383
1384
1385 def detect_exe_version(output, version_re=None, unrecognized='present'):
1386     assert isinstance(output, compat_str)
1387     if version_re is None:
1388         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1389     m = re.search(version_re, output)
1390     if m:
1391         return m.group(1)
1392     else:
1393         return unrecognized
1394
1395
1396 class PagedList(object):
1397     def __len__(self):
1398         # This is only useful for tests
1399         return len(self.getslice())
1400
1401
1402 class OnDemandPagedList(PagedList):
1403     def __init__(self, pagefunc, pagesize):
1404         self._pagefunc = pagefunc
1405         self._pagesize = pagesize
1406
1407     def getslice(self, start=0, end=None):
1408         res = []
1409         for pagenum in itertools.count(start // self._pagesize):
1410             firstid = pagenum * self._pagesize
1411             nextfirstid = pagenum * self._pagesize + self._pagesize
1412             if start >= nextfirstid:
1413                 continue
1414
1415             page_results = list(self._pagefunc(pagenum))
1416
1417             startv = (
1418                 start % self._pagesize
1419                 if firstid <= start < nextfirstid
1420                 else 0)
1421
1422             endv = (
1423                 ((end - 1) % self._pagesize) + 1
1424                 if (end is not None and firstid <= end <= nextfirstid)
1425                 else None)
1426
1427             if startv != 0 or endv is not None:
1428                 page_results = page_results[startv:endv]
1429             res.extend(page_results)
1430
1431             # A little optimization - if current page is not "full", ie. does
1432             # not contain page_size videos then we can assume that this page
1433             # is the last one - there are no more ids on further pages -
1434             # i.e. no need to query again.
1435             if len(page_results) + startv < self._pagesize:
1436                 break
1437
1438             # If we got the whole page, but the next page is not interesting,
1439             # break out early as well
1440             if end == nextfirstid:
1441                 break
1442         return res
1443
1444
1445 class InAdvancePagedList(PagedList):
1446     def __init__(self, pagefunc, pagecount, pagesize):
1447         self._pagefunc = pagefunc
1448         self._pagecount = pagecount
1449         self._pagesize = pagesize
1450
1451     def getslice(self, start=0, end=None):
1452         res = []
1453         start_page = start // self._pagesize
1454         end_page = (
1455             self._pagecount if end is None else (end // self._pagesize + 1))
1456         skip_elems = start - start_page * self._pagesize
1457         only_more = None if end is None else end - start
1458         for pagenum in range(start_page, end_page):
1459             page = list(self._pagefunc(pagenum))
1460             if skip_elems:
1461                 page = page[skip_elems:]
1462                 skip_elems = None
1463             if only_more is not None:
1464                 if len(page) < only_more:
1465                     only_more -= len(page)
1466                 else:
1467                     page = page[:only_more]
1468                     res.extend(page)
1469                     break
1470             res.extend(page)
1471         return res
1472
1473
1474 def uppercase_escape(s):
1475     unicode_escape = codecs.getdecoder('unicode_escape')
1476     return re.sub(
1477         r'\\U[0-9a-fA-F]{8}',
1478         lambda m: unicode_escape(m.group(0))[0],
1479         s)
1480
1481
1482 def lowercase_escape(s):
1483     unicode_escape = codecs.getdecoder('unicode_escape')
1484     return re.sub(
1485         r'\\u[0-9a-fA-F]{4}',
1486         lambda m: unicode_escape(m.group(0))[0],
1487         s)
1488
1489
1490 def escape_rfc3986(s):
1491     """Escape non-ASCII characters as suggested by RFC 3986"""
1492     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1493         s = s.encode('utf-8')
1494     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1495
1496
1497 def escape_url(url):
1498     """Escape URL as suggested by RFC 3986"""
1499     url_parsed = compat_urllib_parse_urlparse(url)
1500     return url_parsed._replace(
1501         path=escape_rfc3986(url_parsed.path),
1502         params=escape_rfc3986(url_parsed.params),
1503         query=escape_rfc3986(url_parsed.query),
1504         fragment=escape_rfc3986(url_parsed.fragment)
1505     ).geturl()
1506
1507 try:
1508     struct.pack('!I', 0)
1509 except TypeError:
1510     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1511     def struct_pack(spec, *args):
1512         if isinstance(spec, compat_str):
1513             spec = spec.encode('ascii')
1514         return struct.pack(spec, *args)
1515
1516     def struct_unpack(spec, *args):
1517         if isinstance(spec, compat_str):
1518             spec = spec.encode('ascii')
1519         return struct.unpack(spec, *args)
1520 else:
1521     struct_pack = struct.pack
1522     struct_unpack = struct.unpack
1523
1524
1525 def read_batch_urls(batch_fd):
1526     def fixup(url):
1527         if not isinstance(url, compat_str):
1528             url = url.decode('utf-8', 'replace')
1529         BOM_UTF8 = '\xef\xbb\xbf'
1530         if url.startswith(BOM_UTF8):
1531             url = url[len(BOM_UTF8):]
1532         url = url.strip()
1533         if url.startswith(('#', ';', ']')):
1534             return False
1535         return url
1536
1537     with contextlib.closing(batch_fd) as fd:
1538         return [url for url in map(fixup, fd) if url]
1539
1540
1541 def urlencode_postdata(*args, **kargs):
1542     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1543
1544
1545 try:
1546     etree_iter = xml.etree.ElementTree.Element.iter
1547 except AttributeError:  # Python <=2.6
1548     etree_iter = lambda n: n.findall('.//*')
1549
1550
1551 def parse_xml(s):
1552     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1553         def doctype(self, name, pubid, system):
1554             pass  # Ignore doctypes
1555
1556     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1557     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1558     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1559     # Fix up XML parser in Python 2.x
1560     if sys.version_info < (3, 0):
1561         for n in etree_iter(tree):
1562             if n.text is not None:
1563                 if not isinstance(n.text, compat_str):
1564                     n.text = n.text.decode('utf-8')
1565     return tree
1566
1567
1568 US_RATINGS = {
1569     'G': 0,
1570     'PG': 10,
1571     'PG-13': 13,
1572     'R': 16,
1573     'NC': 18,
1574 }
1575
1576
1577 def parse_age_limit(s):
1578     if s is None:
1579         return None
1580     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1581     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1582
1583
1584 def strip_jsonp(code):
1585     return re.sub(
1586         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1587
1588
1589 def js_to_json(code):
1590     def fix_kv(m):
1591         v = m.group(0)
1592         if v in ('true', 'false', 'null'):
1593             return v
1594         if v.startswith('"'):
1595             return v
1596         if v.startswith("'"):
1597             v = v[1:-1]
1598             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1599                 '\\\\': '\\\\',
1600                 "\\'": "'",
1601                 '"': '\\"',
1602             }[m.group(0)], v)
1603         return '"%s"' % v
1604
1605     res = re.sub(r'''(?x)
1606         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1607         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1608         [a-zA-Z_][.a-zA-Z_0-9]*
1609         ''', fix_kv, code)
1610     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1611     return res
1612
1613
1614 def qualities(quality_ids):
1615     """ Get a numeric quality value out of a list of possible values """
1616     def q(qid):
1617         try:
1618             return quality_ids.index(qid)
1619         except ValueError:
1620             return -1
1621     return q
1622
1623
1624 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1625
1626
1627 def limit_length(s, length):
1628     """ Add ellipses to overly long strings """
1629     if s is None:
1630         return None
1631     ELLIPSES = '...'
1632     if len(s) > length:
1633         return s[:length - len(ELLIPSES)] + ELLIPSES
1634     return s
1635
1636
1637 def version_tuple(v):
1638     return tuple(int(e) for e in re.split(r'[-.]', v))
1639
1640
1641 def is_outdated_version(version, limit, assume_new=True):
1642     if not version:
1643         return not assume_new
1644     try:
1645         return version_tuple(version) < version_tuple(limit)
1646     except ValueError:
1647         return not assume_new
1648
1649
1650 def ytdl_is_updateable():
1651     """ Returns if youtube-dl can be updated with -U """
1652     from zipimport import zipimporter
1653
1654     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1655
1656
1657 def args_to_str(args):
1658     # Get a short string representation for a subprocess command
1659     return ' '.join(shlex_quote(a) for a in args)
1660
1661
1662 def mimetype2ext(mt):
1663     _, _, res = mt.rpartition('/')
1664
1665     return {
1666         'x-ms-wmv': 'wmv',
1667         'x-mp4-fragmented': 'mp4',
1668         'ttml+xml': 'ttml',
1669     }.get(res, res)
1670
1671
1672 def urlhandle_detect_ext(url_handle):
1673     try:
1674         url_handle.headers
1675         getheader = lambda h: url_handle.headers[h]
1676     except AttributeError:  # Python < 3
1677         getheader = url_handle.info().getheader
1678
1679     cd = getheader('Content-Disposition')
1680     if cd:
1681         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1682         if m:
1683             e = determine_ext(m.group('filename'), default_ext=None)
1684             if e:
1685                 return e
1686
1687     return mimetype2ext(getheader('Content-Type'))
1688
1689
1690 def age_restricted(content_limit, age_limit):
1691     """ Returns True iff the content should be blocked """
1692
1693     if age_limit is None:  # No limit set
1694         return False
1695     if content_limit is None:
1696         return False  # Content available for everyone
1697     return age_limit < content_limit
1698
1699
1700 def is_html(first_bytes):
1701     """ Detect whether a file contains HTML by examining its first bytes. """
1702
1703     BOMS = [
1704         (b'\xef\xbb\xbf', 'utf-8'),
1705         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1706         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1707         (b'\xff\xfe', 'utf-16-le'),
1708         (b'\xfe\xff', 'utf-16-be'),
1709     ]
1710     for bom, enc in BOMS:
1711         if first_bytes.startswith(bom):
1712             s = first_bytes[len(bom):].decode(enc, 'replace')
1713             break
1714     else:
1715         s = first_bytes.decode('utf-8', 'replace')
1716
1717     return re.match(r'^\s*<', s)
1718
1719
1720 def determine_protocol(info_dict):
1721     protocol = info_dict.get('protocol')
1722     if protocol is not None:
1723         return protocol
1724
1725     url = info_dict['url']
1726     if url.startswith('rtmp'):
1727         return 'rtmp'
1728     elif url.startswith('mms'):
1729         return 'mms'
1730     elif url.startswith('rtsp'):
1731         return 'rtsp'
1732
1733     ext = determine_ext(url)
1734     if ext == 'm3u8':
1735         return 'm3u8'
1736     elif ext == 'f4m':
1737         return 'f4m'
1738
1739     return compat_urllib_parse_urlparse(url).scheme
1740
1741
1742 def render_table(header_row, data):
1743     """ Render a list of rows, each as a list of values """
1744     table = [header_row] + data
1745     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1746     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1747     return '\n'.join(format_str % tuple(row) for row in table)
1748
1749
1750 def _match_one(filter_part, dct):
1751     COMPARISON_OPERATORS = {
1752         '<': operator.lt,
1753         '<=': operator.le,
1754         '>': operator.gt,
1755         '>=': operator.ge,
1756         '=': operator.eq,
1757         '!=': operator.ne,
1758     }
1759     operator_rex = re.compile(r'''(?x)\s*
1760         (?P<key>[a-z_]+)
1761         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1762         (?:
1763             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1764             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1765         )
1766         \s*$
1767         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1768     m = operator_rex.search(filter_part)
1769     if m:
1770         op = COMPARISON_OPERATORS[m.group('op')]
1771         if m.group('strval') is not None:
1772             if m.group('op') not in ('=', '!='):
1773                 raise ValueError(
1774                     'Operator %s does not support string values!' % m.group('op'))
1775             comparison_value = m.group('strval')
1776         else:
1777             try:
1778                 comparison_value = int(m.group('intval'))
1779             except ValueError:
1780                 comparison_value = parse_filesize(m.group('intval'))
1781                 if comparison_value is None:
1782                     comparison_value = parse_filesize(m.group('intval') + 'B')
1783                 if comparison_value is None:
1784                     raise ValueError(
1785                         'Invalid integer value %r in filter part %r' % (
1786                             m.group('intval'), filter_part))
1787         actual_value = dct.get(m.group('key'))
1788         if actual_value is None:
1789             return m.group('none_inclusive')
1790         return op(actual_value, comparison_value)
1791
1792     UNARY_OPERATORS = {
1793         '': lambda v: v is not None,
1794         '!': lambda v: v is None,
1795     }
1796     operator_rex = re.compile(r'''(?x)\s*
1797         (?P<op>%s)\s*(?P<key>[a-z_]+)
1798         \s*$
1799         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1800     m = operator_rex.search(filter_part)
1801     if m:
1802         op = UNARY_OPERATORS[m.group('op')]
1803         actual_value = dct.get(m.group('key'))
1804         return op(actual_value)
1805
1806     raise ValueError('Invalid filter part %r' % filter_part)
1807
1808
1809 def match_str(filter_str, dct):
1810     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1811
1812     return all(
1813         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1814
1815
1816 def match_filter_func(filter_str):
1817     def _match_func(info_dict):
1818         if match_str(filter_str, info_dict):
1819             return None
1820         else:
1821             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1822             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1823     return _match_func
1824
1825
1826 def parse_dfxp_time_expr(time_expr):
1827     if not time_expr:
1828         return 0.0
1829
1830     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1831     if mobj:
1832         return float(mobj.group('time_offset'))
1833
1834     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1835     if mobj:
1836         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1837
1838
1839 def srt_subtitles_timecode(seconds):
1840     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1841
1842
1843 def dfxp2srt(dfxp_data):
1844     _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1845
1846     def parse_node(node):
1847         str_or_empty = functools.partial(str_or_none, default='')
1848
1849         out = str_or_empty(node.text)
1850
1851         for child in node:
1852             if child.tag in (_x('ttml:br'), 'br'):
1853                 out += '\n' + str_or_empty(child.tail)
1854             elif child.tag in (_x('ttml:span'), 'span'):
1855                 out += str_or_empty(parse_node(child))
1856             else:
1857                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1858
1859         return out
1860
1861     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1862     out = []
1863     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1864
1865     if not paras:
1866         raise ValueError('Invalid dfxp/TTML subtitle')
1867
1868     for para, index in zip(paras, itertools.count(1)):
1869         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1870         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1871         if not end_time:
1872             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1873         out.append('%d\n%s --> %s\n%s\n\n' % (
1874             index,
1875             srt_subtitles_timecode(begin_time),
1876             srt_subtitles_timecode(end_time),
1877             parse_node(para)))
1878
1879     return ''.join(out)
1880
1881
1882 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1883     def __init__(self, proxies=None):
1884         # Set default handlers
1885         for type in ('http', 'https'):
1886             setattr(self, '%s_open' % type,
1887                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1888                         meth(r, proxy, type))
1889         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1890
1891     def proxy_open(self, req, proxy, type):
1892         req_proxy = req.headers.get('Ytdl-request-proxy')
1893         if req_proxy is not None:
1894             proxy = req_proxy
1895             del req.headers['Ytdl-request-proxy']
1896
1897         if proxy == '__noproxy__':
1898             return None  # No Proxy
1899         return compat_urllib_request.ProxyHandler.proxy_open(
1900             self, req, proxy, type)