git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_etree_fromstring,
  40     compat_html_entities,
  41     compat_http_client,
  42     compat_kwargs,
  43     compat_parse_qs,
  44     compat_socket_create_connection,
  45     compat_str,
  46     compat_urllib_error,
  47     compat_urllib_parse,
  48     compat_urllib_parse_urlparse,
  49     compat_urllib_request,
  50     compat_urlparse,
  51     shlex_quote,
  52 )
  53
  54
  55 # This is not clearly defined otherwise
  56 compiled_regex_type = type(re.compile(''))
  57
  58 std_headers = {
  59     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  60     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62     'Accept-Encoding': 'gzip, deflate',
  63     'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66
  67 NO_DEFAULT = object()
  68
  69 ENGLISH_MONTH_NAMES = [
  70     'January', 'February', 'March', 'April', 'May', 'June',
  71     'July', 'August', 'September', 'October', 'November', 'December']
  72
  73 KNOWN_EXTENSIONS = (
  74     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  75     'flv', 'f4v', 'f4a', 'f4b',
  76     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  77     'mkv', 'mka', 'mk3d',
  78     'avi', 'divx',
  79     'mov',
  80     'asf', 'wmv', 'wma',
  81     '3gp', '3g2',
  82     'mp3',
  83     'flac',
  84     'ape',
  85     'wav',
  86     'f4f', 'f4m', 'm3u8', 'smil')
  87
  88
  89 def preferredencoding():
  90     """Get preferred encoding.
  91
  92     Returns the best encoding scheme for the system, based on
  93     locale.getpreferredencoding() and some further tweaks.
  94     """
  95     try:
  96         pref = locale.getpreferredencoding()
  97         'TEST'.encode(pref)
  98     except Exception:
  99         pref = 'UTF-8'
 100
 101     return pref
 102
 103
 104 def write_json_file(obj, fn):
 105     """ Encode obj as JSON and write it to fn, atomically if possible """
 106
 107     fn = encodeFilename(fn)
 108     if sys.version_info < (3, 0) and sys.platform != 'win32':
 109         encoding = get_filesystem_encoding()
 110         # os.path.basename returns a bytes object, but NamedTemporaryFile
 111         # will fail if the filename contains non ascii characters unless we
 112         # use a unicode object
 113         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 114         # the same for os.path.dirname
 115         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 116     else:
 117         path_basename = os.path.basename
 118         path_dirname = os.path.dirname
 119
 120     args = {
 121         'suffix': '.tmp',
 122         'prefix': path_basename(fn) + '.',
 123         'dir': path_dirname(fn),
 124         'delete': False,
 125     }
 126
 127     # In Python 2.x, json.dump expects a bytestream.
 128     # In Python 3.x, it writes to a character stream
 129     if sys.version_info < (3, 0):
 130         args['mode'] = 'wb'
 131     else:
 132         args.update({
 133             'mode': 'w',
 134             'encoding': 'utf-8',
 135         })
 136
 137     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 138
 139     try:
 140         with tf:
 141             json.dump(obj, tf)
 142         if sys.platform == 'win32':
 143             # Need to remove existing file on Windows, else os.rename raises
 144             # WindowsError or FileExistsError.
 145             try:
 146                 os.unlink(fn)
 147             except OSError:
 148                 pass
 149         os.rename(tf.name, fn)
 150     except Exception:
 151         try:
 152             os.remove(tf.name)
 153         except OSError:
 154             pass
 155         raise
 156
 157
 158 if sys.version_info >= (2, 7):
 159     def find_xpath_attr(node, xpath, key, val=None):
 160         """ Find the xpath xpath[@key=val] """
 161         assert re.match(r'^[a-zA-Z_-]+$', key)
 162         if val:
 163             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 164         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 165         return node.find(expr)
 166 else:
 167     def find_xpath_attr(node, xpath, key, val=None):
 168         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 169         # .//node does not match if a node is a direct child of . !
 170         if isinstance(xpath, compat_str):
 171             xpath = xpath.encode('ascii')
 172
 173         for f in node.findall(xpath):
 174             if key not in f.attrib:
 175                 continue
 176             if val is None or f.attrib.get(key) == val:
 177                 return f
 178         return None
 179
 180 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 181 # the namespace parameter
 182
 183
 184 def xpath_with_ns(path, ns_map):
 185     components = [c.split(':') for c in path.split('/')]
 186     replaced = []
 187     for c in components:
 188         if len(c) == 1:
 189             replaced.append(c[0])
 190         else:
 191             ns, tag = c
 192             replaced.append('{%s}%s' % (ns_map[ns], tag))
 193     return '/'.join(replaced)
 194
 195
 196 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 197     def _find_xpath(xpath):
 198         if sys.version_info < (2, 7):  # Crazy 2.6
 199             xpath = xpath.encode('ascii')
 200         return node.find(xpath)
 201
 202     if isinstance(xpath, (str, compat_str)):
 203         n = _find_xpath(xpath)
 204     else:
 205         for xp in xpath:
 206             n = _find_xpath(xp)
 207             if n is not None:
 208                 break
 209
 210     if n is None:
 211         if default is not NO_DEFAULT:
 212             return default
 213         elif fatal:
 214             name = xpath if name is None else name
 215             raise ExtractorError('Could not find XML element %s' % name)
 216         else:
 217             return None
 218     return n
 219
 220
 221 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 222     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 223     if n is None or n == default:
 224         return n
 225     if n.text is None:
 226         if default is not NO_DEFAULT:
 227             return default
 228         elif fatal:
 229             name = xpath if name is None else name
 230             raise ExtractorError('Could not find XML element\'s text %s' % name)
 231         else:
 232             return None
 233     return n.text
 234
 235
 236 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 237     n = find_xpath_attr(node, xpath, key)
 238     if n is None:
 239         if default is not NO_DEFAULT:
 240             return default
 241         elif fatal:
 242             name = '%s[@%s]' % (xpath, key) if name is None else name
 243             raise ExtractorError('Could not find XML attribute %s' % name)
 244         else:
 245             return None
 246     return n.attrib[key]
 247
 248
 249 def get_element_by_id(id, html):
 250     """Return the content of the tag with the specified ID in the passed HTML document"""
 251     return get_element_by_attribute("id", id, html)
 252
 253
 254 def get_element_by_attribute(attribute, value, html):
 255     """Return the content of the tag with the specified attribute in the passed HTML document"""
 256
 257     m = re.search(r'''(?xs)
 258         <([a-zA-Z0-9:._-]+)
 259          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 260          \s+%s=['"]?%s['"]?
 261          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 262         \s*>
 263         (?P<content>.*?)
 264         </\1>
 265     ''' % (re.escape(attribute), re.escape(value)), html)
 266
 267     if not m:
 268         return None
 269     res = m.group('content')
 270
 271     if res.startswith('"') or res.startswith("'"):
 272         res = res[1:-1]
 273
 274     return unescapeHTML(res)
 275
 276
 277 def clean_html(html):
 278     """Clean an HTML snippet into a readable string"""
 279
 280     if html is None:  # Convenience for sanitizing descriptions etc.
 281         return html
 282
 283     # Newline vs <br />
 284     html = html.replace('\n', ' ')
 285     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 286     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 287     # Strip html tags
 288     html = re.sub('<.*?>', '', html)
 289     # Replace html entities
 290     html = unescapeHTML(html)
 291     return html.strip()
 292
 293
 294 def sanitize_open(filename, open_mode):
 295     """Try to open the given filename, and slightly tweak it if this fails.
 296
 297     Attempts to open the given filename. If this fails, it tries to change
 298     the filename slightly, step by step, until it's either able to open it
 299     or it fails and raises a final exception, like the standard open()
 300     function.
 301
 302     It returns the tuple (stream, definitive_file_name).
 303     """
 304     try:
 305         if filename == '-':
 306             if sys.platform == 'win32':
 307                 import msvcrt
 308                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 309             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 310         stream = open(encodeFilename(filename), open_mode)
 311         return (stream, filename)
 312     except (IOError, OSError) as err:
 313         if err.errno in (errno.EACCES,):
 314             raise
 315
 316         # In case of error, try to remove win32 forbidden chars
 317         alt_filename = sanitize_path(filename)
 318         if alt_filename == filename:
 319             raise
 320         else:
 321             # An exception here should be caught in the caller
 322             stream = open(encodeFilename(alt_filename), open_mode)
 323             return (stream, alt_filename)
 324
 325
 326 def timeconvert(timestr):
 327     """Convert RFC 2822 defined time string into system timestamp"""
 328     timestamp = None
 329     timetuple = email.utils.parsedate_tz(timestr)
 330     if timetuple is not None:
 331         timestamp = email.utils.mktime_tz(timetuple)
 332     return timestamp
 333
 334
 335 def sanitize_filename(s, restricted=False, is_id=False):
 336     """Sanitizes a string so it could be used as part of a filename.
 337     If restricted is set, use a stricter subset of allowed characters.
 338     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 339     """
 340     def replace_insane(char):
 341         if char == '?' or ord(char) < 32 or ord(char) == 127:
 342             return ''
 343         elif char == '"':
 344             return '' if restricted else '\''
 345         elif char == ':':
 346             return '_-' if restricted else ' -'
 347         elif char in '\\/|*<>':
 348             return '_'
 349         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 350             return '_'
 351         if restricted and ord(char) > 127:
 352             return '_'
 353         return char
 354
 355     # Handle timestamps
 356     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 357     result = ''.join(map(replace_insane, s))
 358     if not is_id:
 359         while '__' in result:
 360             result = result.replace('__', '_')
 361         result = result.strip('_')
 362         # Common case of "Foreign band name - English song title"
 363         if restricted and result.startswith('-_'):
 364             result = result[2:]
 365         if result.startswith('-'):
 366             result = '_' + result[len('-'):]
 367         result = result.lstrip('.')
 368         if not result:
 369             result = '_'
 370     return result
 371
 372
 373 def sanitize_path(s):
 374     """Sanitizes and normalizes path on Windows"""
 375     if sys.platform != 'win32':
 376         return s
 377     drive_or_unc, _ = os.path.splitdrive(s)
 378     if sys.version_info < (2, 7) and not drive_or_unc:
 379         drive_or_unc, _ = os.path.splitunc(s)
 380     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 381     if drive_or_unc:
 382         norm_path.pop(0)
 383     sanitized_path = [
 384         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 385         for path_part in norm_path]
 386     if drive_or_unc:
 387         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 388     return os.path.join(*sanitized_path)
 389
 390
 391 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 392 # unwanted failures due to missing protocol
 393 def sanitized_Request(url, *args, **kwargs):
 394     return compat_urllib_request.Request(
 395         'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
 396
 397
 398 def orderedSet(iterable):
 399     """ Remove all duplicates from the input iterable """
 400     res = []
 401     for el in iterable:
 402         if el not in res:
 403             res.append(el)
 404     return res
 405
 406
 407 def _htmlentity_transform(entity):
 408     """Transforms an HTML entity to a character."""
 409     # Known non-numeric HTML entity
 410     if entity in compat_html_entities.name2codepoint:
 411         return compat_chr(compat_html_entities.name2codepoint[entity])
 412
 413     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 414     if mobj is not None:
 415         numstr = mobj.group(1)
 416         if numstr.startswith('x'):
 417             base = 16
 418             numstr = '0%s' % numstr
 419         else:
 420             base = 10
 421         # See https://github.com/rg3/youtube-dl/issues/7518
 422         try:
 423             return compat_chr(int(numstr, base))
 424         except ValueError:
 425             pass
 426
 427     # Unknown entity in name, return its literal representation
 428     return '&%s;' % entity
 429
 430
 431 def unescapeHTML(s):
 432     if s is None:
 433         return None
 434     assert type(s) == compat_str
 435
 436     return re.sub(
 437         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 438
 439
 440 def get_subprocess_encoding():
 441     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 442         # For subprocess calls, encode with locale encoding
 443         # Refer to http://stackoverflow.com/a/9951851/35070
 444         encoding = preferredencoding()
 445     else:
 446         encoding = sys.getfilesystemencoding()
 447     if encoding is None:
 448         encoding = 'utf-8'
 449     return encoding
 450
 451
 452 def encodeFilename(s, for_subprocess=False):
 453     """
 454     @param s The name of the file
 455     """
 456
 457     assert type(s) == compat_str
 458
 459     # Python 3 has a Unicode API
 460     if sys.version_info >= (3, 0):
 461         return s
 462
 463     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 464     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 465     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 466     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 467         return s
 468
 469     return s.encode(get_subprocess_encoding(), 'ignore')
 470
 471
 472 def decodeFilename(b, for_subprocess=False):
 473
 474     if sys.version_info >= (3, 0):
 475         return b
 476
 477     if not isinstance(b, bytes):
 478         return b
 479
 480     return b.decode(get_subprocess_encoding(), 'ignore')
 481
 482
 483 def encodeArgument(s):
 484     if not isinstance(s, compat_str):
 485         # Legacy code that uses byte strings
 486         # Uncomment the following line after fixing all post processors
 487         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 488         s = s.decode('ascii')
 489     return encodeFilename(s, True)
 490
 491
 492 def decodeArgument(b):
 493     return decodeFilename(b, True)
 494
 495
 496 def decodeOption(optval):
 497     if optval is None:
 498         return optval
 499     if isinstance(optval, bytes):
 500         optval = optval.decode(preferredencoding())
 501
 502     assert isinstance(optval, compat_str)
 503     return optval
 504
 505
 506 def formatSeconds(secs):
 507     if secs > 3600:
 508         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 509     elif secs > 60:
 510         return '%d:%02d' % (secs // 60, secs % 60)
 511     else:
 512         return '%d' % secs
 513
 514
 515 def make_HTTPS_handler(params, **kwargs):
 516     opts_no_check_certificate = params.get('nocheckcertificate', False)
 517     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 518         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 519         if opts_no_check_certificate:
 520             context.check_hostname = False
 521             context.verify_mode = ssl.CERT_NONE
 522         try:
 523             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 524         except TypeError:
 525             # Python 2.7.8
 526             # (create_default_context present but HTTPSHandler has no context=)
 527             pass
 528
 529     if sys.version_info < (3, 2):
 530         return YoutubeDLHTTPSHandler(params, **kwargs)
 531     else:  # Python < 3.4
 532         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 533         context.verify_mode = (ssl.CERT_NONE
 534                                if opts_no_check_certificate
 535                                else ssl.CERT_REQUIRED)
 536         context.set_default_verify_paths()
 537         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 538
 539
 540 def bug_reports_message():
 541     if ytdl_is_updateable():
 542         update_cmd = 'type  youtube-dl -U  to update'
 543     else:
 544         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 545     msg = '; please report this issue on https://yt-dl.org/bug .'
 546     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 547     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 548     return msg
 549
 550
 551 class ExtractorError(Exception):
 552     """Error during info extraction."""
 553
 554     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 555         """ tb, if given, is the original traceback (so that it can be printed out).
 556         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 557         """
 558
 559         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 560             expected = True
 561         if video_id is not None:
 562             msg = video_id + ': ' + msg
 563         if cause:
 564             msg += ' (caused by %r)' % cause
 565         if not expected:
 566             msg += bug_reports_message()
 567         super(ExtractorError, self).__init__(msg)
 568
 569         self.traceback = tb
 570         self.exc_info = sys.exc_info()  # preserve original exception
 571         self.cause = cause
 572         self.video_id = video_id
 573
 574     def format_traceback(self):
 575         if self.traceback is None:
 576             return None
 577         return ''.join(traceback.format_tb(self.traceback))
 578
 579
 580 class UnsupportedError(ExtractorError):
 581     def __init__(self, url):
 582         super(UnsupportedError, self).__init__(
 583             'Unsupported URL: %s' % url, expected=True)
 584         self.url = url
 585
 586
 587 class RegexNotFoundError(ExtractorError):
 588     """Error when a regex didn't match"""
 589     pass
 590
 591
 592 class DownloadError(Exception):
 593     """Download Error exception.
 594
 595     This exception may be thrown by FileDownloader objects if they are not
 596     configured to continue on errors. They will contain the appropriate
 597     error message.
 598     """
 599
 600     def __init__(self, msg, exc_info=None):
 601         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 602         super(DownloadError, self).__init__(msg)
 603         self.exc_info = exc_info
 604
 605
 606 class SameFileError(Exception):
 607     """Same File exception.
 608
 609     This exception will be thrown by FileDownloader objects if they detect
 610     multiple files would have to be downloaded to the same file on disk.
 611     """
 612     pass
 613
 614
 615 class PostProcessingError(Exception):
 616     """Post Processing exception.
 617
 618     This exception may be raised by PostProcessor's .run() method to
 619     indicate an error in the postprocessing task.
 620     """
 621
 622     def __init__(self, msg):
 623         self.msg = msg
 624
 625
 626 class MaxDownloadsReached(Exception):
 627     """ --max-downloads limit has been reached. """
 628     pass
 629
 630
 631 class UnavailableVideoError(Exception):
 632     """Unavailable Format exception.
 633
 634     This exception will be thrown when a video is requested
 635     in a format that is not available for that video.
 636     """
 637     pass
 638
 639
 640 class ContentTooShortError(Exception):
 641     """Content Too Short exception.
 642
 643     This exception may be raised by FileDownloader objects when a file they
 644     download is too small for what the server announced first, indicating
 645     the connection was probably interrupted.
 646     """
 647
 648     def __init__(self, downloaded, expected):
 649         # Both in bytes
 650         self.downloaded = downloaded
 651         self.expected = expected
 652
 653
 654 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 655     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 656     # expected HTTP responses to meet HTTP/1.0 or later (see also
 657     # https://github.com/rg3/youtube-dl/issues/6727)
 658     if sys.version_info < (3, 0):
 659         kwargs[b'strict'] = True
 660     hc = http_class(*args, **kwargs)
 661     source_address = ydl_handler._params.get('source_address')
 662     if source_address is not None:
 663         sa = (source_address, 0)
 664         if hasattr(hc, 'source_address'):  # Python 2.7+
 665             hc.source_address = sa
 666         else:  # Python 2.6
 667             def _hc_connect(self, *args, **kwargs):
 668                 sock = compat_socket_create_connection(
 669                     (self.host, self.port), self.timeout, sa)
 670                 if is_https:
 671                     self.sock = ssl.wrap_socket(
 672                         sock, self.key_file, self.cert_file,
 673                         ssl_version=ssl.PROTOCOL_TLSv1)
 674                 else:
 675                     self.sock = sock
 676             hc.connect = functools.partial(_hc_connect, hc)
 677
 678     return hc
 679
 680
 681 def handle_youtubedl_headers(headers):
 682     filtered_headers = headers
 683
 684     if 'Youtubedl-no-compression' in filtered_headers:
 685         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 686         del filtered_headers['Youtubedl-no-compression']
 687
 688     return filtered_headers
 689
 690
 691 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 692     """Handler for HTTP requests and responses.
 693
 694     This class, when installed with an OpenerDirector, automatically adds
 695     the standard headers to every HTTP request and handles gzipped and
 696     deflated responses from web servers. If compression is to be avoided in
 697     a particular request, the original request in the program code only has
 698     to include the HTTP header "Youtubedl-no-compression", which will be
 699     removed before making the real request.
 700
 701     Part of this code was copied from:
 702
 703     http://techknack.net/python-urllib2-handlers/
 704
 705     Andrew Rowls, the author of that code, agreed to release it to the
 706     public domain.
 707     """
 708
 709     def __init__(self, params, *args, **kwargs):
 710         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 711         self._params = params
 712
 713     def http_open(self, req):
 714         return self.do_open(functools.partial(
 715             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 716             req)
 717
 718     @staticmethod
 719     def deflate(data):
 720         try:
 721             return zlib.decompress(data, -zlib.MAX_WBITS)
 722         except zlib.error:
 723             return zlib.decompress(data)
 724
 725     @staticmethod
 726     def addinfourl_wrapper(stream, headers, url, code):
 727         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 728             return compat_urllib_request.addinfourl(stream, headers, url, code)
 729         ret = compat_urllib_request.addinfourl(stream, headers, url)
 730         ret.code = code
 731         return ret
 732
 733     def http_request(self, req):
 734         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 735         # always respected by websites, some tend to give out URLs with non percent-encoded
 736         # non-ASCII characters (see telemb.py, ard.py [#3412])
 737         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 738         # To work around aforementioned issue we will replace request's original URL with
 739         # percent-encoded one
 740         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 741         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 742         url = req.get_full_url()
 743         url_escaped = escape_url(url)
 744
 745         # Substitute URL if any change after escaping
 746         if url != url_escaped:
 747             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 748             new_req = req_type(
 749                 url_escaped, data=req.data, headers=req.headers,
 750                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 751             new_req.timeout = req.timeout
 752             req = new_req
 753
 754         for h, v in std_headers.items():
 755             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 756             # The dict keys are capitalized because of this bug by urllib
 757             if h.capitalize() not in req.headers:
 758                 req.add_header(h, v)
 759
 760         req.headers = handle_youtubedl_headers(req.headers)
 761
 762         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 763             # Python 2.6 is brain-dead when it comes to fragments
 764             req._Request__original = req._Request__original.partition('#')[0]
 765             req._Request__r_type = req._Request__r_type.partition('#')[0]
 766
 767         return req
 768
 769     def http_response(self, req, resp):
 770         old_resp = resp
 771         # gzip
 772         if resp.headers.get('Content-encoding', '') == 'gzip':
 773             content = resp.read()
 774             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 775             try:
 776                 uncompressed = io.BytesIO(gz.read())
 777             except IOError as original_ioerror:
 778                 # There may be junk add the end of the file
 779                 # See http://stackoverflow.com/q/4928560/35070 for details
 780                 for i in range(1, 1024):
 781                     try:
 782                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 783                         uncompressed = io.BytesIO(gz.read())
 784                     except IOError:
 785                         continue
 786                     break
 787                 else:
 788                     raise original_ioerror
 789             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 790             resp.msg = old_resp.msg
 791             del resp.headers['Content-encoding']
 792         # deflate
 793         if resp.headers.get('Content-encoding', '') == 'deflate':
 794             gz = io.BytesIO(self.deflate(resp.read()))
 795             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 796             resp.msg = old_resp.msg
 797             del resp.headers['Content-encoding']
 798         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 799         # https://github.com/rg3/youtube-dl/issues/6457).
 800         if 300 <= resp.code < 400:
 801             location = resp.headers.get('Location')
 802             if location:
 803                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 804                 if sys.version_info >= (3, 0):
 805                     location = location.encode('iso-8859-1').decode('utf-8')
 806                 location_escaped = escape_url(location)
 807                 if location != location_escaped:
 808                     del resp.headers['Location']
 809                     resp.headers['Location'] = location_escaped
 810         return resp
 811
 812     https_request = http_request
 813     https_response = http_response
 814
 815
 816 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 817     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 818         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 819         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 820         self._params = params
 821
 822     def https_open(self, req):
 823         kwargs = {}
 824         if hasattr(self, '_context'):  # python > 2.6
 825             kwargs['context'] = self._context
 826         if hasattr(self, '_check_hostname'):  # python 3.x
 827             kwargs['check_hostname'] = self._check_hostname
 828         return self.do_open(functools.partial(
 829             _create_http_connection, self, self._https_conn_class, True),
 830             req, **kwargs)
 831
 832
 833 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 834     def __init__(self, cookiejar=None):
 835         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 836
 837     def http_response(self, request, response):
 838         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 839         # characters in Set-Cookie HTTP header of last response (see
 840         # https://github.com/rg3/youtube-dl/issues/6769).
 841         # In order to at least prevent crashing we will percent encode Set-Cookie
 842         # header before HTTPCookieProcessor starts processing it.
 843         # if sys.version_info < (3, 0) and response.headers:
 844         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 845         #         set_cookie = response.headers.get(set_cookie_header)
 846         #         if set_cookie:
 847         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 848         #             if set_cookie != set_cookie_escaped:
 849         #                 del response.headers[set_cookie_header]
 850         #                 response.headers[set_cookie_header] = set_cookie_escaped
 851         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 852
 853     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 854     https_response = http_response
 855
 856
 857 def parse_iso8601(date_str, delimiter='T', timezone=None):
 858     """ Return a UNIX timestamp from the given date """
 859
 860     if date_str is None:
 861         return None
 862
 863     date_str = re.sub(r'\.[0-9]+', '', date_str)
 864
 865     if timezone is None:
 866         m = re.search(
 867             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 868             date_str)
 869         if not m:
 870             timezone = datetime.timedelta()
 871         else:
 872             date_str = date_str[:-len(m.group(0))]
 873             if not m.group('sign'):
 874                 timezone = datetime.timedelta()
 875             else:
 876                 sign = 1 if m.group('sign') == '+' else -1
 877                 timezone = datetime.timedelta(
 878                     hours=sign * int(m.group('hours')),
 879                     minutes=sign * int(m.group('minutes')))
 880     try:
 881         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 882         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 883         return calendar.timegm(dt.timetuple())
 884     except ValueError:
 885         pass
 886
 887
 888 def unified_strdate(date_str, day_first=True):
 889     """Return a string with the date in the format YYYYMMDD"""
 890
 891     if date_str is None:
 892         return None
 893     upload_date = None
 894     # Replace commas
 895     date_str = date_str.replace(',', ' ')
 896     # %z (UTC offset) is only supported in python>=3.2
 897     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 898         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 899     # Remove AM/PM + timezone
 900     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 901
 902     format_expressions = [
 903         '%d %B %Y',
 904         '%d %b %Y',
 905         '%B %d %Y',
 906         '%b %d %Y',
 907         '%b %dst %Y %I:%M%p',
 908         '%b %dnd %Y %I:%M%p',
 909         '%b %dth %Y %I:%M%p',
 910         '%Y %m %d',
 911         '%Y-%m-%d',
 912         '%Y/%m/%d',
 913         '%Y/%m/%d %H:%M:%S',
 914         '%Y-%m-%d %H:%M:%S',
 915         '%Y-%m-%d %H:%M:%S.%f',
 916         '%d.%m.%Y %H:%M',
 917         '%d.%m.%Y %H.%M',
 918         '%Y-%m-%dT%H:%M:%SZ',
 919         '%Y-%m-%dT%H:%M:%S.%fZ',
 920         '%Y-%m-%dT%H:%M:%S.%f0Z',
 921         '%Y-%m-%dT%H:%M:%S',
 922         '%Y-%m-%dT%H:%M:%S.%f',
 923         '%Y-%m-%dT%H:%M',
 924     ]
 925     if day_first:
 926         format_expressions.extend([
 927             '%d-%m-%Y',
 928             '%d.%m.%Y',
 929             '%d/%m/%Y',
 930             '%d/%m/%y',
 931             '%d/%m/%Y %H:%M:%S',
 932         ])
 933     else:
 934         format_expressions.extend([
 935             '%m-%d-%Y',
 936             '%m.%d.%Y',
 937             '%m/%d/%Y',
 938             '%m/%d/%y',
 939             '%m/%d/%Y %H:%M:%S',
 940         ])
 941     for expression in format_expressions:
 942         try:
 943             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 944         except ValueError:
 945             pass
 946     if upload_date is None:
 947         timetuple = email.utils.parsedate_tz(date_str)
 948         if timetuple:
 949             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 950     if upload_date is not None:
 951         return compat_str(upload_date)
 952
 953
 954 def determine_ext(url, default_ext='unknown_video'):
 955     if url is None:
 956         return default_ext
 957     guess = url.partition('?')[0].rpartition('.')[2]
 958     if re.match(r'^[A-Za-z0-9]+$', guess):
 959         return guess
 960     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
 961     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
 962         return guess.rstrip('/')
 963     else:
 964         return default_ext
 965
 966
 967 def subtitles_filename(filename, sub_lang, sub_format):
 968     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 969
 970
 971 def date_from_str(date_str):
 972     """
 973     Return a datetime object from a string in the format YYYYMMDD or
 974     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 975     today = datetime.date.today()
 976     if date_str in ('now', 'today'):
 977         return today
 978     if date_str == 'yesterday':
 979         return today - datetime.timedelta(days=1)
 980     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 981     if match is not None:
 982         sign = match.group('sign')
 983         time = int(match.group('time'))
 984         if sign == '-':
 985             time = -time
 986         unit = match.group('unit')
 987         # A bad approximation?
 988         if unit == 'month':
 989             unit = 'day'
 990             time *= 30
 991         elif unit == 'year':
 992             unit = 'day'
 993             time *= 365
 994         unit += 's'
 995         delta = datetime.timedelta(**{unit: time})
 996         return today + delta
 997     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 998
 999
1000 def hyphenate_date(date_str):
1001     """
1002     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1003     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1004     if match is not None:
1005         return '-'.join(match.groups())
1006     else:
1007         return date_str
1008
1009
1010 class DateRange(object):
1011     """Represents a time interval between two dates"""
1012
1013     def __init__(self, start=None, end=None):
1014         """start and end must be strings in the format accepted by date"""
1015         if start is not None:
1016             self.start = date_from_str(start)
1017         else:
1018             self.start = datetime.datetime.min.date()
1019         if end is not None:
1020             self.end = date_from_str(end)
1021         else:
1022             self.end = datetime.datetime.max.date()
1023         if self.start > self.end:
1024             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1025
1026     @classmethod
1027     def day(cls, day):
1028         """Returns a range that only contains the given day"""
1029         return cls(day, day)
1030
1031     def __contains__(self, date):
1032         """Check if the date is in the range"""
1033         if not isinstance(date, datetime.date):
1034             date = date_from_str(date)
1035         return self.start <= date <= self.end
1036
1037     def __str__(self):
1038         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1039
1040
1041 def platform_name():
1042     """ Returns the platform name as a compat_str """
1043     res = platform.platform()
1044     if isinstance(res, bytes):
1045         res = res.decode(preferredencoding())
1046
1047     assert isinstance(res, compat_str)
1048     return res
1049
1050
1051 def _windows_write_string(s, out):
1052     """ Returns True if the string was written using special methods,
1053     False if it has yet to be written out."""
1054     # Adapted from http://stackoverflow.com/a/3259271/35070
1055
1056     import ctypes
1057     import ctypes.wintypes
1058
1059     WIN_OUTPUT_IDS = {
1060         1: -11,
1061         2: -12,
1062     }
1063
1064     try:
1065         fileno = out.fileno()
1066     except AttributeError:
1067         # If the output stream doesn't have a fileno, it's virtual
1068         return False
1069     except io.UnsupportedOperation:
1070         # Some strange Windows pseudo files?
1071         return False
1072     if fileno not in WIN_OUTPUT_IDS:
1073         return False
1074
1075     GetStdHandle = ctypes.WINFUNCTYPE(
1076         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1077         (b"GetStdHandle", ctypes.windll.kernel32))
1078     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1079
1080     WriteConsoleW = ctypes.WINFUNCTYPE(
1081         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1082         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1083         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1084     written = ctypes.wintypes.DWORD(0)
1085
1086     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1087     FILE_TYPE_CHAR = 0x0002
1088     FILE_TYPE_REMOTE = 0x8000
1089     GetConsoleMode = ctypes.WINFUNCTYPE(
1090         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1091         ctypes.POINTER(ctypes.wintypes.DWORD))(
1092         (b"GetConsoleMode", ctypes.windll.kernel32))
1093     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1094
1095     def not_a_console(handle):
1096         if handle == INVALID_HANDLE_VALUE or handle is None:
1097             return True
1098         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1099                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1100
1101     if not_a_console(h):
1102         return False
1103
1104     def next_nonbmp_pos(s):
1105         try:
1106             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1107         except StopIteration:
1108             return len(s)
1109
1110     while s:
1111         count = min(next_nonbmp_pos(s), 1024)
1112
1113         ret = WriteConsoleW(
1114             h, s, count if count else 2, ctypes.byref(written), None)
1115         if ret == 0:
1116             raise OSError('Failed to write string')
1117         if not count:  # We just wrote a non-BMP character
1118             assert written.value == 2
1119             s = s[1:]
1120         else:
1121             assert written.value > 0
1122             s = s[written.value:]
1123     return True
1124
1125
1126 def write_string(s, out=None, encoding=None):
1127     if out is None:
1128         out = sys.stderr
1129     assert type(s) == compat_str
1130
1131     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1132         if _windows_write_string(s, out):
1133             return
1134
1135     if ('b' in getattr(out, 'mode', '') or
1136             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1137         byt = s.encode(encoding or preferredencoding(), 'ignore')
1138         out.write(byt)
1139     elif hasattr(out, 'buffer'):
1140         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1141         byt = s.encode(enc, 'ignore')
1142         out.buffer.write(byt)
1143     else:
1144         out.write(s)
1145     out.flush()
1146
1147
1148 def bytes_to_intlist(bs):
1149     if not bs:
1150         return []
1151     if isinstance(bs[0], int):  # Python 3
1152         return list(bs)
1153     else:
1154         return [ord(c) for c in bs]
1155
1156
1157 def intlist_to_bytes(xs):
1158     if not xs:
1159         return b''
1160     return struct_pack('%dB' % len(xs), *xs)
1161
1162
1163 # Cross-platform file locking
1164 if sys.platform == 'win32':
1165     import ctypes.wintypes
1166     import msvcrt
1167
1168     class OVERLAPPED(ctypes.Structure):
1169         _fields_ = [
1170             ('Internal', ctypes.wintypes.LPVOID),
1171             ('InternalHigh', ctypes.wintypes.LPVOID),
1172             ('Offset', ctypes.wintypes.DWORD),
1173             ('OffsetHigh', ctypes.wintypes.DWORD),
1174             ('hEvent', ctypes.wintypes.HANDLE),
1175         ]
1176
1177     kernel32 = ctypes.windll.kernel32
1178     LockFileEx = kernel32.LockFileEx
1179     LockFileEx.argtypes = [
1180         ctypes.wintypes.HANDLE,     # hFile
1181         ctypes.wintypes.DWORD,      # dwFlags
1182         ctypes.wintypes.DWORD,      # dwReserved
1183         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1184         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1185         ctypes.POINTER(OVERLAPPED)  # Overlapped
1186     ]
1187     LockFileEx.restype = ctypes.wintypes.BOOL
1188     UnlockFileEx = kernel32.UnlockFileEx
1189     UnlockFileEx.argtypes = [
1190         ctypes.wintypes.HANDLE,     # hFile
1191         ctypes.wintypes.DWORD,      # dwReserved
1192         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1193         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1194         ctypes.POINTER(OVERLAPPED)  # Overlapped
1195     ]
1196     UnlockFileEx.restype = ctypes.wintypes.BOOL
1197     whole_low = 0xffffffff
1198     whole_high = 0x7fffffff
1199
1200     def _lock_file(f, exclusive):
1201         overlapped = OVERLAPPED()
1202         overlapped.Offset = 0
1203         overlapped.OffsetHigh = 0
1204         overlapped.hEvent = 0
1205         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1206         handle = msvcrt.get_osfhandle(f.fileno())
1207         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1208                           whole_low, whole_high, f._lock_file_overlapped_p):
1209             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1210
1211     def _unlock_file(f):
1212         assert f._lock_file_overlapped_p
1213         handle = msvcrt.get_osfhandle(f.fileno())
1214         if not UnlockFileEx(handle, 0,
1215                             whole_low, whole_high, f._lock_file_overlapped_p):
1216             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1217
1218 else:
1219     import fcntl
1220
1221     def _lock_file(f, exclusive):
1222         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1223
1224     def _unlock_file(f):
1225         fcntl.flock(f, fcntl.LOCK_UN)
1226
1227
1228 class locked_file(object):
1229     def __init__(self, filename, mode, encoding=None):
1230         assert mode in ['r', 'a', 'w']
1231         self.f = io.open(filename, mode, encoding=encoding)
1232         self.mode = mode
1233
1234     def __enter__(self):
1235         exclusive = self.mode != 'r'
1236         try:
1237             _lock_file(self.f, exclusive)
1238         except IOError:
1239             self.f.close()
1240             raise
1241         return self
1242
1243     def __exit__(self, etype, value, traceback):
1244         try:
1245             _unlock_file(self.f)
1246         finally:
1247             self.f.close()
1248
1249     def __iter__(self):
1250         return iter(self.f)
1251
1252     def write(self, *args):
1253         return self.f.write(*args)
1254
1255     def read(self, *args):
1256         return self.f.read(*args)
1257
1258
1259 def get_filesystem_encoding():
1260     encoding = sys.getfilesystemencoding()
1261     return encoding if encoding is not None else 'utf-8'
1262
1263
1264 def shell_quote(args):
1265     quoted_args = []
1266     encoding = get_filesystem_encoding()
1267     for a in args:
1268         if isinstance(a, bytes):
1269             # We may get a filename encoded with 'encodeFilename'
1270             a = a.decode(encoding)
1271         quoted_args.append(pipes.quote(a))
1272     return ' '.join(quoted_args)
1273
1274
1275 def smuggle_url(url, data):
1276     """ Pass additional data in a URL for internal use. """
1277
1278     sdata = compat_urllib_parse.urlencode(
1279         {'__youtubedl_smuggle': json.dumps(data)})
1280     return url + '#' + sdata
1281
1282
1283 def unsmuggle_url(smug_url, default=None):
1284     if '#__youtubedl_smuggle' not in smug_url:
1285         return smug_url, default
1286     url, _, sdata = smug_url.rpartition('#')
1287     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1288     data = json.loads(jsond)
1289     return url, data
1290
1291
1292 def format_bytes(bytes):
1293     if bytes is None:
1294         return 'N/A'
1295     if type(bytes) is str:
1296         bytes = float(bytes)
1297     if bytes == 0.0:
1298         exponent = 0
1299     else:
1300         exponent = int(math.log(bytes, 1024.0))
1301     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1302     converted = float(bytes) / float(1024 ** exponent)
1303     return '%.2f%s' % (converted, suffix)
1304
1305
1306 def parse_filesize(s):
1307     if s is None:
1308         return None
1309
1310     # The lower-case forms are of course incorrect and unofficial,
1311     # but we support those too
1312     _UNIT_TABLE = {
1313         'B': 1,
1314         'b': 1,
1315         'KiB': 1024,
1316         'KB': 1000,
1317         'kB': 1024,
1318         'Kb': 1000,
1319         'MiB': 1024 ** 2,
1320         'MB': 1000 ** 2,
1321         'mB': 1024 ** 2,
1322         'Mb': 1000 ** 2,
1323         'GiB': 1024 ** 3,
1324         'GB': 1000 ** 3,
1325         'gB': 1024 ** 3,
1326         'Gb': 1000 ** 3,
1327         'TiB': 1024 ** 4,
1328         'TB': 1000 ** 4,
1329         'tB': 1024 ** 4,
1330         'Tb': 1000 ** 4,
1331         'PiB': 1024 ** 5,
1332         'PB': 1000 ** 5,
1333         'pB': 1024 ** 5,
1334         'Pb': 1000 ** 5,
1335         'EiB': 1024 ** 6,
1336         'EB': 1000 ** 6,
1337         'eB': 1024 ** 6,
1338         'Eb': 1000 ** 6,
1339         'ZiB': 1024 ** 7,
1340         'ZB': 1000 ** 7,
1341         'zB': 1024 ** 7,
1342         'Zb': 1000 ** 7,
1343         'YiB': 1024 ** 8,
1344         'YB': 1000 ** 8,
1345         'yB': 1024 ** 8,
1346         'Yb': 1000 ** 8,
1347     }
1348
1349     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1350     m = re.match(
1351         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1352     if not m:
1353         return None
1354
1355     num_str = m.group('num').replace(',', '.')
1356     mult = _UNIT_TABLE[m.group('unit')]
1357     return int(float(num_str) * mult)
1358
1359
1360 def month_by_name(name):
1361     """ Return the number of a month by (locale-independently) English name """
1362
1363     try:
1364         return ENGLISH_MONTH_NAMES.index(name) + 1
1365     except ValueError:
1366         return None
1367
1368
1369 def month_by_abbreviation(abbrev):
1370     """ Return the number of a month by (locale-independently) English
1371         abbreviations """
1372
1373     try:
1374         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1375     except ValueError:
1376         return None
1377
1378
1379 def fix_xml_ampersands(xml_str):
1380     """Replace all the '&' by '&amp;' in XML"""
1381     return re.sub(
1382         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1383         '&amp;',
1384         xml_str)
1385
1386
1387 def setproctitle(title):
1388     assert isinstance(title, compat_str)
1389     try:
1390         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1391     except OSError:
1392         return
1393     title_bytes = title.encode('utf-8')
1394     buf = ctypes.create_string_buffer(len(title_bytes))
1395     buf.value = title_bytes
1396     try:
1397         libc.prctl(15, buf, 0, 0, 0)
1398     except AttributeError:
1399         return  # Strange libc, just skip this
1400
1401
1402 def remove_start(s, start):
1403     if s.startswith(start):
1404         return s[len(start):]
1405     return s
1406
1407
1408 def remove_end(s, end):
1409     if s.endswith(end):
1410         return s[:-len(end)]
1411     return s
1412
1413
1414 def remove_quotes(s):
1415     if s is None or len(s) < 2:
1416         return s
1417     for quote in ('"', "'", ):
1418         if s[0] == quote and s[-1] == quote:
1419             return s[1:-1]
1420     return s
1421
1422
1423 def url_basename(url):
1424     path = compat_urlparse.urlparse(url).path
1425     return path.strip('/').split('/')[-1]
1426
1427
1428 class HEADRequest(compat_urllib_request.Request):
1429     def get_method(self):
1430         return "HEAD"
1431
1432
1433 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1434     if get_attr:
1435         if v is not None:
1436             v = getattr(v, get_attr, None)
1437     if v == '':
1438         v = None
1439     if v is None:
1440         return default
1441     try:
1442         return int(v) * invscale // scale
1443     except ValueError:
1444         return default
1445
1446
1447 def str_or_none(v, default=None):
1448     return default if v is None else compat_str(v)
1449
1450
1451 def str_to_int(int_str):
1452     """ A more relaxed version of int_or_none """
1453     if int_str is None:
1454         return None
1455     int_str = re.sub(r'[,\.\+]', '', int_str)
1456     return int(int_str)
1457
1458
1459 def float_or_none(v, scale=1, invscale=1, default=None):
1460     if v is None:
1461         return default
1462     try:
1463         return float(v) * invscale / scale
1464     except ValueError:
1465         return default
1466
1467
1468 def parse_duration(s):
1469     if not isinstance(s, compat_basestring):
1470         return None
1471
1472     s = s.strip()
1473
1474     m = re.match(
1475         r'''(?ix)(?:P?T)?
1476         (?:
1477             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1478             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1479
1480             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1481             (?:
1482                 (?:
1483                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1484                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1485                 )?
1486                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1487             )?
1488             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1489         )$''', s)
1490     if not m:
1491         return None
1492     res = 0
1493     if m.group('only_mins'):
1494         return float_or_none(m.group('only_mins'), invscale=60)
1495     if m.group('only_hours'):
1496         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1497     if m.group('secs'):
1498         res += int(m.group('secs'))
1499     if m.group('mins_reversed'):
1500         res += int(m.group('mins_reversed')) * 60
1501     if m.group('mins'):
1502         res += int(m.group('mins')) * 60
1503     if m.group('hours'):
1504         res += int(m.group('hours')) * 60 * 60
1505     if m.group('hours_reversed'):
1506         res += int(m.group('hours_reversed')) * 60 * 60
1507     if m.group('days'):
1508         res += int(m.group('days')) * 24 * 60 * 60
1509     if m.group('ms'):
1510         res += float(m.group('ms'))
1511     return res
1512
1513
1514 def prepend_extension(filename, ext, expected_real_ext=None):
1515     name, real_ext = os.path.splitext(filename)
1516     return (
1517         '{0}.{1}{2}'.format(name, ext, real_ext)
1518         if not expected_real_ext or real_ext[1:] == expected_real_ext
1519         else '{0}.{1}'.format(filename, ext))
1520
1521
1522 def replace_extension(filename, ext, expected_real_ext=None):
1523     name, real_ext = os.path.splitext(filename)
1524     return '{0}.{1}'.format(
1525         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1526         ext)
1527
1528
1529 def check_executable(exe, args=[]):
1530     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1531     args can be a list of arguments for a short output (like -version) """
1532     try:
1533         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1534     except OSError:
1535         return False
1536     return exe
1537
1538
1539 def get_exe_version(exe, args=['--version'],
1540                     version_re=None, unrecognized='present'):
1541     """ Returns the version of the specified executable,
1542     or False if the executable is not present """
1543     try:
1544         out, _ = subprocess.Popen(
1545             [encodeArgument(exe)] + args,
1546             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1547     except OSError:
1548         return False
1549     if isinstance(out, bytes):  # Python 2.x
1550         out = out.decode('ascii', 'ignore')
1551     return detect_exe_version(out, version_re, unrecognized)
1552
1553
1554 def detect_exe_version(output, version_re=None, unrecognized='present'):
1555     assert isinstance(output, compat_str)
1556     if version_re is None:
1557         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1558     m = re.search(version_re, output)
1559     if m:
1560         return m.group(1)
1561     else:
1562         return unrecognized
1563
1564
1565 class PagedList(object):
1566     def __len__(self):
1567         # This is only useful for tests
1568         return len(self.getslice())
1569
1570
1571 class OnDemandPagedList(PagedList):
1572     def __init__(self, pagefunc, pagesize):
1573         self._pagefunc = pagefunc
1574         self._pagesize = pagesize
1575
1576     def getslice(self, start=0, end=None):
1577         res = []
1578         for pagenum in itertools.count(start // self._pagesize):
1579             firstid = pagenum * self._pagesize
1580             nextfirstid = pagenum * self._pagesize + self._pagesize
1581             if start >= nextfirstid:
1582                 continue
1583
1584             page_results = list(self._pagefunc(pagenum))
1585
1586             startv = (
1587                 start % self._pagesize
1588                 if firstid <= start < nextfirstid
1589                 else 0)
1590
1591             endv = (
1592                 ((end - 1) % self._pagesize) + 1
1593                 if (end is not None and firstid <= end <= nextfirstid)
1594                 else None)
1595
1596             if startv != 0 or endv is not None:
1597                 page_results = page_results[startv:endv]
1598             res.extend(page_results)
1599
1600             # A little optimization - if current page is not "full", ie. does
1601             # not contain page_size videos then we can assume that this page
1602             # is the last one - there are no more ids on further pages -
1603             # i.e. no need to query again.
1604             if len(page_results) + startv < self._pagesize:
1605                 break
1606
1607             # If we got the whole page, but the next page is not interesting,
1608             # break out early as well
1609             if end == nextfirstid:
1610                 break
1611         return res
1612
1613
1614 class InAdvancePagedList(PagedList):
1615     def __init__(self, pagefunc, pagecount, pagesize):
1616         self._pagefunc = pagefunc
1617         self._pagecount = pagecount
1618         self._pagesize = pagesize
1619
1620     def getslice(self, start=0, end=None):
1621         res = []
1622         start_page = start // self._pagesize
1623         end_page = (
1624             self._pagecount if end is None else (end // self._pagesize + 1))
1625         skip_elems = start - start_page * self._pagesize
1626         only_more = None if end is None else end - start
1627         for pagenum in range(start_page, end_page):
1628             page = list(self._pagefunc(pagenum))
1629             if skip_elems:
1630                 page = page[skip_elems:]
1631                 skip_elems = None
1632             if only_more is not None:
1633                 if len(page) < only_more:
1634                     only_more -= len(page)
1635                 else:
1636                     page = page[:only_more]
1637                     res.extend(page)
1638                     break
1639             res.extend(page)
1640         return res
1641
1642
1643 def uppercase_escape(s):
1644     unicode_escape = codecs.getdecoder('unicode_escape')
1645     return re.sub(
1646         r'\\U[0-9a-fA-F]{8}',
1647         lambda m: unicode_escape(m.group(0))[0],
1648         s)
1649
1650
1651 def lowercase_escape(s):
1652     unicode_escape = codecs.getdecoder('unicode_escape')
1653     return re.sub(
1654         r'\\u[0-9a-fA-F]{4}',
1655         lambda m: unicode_escape(m.group(0))[0],
1656         s)
1657
1658
1659 def escape_rfc3986(s):
1660     """Escape non-ASCII characters as suggested by RFC 3986"""
1661     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1662         s = s.encode('utf-8')
1663     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1664
1665
1666 def escape_url(url):
1667     """Escape URL as suggested by RFC 3986"""
1668     url_parsed = compat_urllib_parse_urlparse(url)
1669     return url_parsed._replace(
1670         path=escape_rfc3986(url_parsed.path),
1671         params=escape_rfc3986(url_parsed.params),
1672         query=escape_rfc3986(url_parsed.query),
1673         fragment=escape_rfc3986(url_parsed.fragment)
1674     ).geturl()
1675
1676 try:
1677     struct.pack('!I', 0)
1678 except TypeError:
1679     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1680     def struct_pack(spec, *args):
1681         if isinstance(spec, compat_str):
1682             spec = spec.encode('ascii')
1683         return struct.pack(spec, *args)
1684
1685     def struct_unpack(spec, *args):
1686         if isinstance(spec, compat_str):
1687             spec = spec.encode('ascii')
1688         return struct.unpack(spec, *args)
1689 else:
1690     struct_pack = struct.pack
1691     struct_unpack = struct.unpack
1692
1693
1694 def read_batch_urls(batch_fd):
1695     def fixup(url):
1696         if not isinstance(url, compat_str):
1697             url = url.decode('utf-8', 'replace')
1698         BOM_UTF8 = '\xef\xbb\xbf'
1699         if url.startswith(BOM_UTF8):
1700             url = url[len(BOM_UTF8):]
1701         url = url.strip()
1702         if url.startswith(('#', ';', ']')):
1703             return False
1704         return url
1705
1706     with contextlib.closing(batch_fd) as fd:
1707         return [url for url in map(fixup, fd) if url]
1708
1709
1710 def urlencode_postdata(*args, **kargs):
1711     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1712
1713
1714 def encode_dict(d, encoding='utf-8'):
1715     def encode(v):
1716         return v.encode(encoding) if isinstance(v, compat_basestring) else v
1717     return dict((encode(k), encode(v)) for k, v in d.items())
1718
1719
1720 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1721     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1722
1723
1724 US_RATINGS = {
1725     'G': 0,
1726     'PG': 10,
1727     'PG-13': 13,
1728     'R': 16,
1729     'NC': 18,
1730 }
1731
1732
1733 def parse_age_limit(s):
1734     if s is None:
1735         return None
1736     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1737     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1738
1739
1740 def strip_jsonp(code):
1741     return re.sub(
1742         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1743
1744
1745 def js_to_json(code):
1746     def fix_kv(m):
1747         v = m.group(0)
1748         if v in ('true', 'false', 'null'):
1749             return v
1750         if v.startswith('"'):
1751             v = re.sub(r"\\'", "'", v[1:-1])
1752         elif v.startswith("'"):
1753             v = v[1:-1]
1754             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1755                 '\\\\': '\\\\',
1756                 "\\'": "'",
1757                 '"': '\\"',
1758             }[m.group(0)], v)
1759         return '"%s"' % v
1760
1761     res = re.sub(r'''(?x)
1762         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1763         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1764         [a-zA-Z_][.a-zA-Z_0-9]*
1765         ''', fix_kv, code)
1766     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1767     return res
1768
1769
1770 def qualities(quality_ids):
1771     """ Get a numeric quality value out of a list of possible values """
1772     def q(qid):
1773         try:
1774             return quality_ids.index(qid)
1775         except ValueError:
1776             return -1
1777     return q
1778
1779
1780 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1781
1782
1783 def limit_length(s, length):
1784     """ Add ellipses to overly long strings """
1785     if s is None:
1786         return None
1787     ELLIPSES = '...'
1788     if len(s) > length:
1789         return s[:length - len(ELLIPSES)] + ELLIPSES
1790     return s
1791
1792
1793 def version_tuple(v):
1794     return tuple(int(e) for e in re.split(r'[-.]', v))
1795
1796
1797 def is_outdated_version(version, limit, assume_new=True):
1798     if not version:
1799         return not assume_new
1800     try:
1801         return version_tuple(version) < version_tuple(limit)
1802     except ValueError:
1803         return not assume_new
1804
1805
1806 def ytdl_is_updateable():
1807     """ Returns if youtube-dl can be updated with -U """
1808     from zipimport import zipimporter
1809
1810     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1811
1812
1813 def args_to_str(args):
1814     # Get a short string representation for a subprocess command
1815     return ' '.join(shlex_quote(a) for a in args)
1816
1817
1818 def error_to_compat_str(err):
1819     err_str = str(err)
1820     # On python 2 error byte string must be decoded with proper
1821     # encoding rather than ascii
1822     if sys.version_info[0] < 3:
1823         err_str = err_str.decode(preferredencoding())
1824     return err_str
1825
1826
1827 def mimetype2ext(mt):
1828     _, _, res = mt.rpartition('/')
1829
1830     return {
1831         '3gpp': '3gp',
1832         'ttml+xml': 'ttml',
1833         'x-flv': 'flv',
1834         'x-mp4-fragmented': 'mp4',
1835         'x-ms-wmv': 'wmv',
1836     }.get(res, res)
1837
1838
1839 def urlhandle_detect_ext(url_handle):
1840     try:
1841         url_handle.headers
1842         getheader = lambda h: url_handle.headers[h]
1843     except AttributeError:  # Python < 3
1844         getheader = url_handle.info().getheader
1845
1846     cd = getheader('Content-Disposition')
1847     if cd:
1848         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1849         if m:
1850             e = determine_ext(m.group('filename'), default_ext=None)
1851             if e:
1852                 return e
1853
1854     return mimetype2ext(getheader('Content-Type'))
1855
1856
1857 def encode_data_uri(data, mime_type):
1858     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1859
1860
1861 def age_restricted(content_limit, age_limit):
1862     """ Returns True iff the content should be blocked """
1863
1864     if age_limit is None:  # No limit set
1865         return False
1866     if content_limit is None:
1867         return False  # Content available for everyone
1868     return age_limit < content_limit
1869
1870
1871 def is_html(first_bytes):
1872     """ Detect whether a file contains HTML by examining its first bytes. """
1873
1874     BOMS = [
1875         (b'\xef\xbb\xbf', 'utf-8'),
1876         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1877         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1878         (b'\xff\xfe', 'utf-16-le'),
1879         (b'\xfe\xff', 'utf-16-be'),
1880     ]
1881     for bom, enc in BOMS:
1882         if first_bytes.startswith(bom):
1883             s = first_bytes[len(bom):].decode(enc, 'replace')
1884             break
1885     else:
1886         s = first_bytes.decode('utf-8', 'replace')
1887
1888     return re.match(r'^\s*<', s)
1889
1890
1891 def determine_protocol(info_dict):
1892     protocol = info_dict.get('protocol')
1893     if protocol is not None:
1894         return protocol
1895
1896     url = info_dict['url']
1897     if url.startswith('rtmp'):
1898         return 'rtmp'
1899     elif url.startswith('mms'):
1900         return 'mms'
1901     elif url.startswith('rtsp'):
1902         return 'rtsp'
1903
1904     ext = determine_ext(url)
1905     if ext == 'm3u8':
1906         return 'm3u8'
1907     elif ext == 'f4m':
1908         return 'f4m'
1909
1910     return compat_urllib_parse_urlparse(url).scheme
1911
1912
1913 def render_table(header_row, data):
1914     """ Render a list of rows, each as a list of values """
1915     table = [header_row] + data
1916     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1917     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1918     return '\n'.join(format_str % tuple(row) for row in table)
1919
1920
1921 def _match_one(filter_part, dct):
1922     COMPARISON_OPERATORS = {
1923         '<': operator.lt,
1924         '<=': operator.le,
1925         '>': operator.gt,
1926         '>=': operator.ge,
1927         '=': operator.eq,
1928         '!=': operator.ne,
1929     }
1930     operator_rex = re.compile(r'''(?x)\s*
1931         (?P<key>[a-z_]+)
1932         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1933         (?:
1934             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1935             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1936         )
1937         \s*$
1938         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1939     m = operator_rex.search(filter_part)
1940     if m:
1941         op = COMPARISON_OPERATORS[m.group('op')]
1942         if m.group('strval') is not None:
1943             if m.group('op') not in ('=', '!='):
1944                 raise ValueError(
1945                     'Operator %s does not support string values!' % m.group('op'))
1946             comparison_value = m.group('strval')
1947         else:
1948             try:
1949                 comparison_value = int(m.group('intval'))
1950             except ValueError:
1951                 comparison_value = parse_filesize(m.group('intval'))
1952                 if comparison_value is None:
1953                     comparison_value = parse_filesize(m.group('intval') + 'B')
1954                 if comparison_value is None:
1955                     raise ValueError(
1956                         'Invalid integer value %r in filter part %r' % (
1957                             m.group('intval'), filter_part))
1958         actual_value = dct.get(m.group('key'))
1959         if actual_value is None:
1960             return m.group('none_inclusive')
1961         return op(actual_value, comparison_value)
1962
1963     UNARY_OPERATORS = {
1964         '': lambda v: v is not None,
1965         '!': lambda v: v is None,
1966     }
1967     operator_rex = re.compile(r'''(?x)\s*
1968         (?P<op>%s)\s*(?P<key>[a-z_]+)
1969         \s*$
1970         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1971     m = operator_rex.search(filter_part)
1972     if m:
1973         op = UNARY_OPERATORS[m.group('op')]
1974         actual_value = dct.get(m.group('key'))
1975         return op(actual_value)
1976
1977     raise ValueError('Invalid filter part %r' % filter_part)
1978
1979
1980 def match_str(filter_str, dct):
1981     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1982
1983     return all(
1984         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1985
1986
1987 def match_filter_func(filter_str):
1988     def _match_func(info_dict):
1989         if match_str(filter_str, info_dict):
1990             return None
1991         else:
1992             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1993             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1994     return _match_func
1995
1996
1997 def parse_dfxp_time_expr(time_expr):
1998     if not time_expr:
1999         return
2000
2001     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2002     if mobj:
2003         return float(mobj.group('time_offset'))
2004
2005     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2006     if mobj:
2007         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2008
2009
2010 def srt_subtitles_timecode(seconds):
2011     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2012
2013
2014 def dfxp2srt(dfxp_data):
2015     _x = functools.partial(xpath_with_ns, ns_map={
2016         'ttml': 'http://www.w3.org/ns/ttml',
2017         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2018     })
2019
2020     def parse_node(node):
2021         str_or_empty = functools.partial(str_or_none, default='')
2022
2023         out = str_or_empty(node.text)
2024
2025         for child in node:
2026             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2027                 out += '\n' + str_or_empty(child.tail)
2028             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
2029                 out += str_or_empty(parse_node(child))
2030             else:
2031                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
2032
2033         return out
2034
2035     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2036     out = []
2037     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2038
2039     if not paras:
2040         raise ValueError('Invalid dfxp/TTML subtitle')
2041
2042     for para, index in zip(paras, itertools.count(1)):
2043         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2044         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2045         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2046         if begin_time is None:
2047             continue
2048         if not end_time:
2049             if not dur:
2050                 continue
2051             end_time = begin_time + dur
2052         out.append('%d\n%s --> %s\n%s\n\n' % (
2053             index,
2054             srt_subtitles_timecode(begin_time),
2055             srt_subtitles_timecode(end_time),
2056             parse_node(para)))
2057
2058     return ''.join(out)
2059
2060
2061 def cli_option(params, command_option, param):
2062     param = params.get(param)
2063     return [command_option, param] if param is not None else []
2064
2065
2066 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2067     param = params.get(param)
2068     assert isinstance(param, bool)
2069     if separator:
2070         return [command_option + separator + (true_value if param else false_value)]
2071     return [command_option, true_value if param else false_value]
2072
2073
2074 def cli_valueless_option(params, command_option, param, expected_value=True):
2075     param = params.get(param)
2076     return [command_option] if param == expected_value else []
2077
2078
2079 def cli_configuration_args(params, param, default=[]):
2080     ex_args = params.get(param)
2081     if ex_args is None:
2082         return default
2083     assert isinstance(ex_args, list)
2084     return ex_args
2085
2086
2087 class ISO639Utils(object):
2088     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2089     _lang_map = {
2090         'aa': 'aar',
2091         'ab': 'abk',
2092         'ae': 'ave',
2093         'af': 'afr',
2094         'ak': 'aka',
2095         'am': 'amh',
2096         'an': 'arg',
2097         'ar': 'ara',
2098         'as': 'asm',
2099         'av': 'ava',
2100         'ay': 'aym',
2101         'az': 'aze',
2102         'ba': 'bak',
2103         'be': 'bel',
2104         'bg': 'bul',
2105         'bh': 'bih',
2106         'bi': 'bis',
2107         'bm': 'bam',
2108         'bn': 'ben',
2109         'bo': 'bod',
2110         'br': 'bre',
2111         'bs': 'bos',
2112         'ca': 'cat',
2113         'ce': 'che',
2114         'ch': 'cha',
2115         'co': 'cos',
2116         'cr': 'cre',
2117         'cs': 'ces',
2118         'cu': 'chu',
2119         'cv': 'chv',
2120         'cy': 'cym',
2121         'da': 'dan',
2122         'de': 'deu',
2123         'dv': 'div',
2124         'dz': 'dzo',
2125         'ee': 'ewe',
2126         'el': 'ell',
2127         'en': 'eng',
2128         'eo': 'epo',
2129         'es': 'spa',
2130         'et': 'est',
2131         'eu': 'eus',
2132         'fa': 'fas',
2133         'ff': 'ful',
2134         'fi': 'fin',
2135         'fj': 'fij',
2136         'fo': 'fao',
2137         'fr': 'fra',
2138         'fy': 'fry',
2139         'ga': 'gle',
2140         'gd': 'gla',
2141         'gl': 'glg',
2142         'gn': 'grn',
2143         'gu': 'guj',
2144         'gv': 'glv',
2145         'ha': 'hau',
2146         'he': 'heb',
2147         'hi': 'hin',
2148         'ho': 'hmo',
2149         'hr': 'hrv',
2150         'ht': 'hat',
2151         'hu': 'hun',
2152         'hy': 'hye',
2153         'hz': 'her',
2154         'ia': 'ina',
2155         'id': 'ind',
2156         'ie': 'ile',
2157         'ig': 'ibo',
2158         'ii': 'iii',
2159         'ik': 'ipk',
2160         'io': 'ido',
2161         'is': 'isl',
2162         'it': 'ita',
2163         'iu': 'iku',
2164         'ja': 'jpn',
2165         'jv': 'jav',
2166         'ka': 'kat',
2167         'kg': 'kon',
2168         'ki': 'kik',
2169         'kj': 'kua',
2170         'kk': 'kaz',
2171         'kl': 'kal',
2172         'km': 'khm',
2173         'kn': 'kan',
2174         'ko': 'kor',
2175         'kr': 'kau',
2176         'ks': 'kas',
2177         'ku': 'kur',
2178         'kv': 'kom',
2179         'kw': 'cor',
2180         'ky': 'kir',
2181         'la': 'lat',
2182         'lb': 'ltz',
2183         'lg': 'lug',
2184         'li': 'lim',
2185         'ln': 'lin',
2186         'lo': 'lao',
2187         'lt': 'lit',
2188         'lu': 'lub',
2189         'lv': 'lav',
2190         'mg': 'mlg',
2191         'mh': 'mah',
2192         'mi': 'mri',
2193         'mk': 'mkd',
2194         'ml': 'mal',
2195         'mn': 'mon',
2196         'mr': 'mar',
2197         'ms': 'msa',
2198         'mt': 'mlt',
2199         'my': 'mya',
2200         'na': 'nau',
2201         'nb': 'nob',
2202         'nd': 'nde',
2203         'ne': 'nep',
2204         'ng': 'ndo',
2205         'nl': 'nld',
2206         'nn': 'nno',
2207         'no': 'nor',
2208         'nr': 'nbl',
2209         'nv': 'nav',
2210         'ny': 'nya',
2211         'oc': 'oci',
2212         'oj': 'oji',
2213         'om': 'orm',
2214         'or': 'ori',
2215         'os': 'oss',
2216         'pa': 'pan',
2217         'pi': 'pli',
2218         'pl': 'pol',
2219         'ps': 'pus',
2220         'pt': 'por',
2221         'qu': 'que',
2222         'rm': 'roh',
2223         'rn': 'run',
2224         'ro': 'ron',
2225         'ru': 'rus',
2226         'rw': 'kin',
2227         'sa': 'san',
2228         'sc': 'srd',
2229         'sd': 'snd',
2230         'se': 'sme',
2231         'sg': 'sag',
2232         'si': 'sin',
2233         'sk': 'slk',
2234         'sl': 'slv',
2235         'sm': 'smo',
2236         'sn': 'sna',
2237         'so': 'som',
2238         'sq': 'sqi',
2239         'sr': 'srp',
2240         'ss': 'ssw',
2241         'st': 'sot',
2242         'su': 'sun',
2243         'sv': 'swe',
2244         'sw': 'swa',
2245         'ta': 'tam',
2246         'te': 'tel',
2247         'tg': 'tgk',
2248         'th': 'tha',
2249         'ti': 'tir',
2250         'tk': 'tuk',
2251         'tl': 'tgl',
2252         'tn': 'tsn',
2253         'to': 'ton',
2254         'tr': 'tur',
2255         'ts': 'tso',
2256         'tt': 'tat',
2257         'tw': 'twi',
2258         'ty': 'tah',
2259         'ug': 'uig',
2260         'uk': 'ukr',
2261         'ur': 'urd',
2262         'uz': 'uzb',
2263         've': 'ven',
2264         'vi': 'vie',
2265         'vo': 'vol',
2266         'wa': 'wln',
2267         'wo': 'wol',
2268         'xh': 'xho',
2269         'yi': 'yid',
2270         'yo': 'yor',
2271         'za': 'zha',
2272         'zh': 'zho',
2273         'zu': 'zul',
2274     }
2275
2276     @classmethod
2277     def short2long(cls, code):
2278         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2279         return cls._lang_map.get(code[:2])
2280
2281     @classmethod
2282     def long2short(cls, code):
2283         """Convert language code from ISO 639-2/T to ISO 639-1"""
2284         for short_name, long_name in cls._lang_map.items():
2285             if long_name == code:
2286                 return short_name
2287
2288
2289 class ISO3166Utils(object):
2290     # From http://data.okfn.org/data/core/country-list
2291     _country_map = {
2292         'AF': 'Afghanistan',
2293         'AX': 'Åland Islands',
2294         'AL': 'Albania',
2295         'DZ': 'Algeria',
2296         'AS': 'American Samoa',
2297         'AD': 'Andorra',
2298         'AO': 'Angola',
2299         'AI': 'Anguilla',
2300         'AQ': 'Antarctica',
2301         'AG': 'Antigua and Barbuda',
2302         'AR': 'Argentina',
2303         'AM': 'Armenia',
2304         'AW': 'Aruba',
2305         'AU': 'Australia',
2306         'AT': 'Austria',
2307         'AZ': 'Azerbaijan',
2308         'BS': 'Bahamas',
2309         'BH': 'Bahrain',
2310         'BD': 'Bangladesh',
2311         'BB': 'Barbados',
2312         'BY': 'Belarus',
2313         'BE': 'Belgium',
2314         'BZ': 'Belize',
2315         'BJ': 'Benin',
2316         'BM': 'Bermuda',
2317         'BT': 'Bhutan',
2318         'BO': 'Bolivia, Plurinational State of',
2319         'BQ': 'Bonaire, Sint Eustatius and Saba',
2320         'BA': 'Bosnia and Herzegovina',
2321         'BW': 'Botswana',
2322         'BV': 'Bouvet Island',
2323         'BR': 'Brazil',
2324         'IO': 'British Indian Ocean Territory',
2325         'BN': 'Brunei Darussalam',
2326         'BG': 'Bulgaria',
2327         'BF': 'Burkina Faso',
2328         'BI': 'Burundi',
2329         'KH': 'Cambodia',
2330         'CM': 'Cameroon',
2331         'CA': 'Canada',
2332         'CV': 'Cape Verde',
2333         'KY': 'Cayman Islands',
2334         'CF': 'Central African Republic',
2335         'TD': 'Chad',
2336         'CL': 'Chile',
2337         'CN': 'China',
2338         'CX': 'Christmas Island',
2339         'CC': 'Cocos (Keeling) Islands',
2340         'CO': 'Colombia',
2341         'KM': 'Comoros',
2342         'CG': 'Congo',
2343         'CD': 'Congo, the Democratic Republic of the',
2344         'CK': 'Cook Islands',
2345         'CR': 'Costa Rica',
2346         'CI': 'Côte d\'Ivoire',
2347         'HR': 'Croatia',
2348         'CU': 'Cuba',
2349         'CW': 'Curaçao',
2350         'CY': 'Cyprus',
2351         'CZ': 'Czech Republic',
2352         'DK': 'Denmark',
2353         'DJ': 'Djibouti',
2354         'DM': 'Dominica',
2355         'DO': 'Dominican Republic',
2356         'EC': 'Ecuador',
2357         'EG': 'Egypt',
2358         'SV': 'El Salvador',
2359         'GQ': 'Equatorial Guinea',
2360         'ER': 'Eritrea',
2361         'EE': 'Estonia',
2362         'ET': 'Ethiopia',
2363         'FK': 'Falkland Islands (Malvinas)',
2364         'FO': 'Faroe Islands',
2365         'FJ': 'Fiji',
2366         'FI': 'Finland',
2367         'FR': 'France',
2368         'GF': 'French Guiana',
2369         'PF': 'French Polynesia',
2370         'TF': 'French Southern Territories',
2371         'GA': 'Gabon',
2372         'GM': 'Gambia',
2373         'GE': 'Georgia',
2374         'DE': 'Germany',
2375         'GH': 'Ghana',
2376         'GI': 'Gibraltar',
2377         'GR': 'Greece',
2378         'GL': 'Greenland',
2379         'GD': 'Grenada',
2380         'GP': 'Guadeloupe',
2381         'GU': 'Guam',
2382         'GT': 'Guatemala',
2383         'GG': 'Guernsey',
2384         'GN': 'Guinea',
2385         'GW': 'Guinea-Bissau',
2386         'GY': 'Guyana',
2387         'HT': 'Haiti',
2388         'HM': 'Heard Island and McDonald Islands',
2389         'VA': 'Holy See (Vatican City State)',
2390         'HN': 'Honduras',
2391         'HK': 'Hong Kong',
2392         'HU': 'Hungary',
2393         'IS': 'Iceland',
2394         'IN': 'India',
2395         'ID': 'Indonesia',
2396         'IR': 'Iran, Islamic Republic of',
2397         'IQ': 'Iraq',
2398         'IE': 'Ireland',
2399         'IM': 'Isle of Man',
2400         'IL': 'Israel',
2401         'IT': 'Italy',
2402         'JM': 'Jamaica',
2403         'JP': 'Japan',
2404         'JE': 'Jersey',
2405         'JO': 'Jordan',
2406         'KZ': 'Kazakhstan',
2407         'KE': 'Kenya',
2408         'KI': 'Kiribati',
2409         'KP': 'Korea, Democratic People\'s Republic of',
2410         'KR': 'Korea, Republic of',
2411         'KW': 'Kuwait',
2412         'KG': 'Kyrgyzstan',
2413         'LA': 'Lao People\'s Democratic Republic',
2414         'LV': 'Latvia',
2415         'LB': 'Lebanon',
2416         'LS': 'Lesotho',
2417         'LR': 'Liberia',
2418         'LY': 'Libya',
2419         'LI': 'Liechtenstein',
2420         'LT': 'Lithuania',
2421         'LU': 'Luxembourg',
2422         'MO': 'Macao',
2423         'MK': 'Macedonia, the Former Yugoslav Republic of',
2424         'MG': 'Madagascar',
2425         'MW': 'Malawi',
2426         'MY': 'Malaysia',
2427         'MV': 'Maldives',
2428         'ML': 'Mali',
2429         'MT': 'Malta',
2430         'MH': 'Marshall Islands',
2431         'MQ': 'Martinique',
2432         'MR': 'Mauritania',
2433         'MU': 'Mauritius',
2434         'YT': 'Mayotte',
2435         'MX': 'Mexico',
2436         'FM': 'Micronesia, Federated States of',
2437         'MD': 'Moldova, Republic of',
2438         'MC': 'Monaco',
2439         'MN': 'Mongolia',
2440         'ME': 'Montenegro',
2441         'MS': 'Montserrat',
2442         'MA': 'Morocco',
2443         'MZ': 'Mozambique',
2444         'MM': 'Myanmar',
2445         'NA': 'Namibia',
2446         'NR': 'Nauru',
2447         'NP': 'Nepal',
2448         'NL': 'Netherlands',
2449         'NC': 'New Caledonia',
2450         'NZ': 'New Zealand',
2451         'NI': 'Nicaragua',
2452         'NE': 'Niger',
2453         'NG': 'Nigeria',
2454         'NU': 'Niue',
2455         'NF': 'Norfolk Island',
2456         'MP': 'Northern Mariana Islands',
2457         'NO': 'Norway',
2458         'OM': 'Oman',
2459         'PK': 'Pakistan',
2460         'PW': 'Palau',
2461         'PS': 'Palestine, State of',
2462         'PA': 'Panama',
2463         'PG': 'Papua New Guinea',
2464         'PY': 'Paraguay',
2465         'PE': 'Peru',
2466         'PH': 'Philippines',
2467         'PN': 'Pitcairn',
2468         'PL': 'Poland',
2469         'PT': 'Portugal',
2470         'PR': 'Puerto Rico',
2471         'QA': 'Qatar',
2472         'RE': 'Réunion',
2473         'RO': 'Romania',
2474         'RU': 'Russian Federation',
2475         'RW': 'Rwanda',
2476         'BL': 'Saint Barthélemy',
2477         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2478         'KN': 'Saint Kitts and Nevis',
2479         'LC': 'Saint Lucia',
2480         'MF': 'Saint Martin (French part)',
2481         'PM': 'Saint Pierre and Miquelon',
2482         'VC': 'Saint Vincent and the Grenadines',
2483         'WS': 'Samoa',
2484         'SM': 'San Marino',
2485         'ST': 'Sao Tome and Principe',
2486         'SA': 'Saudi Arabia',
2487         'SN': 'Senegal',
2488         'RS': 'Serbia',
2489         'SC': 'Seychelles',
2490         'SL': 'Sierra Leone',
2491         'SG': 'Singapore',
2492         'SX': 'Sint Maarten (Dutch part)',
2493         'SK': 'Slovakia',
2494         'SI': 'Slovenia',
2495         'SB': 'Solomon Islands',
2496         'SO': 'Somalia',
2497         'ZA': 'South Africa',
2498         'GS': 'South Georgia and the South Sandwich Islands',
2499         'SS': 'South Sudan',
2500         'ES': 'Spain',
2501         'LK': 'Sri Lanka',
2502         'SD': 'Sudan',
2503         'SR': 'Suriname',
2504         'SJ': 'Svalbard and Jan Mayen',
2505         'SZ': 'Swaziland',
2506         'SE': 'Sweden',
2507         'CH': 'Switzerland',
2508         'SY': 'Syrian Arab Republic',
2509         'TW': 'Taiwan, Province of China',
2510         'TJ': 'Tajikistan',
2511         'TZ': 'Tanzania, United Republic of',
2512         'TH': 'Thailand',
2513         'TL': 'Timor-Leste',
2514         'TG': 'Togo',
2515         'TK': 'Tokelau',
2516         'TO': 'Tonga',
2517         'TT': 'Trinidad and Tobago',
2518         'TN': 'Tunisia',
2519         'TR': 'Turkey',
2520         'TM': 'Turkmenistan',
2521         'TC': 'Turks and Caicos Islands',
2522         'TV': 'Tuvalu',
2523         'UG': 'Uganda',
2524         'UA': 'Ukraine',
2525         'AE': 'United Arab Emirates',
2526         'GB': 'United Kingdom',
2527         'US': 'United States',
2528         'UM': 'United States Minor Outlying Islands',
2529         'UY': 'Uruguay',
2530         'UZ': 'Uzbekistan',
2531         'VU': 'Vanuatu',
2532         'VE': 'Venezuela, Bolivarian Republic of',
2533         'VN': 'Viet Nam',
2534         'VG': 'Virgin Islands, British',
2535         'VI': 'Virgin Islands, U.S.',
2536         'WF': 'Wallis and Futuna',
2537         'EH': 'Western Sahara',
2538         'YE': 'Yemen',
2539         'ZM': 'Zambia',
2540         'ZW': 'Zimbabwe',
2541     }
2542
2543     @classmethod
2544     def short2full(cls, code):
2545         """Convert an ISO 3166-2 country code to the corresponding full name"""
2546         return cls._country_map.get(code.upper())
2547
2548
2549 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2550     def __init__(self, proxies=None):
2551         # Set default handlers
2552         for type in ('http', 'https'):
2553             setattr(self, '%s_open' % type,
2554                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2555                         meth(r, proxy, type))
2556         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2557
2558     def proxy_open(self, req, proxy, type):
2559         req_proxy = req.headers.get('Ytdl-request-proxy')
2560         if req_proxy is not None:
2561             proxy = req_proxy
2562             del req.headers['Ytdl-request-proxy']
2563
2564         if proxy == '__noproxy__':
2565             return None  # No Proxy
2566         return compat_urllib_request.ProxyHandler.proxy_open(
2567             self, req, proxy, type)