_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_etree_fromstring,
  40     compat_html_entities,
  41     compat_http_client,
  42     compat_kwargs,
  43     compat_parse_qs,
  44     compat_socket_create_connection,
  45     compat_str,
  46     compat_urllib_error,
  47     compat_urllib_parse,
  48     compat_urllib_parse_urlparse,
  49     compat_urllib_request,
  50     compat_urlparse,
  51     shlex_quote,
  52 )
  53
  54
  55 # This is not clearly defined otherwise
  56 compiled_regex_type = type(re.compile(''))
  57
  58 std_headers = {
  59     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  60     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62     'Accept-Encoding': 'gzip, deflate',
  63     'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66
  67 NO_DEFAULT = object()
  68
  69 ENGLISH_MONTH_NAMES = [
  70     'January', 'February', 'March', 'April', 'May', 'June',
  71     'July', 'August', 'September', 'October', 'November', 'December']
  72
  73
  74 def preferredencoding():
  75     """Get preferred encoding.
  76
  77     Returns the best encoding scheme for the system, based on
  78     locale.getpreferredencoding() and some further tweaks.
  79     """
  80     try:
  81         pref = locale.getpreferredencoding()
  82         'TEST'.encode(pref)
  83     except Exception:
  84         pref = 'UTF-8'
  85
  86     return pref
  87
  88
  89 def write_json_file(obj, fn):
  90     """ Encode obj as JSON and write it to fn, atomically if possible """
  91
  92     fn = encodeFilename(fn)
  93     if sys.version_info < (3, 0) and sys.platform != 'win32':
  94         encoding = get_filesystem_encoding()
  95         # os.path.basename returns a bytes object, but NamedTemporaryFile
  96         # will fail if the filename contains non ascii characters unless we
  97         # use a unicode object
  98         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  99         # the same for os.path.dirname
 100         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 101     else:
 102         path_basename = os.path.basename
 103         path_dirname = os.path.dirname
 104
 105     args = {
 106         'suffix': '.tmp',
 107         'prefix': path_basename(fn) + '.',
 108         'dir': path_dirname(fn),
 109         'delete': False,
 110     }
 111
 112     # In Python 2.x, json.dump expects a bytestream.
 113     # In Python 3.x, it writes to a character stream
 114     if sys.version_info < (3, 0):
 115         args['mode'] = 'wb'
 116     else:
 117         args.update({
 118             'mode': 'w',
 119             'encoding': 'utf-8',
 120         })
 121
 122     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 123
 124     try:
 125         with tf:
 126             json.dump(obj, tf)
 127         if sys.platform == 'win32':
 128             # Need to remove existing file on Windows, else os.rename raises
 129             # WindowsError or FileExistsError.
 130             try:
 131                 os.unlink(fn)
 132             except OSError:
 133                 pass
 134         os.rename(tf.name, fn)
 135     except Exception:
 136         try:
 137             os.remove(tf.name)
 138         except OSError:
 139             pass
 140         raise
 141
 142
 143 if sys.version_info >= (2, 7):
 144     def find_xpath_attr(node, xpath, key, val=None):
 145         """ Find the xpath xpath[@key=val] """
 146         assert re.match(r'^[a-zA-Z_-]+$', key)
 147         if val:
 148             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 149         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 150         return node.find(expr)
 151 else:
 152     def find_xpath_attr(node, xpath, key, val=None):
 153         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 154         # .//node does not match if a node is a direct child of . !
 155         if isinstance(xpath, compat_str):
 156             xpath = xpath.encode('ascii')
 157
 158         for f in node.findall(xpath):
 159             if key not in f.attrib:
 160                 continue
 161             if val is None or f.attrib.get(key) == val:
 162                 return f
 163         return None
 164
 165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 166 # the namespace parameter
 167
 168
 169 def xpath_with_ns(path, ns_map):
 170     components = [c.split(':') for c in path.split('/')]
 171     replaced = []
 172     for c in components:
 173         if len(c) == 1:
 174             replaced.append(c[0])
 175         else:
 176             ns, tag = c
 177             replaced.append('{%s}%s' % (ns_map[ns], tag))
 178     return '/'.join(replaced)
 179
 180
 181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 182     def _find_xpath(xpath):
 183         if sys.version_info < (2, 7):  # Crazy 2.6
 184             xpath = xpath.encode('ascii')
 185         return node.find(xpath)
 186
 187     if isinstance(xpath, (str, compat_str)):
 188         n = _find_xpath(xpath)
 189     else:
 190         for xp in xpath:
 191             n = _find_xpath(xp)
 192             if n is not None:
 193                 break
 194
 195     if n is None:
 196         if default is not NO_DEFAULT:
 197             return default
 198         elif fatal:
 199             name = xpath if name is None else name
 200             raise ExtractorError('Could not find XML element %s' % name)
 201         else:
 202             return None
 203     return n
 204
 205
 206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 207     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 208     if n is None or n == default:
 209         return n
 210     if n.text is None:
 211         if default is not NO_DEFAULT:
 212             return default
 213         elif fatal:
 214             name = xpath if name is None else name
 215             raise ExtractorError('Could not find XML element\'s text %s' % name)
 216         else:
 217             return None
 218     return n.text
 219
 220
 221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 222     n = find_xpath_attr(node, xpath, key)
 223     if n is None:
 224         if default is not NO_DEFAULT:
 225             return default
 226         elif fatal:
 227             name = '%s[@%s]' % (xpath, key) if name is None else name
 228             raise ExtractorError('Could not find XML attribute %s' % name)
 229         else:
 230             return None
 231     return n.attrib[key]
 232
 233
 234 def get_element_by_id(id, html):
 235     """Return the content of the tag with the specified ID in the passed HTML document"""
 236     return get_element_by_attribute("id", id, html)
 237
 238
 239 def get_element_by_attribute(attribute, value, html):
 240     """Return the content of the tag with the specified attribute in the passed HTML document"""
 241
 242     m = re.search(r'''(?xs)
 243         <([a-zA-Z0-9:._-]+)
 244          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 245          \s+%s=['"]?%s['"]?
 246          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 247         \s*>
 248         (?P<content>.*?)
 249         </\1>
 250     ''' % (re.escape(attribute), re.escape(value)), html)
 251
 252     if not m:
 253         return None
 254     res = m.group('content')
 255
 256     if res.startswith('"') or res.startswith("'"):
 257         res = res[1:-1]
 258
 259     return unescapeHTML(res)
 260
 261
 262 def clean_html(html):
 263     """Clean an HTML snippet into a readable string"""
 264
 265     if html is None:  # Convenience for sanitizing descriptions etc.
 266         return html
 267
 268     # Newline vs <br />
 269     html = html.replace('\n', ' ')
 270     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 271     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 272     # Strip html tags
 273     html = re.sub('<.*?>', '', html)
 274     # Replace html entities
 275     html = unescapeHTML(html)
 276     return html.strip()
 277
 278
 279 def sanitize_open(filename, open_mode):
 280     """Try to open the given filename, and slightly tweak it if this fails.
 281
 282     Attempts to open the given filename. If this fails, it tries to change
 283     the filename slightly, step by step, until it's either able to open it
 284     or it fails and raises a final exception, like the standard open()
 285     function.
 286
 287     It returns the tuple (stream, definitive_file_name).
 288     """
 289     try:
 290         if filename == '-':
 291             if sys.platform == 'win32':
 292                 import msvcrt
 293                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 294             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 295         stream = open(encodeFilename(filename), open_mode)
 296         return (stream, filename)
 297     except (IOError, OSError) as err:
 298         if err.errno in (errno.EACCES,):
 299             raise
 300
 301         # In case of error, try to remove win32 forbidden chars
 302         alt_filename = sanitize_path(filename)
 303         if alt_filename == filename:
 304             raise
 305         else:
 306             # An exception here should be caught in the caller
 307             stream = open(encodeFilename(alt_filename), open_mode)
 308             return (stream, alt_filename)
 309
 310
 311 def timeconvert(timestr):
 312     """Convert RFC 2822 defined time string into system timestamp"""
 313     timestamp = None
 314     timetuple = email.utils.parsedate_tz(timestr)
 315     if timetuple is not None:
 316         timestamp = email.utils.mktime_tz(timetuple)
 317     return timestamp
 318
 319
 320 def sanitize_filename(s, restricted=False, is_id=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 324     """
 325     def replace_insane(char):
 326         if char == '?' or ord(char) < 32 or ord(char) == 127:
 327             return ''
 328         elif char == '"':
 329             return '' if restricted else '\''
 330         elif char == ':':
 331             return '_-' if restricted else ' -'
 332         elif char in '\\/|*<>':
 333             return '_'
 334         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 335             return '_'
 336         if restricted and ord(char) > 127:
 337             return '_'
 338         return char
 339
 340     # Handle timestamps
 341     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 342     result = ''.join(map(replace_insane, s))
 343     if not is_id:
 344         while '__' in result:
 345             result = result.replace('__', '_')
 346         result = result.strip('_')
 347         # Common case of "Foreign band name - English song title"
 348         if restricted and result.startswith('-_'):
 349             result = result[2:]
 350         if result.startswith('-'):
 351             result = '_' + result[len('-'):]
 352         result = result.lstrip('.')
 353         if not result:
 354             result = '_'
 355     return result
 356
 357
 358 def sanitize_path(s):
 359     """Sanitizes and normalizes path on Windows"""
 360     if sys.platform != 'win32':
 361         return s
 362     drive_or_unc, _ = os.path.splitdrive(s)
 363     if sys.version_info < (2, 7) and not drive_or_unc:
 364         drive_or_unc, _ = os.path.splitunc(s)
 365     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 366     if drive_or_unc:
 367         norm_path.pop(0)
 368     sanitized_path = [
 369         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 370         for path_part in norm_path]
 371     if drive_or_unc:
 372         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 373     return os.path.join(*sanitized_path)
 374
 375
 376 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 377 # unwanted failures due to missing protocol
 378 def sanitized_Request(url, *args, **kwargs):
 379     return compat_urllib_request.Request(
 380         'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
 381
 382
 383 def orderedSet(iterable):
 384     """ Remove all duplicates from the input iterable """
 385     res = []
 386     for el in iterable:
 387         if el not in res:
 388             res.append(el)
 389     return res
 390
 391
 392 def _htmlentity_transform(entity):
 393     """Transforms an HTML entity to a character."""
 394     # Known non-numeric HTML entity
 395     if entity in compat_html_entities.name2codepoint:
 396         return compat_chr(compat_html_entities.name2codepoint[entity])
 397
 398     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 399     if mobj is not None:
 400         numstr = mobj.group(1)
 401         if numstr.startswith('x'):
 402             base = 16
 403             numstr = '0%s' % numstr
 404         else:
 405             base = 10
 406         # See https://github.com/rg3/youtube-dl/issues/7518
 407         try:
 408             return compat_chr(int(numstr, base))
 409         except ValueError:
 410             pass
 411
 412     # Unknown entity in name, return its literal representation
 413     return '&%s;' % entity
 414
 415
 416 def unescapeHTML(s):
 417     if s is None:
 418         return None
 419     assert type(s) == compat_str
 420
 421     return re.sub(
 422         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 423
 424
 425 def get_subprocess_encoding():
 426     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 427         # For subprocess calls, encode with locale encoding
 428         # Refer to http://stackoverflow.com/a/9951851/35070
 429         encoding = preferredencoding()
 430     else:
 431         encoding = sys.getfilesystemencoding()
 432     if encoding is None:
 433         encoding = 'utf-8'
 434     return encoding
 435
 436
 437 def encodeFilename(s, for_subprocess=False):
 438     """
 439     @param s The name of the file
 440     """
 441
 442     assert type(s) == compat_str
 443
 444     # Python 3 has a Unicode API
 445     if sys.version_info >= (3, 0):
 446         return s
 447
 448     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 449     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 450     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 451     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 452         return s
 453
 454     return s.encode(get_subprocess_encoding(), 'ignore')
 455
 456
 457 def decodeFilename(b, for_subprocess=False):
 458
 459     if sys.version_info >= (3, 0):
 460         return b
 461
 462     if not isinstance(b, bytes):
 463         return b
 464
 465     return b.decode(get_subprocess_encoding(), 'ignore')
 466
 467
 468 def encodeArgument(s):
 469     if not isinstance(s, compat_str):
 470         # Legacy code that uses byte strings
 471         # Uncomment the following line after fixing all post processors
 472         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 473         s = s.decode('ascii')
 474     return encodeFilename(s, True)
 475
 476
 477 def decodeArgument(b):
 478     return decodeFilename(b, True)
 479
 480
 481 def decodeOption(optval):
 482     if optval is None:
 483         return optval
 484     if isinstance(optval, bytes):
 485         optval = optval.decode(preferredencoding())
 486
 487     assert isinstance(optval, compat_str)
 488     return optval
 489
 490
 491 def formatSeconds(secs):
 492     if secs > 3600:
 493         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 494     elif secs > 60:
 495         return '%d:%02d' % (secs // 60, secs % 60)
 496     else:
 497         return '%d' % secs
 498
 499
 500 def make_HTTPS_handler(params, **kwargs):
 501     opts_no_check_certificate = params.get('nocheckcertificate', False)
 502     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 503         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 504         if opts_no_check_certificate:
 505             context.check_hostname = False
 506             context.verify_mode = ssl.CERT_NONE
 507         try:
 508             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 509         except TypeError:
 510             # Python 2.7.8
 511             # (create_default_context present but HTTPSHandler has no context=)
 512             pass
 513
 514     if sys.version_info < (3, 2):
 515         return YoutubeDLHTTPSHandler(params, **kwargs)
 516     else:  # Python < 3.4
 517         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 518         context.verify_mode = (ssl.CERT_NONE
 519                                if opts_no_check_certificate
 520                                else ssl.CERT_REQUIRED)
 521         context.set_default_verify_paths()
 522         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 523
 524
 525 def bug_reports_message():
 526     if ytdl_is_updateable():
 527         update_cmd = 'type  youtube-dl -U  to update'
 528     else:
 529         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 530     msg = '; please report this issue on https://yt-dl.org/bug .'
 531     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 532     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 533     return msg
 534
 535
 536 class ExtractorError(Exception):
 537     """Error during info extraction."""
 538
 539     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 540         """ tb, if given, is the original traceback (so that it can be printed out).
 541         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 542         """
 543
 544         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 545             expected = True
 546         if video_id is not None:
 547             msg = video_id + ': ' + msg
 548         if cause:
 549             msg += ' (caused by %r)' % cause
 550         if not expected:
 551             msg += bug_reports_message()
 552         super(ExtractorError, self).__init__(msg)
 553
 554         self.traceback = tb
 555         self.exc_info = sys.exc_info()  # preserve original exception
 556         self.cause = cause
 557         self.video_id = video_id
 558
 559     def format_traceback(self):
 560         if self.traceback is None:
 561             return None
 562         return ''.join(traceback.format_tb(self.traceback))
 563
 564
 565 class UnsupportedError(ExtractorError):
 566     def __init__(self, url):
 567         super(UnsupportedError, self).__init__(
 568             'Unsupported URL: %s' % url, expected=True)
 569         self.url = url
 570
 571
 572 class RegexNotFoundError(ExtractorError):
 573     """Error when a regex didn't match"""
 574     pass
 575
 576
 577 class DownloadError(Exception):
 578     """Download Error exception.
 579
 580     This exception may be thrown by FileDownloader objects if they are not
 581     configured to continue on errors. They will contain the appropriate
 582     error message.
 583     """
 584
 585     def __init__(self, msg, exc_info=None):
 586         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 587         super(DownloadError, self).__init__(msg)
 588         self.exc_info = exc_info
 589
 590
 591 class SameFileError(Exception):
 592     """Same File exception.
 593
 594     This exception will be thrown by FileDownloader objects if they detect
 595     multiple files would have to be downloaded to the same file on disk.
 596     """
 597     pass
 598
 599
 600 class PostProcessingError(Exception):
 601     """Post Processing exception.
 602
 603     This exception may be raised by PostProcessor's .run() method to
 604     indicate an error in the postprocessing task.
 605     """
 606
 607     def __init__(self, msg):
 608         self.msg = msg
 609
 610
 611 class MaxDownloadsReached(Exception):
 612     """ --max-downloads limit has been reached. """
 613     pass
 614
 615
 616 class UnavailableVideoError(Exception):
 617     """Unavailable Format exception.
 618
 619     This exception will be thrown when a video is requested
 620     in a format that is not available for that video.
 621     """
 622     pass
 623
 624
 625 class ContentTooShortError(Exception):
 626     """Content Too Short exception.
 627
 628     This exception may be raised by FileDownloader objects when a file they
 629     download is too small for what the server announced first, indicating
 630     the connection was probably interrupted.
 631     """
 632
 633     def __init__(self, downloaded, expected):
 634         # Both in bytes
 635         self.downloaded = downloaded
 636         self.expected = expected
 637
 638
 639 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 640     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 641     # expected HTTP responses to meet HTTP/1.0 or later (see also
 642     # https://github.com/rg3/youtube-dl/issues/6727)
 643     if sys.version_info < (3, 0):
 644         kwargs[b'strict'] = True
 645     hc = http_class(*args, **kwargs)
 646     source_address = ydl_handler._params.get('source_address')
 647     if source_address is not None:
 648         sa = (source_address, 0)
 649         if hasattr(hc, 'source_address'):  # Python 2.7+
 650             hc.source_address = sa
 651         else:  # Python 2.6
 652             def _hc_connect(self, *args, **kwargs):
 653                 sock = compat_socket_create_connection(
 654                     (self.host, self.port), self.timeout, sa)
 655                 if is_https:
 656                     self.sock = ssl.wrap_socket(
 657                         sock, self.key_file, self.cert_file,
 658                         ssl_version=ssl.PROTOCOL_TLSv1)
 659                 else:
 660                     self.sock = sock
 661             hc.connect = functools.partial(_hc_connect, hc)
 662
 663     return hc
 664
 665
 666 def handle_youtubedl_headers(headers):
 667     filtered_headers = headers
 668
 669     if 'Youtubedl-no-compression' in filtered_headers:
 670         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 671         del filtered_headers['Youtubedl-no-compression']
 672
 673     return filtered_headers
 674
 675
 676 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 677     """Handler for HTTP requests and responses.
 678
 679     This class, when installed with an OpenerDirector, automatically adds
 680     the standard headers to every HTTP request and handles gzipped and
 681     deflated responses from web servers. If compression is to be avoided in
 682     a particular request, the original request in the program code only has
 683     to include the HTTP header "Youtubedl-no-compression", which will be
 684     removed before making the real request.
 685
 686     Part of this code was copied from:
 687
 688     http://techknack.net/python-urllib2-handlers/
 689
 690     Andrew Rowls, the author of that code, agreed to release it to the
 691     public domain.
 692     """
 693
 694     def __init__(self, params, *args, **kwargs):
 695         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 696         self._params = params
 697
 698     def http_open(self, req):
 699         return self.do_open(functools.partial(
 700             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 701             req)
 702
 703     @staticmethod
 704     def deflate(data):
 705         try:
 706             return zlib.decompress(data, -zlib.MAX_WBITS)
 707         except zlib.error:
 708             return zlib.decompress(data)
 709
 710     @staticmethod
 711     def addinfourl_wrapper(stream, headers, url, code):
 712         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 713             return compat_urllib_request.addinfourl(stream, headers, url, code)
 714         ret = compat_urllib_request.addinfourl(stream, headers, url)
 715         ret.code = code
 716         return ret
 717
 718     def http_request(self, req):
 719         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 720         # always respected by websites, some tend to give out URLs with non percent-encoded
 721         # non-ASCII characters (see telemb.py, ard.py [#3412])
 722         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 723         # To work around aforementioned issue we will replace request's original URL with
 724         # percent-encoded one
 725         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 726         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 727         url = req.get_full_url()
 728         url_escaped = escape_url(url)
 729
 730         # Substitute URL if any change after escaping
 731         if url != url_escaped:
 732             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 733             new_req = req_type(
 734                 url_escaped, data=req.data, headers=req.headers,
 735                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 736             new_req.timeout = req.timeout
 737             req = new_req
 738
 739         for h, v in std_headers.items():
 740             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 741             # The dict keys are capitalized because of this bug by urllib
 742             if h.capitalize() not in req.headers:
 743                 req.add_header(h, v)
 744
 745         req.headers = handle_youtubedl_headers(req.headers)
 746
 747         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 748             # Python 2.6 is brain-dead when it comes to fragments
 749             req._Request__original = req._Request__original.partition('#')[0]
 750             req._Request__r_type = req._Request__r_type.partition('#')[0]
 751
 752         return req
 753
 754     def http_response(self, req, resp):
 755         old_resp = resp
 756         # gzip
 757         if resp.headers.get('Content-encoding', '') == 'gzip':
 758             content = resp.read()
 759             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 760             try:
 761                 uncompressed = io.BytesIO(gz.read())
 762             except IOError as original_ioerror:
 763                 # There may be junk add the end of the file
 764                 # See http://stackoverflow.com/q/4928560/35070 for details
 765                 for i in range(1, 1024):
 766                     try:
 767                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 768                         uncompressed = io.BytesIO(gz.read())
 769                     except IOError:
 770                         continue
 771                     break
 772                 else:
 773                     raise original_ioerror
 774             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 775             resp.msg = old_resp.msg
 776             del resp.headers['Content-encoding']
 777         # deflate
 778         if resp.headers.get('Content-encoding', '') == 'deflate':
 779             gz = io.BytesIO(self.deflate(resp.read()))
 780             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 781             resp.msg = old_resp.msg
 782             del resp.headers['Content-encoding']
 783         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 784         # https://github.com/rg3/youtube-dl/issues/6457).
 785         if 300 <= resp.code < 400:
 786             location = resp.headers.get('Location')
 787             if location:
 788                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 789                 if sys.version_info >= (3, 0):
 790                     location = location.encode('iso-8859-1').decode('utf-8')
 791                 location_escaped = escape_url(location)
 792                 if location != location_escaped:
 793                     del resp.headers['Location']
 794                     resp.headers['Location'] = location_escaped
 795         return resp
 796
 797     https_request = http_request
 798     https_response = http_response
 799
 800
 801 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 802     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 803         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 804         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 805         self._params = params
 806
 807     def https_open(self, req):
 808         kwargs = {}
 809         if hasattr(self, '_context'):  # python > 2.6
 810             kwargs['context'] = self._context
 811         if hasattr(self, '_check_hostname'):  # python 3.x
 812             kwargs['check_hostname'] = self._check_hostname
 813         return self.do_open(functools.partial(
 814             _create_http_connection, self, self._https_conn_class, True),
 815             req, **kwargs)
 816
 817
 818 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 819     def __init__(self, cookiejar=None):
 820         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 821
 822     def http_response(self, request, response):
 823         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 824         # characters in Set-Cookie HTTP header of last response (see
 825         # https://github.com/rg3/youtube-dl/issues/6769).
 826         # In order to at least prevent crashing we will percent encode Set-Cookie
 827         # header before HTTPCookieProcessor starts processing it.
 828         # if sys.version_info < (3, 0) and response.headers:
 829         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 830         #         set_cookie = response.headers.get(set_cookie_header)
 831         #         if set_cookie:
 832         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 833         #             if set_cookie != set_cookie_escaped:
 834         #                 del response.headers[set_cookie_header]
 835         #                 response.headers[set_cookie_header] = set_cookie_escaped
 836         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 837
 838     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 839     https_response = http_response
 840
 841
 842 def parse_iso8601(date_str, delimiter='T', timezone=None):
 843     """ Return a UNIX timestamp from the given date """
 844
 845     if date_str is None:
 846         return None
 847
 848     date_str = re.sub(r'\.[0-9]+', '', date_str)
 849
 850     if timezone is None:
 851         m = re.search(
 852             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 853             date_str)
 854         if not m:
 855             timezone = datetime.timedelta()
 856         else:
 857             date_str = date_str[:-len(m.group(0))]
 858             if not m.group('sign'):
 859                 timezone = datetime.timedelta()
 860             else:
 861                 sign = 1 if m.group('sign') == '+' else -1
 862                 timezone = datetime.timedelta(
 863                     hours=sign * int(m.group('hours')),
 864                     minutes=sign * int(m.group('minutes')))
 865     try:
 866         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 867         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 868         return calendar.timegm(dt.timetuple())
 869     except ValueError:
 870         pass
 871
 872
 873 def unified_strdate(date_str, day_first=True):
 874     """Return a string with the date in the format YYYYMMDD"""
 875
 876     if date_str is None:
 877         return None
 878     upload_date = None
 879     # Replace commas
 880     date_str = date_str.replace(',', ' ')
 881     # %z (UTC offset) is only supported in python>=3.2
 882     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 883         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 884     # Remove AM/PM + timezone
 885     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 886
 887     format_expressions = [
 888         '%d %B %Y',
 889         '%d %b %Y',
 890         '%B %d %Y',
 891         '%b %d %Y',
 892         '%b %dst %Y %I:%M%p',
 893         '%b %dnd %Y %I:%M%p',
 894         '%b %dth %Y %I:%M%p',
 895         '%Y %m %d',
 896         '%Y-%m-%d',
 897         '%Y/%m/%d',
 898         '%Y/%m/%d %H:%M:%S',
 899         '%Y-%m-%d %H:%M:%S',
 900         '%Y-%m-%d %H:%M:%S.%f',
 901         '%d.%m.%Y %H:%M',
 902         '%d.%m.%Y %H.%M',
 903         '%Y-%m-%dT%H:%M:%SZ',
 904         '%Y-%m-%dT%H:%M:%S.%fZ',
 905         '%Y-%m-%dT%H:%M:%S.%f0Z',
 906         '%Y-%m-%dT%H:%M:%S',
 907         '%Y-%m-%dT%H:%M:%S.%f',
 908         '%Y-%m-%dT%H:%M',
 909     ]
 910     if day_first:
 911         format_expressions.extend([
 912             '%d-%m-%Y',
 913             '%d.%m.%Y',
 914             '%d/%m/%Y',
 915             '%d/%m/%y',
 916             '%d/%m/%Y %H:%M:%S',
 917         ])
 918     else:
 919         format_expressions.extend([
 920             '%m-%d-%Y',
 921             '%m.%d.%Y',
 922             '%m/%d/%Y',
 923             '%m/%d/%y',
 924             '%m/%d/%Y %H:%M:%S',
 925         ])
 926     for expression in format_expressions:
 927         try:
 928             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 929         except ValueError:
 930             pass
 931     if upload_date is None:
 932         timetuple = email.utils.parsedate_tz(date_str)
 933         if timetuple:
 934             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 935     if upload_date is not None:
 936         return compat_str(upload_date)
 937
 938
 939 def determine_ext(url, default_ext='unknown_video'):
 940     if url is None:
 941         return default_ext
 942     guess = url.partition('?')[0].rpartition('.')[2]
 943     if re.match(r'^[A-Za-z0-9]+$', guess):
 944         return guess
 945     elif guess.rstrip('/') in (
 946             'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 947             'flv', 'f4v', 'f4a', 'f4b',
 948             'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 949             'mkv', 'mka', 'mk3d',
 950             'avi', 'divx',
 951             'mov',
 952             'asf', 'wmv', 'wma',
 953             '3gp', '3g2',
 954             'mp3',
 955             'flac',
 956             'ape',
 957             'wav',
 958             'f4f', 'f4m', 'm3u8', 'smil'):
 959         return guess.rstrip('/')
 960     else:
 961         return default_ext
 962
 963
 964 def subtitles_filename(filename, sub_lang, sub_format):
 965     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 966
 967
 968 def date_from_str(date_str):
 969     """
 970     Return a datetime object from a string in the format YYYYMMDD or
 971     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 972     today = datetime.date.today()
 973     if date_str in ('now', 'today'):
 974         return today
 975     if date_str == 'yesterday':
 976         return today - datetime.timedelta(days=1)
 977     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 978     if match is not None:
 979         sign = match.group('sign')
 980         time = int(match.group('time'))
 981         if sign == '-':
 982             time = -time
 983         unit = match.group('unit')
 984         # A bad aproximation?
 985         if unit == 'month':
 986             unit = 'day'
 987             time *= 30
 988         elif unit == 'year':
 989             unit = 'day'
 990             time *= 365
 991         unit += 's'
 992         delta = datetime.timedelta(**{unit: time})
 993         return today + delta
 994     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 995
 996
 997 def hyphenate_date(date_str):
 998     """
 999     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1000     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1001     if match is not None:
1002         return '-'.join(match.groups())
1003     else:
1004         return date_str
1005
1006
1007 class DateRange(object):
1008     """Represents a time interval between two dates"""
1009
1010     def __init__(self, start=None, end=None):
1011         """start and end must be strings in the format accepted by date"""
1012         if start is not None:
1013             self.start = date_from_str(start)
1014         else:
1015             self.start = datetime.datetime.min.date()
1016         if end is not None:
1017             self.end = date_from_str(end)
1018         else:
1019             self.end = datetime.datetime.max.date()
1020         if self.start > self.end:
1021             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1022
1023     @classmethod
1024     def day(cls, day):
1025         """Returns a range that only contains the given day"""
1026         return cls(day, day)
1027
1028     def __contains__(self, date):
1029         """Check if the date is in the range"""
1030         if not isinstance(date, datetime.date):
1031             date = date_from_str(date)
1032         return self.start <= date <= self.end
1033
1034     def __str__(self):
1035         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1036
1037
1038 def platform_name():
1039     """ Returns the platform name as a compat_str """
1040     res = platform.platform()
1041     if isinstance(res, bytes):
1042         res = res.decode(preferredencoding())
1043
1044     assert isinstance(res, compat_str)
1045     return res
1046
1047
1048 def _windows_write_string(s, out):
1049     """ Returns True if the string was written using special methods,
1050     False if it has yet to be written out."""
1051     # Adapted from http://stackoverflow.com/a/3259271/35070
1052
1053     import ctypes
1054     import ctypes.wintypes
1055
1056     WIN_OUTPUT_IDS = {
1057         1: -11,
1058         2: -12,
1059     }
1060
1061     try:
1062         fileno = out.fileno()
1063     except AttributeError:
1064         # If the output stream doesn't have a fileno, it's virtual
1065         return False
1066     except io.UnsupportedOperation:
1067         # Some strange Windows pseudo files?
1068         return False
1069     if fileno not in WIN_OUTPUT_IDS:
1070         return False
1071
1072     GetStdHandle = ctypes.WINFUNCTYPE(
1073         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1074         (b"GetStdHandle", ctypes.windll.kernel32))
1075     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1076
1077     WriteConsoleW = ctypes.WINFUNCTYPE(
1078         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1079         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1080         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1081     written = ctypes.wintypes.DWORD(0)
1082
1083     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1084     FILE_TYPE_CHAR = 0x0002
1085     FILE_TYPE_REMOTE = 0x8000
1086     GetConsoleMode = ctypes.WINFUNCTYPE(
1087         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1088         ctypes.POINTER(ctypes.wintypes.DWORD))(
1089         (b"GetConsoleMode", ctypes.windll.kernel32))
1090     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1091
1092     def not_a_console(handle):
1093         if handle == INVALID_HANDLE_VALUE or handle is None:
1094             return True
1095         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1096                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1097
1098     if not_a_console(h):
1099         return False
1100
1101     def next_nonbmp_pos(s):
1102         try:
1103             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1104         except StopIteration:
1105             return len(s)
1106
1107     while s:
1108         count = min(next_nonbmp_pos(s), 1024)
1109
1110         ret = WriteConsoleW(
1111             h, s, count if count else 2, ctypes.byref(written), None)
1112         if ret == 0:
1113             raise OSError('Failed to write string')
1114         if not count:  # We just wrote a non-BMP character
1115             assert written.value == 2
1116             s = s[1:]
1117         else:
1118             assert written.value > 0
1119             s = s[written.value:]
1120     return True
1121
1122
1123 def write_string(s, out=None, encoding=None):
1124     if out is None:
1125         out = sys.stderr
1126     assert type(s) == compat_str
1127
1128     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1129         if _windows_write_string(s, out):
1130             return
1131
1132     if ('b' in getattr(out, 'mode', '') or
1133             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1134         byt = s.encode(encoding or preferredencoding(), 'ignore')
1135         out.write(byt)
1136     elif hasattr(out, 'buffer'):
1137         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1138         byt = s.encode(enc, 'ignore')
1139         out.buffer.write(byt)
1140     else:
1141         out.write(s)
1142     out.flush()
1143
1144
1145 def bytes_to_intlist(bs):
1146     if not bs:
1147         return []
1148     if isinstance(bs[0], int):  # Python 3
1149         return list(bs)
1150     else:
1151         return [ord(c) for c in bs]
1152
1153
1154 def intlist_to_bytes(xs):
1155     if not xs:
1156         return b''
1157     return struct_pack('%dB' % len(xs), *xs)
1158
1159
1160 # Cross-platform file locking
1161 if sys.platform == 'win32':
1162     import ctypes.wintypes
1163     import msvcrt
1164
1165     class OVERLAPPED(ctypes.Structure):
1166         _fields_ = [
1167             ('Internal', ctypes.wintypes.LPVOID),
1168             ('InternalHigh', ctypes.wintypes.LPVOID),
1169             ('Offset', ctypes.wintypes.DWORD),
1170             ('OffsetHigh', ctypes.wintypes.DWORD),
1171             ('hEvent', ctypes.wintypes.HANDLE),
1172         ]
1173
1174     kernel32 = ctypes.windll.kernel32
1175     LockFileEx = kernel32.LockFileEx
1176     LockFileEx.argtypes = [
1177         ctypes.wintypes.HANDLE,     # hFile
1178         ctypes.wintypes.DWORD,      # dwFlags
1179         ctypes.wintypes.DWORD,      # dwReserved
1180         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1181         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1182         ctypes.POINTER(OVERLAPPED)  # Overlapped
1183     ]
1184     LockFileEx.restype = ctypes.wintypes.BOOL
1185     UnlockFileEx = kernel32.UnlockFileEx
1186     UnlockFileEx.argtypes = [
1187         ctypes.wintypes.HANDLE,     # hFile
1188         ctypes.wintypes.DWORD,      # dwReserved
1189         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1190         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1191         ctypes.POINTER(OVERLAPPED)  # Overlapped
1192     ]
1193     UnlockFileEx.restype = ctypes.wintypes.BOOL
1194     whole_low = 0xffffffff
1195     whole_high = 0x7fffffff
1196
1197     def _lock_file(f, exclusive):
1198         overlapped = OVERLAPPED()
1199         overlapped.Offset = 0
1200         overlapped.OffsetHigh = 0
1201         overlapped.hEvent = 0
1202         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1203         handle = msvcrt.get_osfhandle(f.fileno())
1204         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1205                           whole_low, whole_high, f._lock_file_overlapped_p):
1206             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1207
1208     def _unlock_file(f):
1209         assert f._lock_file_overlapped_p
1210         handle = msvcrt.get_osfhandle(f.fileno())
1211         if not UnlockFileEx(handle, 0,
1212                             whole_low, whole_high, f._lock_file_overlapped_p):
1213             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1214
1215 else:
1216     import fcntl
1217
1218     def _lock_file(f, exclusive):
1219         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1220
1221     def _unlock_file(f):
1222         fcntl.flock(f, fcntl.LOCK_UN)
1223
1224
1225 class locked_file(object):
1226     def __init__(self, filename, mode, encoding=None):
1227         assert mode in ['r', 'a', 'w']
1228         self.f = io.open(filename, mode, encoding=encoding)
1229         self.mode = mode
1230
1231     def __enter__(self):
1232         exclusive = self.mode != 'r'
1233         try:
1234             _lock_file(self.f, exclusive)
1235         except IOError:
1236             self.f.close()
1237             raise
1238         return self
1239
1240     def __exit__(self, etype, value, traceback):
1241         try:
1242             _unlock_file(self.f)
1243         finally:
1244             self.f.close()
1245
1246     def __iter__(self):
1247         return iter(self.f)
1248
1249     def write(self, *args):
1250         return self.f.write(*args)
1251
1252     def read(self, *args):
1253         return self.f.read(*args)
1254
1255
1256 def get_filesystem_encoding():
1257     encoding = sys.getfilesystemencoding()
1258     return encoding if encoding is not None else 'utf-8'
1259
1260
1261 def shell_quote(args):
1262     quoted_args = []
1263     encoding = get_filesystem_encoding()
1264     for a in args:
1265         if isinstance(a, bytes):
1266             # We may get a filename encoded with 'encodeFilename'
1267             a = a.decode(encoding)
1268         quoted_args.append(pipes.quote(a))
1269     return ' '.join(quoted_args)
1270
1271
1272 def smuggle_url(url, data):
1273     """ Pass additional data in a URL for internal use. """
1274
1275     sdata = compat_urllib_parse.urlencode(
1276         {'__youtubedl_smuggle': json.dumps(data)})
1277     return url + '#' + sdata
1278
1279
1280 def unsmuggle_url(smug_url, default=None):
1281     if '#__youtubedl_smuggle' not in smug_url:
1282         return smug_url, default
1283     url, _, sdata = smug_url.rpartition('#')
1284     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1285     data = json.loads(jsond)
1286     return url, data
1287
1288
1289 def format_bytes(bytes):
1290     if bytes is None:
1291         return 'N/A'
1292     if type(bytes) is str:
1293         bytes = float(bytes)
1294     if bytes == 0.0:
1295         exponent = 0
1296     else:
1297         exponent = int(math.log(bytes, 1024.0))
1298     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1299     converted = float(bytes) / float(1024 ** exponent)
1300     return '%.2f%s' % (converted, suffix)
1301
1302
1303 def parse_filesize(s):
1304     if s is None:
1305         return None
1306
1307     # The lower-case forms are of course incorrect and inofficial,
1308     # but we support those too
1309     _UNIT_TABLE = {
1310         'B': 1,
1311         'b': 1,
1312         'KiB': 1024,
1313         'KB': 1000,
1314         'kB': 1024,
1315         'Kb': 1000,
1316         'MiB': 1024 ** 2,
1317         'MB': 1000 ** 2,
1318         'mB': 1024 ** 2,
1319         'Mb': 1000 ** 2,
1320         'GiB': 1024 ** 3,
1321         'GB': 1000 ** 3,
1322         'gB': 1024 ** 3,
1323         'Gb': 1000 ** 3,
1324         'TiB': 1024 ** 4,
1325         'TB': 1000 ** 4,
1326         'tB': 1024 ** 4,
1327         'Tb': 1000 ** 4,
1328         'PiB': 1024 ** 5,
1329         'PB': 1000 ** 5,
1330         'pB': 1024 ** 5,
1331         'Pb': 1000 ** 5,
1332         'EiB': 1024 ** 6,
1333         'EB': 1000 ** 6,
1334         'eB': 1024 ** 6,
1335         'Eb': 1000 ** 6,
1336         'ZiB': 1024 ** 7,
1337         'ZB': 1000 ** 7,
1338         'zB': 1024 ** 7,
1339         'Zb': 1000 ** 7,
1340         'YiB': 1024 ** 8,
1341         'YB': 1000 ** 8,
1342         'yB': 1024 ** 8,
1343         'Yb': 1000 ** 8,
1344     }
1345
1346     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1347     m = re.match(
1348         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1349     if not m:
1350         return None
1351
1352     num_str = m.group('num').replace(',', '.')
1353     mult = _UNIT_TABLE[m.group('unit')]
1354     return int(float(num_str) * mult)
1355
1356
1357 def month_by_name(name):
1358     """ Return the number of a month by (locale-independently) English name """
1359
1360     try:
1361         return ENGLISH_MONTH_NAMES.index(name) + 1
1362     except ValueError:
1363         return None
1364
1365
1366 def month_by_abbreviation(abbrev):
1367     """ Return the number of a month by (locale-independently) English
1368         abbreviations """
1369
1370     try:
1371         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1372     except ValueError:
1373         return None
1374
1375
1376 def fix_xml_ampersands(xml_str):
1377     """Replace all the '&' by '&amp;' in XML"""
1378     return re.sub(
1379         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1380         '&amp;',
1381         xml_str)
1382
1383
1384 def setproctitle(title):
1385     assert isinstance(title, compat_str)
1386     try:
1387         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1388     except OSError:
1389         return
1390     title_bytes = title.encode('utf-8')
1391     buf = ctypes.create_string_buffer(len(title_bytes))
1392     buf.value = title_bytes
1393     try:
1394         libc.prctl(15, buf, 0, 0, 0)
1395     except AttributeError:
1396         return  # Strange libc, just skip this
1397
1398
1399 def remove_start(s, start):
1400     if s.startswith(start):
1401         return s[len(start):]
1402     return s
1403
1404
1405 def remove_end(s, end):
1406     if s.endswith(end):
1407         return s[:-len(end)]
1408     return s
1409
1410
1411 def remove_quotes(s):
1412     if s is None or len(s) < 2:
1413         return s
1414     for quote in ('"', "'", ):
1415         if s[0] == quote and s[-1] == quote:
1416             return s[1:-1]
1417     return s
1418
1419
1420 def url_basename(url):
1421     path = compat_urlparse.urlparse(url).path
1422     return path.strip('/').split('/')[-1]
1423
1424
1425 class HEADRequest(compat_urllib_request.Request):
1426     def get_method(self):
1427         return "HEAD"
1428
1429
1430 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1431     if get_attr:
1432         if v is not None:
1433             v = getattr(v, get_attr, None)
1434     if v == '':
1435         v = None
1436     if v is None:
1437         return default
1438     try:
1439         return int(v) * invscale // scale
1440     except ValueError:
1441         return default
1442
1443
1444 def str_or_none(v, default=None):
1445     return default if v is None else compat_str(v)
1446
1447
1448 def str_to_int(int_str):
1449     """ A more relaxed version of int_or_none """
1450     if int_str is None:
1451         return None
1452     int_str = re.sub(r'[,\.\+]', '', int_str)
1453     return int(int_str)
1454
1455
1456 def float_or_none(v, scale=1, invscale=1, default=None):
1457     if v is None:
1458         return default
1459     try:
1460         return float(v) * invscale / scale
1461     except ValueError:
1462         return default
1463
1464
1465 def parse_duration(s):
1466     if not isinstance(s, compat_basestring):
1467         return None
1468
1469     s = s.strip()
1470
1471     m = re.match(
1472         r'''(?ix)(?:P?T)?
1473         (?:
1474             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1475             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1476
1477             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1478             (?:
1479                 (?:
1480                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1481                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1482                 )?
1483                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1484             )?
1485             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1486         )$''', s)
1487     if not m:
1488         return None
1489     res = 0
1490     if m.group('only_mins'):
1491         return float_or_none(m.group('only_mins'), invscale=60)
1492     if m.group('only_hours'):
1493         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1494     if m.group('secs'):
1495         res += int(m.group('secs'))
1496     if m.group('mins_reversed'):
1497         res += int(m.group('mins_reversed')) * 60
1498     if m.group('mins'):
1499         res += int(m.group('mins')) * 60
1500     if m.group('hours'):
1501         res += int(m.group('hours')) * 60 * 60
1502     if m.group('hours_reversed'):
1503         res += int(m.group('hours_reversed')) * 60 * 60
1504     if m.group('days'):
1505         res += int(m.group('days')) * 24 * 60 * 60
1506     if m.group('ms'):
1507         res += float(m.group('ms'))
1508     return res
1509
1510
1511 def prepend_extension(filename, ext, expected_real_ext=None):
1512     name, real_ext = os.path.splitext(filename)
1513     return (
1514         '{0}.{1}{2}'.format(name, ext, real_ext)
1515         if not expected_real_ext or real_ext[1:] == expected_real_ext
1516         else '{0}.{1}'.format(filename, ext))
1517
1518
1519 def replace_extension(filename, ext, expected_real_ext=None):
1520     name, real_ext = os.path.splitext(filename)
1521     return '{0}.{1}'.format(
1522         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1523         ext)
1524
1525
1526 def check_executable(exe, args=[]):
1527     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1528     args can be a list of arguments for a short output (like -version) """
1529     try:
1530         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1531     except OSError:
1532         return False
1533     return exe
1534
1535
1536 def get_exe_version(exe, args=['--version'],
1537                     version_re=None, unrecognized='present'):
1538     """ Returns the version of the specified executable,
1539     or False if the executable is not present """
1540     try:
1541         out, _ = subprocess.Popen(
1542             [encodeArgument(exe)] + args,
1543             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1544     except OSError:
1545         return False
1546     if isinstance(out, bytes):  # Python 2.x
1547         out = out.decode('ascii', 'ignore')
1548     return detect_exe_version(out, version_re, unrecognized)
1549
1550
1551 def detect_exe_version(output, version_re=None, unrecognized='present'):
1552     assert isinstance(output, compat_str)
1553     if version_re is None:
1554         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1555     m = re.search(version_re, output)
1556     if m:
1557         return m.group(1)
1558     else:
1559         return unrecognized
1560
1561
1562 class PagedList(object):
1563     def __len__(self):
1564         # This is only useful for tests
1565         return len(self.getslice())
1566
1567
1568 class OnDemandPagedList(PagedList):
1569     def __init__(self, pagefunc, pagesize):
1570         self._pagefunc = pagefunc
1571         self._pagesize = pagesize
1572
1573     def getslice(self, start=0, end=None):
1574         res = []
1575         for pagenum in itertools.count(start // self._pagesize):
1576             firstid = pagenum * self._pagesize
1577             nextfirstid = pagenum * self._pagesize + self._pagesize
1578             if start >= nextfirstid:
1579                 continue
1580
1581             page_results = list(self._pagefunc(pagenum))
1582
1583             startv = (
1584                 start % self._pagesize
1585                 if firstid <= start < nextfirstid
1586                 else 0)
1587
1588             endv = (
1589                 ((end - 1) % self._pagesize) + 1
1590                 if (end is not None and firstid <= end <= nextfirstid)
1591                 else None)
1592
1593             if startv != 0 or endv is not None:
1594                 page_results = page_results[startv:endv]
1595             res.extend(page_results)
1596
1597             # A little optimization - if current page is not "full", ie. does
1598             # not contain page_size videos then we can assume that this page
1599             # is the last one - there are no more ids on further pages -
1600             # i.e. no need to query again.
1601             if len(page_results) + startv < self._pagesize:
1602                 break
1603
1604             # If we got the whole page, but the next page is not interesting,
1605             # break out early as well
1606             if end == nextfirstid:
1607                 break
1608         return res
1609
1610
1611 class InAdvancePagedList(PagedList):
1612     def __init__(self, pagefunc, pagecount, pagesize):
1613         self._pagefunc = pagefunc
1614         self._pagecount = pagecount
1615         self._pagesize = pagesize
1616
1617     def getslice(self, start=0, end=None):
1618         res = []
1619         start_page = start // self._pagesize
1620         end_page = (
1621             self._pagecount if end is None else (end // self._pagesize + 1))
1622         skip_elems = start - start_page * self._pagesize
1623         only_more = None if end is None else end - start
1624         for pagenum in range(start_page, end_page):
1625             page = list(self._pagefunc(pagenum))
1626             if skip_elems:
1627                 page = page[skip_elems:]
1628                 skip_elems = None
1629             if only_more is not None:
1630                 if len(page) < only_more:
1631                     only_more -= len(page)
1632                 else:
1633                     page = page[:only_more]
1634                     res.extend(page)
1635                     break
1636             res.extend(page)
1637         return res
1638
1639
1640 def uppercase_escape(s):
1641     unicode_escape = codecs.getdecoder('unicode_escape')
1642     return re.sub(
1643         r'\\U[0-9a-fA-F]{8}',
1644         lambda m: unicode_escape(m.group(0))[0],
1645         s)
1646
1647
1648 def lowercase_escape(s):
1649     unicode_escape = codecs.getdecoder('unicode_escape')
1650     return re.sub(
1651         r'\\u[0-9a-fA-F]{4}',
1652         lambda m: unicode_escape(m.group(0))[0],
1653         s)
1654
1655
1656 def escape_rfc3986(s):
1657     """Escape non-ASCII characters as suggested by RFC 3986"""
1658     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1659         s = s.encode('utf-8')
1660     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1661
1662
1663 def escape_url(url):
1664     """Escape URL as suggested by RFC 3986"""
1665     url_parsed = compat_urllib_parse_urlparse(url)
1666     return url_parsed._replace(
1667         path=escape_rfc3986(url_parsed.path),
1668         params=escape_rfc3986(url_parsed.params),
1669         query=escape_rfc3986(url_parsed.query),
1670         fragment=escape_rfc3986(url_parsed.fragment)
1671     ).geturl()
1672
1673 try:
1674     struct.pack('!I', 0)
1675 except TypeError:
1676     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1677     def struct_pack(spec, *args):
1678         if isinstance(spec, compat_str):
1679             spec = spec.encode('ascii')
1680         return struct.pack(spec, *args)
1681
1682     def struct_unpack(spec, *args):
1683         if isinstance(spec, compat_str):
1684             spec = spec.encode('ascii')
1685         return struct.unpack(spec, *args)
1686 else:
1687     struct_pack = struct.pack
1688     struct_unpack = struct.unpack
1689
1690
1691 def read_batch_urls(batch_fd):
1692     def fixup(url):
1693         if not isinstance(url, compat_str):
1694             url = url.decode('utf-8', 'replace')
1695         BOM_UTF8 = '\xef\xbb\xbf'
1696         if url.startswith(BOM_UTF8):
1697             url = url[len(BOM_UTF8):]
1698         url = url.strip()
1699         if url.startswith(('#', ';', ']')):
1700             return False
1701         return url
1702
1703     with contextlib.closing(batch_fd) as fd:
1704         return [url for url in map(fixup, fd) if url]
1705
1706
1707 def urlencode_postdata(*args, **kargs):
1708     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1709
1710
1711 def encode_dict(d, encoding='utf-8'):
1712     def encode(v):
1713         return v.encode(encoding) if isinstance(v, compat_basestring) else v
1714     return dict((encode(k), encode(v)) for k, v in d.items())
1715
1716
1717 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1718     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1719
1720
1721 US_RATINGS = {
1722     'G': 0,
1723     'PG': 10,
1724     'PG-13': 13,
1725     'R': 16,
1726     'NC': 18,
1727 }
1728
1729
1730 def parse_age_limit(s):
1731     if s is None:
1732         return None
1733     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1734     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1735
1736
1737 def strip_jsonp(code):
1738     return re.sub(
1739         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1740
1741
1742 def js_to_json(code):
1743     def fix_kv(m):
1744         v = m.group(0)
1745         if v in ('true', 'false', 'null'):
1746             return v
1747         if v.startswith('"'):
1748             v = re.sub(r"\\'", "'", v[1:-1])
1749         elif v.startswith("'"):
1750             v = v[1:-1]
1751             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1752                 '\\\\': '\\\\',
1753                 "\\'": "'",
1754                 '"': '\\"',
1755             }[m.group(0)], v)
1756         return '"%s"' % v
1757
1758     res = re.sub(r'''(?x)
1759         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1760         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1761         [a-zA-Z_][.a-zA-Z_0-9]*
1762         ''', fix_kv, code)
1763     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1764     return res
1765
1766
1767 def qualities(quality_ids):
1768     """ Get a numeric quality value out of a list of possible values """
1769     def q(qid):
1770         try:
1771             return quality_ids.index(qid)
1772         except ValueError:
1773             return -1
1774     return q
1775
1776
1777 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1778
1779
1780 def limit_length(s, length):
1781     """ Add ellipses to overly long strings """
1782     if s is None:
1783         return None
1784     ELLIPSES = '...'
1785     if len(s) > length:
1786         return s[:length - len(ELLIPSES)] + ELLIPSES
1787     return s
1788
1789
1790 def version_tuple(v):
1791     return tuple(int(e) for e in re.split(r'[-.]', v))
1792
1793
1794 def is_outdated_version(version, limit, assume_new=True):
1795     if not version:
1796         return not assume_new
1797     try:
1798         return version_tuple(version) < version_tuple(limit)
1799     except ValueError:
1800         return not assume_new
1801
1802
1803 def ytdl_is_updateable():
1804     """ Returns if youtube-dl can be updated with -U """
1805     from zipimport import zipimporter
1806
1807     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1808
1809
1810 def args_to_str(args):
1811     # Get a short string representation for a subprocess command
1812     return ' '.join(shlex_quote(a) for a in args)
1813
1814
1815 def error_to_compat_str(err):
1816     err_str = str(err)
1817     # On python 2 error byte string must be decoded with proper
1818     # encoding rather than ascii
1819     if sys.version_info[0] < 3:
1820         err_str = err_str.decode(preferredencoding())
1821     return err_str
1822
1823
1824 def mimetype2ext(mt):
1825     _, _, res = mt.rpartition('/')
1826
1827     return {
1828         'x-ms-wmv': 'wmv',
1829         'x-mp4-fragmented': 'mp4',
1830         'ttml+xml': 'ttml',
1831     }.get(res, res)
1832
1833
1834 def urlhandle_detect_ext(url_handle):
1835     try:
1836         url_handle.headers
1837         getheader = lambda h: url_handle.headers[h]
1838     except AttributeError:  # Python < 3
1839         getheader = url_handle.info().getheader
1840
1841     cd = getheader('Content-Disposition')
1842     if cd:
1843         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1844         if m:
1845             e = determine_ext(m.group('filename'), default_ext=None)
1846             if e:
1847                 return e
1848
1849     return mimetype2ext(getheader('Content-Type'))
1850
1851
1852 def encode_data_uri(data, mime_type):
1853     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1854
1855
1856 def age_restricted(content_limit, age_limit):
1857     """ Returns True iff the content should be blocked """
1858
1859     if age_limit is None:  # No limit set
1860         return False
1861     if content_limit is None:
1862         return False  # Content available for everyone
1863     return age_limit < content_limit
1864
1865
1866 def is_html(first_bytes):
1867     """ Detect whether a file contains HTML by examining its first bytes. """
1868
1869     BOMS = [
1870         (b'\xef\xbb\xbf', 'utf-8'),
1871         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1872         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1873         (b'\xff\xfe', 'utf-16-le'),
1874         (b'\xfe\xff', 'utf-16-be'),
1875     ]
1876     for bom, enc in BOMS:
1877         if first_bytes.startswith(bom):
1878             s = first_bytes[len(bom):].decode(enc, 'replace')
1879             break
1880     else:
1881         s = first_bytes.decode('utf-8', 'replace')
1882
1883     return re.match(r'^\s*<', s)
1884
1885
1886 def determine_protocol(info_dict):
1887     protocol = info_dict.get('protocol')
1888     if protocol is not None:
1889         return protocol
1890
1891     url = info_dict['url']
1892     if url.startswith('rtmp'):
1893         return 'rtmp'
1894     elif url.startswith('mms'):
1895         return 'mms'
1896     elif url.startswith('rtsp'):
1897         return 'rtsp'
1898
1899     ext = determine_ext(url)
1900     if ext == 'm3u8':
1901         return 'm3u8'
1902     elif ext == 'f4m':
1903         return 'f4m'
1904
1905     return compat_urllib_parse_urlparse(url).scheme
1906
1907
1908 def render_table(header_row, data):
1909     """ Render a list of rows, each as a list of values """
1910     table = [header_row] + data
1911     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1912     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1913     return '\n'.join(format_str % tuple(row) for row in table)
1914
1915
1916 def _match_one(filter_part, dct):
1917     COMPARISON_OPERATORS = {
1918         '<': operator.lt,
1919         '<=': operator.le,
1920         '>': operator.gt,
1921         '>=': operator.ge,
1922         '=': operator.eq,
1923         '!=': operator.ne,
1924     }
1925     operator_rex = re.compile(r'''(?x)\s*
1926         (?P<key>[a-z_]+)
1927         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1928         (?:
1929             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1930             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1931         )
1932         \s*$
1933         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1934     m = operator_rex.search(filter_part)
1935     if m:
1936         op = COMPARISON_OPERATORS[m.group('op')]
1937         if m.group('strval') is not None:
1938             if m.group('op') not in ('=', '!='):
1939                 raise ValueError(
1940                     'Operator %s does not support string values!' % m.group('op'))
1941             comparison_value = m.group('strval')
1942         else:
1943             try:
1944                 comparison_value = int(m.group('intval'))
1945             except ValueError:
1946                 comparison_value = parse_filesize(m.group('intval'))
1947                 if comparison_value is None:
1948                     comparison_value = parse_filesize(m.group('intval') + 'B')
1949                 if comparison_value is None:
1950                     raise ValueError(
1951                         'Invalid integer value %r in filter part %r' % (
1952                             m.group('intval'), filter_part))
1953         actual_value = dct.get(m.group('key'))
1954         if actual_value is None:
1955             return m.group('none_inclusive')
1956         return op(actual_value, comparison_value)
1957
1958     UNARY_OPERATORS = {
1959         '': lambda v: v is not None,
1960         '!': lambda v: v is None,
1961     }
1962     operator_rex = re.compile(r'''(?x)\s*
1963         (?P<op>%s)\s*(?P<key>[a-z_]+)
1964         \s*$
1965         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1966     m = operator_rex.search(filter_part)
1967     if m:
1968         op = UNARY_OPERATORS[m.group('op')]
1969         actual_value = dct.get(m.group('key'))
1970         return op(actual_value)
1971
1972     raise ValueError('Invalid filter part %r' % filter_part)
1973
1974
1975 def match_str(filter_str, dct):
1976     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1977
1978     return all(
1979         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1980
1981
1982 def match_filter_func(filter_str):
1983     def _match_func(info_dict):
1984         if match_str(filter_str, info_dict):
1985             return None
1986         else:
1987             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1988             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1989     return _match_func
1990
1991
1992 def parse_dfxp_time_expr(time_expr):
1993     if not time_expr:
1994         return
1995
1996     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1997     if mobj:
1998         return float(mobj.group('time_offset'))
1999
2000     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2001     if mobj:
2002         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2003
2004
2005 def srt_subtitles_timecode(seconds):
2006     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2007
2008
2009 def dfxp2srt(dfxp_data):
2010     _x = functools.partial(xpath_with_ns, ns_map={
2011         'ttml': 'http://www.w3.org/ns/ttml',
2012         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2013     })
2014
2015     def parse_node(node):
2016         str_or_empty = functools.partial(str_or_none, default='')
2017
2018         out = str_or_empty(node.text)
2019
2020         for child in node:
2021             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2022                 out += '\n' + str_or_empty(child.tail)
2023             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
2024                 out += str_or_empty(parse_node(child))
2025             else:
2026                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
2027
2028         return out
2029
2030     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2031     out = []
2032     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2033
2034     if not paras:
2035         raise ValueError('Invalid dfxp/TTML subtitle')
2036
2037     for para, index in zip(paras, itertools.count(1)):
2038         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2039         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2040         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2041         if begin_time is None:
2042             continue
2043         if not end_time:
2044             if not dur:
2045                 continue
2046             end_time = begin_time + dur
2047         out.append('%d\n%s --> %s\n%s\n\n' % (
2048             index,
2049             srt_subtitles_timecode(begin_time),
2050             srt_subtitles_timecode(end_time),
2051             parse_node(para)))
2052
2053     return ''.join(out)
2054
2055
2056 def cli_option(params, command_option, param):
2057     param = params.get(param)
2058     return [command_option, param] if param is not None else []
2059
2060
2061 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2062     param = params.get(param)
2063     assert isinstance(param, bool)
2064     if separator:
2065         return [command_option + separator + (true_value if param else false_value)]
2066     return [command_option, true_value if param else false_value]
2067
2068
2069 def cli_valueless_option(params, command_option, param, expected_value=True):
2070     param = params.get(param)
2071     return [command_option] if param == expected_value else []
2072
2073
2074 def cli_configuration_args(params, param, default=[]):
2075     ex_args = params.get(param)
2076     if ex_args is None:
2077         return default
2078     assert isinstance(ex_args, list)
2079     return ex_args
2080
2081
2082 class ISO639Utils(object):
2083     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2084     _lang_map = {
2085         'aa': 'aar',
2086         'ab': 'abk',
2087         'ae': 'ave',
2088         'af': 'afr',
2089         'ak': 'aka',
2090         'am': 'amh',
2091         'an': 'arg',
2092         'ar': 'ara',
2093         'as': 'asm',
2094         'av': 'ava',
2095         'ay': 'aym',
2096         'az': 'aze',
2097         'ba': 'bak',
2098         'be': 'bel',
2099         'bg': 'bul',
2100         'bh': 'bih',
2101         'bi': 'bis',
2102         'bm': 'bam',
2103         'bn': 'ben',
2104         'bo': 'bod',
2105         'br': 'bre',
2106         'bs': 'bos',
2107         'ca': 'cat',
2108         'ce': 'che',
2109         'ch': 'cha',
2110         'co': 'cos',
2111         'cr': 'cre',
2112         'cs': 'ces',
2113         'cu': 'chu',
2114         'cv': 'chv',
2115         'cy': 'cym',
2116         'da': 'dan',
2117         'de': 'deu',
2118         'dv': 'div',
2119         'dz': 'dzo',
2120         'ee': 'ewe',
2121         'el': 'ell',
2122         'en': 'eng',
2123         'eo': 'epo',
2124         'es': 'spa',
2125         'et': 'est',
2126         'eu': 'eus',
2127         'fa': 'fas',
2128         'ff': 'ful',
2129         'fi': 'fin',
2130         'fj': 'fij',
2131         'fo': 'fao',
2132         'fr': 'fra',
2133         'fy': 'fry',
2134         'ga': 'gle',
2135         'gd': 'gla',
2136         'gl': 'glg',
2137         'gn': 'grn',
2138         'gu': 'guj',
2139         'gv': 'glv',
2140         'ha': 'hau',
2141         'he': 'heb',
2142         'hi': 'hin',
2143         'ho': 'hmo',
2144         'hr': 'hrv',
2145         'ht': 'hat',
2146         'hu': 'hun',
2147         'hy': 'hye',
2148         'hz': 'her',
2149         'ia': 'ina',
2150         'id': 'ind',
2151         'ie': 'ile',
2152         'ig': 'ibo',
2153         'ii': 'iii',
2154         'ik': 'ipk',
2155         'io': 'ido',
2156         'is': 'isl',
2157         'it': 'ita',
2158         'iu': 'iku',
2159         'ja': 'jpn',
2160         'jv': 'jav',
2161         'ka': 'kat',
2162         'kg': 'kon',
2163         'ki': 'kik',
2164         'kj': 'kua',
2165         'kk': 'kaz',
2166         'kl': 'kal',
2167         'km': 'khm',
2168         'kn': 'kan',
2169         'ko': 'kor',
2170         'kr': 'kau',
2171         'ks': 'kas',
2172         'ku': 'kur',
2173         'kv': 'kom',
2174         'kw': 'cor',
2175         'ky': 'kir',
2176         'la': 'lat',
2177         'lb': 'ltz',
2178         'lg': 'lug',
2179         'li': 'lim',
2180         'ln': 'lin',
2181         'lo': 'lao',
2182         'lt': 'lit',
2183         'lu': 'lub',
2184         'lv': 'lav',
2185         'mg': 'mlg',
2186         'mh': 'mah',
2187         'mi': 'mri',
2188         'mk': 'mkd',
2189         'ml': 'mal',
2190         'mn': 'mon',
2191         'mr': 'mar',
2192         'ms': 'msa',
2193         'mt': 'mlt',
2194         'my': 'mya',
2195         'na': 'nau',
2196         'nb': 'nob',
2197         'nd': 'nde',
2198         'ne': 'nep',
2199         'ng': 'ndo',
2200         'nl': 'nld',
2201         'nn': 'nno',
2202         'no': 'nor',
2203         'nr': 'nbl',
2204         'nv': 'nav',
2205         'ny': 'nya',
2206         'oc': 'oci',
2207         'oj': 'oji',
2208         'om': 'orm',
2209         'or': 'ori',
2210         'os': 'oss',
2211         'pa': 'pan',
2212         'pi': 'pli',
2213         'pl': 'pol',
2214         'ps': 'pus',
2215         'pt': 'por',
2216         'qu': 'que',
2217         'rm': 'roh',
2218         'rn': 'run',
2219         'ro': 'ron',
2220         'ru': 'rus',
2221         'rw': 'kin',
2222         'sa': 'san',
2223         'sc': 'srd',
2224         'sd': 'snd',
2225         'se': 'sme',
2226         'sg': 'sag',
2227         'si': 'sin',
2228         'sk': 'slk',
2229         'sl': 'slv',
2230         'sm': 'smo',
2231         'sn': 'sna',
2232         'so': 'som',
2233         'sq': 'sqi',
2234         'sr': 'srp',
2235         'ss': 'ssw',
2236         'st': 'sot',
2237         'su': 'sun',
2238         'sv': 'swe',
2239         'sw': 'swa',
2240         'ta': 'tam',
2241         'te': 'tel',
2242         'tg': 'tgk',
2243         'th': 'tha',
2244         'ti': 'tir',
2245         'tk': 'tuk',
2246         'tl': 'tgl',
2247         'tn': 'tsn',
2248         'to': 'ton',
2249         'tr': 'tur',
2250         'ts': 'tso',
2251         'tt': 'tat',
2252         'tw': 'twi',
2253         'ty': 'tah',
2254         'ug': 'uig',
2255         'uk': 'ukr',
2256         'ur': 'urd',
2257         'uz': 'uzb',
2258         've': 'ven',
2259         'vi': 'vie',
2260         'vo': 'vol',
2261         'wa': 'wln',
2262         'wo': 'wol',
2263         'xh': 'xho',
2264         'yi': 'yid',
2265         'yo': 'yor',
2266         'za': 'zha',
2267         'zh': 'zho',
2268         'zu': 'zul',
2269     }
2270
2271     @classmethod
2272     def short2long(cls, code):
2273         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2274         return cls._lang_map.get(code[:2])
2275
2276     @classmethod
2277     def long2short(cls, code):
2278         """Convert language code from ISO 639-2/T to ISO 639-1"""
2279         for short_name, long_name in cls._lang_map.items():
2280             if long_name == code:
2281                 return short_name
2282
2283
2284 class ISO3166Utils(object):
2285     # From http://data.okfn.org/data/core/country-list
2286     _country_map = {
2287         'AF': 'Afghanistan',
2288         'AX': 'Åland Islands',
2289         'AL': 'Albania',
2290         'DZ': 'Algeria',
2291         'AS': 'American Samoa',
2292         'AD': 'Andorra',
2293         'AO': 'Angola',
2294         'AI': 'Anguilla',
2295         'AQ': 'Antarctica',
2296         'AG': 'Antigua and Barbuda',
2297         'AR': 'Argentina',
2298         'AM': 'Armenia',
2299         'AW': 'Aruba',
2300         'AU': 'Australia',
2301         'AT': 'Austria',
2302         'AZ': 'Azerbaijan',
2303         'BS': 'Bahamas',
2304         'BH': 'Bahrain',
2305         'BD': 'Bangladesh',
2306         'BB': 'Barbados',
2307         'BY': 'Belarus',
2308         'BE': 'Belgium',
2309         'BZ': 'Belize',
2310         'BJ': 'Benin',
2311         'BM': 'Bermuda',
2312         'BT': 'Bhutan',
2313         'BO': 'Bolivia, Plurinational State of',
2314         'BQ': 'Bonaire, Sint Eustatius and Saba',
2315         'BA': 'Bosnia and Herzegovina',
2316         'BW': 'Botswana',
2317         'BV': 'Bouvet Island',
2318         'BR': 'Brazil',
2319         'IO': 'British Indian Ocean Territory',
2320         'BN': 'Brunei Darussalam',
2321         'BG': 'Bulgaria',
2322         'BF': 'Burkina Faso',
2323         'BI': 'Burundi',
2324         'KH': 'Cambodia',
2325         'CM': 'Cameroon',
2326         'CA': 'Canada',
2327         'CV': 'Cape Verde',
2328         'KY': 'Cayman Islands',
2329         'CF': 'Central African Republic',
2330         'TD': 'Chad',
2331         'CL': 'Chile',
2332         'CN': 'China',
2333         'CX': 'Christmas Island',
2334         'CC': 'Cocos (Keeling) Islands',
2335         'CO': 'Colombia',
2336         'KM': 'Comoros',
2337         'CG': 'Congo',
2338         'CD': 'Congo, the Democratic Republic of the',
2339         'CK': 'Cook Islands',
2340         'CR': 'Costa Rica',
2341         'CI': 'Côte d\'Ivoire',
2342         'HR': 'Croatia',
2343         'CU': 'Cuba',
2344         'CW': 'Curaçao',
2345         'CY': 'Cyprus',
2346         'CZ': 'Czech Republic',
2347         'DK': 'Denmark',
2348         'DJ': 'Djibouti',
2349         'DM': 'Dominica',
2350         'DO': 'Dominican Republic',
2351         'EC': 'Ecuador',
2352         'EG': 'Egypt',
2353         'SV': 'El Salvador',
2354         'GQ': 'Equatorial Guinea',
2355         'ER': 'Eritrea',
2356         'EE': 'Estonia',
2357         'ET': 'Ethiopia',
2358         'FK': 'Falkland Islands (Malvinas)',
2359         'FO': 'Faroe Islands',
2360         'FJ': 'Fiji',
2361         'FI': 'Finland',
2362         'FR': 'France',
2363         'GF': 'French Guiana',
2364         'PF': 'French Polynesia',
2365         'TF': 'French Southern Territories',
2366         'GA': 'Gabon',
2367         'GM': 'Gambia',
2368         'GE': 'Georgia',
2369         'DE': 'Germany',
2370         'GH': 'Ghana',
2371         'GI': 'Gibraltar',
2372         'GR': 'Greece',
2373         'GL': 'Greenland',
2374         'GD': 'Grenada',
2375         'GP': 'Guadeloupe',
2376         'GU': 'Guam',
2377         'GT': 'Guatemala',
2378         'GG': 'Guernsey',
2379         'GN': 'Guinea',
2380         'GW': 'Guinea-Bissau',
2381         'GY': 'Guyana',
2382         'HT': 'Haiti',
2383         'HM': 'Heard Island and McDonald Islands',
2384         'VA': 'Holy See (Vatican City State)',
2385         'HN': 'Honduras',
2386         'HK': 'Hong Kong',
2387         'HU': 'Hungary',
2388         'IS': 'Iceland',
2389         'IN': 'India',
2390         'ID': 'Indonesia',
2391         'IR': 'Iran, Islamic Republic of',
2392         'IQ': 'Iraq',
2393         'IE': 'Ireland',
2394         'IM': 'Isle of Man',
2395         'IL': 'Israel',
2396         'IT': 'Italy',
2397         'JM': 'Jamaica',
2398         'JP': 'Japan',
2399         'JE': 'Jersey',
2400         'JO': 'Jordan',
2401         'KZ': 'Kazakhstan',
2402         'KE': 'Kenya',
2403         'KI': 'Kiribati',
2404         'KP': 'Korea, Democratic People\'s Republic of',
2405         'KR': 'Korea, Republic of',
2406         'KW': 'Kuwait',
2407         'KG': 'Kyrgyzstan',
2408         'LA': 'Lao People\'s Democratic Republic',
2409         'LV': 'Latvia',
2410         'LB': 'Lebanon',
2411         'LS': 'Lesotho',
2412         'LR': 'Liberia',
2413         'LY': 'Libya',
2414         'LI': 'Liechtenstein',
2415         'LT': 'Lithuania',
2416         'LU': 'Luxembourg',
2417         'MO': 'Macao',
2418         'MK': 'Macedonia, the Former Yugoslav Republic of',
2419         'MG': 'Madagascar',
2420         'MW': 'Malawi',
2421         'MY': 'Malaysia',
2422         'MV': 'Maldives',
2423         'ML': 'Mali',
2424         'MT': 'Malta',
2425         'MH': 'Marshall Islands',
2426         'MQ': 'Martinique',
2427         'MR': 'Mauritania',
2428         'MU': 'Mauritius',
2429         'YT': 'Mayotte',
2430         'MX': 'Mexico',
2431         'FM': 'Micronesia, Federated States of',
2432         'MD': 'Moldova, Republic of',
2433         'MC': 'Monaco',
2434         'MN': 'Mongolia',
2435         'ME': 'Montenegro',
2436         'MS': 'Montserrat',
2437         'MA': 'Morocco',
2438         'MZ': 'Mozambique',
2439         'MM': 'Myanmar',
2440         'NA': 'Namibia',
2441         'NR': 'Nauru',
2442         'NP': 'Nepal',
2443         'NL': 'Netherlands',
2444         'NC': 'New Caledonia',
2445         'NZ': 'New Zealand',
2446         'NI': 'Nicaragua',
2447         'NE': 'Niger',
2448         'NG': 'Nigeria',
2449         'NU': 'Niue',
2450         'NF': 'Norfolk Island',
2451         'MP': 'Northern Mariana Islands',
2452         'NO': 'Norway',
2453         'OM': 'Oman',
2454         'PK': 'Pakistan',
2455         'PW': 'Palau',
2456         'PS': 'Palestine, State of',
2457         'PA': 'Panama',
2458         'PG': 'Papua New Guinea',
2459         'PY': 'Paraguay',
2460         'PE': 'Peru',
2461         'PH': 'Philippines',
2462         'PN': 'Pitcairn',
2463         'PL': 'Poland',
2464         'PT': 'Portugal',
2465         'PR': 'Puerto Rico',
2466         'QA': 'Qatar',
2467         'RE': 'Réunion',
2468         'RO': 'Romania',
2469         'RU': 'Russian Federation',
2470         'RW': 'Rwanda',
2471         'BL': 'Saint Barthélemy',
2472         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2473         'KN': 'Saint Kitts and Nevis',
2474         'LC': 'Saint Lucia',
2475         'MF': 'Saint Martin (French part)',
2476         'PM': 'Saint Pierre and Miquelon',
2477         'VC': 'Saint Vincent and the Grenadines',
2478         'WS': 'Samoa',
2479         'SM': 'San Marino',
2480         'ST': 'Sao Tome and Principe',
2481         'SA': 'Saudi Arabia',
2482         'SN': 'Senegal',
2483         'RS': 'Serbia',
2484         'SC': 'Seychelles',
2485         'SL': 'Sierra Leone',
2486         'SG': 'Singapore',
2487         'SX': 'Sint Maarten (Dutch part)',
2488         'SK': 'Slovakia',
2489         'SI': 'Slovenia',
2490         'SB': 'Solomon Islands',
2491         'SO': 'Somalia',
2492         'ZA': 'South Africa',
2493         'GS': 'South Georgia and the South Sandwich Islands',
2494         'SS': 'South Sudan',
2495         'ES': 'Spain',
2496         'LK': 'Sri Lanka',
2497         'SD': 'Sudan',
2498         'SR': 'Suriname',
2499         'SJ': 'Svalbard and Jan Mayen',
2500         'SZ': 'Swaziland',
2501         'SE': 'Sweden',
2502         'CH': 'Switzerland',
2503         'SY': 'Syrian Arab Republic',
2504         'TW': 'Taiwan, Province of China',
2505         'TJ': 'Tajikistan',
2506         'TZ': 'Tanzania, United Republic of',
2507         'TH': 'Thailand',
2508         'TL': 'Timor-Leste',
2509         'TG': 'Togo',
2510         'TK': 'Tokelau',
2511         'TO': 'Tonga',
2512         'TT': 'Trinidad and Tobago',
2513         'TN': 'Tunisia',
2514         'TR': 'Turkey',
2515         'TM': 'Turkmenistan',
2516         'TC': 'Turks and Caicos Islands',
2517         'TV': 'Tuvalu',
2518         'UG': 'Uganda',
2519         'UA': 'Ukraine',
2520         'AE': 'United Arab Emirates',
2521         'GB': 'United Kingdom',
2522         'US': 'United States',
2523         'UM': 'United States Minor Outlying Islands',
2524         'UY': 'Uruguay',
2525         'UZ': 'Uzbekistan',
2526         'VU': 'Vanuatu',
2527         'VE': 'Venezuela, Bolivarian Republic of',
2528         'VN': 'Viet Nam',
2529         'VG': 'Virgin Islands, British',
2530         'VI': 'Virgin Islands, U.S.',
2531         'WF': 'Wallis and Futuna',
2532         'EH': 'Western Sahara',
2533         'YE': 'Yemen',
2534         'ZM': 'Zambia',
2535         'ZW': 'Zimbabwe',
2536     }
2537
2538     @classmethod
2539     def short2full(cls, code):
2540         """Convert an ISO 3166-2 country code to the corresponding full name"""
2541         return cls._country_map.get(code.upper())
2542
2543
2544 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2545     def __init__(self, proxies=None):
2546         # Set default handlers
2547         for type in ('http', 'https'):
2548             setattr(self, '%s_open' % type,
2549                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2550                         meth(r, proxy, type))
2551         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2552
2553     def proxy_open(self, req, proxy, type):
2554         req_proxy = req.headers.get('Ytdl-request-proxy')
2555         if req_proxy is not None:
2556             proxy = req_proxy
2557             del req.headers['Ytdl-request-proxy']
2558
2559         if proxy == '__noproxy__':
2560             return None  # No Proxy
2561         return compat_urllib_request.ProxyHandler.proxy_open(
2562             self, req, proxy, type)