_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import random
  27 import re
  28 import socket
  29 import ssl
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParser,
  39     compat_basestring,
  40     compat_chr,
  41     compat_etree_fromstring,
  42     compat_expanduser,
  43     compat_html_entities,
  44     compat_html_entities_html5,
  45     compat_http_client,
  46     compat_kwargs,
  47     compat_os_name,
  48     compat_parse_qs,
  49     compat_shlex_quote,
  50     compat_socket_create_connection,
  51     compat_str,
  52     compat_struct_pack,
  53     compat_struct_unpack,
  54     compat_urllib_error,
  55     compat_urllib_parse,
  56     compat_urllib_parse_urlencode,
  57     compat_urllib_parse_urlparse,
  58     compat_urllib_parse_unquote_plus,
  59     compat_urllib_request,
  60     compat_urlparse,
  61     compat_xpath,
  62 )
  63
  64 from .socks import (
  65     ProxyType,
  66     sockssocket,
  67 )
  68
  69
  70 def register_socks_protocols():
  71     # "Register" SOCKS protocols
  72     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  73     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  74     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  75         if scheme not in compat_urlparse.uses_netloc:
  76             compat_urlparse.uses_netloc.append(scheme)
  77
  78
  79 # This is not clearly defined otherwise
  80 compiled_regex_type = type(re.compile(''))
  81
  82 std_headers = {
  83     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  84     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  85     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  86     'Accept-Encoding': 'gzip, deflate',
  87     'Accept-Language': 'en-us,en;q=0.5',
  88 }
  89
  90
  91 USER_AGENTS = {
  92     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  93 }
  94
  95
  96 NO_DEFAULT = object()
  97
  98 ENGLISH_MONTH_NAMES = [
  99     'January', 'February', 'March', 'April', 'May', 'June',
 100     'July', 'August', 'September', 'October', 'November', 'December']
 101
 102 MONTH_NAMES = {
 103     'en': ENGLISH_MONTH_NAMES,
 104     'fr': [
 105         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 106         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 107 }
 108
 109 KNOWN_EXTENSIONS = (
 110     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 111     'flv', 'f4v', 'f4a', 'f4b',
 112     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 113     'mkv', 'mka', 'mk3d',
 114     'avi', 'divx',
 115     'mov',
 116     'asf', 'wmv', 'wma',
 117     '3gp', '3g2',
 118     'mp3',
 119     'flac',
 120     'ape',
 121     'wav',
 122     'f4f', 'f4m', 'm3u8', 'smil')
 123
 124 # needed for sanitizing filenames in restricted mode
 125 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 126                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 127                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 128
 129 DATE_FORMATS = (
 130     '%d %B %Y',
 131     '%d %b %Y',
 132     '%B %d %Y',
 133     '%B %dst %Y',
 134     '%B %dnd %Y',
 135     '%B %dth %Y',
 136     '%b %d %Y',
 137     '%b %dst %Y',
 138     '%b %dnd %Y',
 139     '%b %dth %Y',
 140     '%b %dst %Y %I:%M',
 141     '%b %dnd %Y %I:%M',
 142     '%b %dth %Y %I:%M',
 143     '%Y %m %d',
 144     '%Y-%m-%d',
 145     '%Y/%m/%d',
 146     '%Y/%m/%d %H:%M',
 147     '%Y/%m/%d %H:%M:%S',
 148     '%Y-%m-%d %H:%M',
 149     '%Y-%m-%d %H:%M:%S',
 150     '%Y-%m-%d %H:%M:%S.%f',
 151     '%d.%m.%Y %H:%M',
 152     '%d.%m.%Y %H.%M',
 153     '%Y-%m-%dT%H:%M:%SZ',
 154     '%Y-%m-%dT%H:%M:%S.%fZ',
 155     '%Y-%m-%dT%H:%M:%S.%f0Z',
 156     '%Y-%m-%dT%H:%M:%S',
 157     '%Y-%m-%dT%H:%M:%S.%f',
 158     '%Y-%m-%dT%H:%M',
 159     '%b %d %Y at %H:%M',
 160     '%b %d %Y at %H:%M:%S',
 161 )
 162
 163 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 164 DATE_FORMATS_DAY_FIRST.extend([
 165     '%d-%m-%Y',
 166     '%d.%m.%Y',
 167     '%d.%m.%y',
 168     '%d/%m/%Y',
 169     '%d/%m/%y',
 170     '%d/%m/%Y %H:%M:%S',
 171 ])
 172
 173 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 174 DATE_FORMATS_MONTH_FIRST.extend([
 175     '%m-%d-%Y',
 176     '%m.%d.%Y',
 177     '%m/%d/%Y',
 178     '%m/%d/%y',
 179     '%m/%d/%Y %H:%M:%S',
 180 ])
 181
 182 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 183
 184
 185 def preferredencoding():
 186     """Get preferred encoding.
 187
 188     Returns the best encoding scheme for the system, based on
 189     locale.getpreferredencoding() and some further tweaks.
 190     """
 191     try:
 192         pref = locale.getpreferredencoding()
 193         'TEST'.encode(pref)
 194     except Exception:
 195         pref = 'UTF-8'
 196
 197     return pref
 198
 199
 200 def write_json_file(obj, fn):
 201     """ Encode obj as JSON and write it to fn, atomically if possible """
 202
 203     fn = encodeFilename(fn)
 204     if sys.version_info < (3, 0) and sys.platform != 'win32':
 205         encoding = get_filesystem_encoding()
 206         # os.path.basename returns a bytes object, but NamedTemporaryFile
 207         # will fail if the filename contains non ascii characters unless we
 208         # use a unicode object
 209         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 210         # the same for os.path.dirname
 211         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 212     else:
 213         path_basename = os.path.basename
 214         path_dirname = os.path.dirname
 215
 216     args = {
 217         'suffix': '.tmp',
 218         'prefix': path_basename(fn) + '.',
 219         'dir': path_dirname(fn),
 220         'delete': False,
 221     }
 222
 223     # In Python 2.x, json.dump expects a bytestream.
 224     # In Python 3.x, it writes to a character stream
 225     if sys.version_info < (3, 0):
 226         args['mode'] = 'wb'
 227     else:
 228         args.update({
 229             'mode': 'w',
 230             'encoding': 'utf-8',
 231         })
 232
 233     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 234
 235     try:
 236         with tf:
 237             json.dump(obj, tf)
 238         if sys.platform == 'win32':
 239             # Need to remove existing file on Windows, else os.rename raises
 240             # WindowsError or FileExistsError.
 241             try:
 242                 os.unlink(fn)
 243             except OSError:
 244                 pass
 245         os.rename(tf.name, fn)
 246     except Exception:
 247         try:
 248             os.remove(tf.name)
 249         except OSError:
 250             pass
 251         raise
 252
 253
 254 if sys.version_info >= (2, 7):
 255     def find_xpath_attr(node, xpath, key, val=None):
 256         """ Find the xpath xpath[@key=val] """
 257         assert re.match(r'^[a-zA-Z_-]+$', key)
 258         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 259         return node.find(expr)
 260 else:
 261     def find_xpath_attr(node, xpath, key, val=None):
 262         for f in node.findall(compat_xpath(xpath)):
 263             if key not in f.attrib:
 264                 continue
 265             if val is None or f.attrib.get(key) == val:
 266                 return f
 267         return None
 268
 269 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 270 # the namespace parameter
 271
 272
 273 def xpath_with_ns(path, ns_map):
 274     components = [c.split(':') for c in path.split('/')]
 275     replaced = []
 276     for c in components:
 277         if len(c) == 1:
 278             replaced.append(c[0])
 279         else:
 280             ns, tag = c
 281             replaced.append('{%s}%s' % (ns_map[ns], tag))
 282     return '/'.join(replaced)
 283
 284
 285 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 286     def _find_xpath(xpath):
 287         return node.find(compat_xpath(xpath))
 288
 289     if isinstance(xpath, (str, compat_str)):
 290         n = _find_xpath(xpath)
 291     else:
 292         for xp in xpath:
 293             n = _find_xpath(xp)
 294             if n is not None:
 295                 break
 296
 297     if n is None:
 298         if default is not NO_DEFAULT:
 299             return default
 300         elif fatal:
 301             name = xpath if name is None else name
 302             raise ExtractorError('Could not find XML element %s' % name)
 303         else:
 304             return None
 305     return n
 306
 307
 308 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 309     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 310     if n is None or n == default:
 311         return n
 312     if n.text is None:
 313         if default is not NO_DEFAULT:
 314             return default
 315         elif fatal:
 316             name = xpath if name is None else name
 317             raise ExtractorError('Could not find XML element\'s text %s' % name)
 318         else:
 319             return None
 320     return n.text
 321
 322
 323 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 324     n = find_xpath_attr(node, xpath, key)
 325     if n is None:
 326         if default is not NO_DEFAULT:
 327             return default
 328         elif fatal:
 329             name = '%s[@%s]' % (xpath, key) if name is None else name
 330             raise ExtractorError('Could not find XML attribute %s' % name)
 331         else:
 332             return None
 333     return n.attrib[key]
 334
 335
 336 def get_element_by_id(id, html):
 337     """Return the content of the tag with the specified ID in the passed HTML document"""
 338     return get_element_by_attribute('id', id, html)
 339
 340
 341 def get_element_by_class(class_name, html):
 342     """Return the content of the first tag with the specified class in the passed HTML document"""
 343     retval = get_elements_by_class(class_name, html)
 344     return retval[0] if retval else None
 345
 346
 347 def get_element_by_attribute(attribute, value, html, escape_value=True):
 348     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 349     return retval[0] if retval else None
 350
 351
 352 def get_elements_by_class(class_name, html):
 353     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 354     return get_elements_by_attribute(
 355         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 356         html, escape_value=False)
 357
 358
 359 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 360     """Return the content of the tag with the specified attribute in the passed HTML document"""
 361
 362     value = re.escape(value) if escape_value else value
 363
 364     retlist = []
 365     for m in re.finditer(r'''(?xs)
 366         <([a-zA-Z0-9:._-]+)
 367          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 368          \s+%s=['"]?%s['"]?
 369          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 370         \s*>
 371         (?P<content>.*?)
 372         </\1>
 373     ''' % (re.escape(attribute), value), html):
 374         res = m.group('content')
 375
 376         if res.startswith('"') or res.startswith("'"):
 377             res = res[1:-1]
 378
 379         retlist.append(unescapeHTML(res))
 380
 381     return retlist
 382
 383
 384 class HTMLAttributeParser(compat_HTMLParser):
 385     """Trivial HTML parser to gather the attributes for a single element"""
 386     def __init__(self):
 387         self.attrs = {}
 388         compat_HTMLParser.__init__(self)
 389
 390     def handle_starttag(self, tag, attrs):
 391         self.attrs = dict(attrs)
 392
 393
 394 def extract_attributes(html_element):
 395     """Given a string for an HTML element such as
 396     <el
 397          a="foo" B="bar" c="&98;az" d=boz
 398          empty= noval entity="&amp;"
 399          sq='"' dq="'"
 400     >
 401     Decode and return a dictionary of attributes.
 402     {
 403         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 404         'empty': '', 'noval': None, 'entity': '&',
 405         'sq': '"', 'dq': '\''
 406     }.
 407     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 408     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 409     """
 410     parser = HTMLAttributeParser()
 411     parser.feed(html_element)
 412     parser.close()
 413     return parser.attrs
 414
 415
 416 def clean_html(html):
 417     """Clean an HTML snippet into a readable string"""
 418
 419     if html is None:  # Convenience for sanitizing descriptions etc.
 420         return html
 421
 422     # Newline vs <br />
 423     html = html.replace('\n', ' ')
 424     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 425     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 426     # Strip html tags
 427     html = re.sub('<.*?>', '', html)
 428     # Replace html entities
 429     html = unescapeHTML(html)
 430     return html.strip()
 431
 432
 433 def sanitize_open(filename, open_mode):
 434     """Try to open the given filename, and slightly tweak it if this fails.
 435
 436     Attempts to open the given filename. If this fails, it tries to change
 437     the filename slightly, step by step, until it's either able to open it
 438     or it fails and raises a final exception, like the standard open()
 439     function.
 440
 441     It returns the tuple (stream, definitive_file_name).
 442     """
 443     try:
 444         if filename == '-':
 445             if sys.platform == 'win32':
 446                 import msvcrt
 447                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 448             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 449         stream = open(encodeFilename(filename), open_mode)
 450         return (stream, filename)
 451     except (IOError, OSError) as err:
 452         if err.errno in (errno.EACCES,):
 453             raise
 454
 455         # In case of error, try to remove win32 forbidden chars
 456         alt_filename = sanitize_path(filename)
 457         if alt_filename == filename:
 458             raise
 459         else:
 460             # An exception here should be caught in the caller
 461             stream = open(encodeFilename(alt_filename), open_mode)
 462             return (stream, alt_filename)
 463
 464
 465 def timeconvert(timestr):
 466     """Convert RFC 2822 defined time string into system timestamp"""
 467     timestamp = None
 468     timetuple = email.utils.parsedate_tz(timestr)
 469     if timetuple is not None:
 470         timestamp = email.utils.mktime_tz(timetuple)
 471     return timestamp
 472
 473
 474 def sanitize_filename(s, restricted=False, is_id=False):
 475     """Sanitizes a string so it could be used as part of a filename.
 476     If restricted is set, use a stricter subset of allowed characters.
 477     Set is_id if this is not an arbitrary string, but an ID that should be kept
 478     if possible.
 479     """
 480     def replace_insane(char):
 481         if restricted and char in ACCENT_CHARS:
 482             return ACCENT_CHARS[char]
 483         if char == '?' or ord(char) < 32 or ord(char) == 127:
 484             return ''
 485         elif char == '"':
 486             return '' if restricted else '\''
 487         elif char == ':':
 488             return '_-' if restricted else ' -'
 489         elif char in '\\/|*<>':
 490             return '_'
 491         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 492             return '_'
 493         if restricted and ord(char) > 127:
 494             return '_'
 495         return char
 496
 497     # Handle timestamps
 498     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 499     result = ''.join(map(replace_insane, s))
 500     if not is_id:
 501         while '__' in result:
 502             result = result.replace('__', '_')
 503         result = result.strip('_')
 504         # Common case of "Foreign band name - English song title"
 505         if restricted and result.startswith('-_'):
 506             result = result[2:]
 507         if result.startswith('-'):
 508             result = '_' + result[len('-'):]
 509         result = result.lstrip('.')
 510         if not result:
 511             result = '_'
 512     return result
 513
 514
 515 def sanitize_path(s):
 516     """Sanitizes and normalizes path on Windows"""
 517     if sys.platform != 'win32':
 518         return s
 519     drive_or_unc, _ = os.path.splitdrive(s)
 520     if sys.version_info < (2, 7) and not drive_or_unc:
 521         drive_or_unc, _ = os.path.splitunc(s)
 522     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 523     if drive_or_unc:
 524         norm_path.pop(0)
 525     sanitized_path = [
 526         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 527         for path_part in norm_path]
 528     if drive_or_unc:
 529         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 530     return os.path.join(*sanitized_path)
 531
 532
 533 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 534 # unwanted failures due to missing protocol
 535 def sanitize_url(url):
 536     return 'http:%s' % url if url.startswith('//') else url
 537
 538
 539 def sanitized_Request(url, *args, **kwargs):
 540     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 541
 542
 543 def expand_path(s):
 544     """Expand shell variables and ~"""
 545     return os.path.expandvars(compat_expanduser(s))
 546
 547
 548 def orderedSet(iterable):
 549     """ Remove all duplicates from the input iterable """
 550     res = []
 551     for el in iterable:
 552         if el not in res:
 553             res.append(el)
 554     return res
 555
 556
 557 def _htmlentity_transform(entity_with_semicolon):
 558     """Transforms an HTML entity to a character."""
 559     entity = entity_with_semicolon[:-1]
 560
 561     # Known non-numeric HTML entity
 562     if entity in compat_html_entities.name2codepoint:
 563         return compat_chr(compat_html_entities.name2codepoint[entity])
 564
 565     # TODO: HTML5 allows entities without a semicolon. For example,
 566     # '&Eacuteric' should be decoded as 'Éric'.
 567     if entity_with_semicolon in compat_html_entities_html5:
 568         return compat_html_entities_html5[entity_with_semicolon]
 569
 570     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 571     if mobj is not None:
 572         numstr = mobj.group(1)
 573         if numstr.startswith('x'):
 574             base = 16
 575             numstr = '0%s' % numstr
 576         else:
 577             base = 10
 578         # See https://github.com/rg3/youtube-dl/issues/7518
 579         try:
 580             return compat_chr(int(numstr, base))
 581         except ValueError:
 582             pass
 583
 584     # Unknown entity in name, return its literal representation
 585     return '&%s;' % entity
 586
 587
 588 def unescapeHTML(s):
 589     if s is None:
 590         return None
 591     assert type(s) == compat_str
 592
 593     return re.sub(
 594         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 595
 596
 597 def get_subprocess_encoding():
 598     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 599         # For subprocess calls, encode with locale encoding
 600         # Refer to http://stackoverflow.com/a/9951851/35070
 601         encoding = preferredencoding()
 602     else:
 603         encoding = sys.getfilesystemencoding()
 604     if encoding is None:
 605         encoding = 'utf-8'
 606     return encoding
 607
 608
 609 def encodeFilename(s, for_subprocess=False):
 610     """
 611     @param s The name of the file
 612     """
 613
 614     assert type(s) == compat_str
 615
 616     # Python 3 has a Unicode API
 617     if sys.version_info >= (3, 0):
 618         return s
 619
 620     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 621     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 622     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 623     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 624         return s
 625
 626     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 627     if sys.platform.startswith('java'):
 628         return s
 629
 630     return s.encode(get_subprocess_encoding(), 'ignore')
 631
 632
 633 def decodeFilename(b, for_subprocess=False):
 634
 635     if sys.version_info >= (3, 0):
 636         return b
 637
 638     if not isinstance(b, bytes):
 639         return b
 640
 641     return b.decode(get_subprocess_encoding(), 'ignore')
 642
 643
 644 def encodeArgument(s):
 645     if not isinstance(s, compat_str):
 646         # Legacy code that uses byte strings
 647         # Uncomment the following line after fixing all post processors
 648         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 649         s = s.decode('ascii')
 650     return encodeFilename(s, True)
 651
 652
 653 def decodeArgument(b):
 654     return decodeFilename(b, True)
 655
 656
 657 def decodeOption(optval):
 658     if optval is None:
 659         return optval
 660     if isinstance(optval, bytes):
 661         optval = optval.decode(preferredencoding())
 662
 663     assert isinstance(optval, compat_str)
 664     return optval
 665
 666
 667 def formatSeconds(secs):
 668     if secs > 3600:
 669         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 670     elif secs > 60:
 671         return '%d:%02d' % (secs // 60, secs % 60)
 672     else:
 673         return '%d' % secs
 674
 675
 676 def make_HTTPS_handler(params, **kwargs):
 677     opts_no_check_certificate = params.get('nocheckcertificate', False)
 678     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 679         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 680         if opts_no_check_certificate:
 681             context.check_hostname = False
 682             context.verify_mode = ssl.CERT_NONE
 683         try:
 684             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 685         except TypeError:
 686             # Python 2.7.8
 687             # (create_default_context present but HTTPSHandler has no context=)
 688             pass
 689
 690     if sys.version_info < (3, 2):
 691         return YoutubeDLHTTPSHandler(params, **kwargs)
 692     else:  # Python < 3.4
 693         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 694         context.verify_mode = (ssl.CERT_NONE
 695                                if opts_no_check_certificate
 696                                else ssl.CERT_REQUIRED)
 697         context.set_default_verify_paths()
 698         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 699
 700
 701 def bug_reports_message():
 702     if ytdl_is_updateable():
 703         update_cmd = 'type  youtube-dl -U  to update'
 704     else:
 705         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 706     msg = '; please report this issue on https://yt-dl.org/bug .'
 707     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 708     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 709     return msg
 710
 711
 712 class YoutubeDLError(Exception):
 713     """Base exception for YoutubeDL errors."""
 714     pass
 715
 716
 717 class ExtractorError(YoutubeDLError):
 718     """Error during info extraction."""
 719
 720     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 721         """ tb, if given, is the original traceback (so that it can be printed out).
 722         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 723         """
 724
 725         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 726             expected = True
 727         if video_id is not None:
 728             msg = video_id + ': ' + msg
 729         if cause:
 730             msg += ' (caused by %r)' % cause
 731         if not expected:
 732             msg += bug_reports_message()
 733         super(ExtractorError, self).__init__(msg)
 734
 735         self.traceback = tb
 736         self.exc_info = sys.exc_info()  # preserve original exception
 737         self.cause = cause
 738         self.video_id = video_id
 739
 740     def format_traceback(self):
 741         if self.traceback is None:
 742             return None
 743         return ''.join(traceback.format_tb(self.traceback))
 744
 745
 746 class UnsupportedError(ExtractorError):
 747     def __init__(self, url):
 748         super(UnsupportedError, self).__init__(
 749             'Unsupported URL: %s' % url, expected=True)
 750         self.url = url
 751
 752
 753 class RegexNotFoundError(ExtractorError):
 754     """Error when a regex didn't match"""
 755     pass
 756
 757
 758 class GeoRestrictedError(ExtractorError):
 759     """Geographic restriction Error exception.
 760
 761     This exception may be thrown when a video is not available from your
 762     geographic location due to geographic restrictions imposed by a website.
 763     """
 764     def __init__(self, msg, countries=None):
 765         super(GeoRestrictedError, self).__init__(msg, expected=True)
 766         self.msg = msg
 767         self.countries = countries
 768
 769
 770 class DownloadError(YoutubeDLError):
 771     """Download Error exception.
 772
 773     This exception may be thrown by FileDownloader objects if they are not
 774     configured to continue on errors. They will contain the appropriate
 775     error message.
 776     """
 777
 778     def __init__(self, msg, exc_info=None):
 779         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 780         super(DownloadError, self).__init__(msg)
 781         self.exc_info = exc_info
 782
 783
 784 class SameFileError(YoutubeDLError):
 785     """Same File exception.
 786
 787     This exception will be thrown by FileDownloader objects if they detect
 788     multiple files would have to be downloaded to the same file on disk.
 789     """
 790     pass
 791
 792
 793 class PostProcessingError(YoutubeDLError):
 794     """Post Processing exception.
 795
 796     This exception may be raised by PostProcessor's .run() method to
 797     indicate an error in the postprocessing task.
 798     """
 799
 800     def __init__(self, msg):
 801         super(PostProcessingError, self).__init__(msg)
 802         self.msg = msg
 803
 804
 805 class MaxDownloadsReached(YoutubeDLError):
 806     """ --max-downloads limit has been reached. """
 807     pass
 808
 809
 810 class UnavailableVideoError(YoutubeDLError):
 811     """Unavailable Format exception.
 812
 813     This exception will be thrown when a video is requested
 814     in a format that is not available for that video.
 815     """
 816     pass
 817
 818
 819 class ContentTooShortError(YoutubeDLError):
 820     """Content Too Short exception.
 821
 822     This exception may be raised by FileDownloader objects when a file they
 823     download is too small for what the server announced first, indicating
 824     the connection was probably interrupted.
 825     """
 826
 827     def __init__(self, downloaded, expected):
 828         super(ContentTooShortError, self).__init__(
 829             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 830         )
 831         # Both in bytes
 832         self.downloaded = downloaded
 833         self.expected = expected
 834
 835
 836 class XAttrMetadataError(YoutubeDLError):
 837     def __init__(self, code=None, msg='Unknown error'):
 838         super(XAttrMetadataError, self).__init__(msg)
 839         self.code = code
 840         self.msg = msg
 841
 842         # Parsing code and msg
 843         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 844                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 845             self.reason = 'NO_SPACE'
 846         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 847             self.reason = 'VALUE_TOO_LONG'
 848         else:
 849             self.reason = 'NOT_SUPPORTED'
 850
 851
 852 class XAttrUnavailableError(YoutubeDLError):
 853     pass
 854
 855
 856 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 857     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 858     # expected HTTP responses to meet HTTP/1.0 or later (see also
 859     # https://github.com/rg3/youtube-dl/issues/6727)
 860     if sys.version_info < (3, 0):
 861         kwargs[b'strict'] = True
 862     hc = http_class(*args, **kwargs)
 863     source_address = ydl_handler._params.get('source_address')
 864     if source_address is not None:
 865         sa = (source_address, 0)
 866         if hasattr(hc, 'source_address'):  # Python 2.7+
 867             hc.source_address = sa
 868         else:  # Python 2.6
 869             def _hc_connect(self, *args, **kwargs):
 870                 sock = compat_socket_create_connection(
 871                     (self.host, self.port), self.timeout, sa)
 872                 if is_https:
 873                     self.sock = ssl.wrap_socket(
 874                         sock, self.key_file, self.cert_file,
 875                         ssl_version=ssl.PROTOCOL_TLSv1)
 876                 else:
 877                     self.sock = sock
 878             hc.connect = functools.partial(_hc_connect, hc)
 879
 880     return hc
 881
 882
 883 def handle_youtubedl_headers(headers):
 884     filtered_headers = headers
 885
 886     if 'Youtubedl-no-compression' in filtered_headers:
 887         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 888         del filtered_headers['Youtubedl-no-compression']
 889
 890     return filtered_headers
 891
 892
 893 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 894     """Handler for HTTP requests and responses.
 895
 896     This class, when installed with an OpenerDirector, automatically adds
 897     the standard headers to every HTTP request and handles gzipped and
 898     deflated responses from web servers. If compression is to be avoided in
 899     a particular request, the original request in the program code only has
 900     to include the HTTP header "Youtubedl-no-compression", which will be
 901     removed before making the real request.
 902
 903     Part of this code was copied from:
 904
 905     http://techknack.net/python-urllib2-handlers/
 906
 907     Andrew Rowls, the author of that code, agreed to release it to the
 908     public domain.
 909     """
 910
 911     def __init__(self, params, *args, **kwargs):
 912         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 913         self._params = params
 914
 915     def http_open(self, req):
 916         conn_class = compat_http_client.HTTPConnection
 917
 918         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 919         if socks_proxy:
 920             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 921             del req.headers['Ytdl-socks-proxy']
 922
 923         return self.do_open(functools.partial(
 924             _create_http_connection, self, conn_class, False),
 925             req)
 926
 927     @staticmethod
 928     def deflate(data):
 929         try:
 930             return zlib.decompress(data, -zlib.MAX_WBITS)
 931         except zlib.error:
 932             return zlib.decompress(data)
 933
 934     @staticmethod
 935     def addinfourl_wrapper(stream, headers, url, code):
 936         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 937             return compat_urllib_request.addinfourl(stream, headers, url, code)
 938         ret = compat_urllib_request.addinfourl(stream, headers, url)
 939         ret.code = code
 940         return ret
 941
 942     def http_request(self, req):
 943         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 944         # always respected by websites, some tend to give out URLs with non percent-encoded
 945         # non-ASCII characters (see telemb.py, ard.py [#3412])
 946         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 947         # To work around aforementioned issue we will replace request's original URL with
 948         # percent-encoded one
 949         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 950         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 951         url = req.get_full_url()
 952         url_escaped = escape_url(url)
 953
 954         # Substitute URL if any change after escaping
 955         if url != url_escaped:
 956             req = update_Request(req, url=url_escaped)
 957
 958         for h, v in std_headers.items():
 959             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 960             # The dict keys are capitalized because of this bug by urllib
 961             if h.capitalize() not in req.headers:
 962                 req.add_header(h, v)
 963
 964         req.headers = handle_youtubedl_headers(req.headers)
 965
 966         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 967             # Python 2.6 is brain-dead when it comes to fragments
 968             req._Request__original = req._Request__original.partition('#')[0]
 969             req._Request__r_type = req._Request__r_type.partition('#')[0]
 970
 971         return req
 972
 973     def http_response(self, req, resp):
 974         old_resp = resp
 975         # gzip
 976         if resp.headers.get('Content-encoding', '') == 'gzip':
 977             content = resp.read()
 978             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 979             try:
 980                 uncompressed = io.BytesIO(gz.read())
 981             except IOError as original_ioerror:
 982                 # There may be junk add the end of the file
 983                 # See http://stackoverflow.com/q/4928560/35070 for details
 984                 for i in range(1, 1024):
 985                     try:
 986                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 987                         uncompressed = io.BytesIO(gz.read())
 988                     except IOError:
 989                         continue
 990                     break
 991                 else:
 992                     raise original_ioerror
 993             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 994             resp.msg = old_resp.msg
 995             del resp.headers['Content-encoding']
 996         # deflate
 997         if resp.headers.get('Content-encoding', '') == 'deflate':
 998             gz = io.BytesIO(self.deflate(resp.read()))
 999             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1000             resp.msg = old_resp.msg
1001             del resp.headers['Content-encoding']
1002         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1003         # https://github.com/rg3/youtube-dl/issues/6457).
1004         if 300 <= resp.code < 400:
1005             location = resp.headers.get('Location')
1006             if location:
1007                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1008                 if sys.version_info >= (3, 0):
1009                     location = location.encode('iso-8859-1').decode('utf-8')
1010                 else:
1011                     location = location.decode('utf-8')
1012                 location_escaped = escape_url(location)
1013                 if location != location_escaped:
1014                     del resp.headers['Location']
1015                     if sys.version_info < (3, 0):
1016                         location_escaped = location_escaped.encode('utf-8')
1017                     resp.headers['Location'] = location_escaped
1018         return resp
1019
1020     https_request = http_request
1021     https_response = http_response
1022
1023
1024 def make_socks_conn_class(base_class, socks_proxy):
1025     assert issubclass(base_class, (
1026         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1027
1028     url_components = compat_urlparse.urlparse(socks_proxy)
1029     if url_components.scheme.lower() == 'socks5':
1030         socks_type = ProxyType.SOCKS5
1031     elif url_components.scheme.lower() in ('socks', 'socks4'):
1032         socks_type = ProxyType.SOCKS4
1033     elif url_components.scheme.lower() == 'socks4a':
1034         socks_type = ProxyType.SOCKS4A
1035
1036     def unquote_if_non_empty(s):
1037         if not s:
1038             return s
1039         return compat_urllib_parse_unquote_plus(s)
1040
1041     proxy_args = (
1042         socks_type,
1043         url_components.hostname, url_components.port or 1080,
1044         True,  # Remote DNS
1045         unquote_if_non_empty(url_components.username),
1046         unquote_if_non_empty(url_components.password),
1047     )
1048
1049     class SocksConnection(base_class):
1050         def connect(self):
1051             self.sock = sockssocket()
1052             self.sock.setproxy(*proxy_args)
1053             if type(self.timeout) in (int, float):
1054                 self.sock.settimeout(self.timeout)
1055             self.sock.connect((self.host, self.port))
1056
1057             if isinstance(self, compat_http_client.HTTPSConnection):
1058                 if hasattr(self, '_context'):  # Python > 2.6
1059                     self.sock = self._context.wrap_socket(
1060                         self.sock, server_hostname=self.host)
1061                 else:
1062                     self.sock = ssl.wrap_socket(self.sock)
1063
1064     return SocksConnection
1065
1066
1067 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1068     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1069         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1070         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1071         self._params = params
1072
1073     def https_open(self, req):
1074         kwargs = {}
1075         conn_class = self._https_conn_class
1076
1077         if hasattr(self, '_context'):  # python > 2.6
1078             kwargs['context'] = self._context
1079         if hasattr(self, '_check_hostname'):  # python 3.x
1080             kwargs['check_hostname'] = self._check_hostname
1081
1082         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1083         if socks_proxy:
1084             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1085             del req.headers['Ytdl-socks-proxy']
1086
1087         return self.do_open(functools.partial(
1088             _create_http_connection, self, conn_class, True),
1089             req, **kwargs)
1090
1091
1092 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1093     def __init__(self, cookiejar=None):
1094         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1095
1096     def http_response(self, request, response):
1097         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1098         # characters in Set-Cookie HTTP header of last response (see
1099         # https://github.com/rg3/youtube-dl/issues/6769).
1100         # In order to at least prevent crashing we will percent encode Set-Cookie
1101         # header before HTTPCookieProcessor starts processing it.
1102         # if sys.version_info < (3, 0) and response.headers:
1103         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1104         #         set_cookie = response.headers.get(set_cookie_header)
1105         #         if set_cookie:
1106         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1107         #             if set_cookie != set_cookie_escaped:
1108         #                 del response.headers[set_cookie_header]
1109         #                 response.headers[set_cookie_header] = set_cookie_escaped
1110         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1111
1112     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1113     https_response = http_response
1114
1115
1116 def extract_timezone(date_str):
1117     m = re.search(
1118         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1119         date_str)
1120     if not m:
1121         timezone = datetime.timedelta()
1122     else:
1123         date_str = date_str[:-len(m.group('tz'))]
1124         if not m.group('sign'):
1125             timezone = datetime.timedelta()
1126         else:
1127             sign = 1 if m.group('sign') == '+' else -1
1128             timezone = datetime.timedelta(
1129                 hours=sign * int(m.group('hours')),
1130                 minutes=sign * int(m.group('minutes')))
1131     return timezone, date_str
1132
1133
1134 def parse_iso8601(date_str, delimiter='T', timezone=None):
1135     """ Return a UNIX timestamp from the given date """
1136
1137     if date_str is None:
1138         return None
1139
1140     date_str = re.sub(r'\.[0-9]+', '', date_str)
1141
1142     if timezone is None:
1143         timezone, date_str = extract_timezone(date_str)
1144
1145     try:
1146         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1147         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1148         return calendar.timegm(dt.timetuple())
1149     except ValueError:
1150         pass
1151
1152
1153 def date_formats(day_first=True):
1154     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1155
1156
1157 def unified_strdate(date_str, day_first=True):
1158     """Return a string with the date in the format YYYYMMDD"""
1159
1160     if date_str is None:
1161         return None
1162     upload_date = None
1163     # Replace commas
1164     date_str = date_str.replace(',', ' ')
1165     # Remove AM/PM + timezone
1166     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1167     _, date_str = extract_timezone(date_str)
1168
1169     for expression in date_formats(day_first):
1170         try:
1171             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1172         except ValueError:
1173             pass
1174     if upload_date is None:
1175         timetuple = email.utils.parsedate_tz(date_str)
1176         if timetuple:
1177             try:
1178                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1179             except ValueError:
1180                 pass
1181     if upload_date is not None:
1182         return compat_str(upload_date)
1183
1184
1185 def unified_timestamp(date_str, day_first=True):
1186     if date_str is None:
1187         return None
1188
1189     date_str = date_str.replace(',', ' ')
1190
1191     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1192     timezone, date_str = extract_timezone(date_str)
1193
1194     # Remove AM/PM + timezone
1195     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1196
1197     # Remove unrecognized timezones from ISO 8601 alike timestamps
1198     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1199     if m:
1200         date_str = date_str[:-len(m.group('tz'))]
1201
1202     for expression in date_formats(day_first):
1203         try:
1204             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1205             return calendar.timegm(dt.timetuple())
1206         except ValueError:
1207             pass
1208     timetuple = email.utils.parsedate_tz(date_str)
1209     if timetuple:
1210         return calendar.timegm(timetuple) + pm_delta * 3600
1211
1212
1213 def determine_ext(url, default_ext='unknown_video'):
1214     if url is None:
1215         return default_ext
1216     guess = url.partition('?')[0].rpartition('.')[2]
1217     if re.match(r'^[A-Za-z0-9]+$', guess):
1218         return guess
1219     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1220     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1221         return guess.rstrip('/')
1222     else:
1223         return default_ext
1224
1225
1226 def subtitles_filename(filename, sub_lang, sub_format):
1227     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1228
1229
1230 def date_from_str(date_str):
1231     """
1232     Return a datetime object from a string in the format YYYYMMDD or
1233     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1234     today = datetime.date.today()
1235     if date_str in ('now', 'today'):
1236         return today
1237     if date_str == 'yesterday':
1238         return today - datetime.timedelta(days=1)
1239     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1240     if match is not None:
1241         sign = match.group('sign')
1242         time = int(match.group('time'))
1243         if sign == '-':
1244             time = -time
1245         unit = match.group('unit')
1246         # A bad approximation?
1247         if unit == 'month':
1248             unit = 'day'
1249             time *= 30
1250         elif unit == 'year':
1251             unit = 'day'
1252             time *= 365
1253         unit += 's'
1254         delta = datetime.timedelta(**{unit: time})
1255         return today + delta
1256     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1257
1258
1259 def hyphenate_date(date_str):
1260     """
1261     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1262     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1263     if match is not None:
1264         return '-'.join(match.groups())
1265     else:
1266         return date_str
1267
1268
1269 class DateRange(object):
1270     """Represents a time interval between two dates"""
1271
1272     def __init__(self, start=None, end=None):
1273         """start and end must be strings in the format accepted by date"""
1274         if start is not None:
1275             self.start = date_from_str(start)
1276         else:
1277             self.start = datetime.datetime.min.date()
1278         if end is not None:
1279             self.end = date_from_str(end)
1280         else:
1281             self.end = datetime.datetime.max.date()
1282         if self.start > self.end:
1283             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1284
1285     @classmethod
1286     def day(cls, day):
1287         """Returns a range that only contains the given day"""
1288         return cls(day, day)
1289
1290     def __contains__(self, date):
1291         """Check if the date is in the range"""
1292         if not isinstance(date, datetime.date):
1293             date = date_from_str(date)
1294         return self.start <= date <= self.end
1295
1296     def __str__(self):
1297         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1298
1299
1300 def platform_name():
1301     """ Returns the platform name as a compat_str """
1302     res = platform.platform()
1303     if isinstance(res, bytes):
1304         res = res.decode(preferredencoding())
1305
1306     assert isinstance(res, compat_str)
1307     return res
1308
1309
1310 def _windows_write_string(s, out):
1311     """ Returns True if the string was written using special methods,
1312     False if it has yet to be written out."""
1313     # Adapted from http://stackoverflow.com/a/3259271/35070
1314
1315     import ctypes
1316     import ctypes.wintypes
1317
1318     WIN_OUTPUT_IDS = {
1319         1: -11,
1320         2: -12,
1321     }
1322
1323     try:
1324         fileno = out.fileno()
1325     except AttributeError:
1326         # If the output stream doesn't have a fileno, it's virtual
1327         return False
1328     except io.UnsupportedOperation:
1329         # Some strange Windows pseudo files?
1330         return False
1331     if fileno not in WIN_OUTPUT_IDS:
1332         return False
1333
1334     GetStdHandle = ctypes.WINFUNCTYPE(
1335         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1336         (b'GetStdHandle', ctypes.windll.kernel32))
1337     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1338
1339     WriteConsoleW = ctypes.WINFUNCTYPE(
1340         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1341         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1342         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1343     written = ctypes.wintypes.DWORD(0)
1344
1345     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1346     FILE_TYPE_CHAR = 0x0002
1347     FILE_TYPE_REMOTE = 0x8000
1348     GetConsoleMode = ctypes.WINFUNCTYPE(
1349         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1350         ctypes.POINTER(ctypes.wintypes.DWORD))(
1351         (b'GetConsoleMode', ctypes.windll.kernel32))
1352     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1353
1354     def not_a_console(handle):
1355         if handle == INVALID_HANDLE_VALUE or handle is None:
1356             return True
1357         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1358                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1359
1360     if not_a_console(h):
1361         return False
1362
1363     def next_nonbmp_pos(s):
1364         try:
1365             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1366         except StopIteration:
1367             return len(s)
1368
1369     while s:
1370         count = min(next_nonbmp_pos(s), 1024)
1371
1372         ret = WriteConsoleW(
1373             h, s, count if count else 2, ctypes.byref(written), None)
1374         if ret == 0:
1375             raise OSError('Failed to write string')
1376         if not count:  # We just wrote a non-BMP character
1377             assert written.value == 2
1378             s = s[1:]
1379         else:
1380             assert written.value > 0
1381             s = s[written.value:]
1382     return True
1383
1384
1385 def write_string(s, out=None, encoding=None):
1386     if out is None:
1387         out = sys.stderr
1388     assert type(s) == compat_str
1389
1390     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1391         if _windows_write_string(s, out):
1392             return
1393
1394     if ('b' in getattr(out, 'mode', '') or
1395             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1396         byt = s.encode(encoding or preferredencoding(), 'ignore')
1397         out.write(byt)
1398     elif hasattr(out, 'buffer'):
1399         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1400         byt = s.encode(enc, 'ignore')
1401         out.buffer.write(byt)
1402     else:
1403         out.write(s)
1404     out.flush()
1405
1406
1407 def bytes_to_intlist(bs):
1408     if not bs:
1409         return []
1410     if isinstance(bs[0], int):  # Python 3
1411         return list(bs)
1412     else:
1413         return [ord(c) for c in bs]
1414
1415
1416 def intlist_to_bytes(xs):
1417     if not xs:
1418         return b''
1419     return compat_struct_pack('%dB' % len(xs), *xs)
1420
1421
1422 # Cross-platform file locking
1423 if sys.platform == 'win32':
1424     import ctypes.wintypes
1425     import msvcrt
1426
1427     class OVERLAPPED(ctypes.Structure):
1428         _fields_ = [
1429             ('Internal', ctypes.wintypes.LPVOID),
1430             ('InternalHigh', ctypes.wintypes.LPVOID),
1431             ('Offset', ctypes.wintypes.DWORD),
1432             ('OffsetHigh', ctypes.wintypes.DWORD),
1433             ('hEvent', ctypes.wintypes.HANDLE),
1434         ]
1435
1436     kernel32 = ctypes.windll.kernel32
1437     LockFileEx = kernel32.LockFileEx
1438     LockFileEx.argtypes = [
1439         ctypes.wintypes.HANDLE,     # hFile
1440         ctypes.wintypes.DWORD,      # dwFlags
1441         ctypes.wintypes.DWORD,      # dwReserved
1442         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1443         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1444         ctypes.POINTER(OVERLAPPED)  # Overlapped
1445     ]
1446     LockFileEx.restype = ctypes.wintypes.BOOL
1447     UnlockFileEx = kernel32.UnlockFileEx
1448     UnlockFileEx.argtypes = [
1449         ctypes.wintypes.HANDLE,     # hFile
1450         ctypes.wintypes.DWORD,      # dwReserved
1451         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1452         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1453         ctypes.POINTER(OVERLAPPED)  # Overlapped
1454     ]
1455     UnlockFileEx.restype = ctypes.wintypes.BOOL
1456     whole_low = 0xffffffff
1457     whole_high = 0x7fffffff
1458
1459     def _lock_file(f, exclusive):
1460         overlapped = OVERLAPPED()
1461         overlapped.Offset = 0
1462         overlapped.OffsetHigh = 0
1463         overlapped.hEvent = 0
1464         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1465         handle = msvcrt.get_osfhandle(f.fileno())
1466         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1467                           whole_low, whole_high, f._lock_file_overlapped_p):
1468             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1469
1470     def _unlock_file(f):
1471         assert f._lock_file_overlapped_p
1472         handle = msvcrt.get_osfhandle(f.fileno())
1473         if not UnlockFileEx(handle, 0,
1474                             whole_low, whole_high, f._lock_file_overlapped_p):
1475             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1476
1477 else:
1478     # Some platforms, such as Jython, is missing fcntl
1479     try:
1480         import fcntl
1481
1482         def _lock_file(f, exclusive):
1483             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1484
1485         def _unlock_file(f):
1486             fcntl.flock(f, fcntl.LOCK_UN)
1487     except ImportError:
1488         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1489
1490         def _lock_file(f, exclusive):
1491             raise IOError(UNSUPPORTED_MSG)
1492
1493         def _unlock_file(f):
1494             raise IOError(UNSUPPORTED_MSG)
1495
1496
1497 class locked_file(object):
1498     def __init__(self, filename, mode, encoding=None):
1499         assert mode in ['r', 'a', 'w']
1500         self.f = io.open(filename, mode, encoding=encoding)
1501         self.mode = mode
1502
1503     def __enter__(self):
1504         exclusive = self.mode != 'r'
1505         try:
1506             _lock_file(self.f, exclusive)
1507         except IOError:
1508             self.f.close()
1509             raise
1510         return self
1511
1512     def __exit__(self, etype, value, traceback):
1513         try:
1514             _unlock_file(self.f)
1515         finally:
1516             self.f.close()
1517
1518     def __iter__(self):
1519         return iter(self.f)
1520
1521     def write(self, *args):
1522         return self.f.write(*args)
1523
1524     def read(self, *args):
1525         return self.f.read(*args)
1526
1527
1528 def get_filesystem_encoding():
1529     encoding = sys.getfilesystemencoding()
1530     return encoding if encoding is not None else 'utf-8'
1531
1532
1533 def shell_quote(args):
1534     quoted_args = []
1535     encoding = get_filesystem_encoding()
1536     for a in args:
1537         if isinstance(a, bytes):
1538             # We may get a filename encoded with 'encodeFilename'
1539             a = a.decode(encoding)
1540         quoted_args.append(pipes.quote(a))
1541     return ' '.join(quoted_args)
1542
1543
1544 def smuggle_url(url, data):
1545     """ Pass additional data in a URL for internal use. """
1546
1547     url, idata = unsmuggle_url(url, {})
1548     data.update(idata)
1549     sdata = compat_urllib_parse_urlencode(
1550         {'__youtubedl_smuggle': json.dumps(data)})
1551     return url + '#' + sdata
1552
1553
1554 def unsmuggle_url(smug_url, default=None):
1555     if '#__youtubedl_smuggle' not in smug_url:
1556         return smug_url, default
1557     url, _, sdata = smug_url.rpartition('#')
1558     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1559     data = json.loads(jsond)
1560     return url, data
1561
1562
1563 def format_bytes(bytes):
1564     if bytes is None:
1565         return 'N/A'
1566     if type(bytes) is str:
1567         bytes = float(bytes)
1568     if bytes == 0.0:
1569         exponent = 0
1570     else:
1571         exponent = int(math.log(bytes, 1024.0))
1572     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1573     converted = float(bytes) / float(1024 ** exponent)
1574     return '%.2f%s' % (converted, suffix)
1575
1576
1577 def lookup_unit_table(unit_table, s):
1578     units_re = '|'.join(re.escape(u) for u in unit_table)
1579     m = re.match(
1580         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1581     if not m:
1582         return None
1583     num_str = m.group('num').replace(',', '.')
1584     mult = unit_table[m.group('unit')]
1585     return int(float(num_str) * mult)
1586
1587
1588 def parse_filesize(s):
1589     if s is None:
1590         return None
1591
1592     # The lower-case forms are of course incorrect and unofficial,
1593     # but we support those too
1594     _UNIT_TABLE = {
1595         'B': 1,
1596         'b': 1,
1597         'bytes': 1,
1598         'KiB': 1024,
1599         'KB': 1000,
1600         'kB': 1024,
1601         'Kb': 1000,
1602         'kb': 1000,
1603         'kilobytes': 1000,
1604         'kibibytes': 1024,
1605         'MiB': 1024 ** 2,
1606         'MB': 1000 ** 2,
1607         'mB': 1024 ** 2,
1608         'Mb': 1000 ** 2,
1609         'mb': 1000 ** 2,
1610         'megabytes': 1000 ** 2,
1611         'mebibytes': 1024 ** 2,
1612         'GiB': 1024 ** 3,
1613         'GB': 1000 ** 3,
1614         'gB': 1024 ** 3,
1615         'Gb': 1000 ** 3,
1616         'gb': 1000 ** 3,
1617         'gigabytes': 1000 ** 3,
1618         'gibibytes': 1024 ** 3,
1619         'TiB': 1024 ** 4,
1620         'TB': 1000 ** 4,
1621         'tB': 1024 ** 4,
1622         'Tb': 1000 ** 4,
1623         'tb': 1000 ** 4,
1624         'terabytes': 1000 ** 4,
1625         'tebibytes': 1024 ** 4,
1626         'PiB': 1024 ** 5,
1627         'PB': 1000 ** 5,
1628         'pB': 1024 ** 5,
1629         'Pb': 1000 ** 5,
1630         'pb': 1000 ** 5,
1631         'petabytes': 1000 ** 5,
1632         'pebibytes': 1024 ** 5,
1633         'EiB': 1024 ** 6,
1634         'EB': 1000 ** 6,
1635         'eB': 1024 ** 6,
1636         'Eb': 1000 ** 6,
1637         'eb': 1000 ** 6,
1638         'exabytes': 1000 ** 6,
1639         'exbibytes': 1024 ** 6,
1640         'ZiB': 1024 ** 7,
1641         'ZB': 1000 ** 7,
1642         'zB': 1024 ** 7,
1643         'Zb': 1000 ** 7,
1644         'zb': 1000 ** 7,
1645         'zettabytes': 1000 ** 7,
1646         'zebibytes': 1024 ** 7,
1647         'YiB': 1024 ** 8,
1648         'YB': 1000 ** 8,
1649         'yB': 1024 ** 8,
1650         'Yb': 1000 ** 8,
1651         'yb': 1000 ** 8,
1652         'yottabytes': 1000 ** 8,
1653         'yobibytes': 1024 ** 8,
1654     }
1655
1656     return lookup_unit_table(_UNIT_TABLE, s)
1657
1658
1659 def parse_count(s):
1660     if s is None:
1661         return None
1662
1663     s = s.strip()
1664
1665     if re.match(r'^[\d,.]+$', s):
1666         return str_to_int(s)
1667
1668     _UNIT_TABLE = {
1669         'k': 1000,
1670         'K': 1000,
1671         'm': 1000 ** 2,
1672         'M': 1000 ** 2,
1673         'kk': 1000 ** 2,
1674         'KK': 1000 ** 2,
1675     }
1676
1677     return lookup_unit_table(_UNIT_TABLE, s)
1678
1679
1680 def month_by_name(name, lang='en'):
1681     """ Return the number of a month by (locale-independently) English name """
1682
1683     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1684
1685     try:
1686         return month_names.index(name) + 1
1687     except ValueError:
1688         return None
1689
1690
1691 def month_by_abbreviation(abbrev):
1692     """ Return the number of a month by (locale-independently) English
1693         abbreviations """
1694
1695     try:
1696         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1697     except ValueError:
1698         return None
1699
1700
1701 def fix_xml_ampersands(xml_str):
1702     """Replace all the '&' by '&amp;' in XML"""
1703     return re.sub(
1704         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1705         '&amp;',
1706         xml_str)
1707
1708
1709 def setproctitle(title):
1710     assert isinstance(title, compat_str)
1711
1712     # ctypes in Jython is not complete
1713     # http://bugs.jython.org/issue2148
1714     if sys.platform.startswith('java'):
1715         return
1716
1717     try:
1718         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1719     except OSError:
1720         return
1721     except TypeError:
1722         # LoadLibrary in Windows Python 2.7.13 only expects
1723         # a bytestring, but since unicode_literals turns
1724         # every string into a unicode string, it fails.
1725         return
1726     title_bytes = title.encode('utf-8')
1727     buf = ctypes.create_string_buffer(len(title_bytes))
1728     buf.value = title_bytes
1729     try:
1730         libc.prctl(15, buf, 0, 0, 0)
1731     except AttributeError:
1732         return  # Strange libc, just skip this
1733
1734
1735 def remove_start(s, start):
1736     return s[len(start):] if s is not None and s.startswith(start) else s
1737
1738
1739 def remove_end(s, end):
1740     return s[:-len(end)] if s is not None and s.endswith(end) else s
1741
1742
1743 def remove_quotes(s):
1744     if s is None or len(s) < 2:
1745         return s
1746     for quote in ('"', "'", ):
1747         if s[0] == quote and s[-1] == quote:
1748             return s[1:-1]
1749     return s
1750
1751
1752 def url_basename(url):
1753     path = compat_urlparse.urlparse(url).path
1754     return path.strip('/').split('/')[-1]
1755
1756
1757 def base_url(url):
1758     return re.match(r'https?://[^?#&]+/', url).group()
1759
1760
1761 def urljoin(base, path):
1762     if isinstance(path, bytes):
1763         path = path.decode('utf-8')
1764     if not isinstance(path, compat_str) or not path:
1765         return None
1766     if re.match(r'^(?:https?:)?//', path):
1767         return path
1768     if isinstance(base, bytes):
1769         base = base.decode('utf-8')
1770     if not isinstance(base, compat_str) or not re.match(
1771             r'^(?:https?:)?//', base):
1772         return None
1773     return compat_urlparse.urljoin(base, path)
1774
1775
1776 class HEADRequest(compat_urllib_request.Request):
1777     def get_method(self):
1778         return 'HEAD'
1779
1780
1781 class PUTRequest(compat_urllib_request.Request):
1782     def get_method(self):
1783         return 'PUT'
1784
1785
1786 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1787     if get_attr:
1788         if v is not None:
1789             v = getattr(v, get_attr, None)
1790     if v == '':
1791         v = None
1792     if v is None:
1793         return default
1794     try:
1795         return int(v) * invscale // scale
1796     except ValueError:
1797         return default
1798
1799
1800 def str_or_none(v, default=None):
1801     return default if v is None else compat_str(v)
1802
1803
1804 def str_to_int(int_str):
1805     """ A more relaxed version of int_or_none """
1806     if int_str is None:
1807         return None
1808     int_str = re.sub(r'[,\.\+]', '', int_str)
1809     return int(int_str)
1810
1811
1812 def float_or_none(v, scale=1, invscale=1, default=None):
1813     if v is None:
1814         return default
1815     try:
1816         return float(v) * invscale / scale
1817     except ValueError:
1818         return default
1819
1820
1821 def strip_or_none(v):
1822     return None if v is None else v.strip()
1823
1824
1825 def parse_duration(s):
1826     if not isinstance(s, compat_basestring):
1827         return None
1828
1829     s = s.strip()
1830
1831     days, hours, mins, secs, ms = [None] * 5
1832     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1833     if m:
1834         days, hours, mins, secs, ms = m.groups()
1835     else:
1836         m = re.match(
1837             r'''(?ix)(?:P?T)?
1838                 (?:
1839                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1840                 )?
1841                 (?:
1842                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1843                 )?
1844                 (?:
1845                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1846                 )?
1847                 (?:
1848                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1849                 )?Z?$''', s)
1850         if m:
1851             days, hours, mins, secs, ms = m.groups()
1852         else:
1853             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1854             if m:
1855                 hours, mins = m.groups()
1856             else:
1857                 return None
1858
1859     duration = 0
1860     if secs:
1861         duration += float(secs)
1862     if mins:
1863         duration += float(mins) * 60
1864     if hours:
1865         duration += float(hours) * 60 * 60
1866     if days:
1867         duration += float(days) * 24 * 60 * 60
1868     if ms:
1869         duration += float(ms)
1870     return duration
1871
1872
1873 def prepend_extension(filename, ext, expected_real_ext=None):
1874     name, real_ext = os.path.splitext(filename)
1875     return (
1876         '{0}.{1}{2}'.format(name, ext, real_ext)
1877         if not expected_real_ext or real_ext[1:] == expected_real_ext
1878         else '{0}.{1}'.format(filename, ext))
1879
1880
1881 def replace_extension(filename, ext, expected_real_ext=None):
1882     name, real_ext = os.path.splitext(filename)
1883     return '{0}.{1}'.format(
1884         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1885         ext)
1886
1887
1888 def check_executable(exe, args=[]):
1889     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1890     args can be a list of arguments for a short output (like -version) """
1891     try:
1892         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1893     except OSError:
1894         return False
1895     return exe
1896
1897
1898 def get_exe_version(exe, args=['--version'],
1899                     version_re=None, unrecognized='present'):
1900     """ Returns the version of the specified executable,
1901     or False if the executable is not present """
1902     try:
1903         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1904         # SIGTTOU if youtube-dl is run in the background.
1905         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1906         out, _ = subprocess.Popen(
1907             [encodeArgument(exe)] + args,
1908             stdin=subprocess.PIPE,
1909             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1910     except OSError:
1911         return False
1912     if isinstance(out, bytes):  # Python 2.x
1913         out = out.decode('ascii', 'ignore')
1914     return detect_exe_version(out, version_re, unrecognized)
1915
1916
1917 def detect_exe_version(output, version_re=None, unrecognized='present'):
1918     assert isinstance(output, compat_str)
1919     if version_re is None:
1920         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1921     m = re.search(version_re, output)
1922     if m:
1923         return m.group(1)
1924     else:
1925         return unrecognized
1926
1927
1928 class PagedList(object):
1929     def __len__(self):
1930         # This is only useful for tests
1931         return len(self.getslice())
1932
1933
1934 class OnDemandPagedList(PagedList):
1935     def __init__(self, pagefunc, pagesize, use_cache=False):
1936         self._pagefunc = pagefunc
1937         self._pagesize = pagesize
1938         self._use_cache = use_cache
1939         if use_cache:
1940             self._cache = {}
1941
1942     def getslice(self, start=0, end=None):
1943         res = []
1944         for pagenum in itertools.count(start // self._pagesize):
1945             firstid = pagenum * self._pagesize
1946             nextfirstid = pagenum * self._pagesize + self._pagesize
1947             if start >= nextfirstid:
1948                 continue
1949
1950             page_results = None
1951             if self._use_cache:
1952                 page_results = self._cache.get(pagenum)
1953             if page_results is None:
1954                 page_results = list(self._pagefunc(pagenum))
1955             if self._use_cache:
1956                 self._cache[pagenum] = page_results
1957
1958             startv = (
1959                 start % self._pagesize
1960                 if firstid <= start < nextfirstid
1961                 else 0)
1962
1963             endv = (
1964                 ((end - 1) % self._pagesize) + 1
1965                 if (end is not None and firstid <= end <= nextfirstid)
1966                 else None)
1967
1968             if startv != 0 or endv is not None:
1969                 page_results = page_results[startv:endv]
1970             res.extend(page_results)
1971
1972             # A little optimization - if current page is not "full", ie. does
1973             # not contain page_size videos then we can assume that this page
1974             # is the last one - there are no more ids on further pages -
1975             # i.e. no need to query again.
1976             if len(page_results) + startv < self._pagesize:
1977                 break
1978
1979             # If we got the whole page, but the next page is not interesting,
1980             # break out early as well
1981             if end == nextfirstid:
1982                 break
1983         return res
1984
1985
1986 class InAdvancePagedList(PagedList):
1987     def __init__(self, pagefunc, pagecount, pagesize):
1988         self._pagefunc = pagefunc
1989         self._pagecount = pagecount
1990         self._pagesize = pagesize
1991
1992     def getslice(self, start=0, end=None):
1993         res = []
1994         start_page = start // self._pagesize
1995         end_page = (
1996             self._pagecount if end is None else (end // self._pagesize + 1))
1997         skip_elems = start - start_page * self._pagesize
1998         only_more = None if end is None else end - start
1999         for pagenum in range(start_page, end_page):
2000             page = list(self._pagefunc(pagenum))
2001             if skip_elems:
2002                 page = page[skip_elems:]
2003                 skip_elems = None
2004             if only_more is not None:
2005                 if len(page) < only_more:
2006                     only_more -= len(page)
2007                 else:
2008                     page = page[:only_more]
2009                     res.extend(page)
2010                     break
2011             res.extend(page)
2012         return res
2013
2014
2015 def uppercase_escape(s):
2016     unicode_escape = codecs.getdecoder('unicode_escape')
2017     return re.sub(
2018         r'\\U[0-9a-fA-F]{8}',
2019         lambda m: unicode_escape(m.group(0))[0],
2020         s)
2021
2022
2023 def lowercase_escape(s):
2024     unicode_escape = codecs.getdecoder('unicode_escape')
2025     return re.sub(
2026         r'\\u[0-9a-fA-F]{4}',
2027         lambda m: unicode_escape(m.group(0))[0],
2028         s)
2029
2030
2031 def escape_rfc3986(s):
2032     """Escape non-ASCII characters as suggested by RFC 3986"""
2033     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2034         s = s.encode('utf-8')
2035     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2036
2037
2038 def escape_url(url):
2039     """Escape URL as suggested by RFC 3986"""
2040     url_parsed = compat_urllib_parse_urlparse(url)
2041     return url_parsed._replace(
2042         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2043         path=escape_rfc3986(url_parsed.path),
2044         params=escape_rfc3986(url_parsed.params),
2045         query=escape_rfc3986(url_parsed.query),
2046         fragment=escape_rfc3986(url_parsed.fragment)
2047     ).geturl()
2048
2049
2050 def read_batch_urls(batch_fd):
2051     def fixup(url):
2052         if not isinstance(url, compat_str):
2053             url = url.decode('utf-8', 'replace')
2054         BOM_UTF8 = '\xef\xbb\xbf'
2055         if url.startswith(BOM_UTF8):
2056             url = url[len(BOM_UTF8):]
2057         url = url.strip()
2058         if url.startswith(('#', ';', ']')):
2059             return False
2060         return url
2061
2062     with contextlib.closing(batch_fd) as fd:
2063         return [url for url in map(fixup, fd) if url]
2064
2065
2066 def urlencode_postdata(*args, **kargs):
2067     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2068
2069
2070 def update_url_query(url, query):
2071     if not query:
2072         return url
2073     parsed_url = compat_urlparse.urlparse(url)
2074     qs = compat_parse_qs(parsed_url.query)
2075     qs.update(query)
2076     return compat_urlparse.urlunparse(parsed_url._replace(
2077         query=compat_urllib_parse_urlencode(qs, True)))
2078
2079
2080 def update_Request(req, url=None, data=None, headers={}, query={}):
2081     req_headers = req.headers.copy()
2082     req_headers.update(headers)
2083     req_data = data or req.data
2084     req_url = update_url_query(url or req.get_full_url(), query)
2085     req_get_method = req.get_method()
2086     if req_get_method == 'HEAD':
2087         req_type = HEADRequest
2088     elif req_get_method == 'PUT':
2089         req_type = PUTRequest
2090     else:
2091         req_type = compat_urllib_request.Request
2092     new_req = req_type(
2093         req_url, data=req_data, headers=req_headers,
2094         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2095     if hasattr(req, 'timeout'):
2096         new_req.timeout = req.timeout
2097     return new_req
2098
2099
2100 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2101     if isinstance(key_or_keys, (list, tuple)):
2102         for key in key_or_keys:
2103             if key not in d or d[key] is None or skip_false_values and not d[key]:
2104                 continue
2105             return d[key]
2106         return default
2107     return d.get(key_or_keys, default)
2108
2109
2110 def try_get(src, getter, expected_type=None):
2111     if not isinstance(getter, (list, tuple)):
2112         getter = [getter]
2113     for get in getter:
2114         try:
2115             v = get(src)
2116         except (AttributeError, KeyError, TypeError, IndexError):
2117             pass
2118         else:
2119             if expected_type is None or isinstance(v, expected_type):
2120                 return v
2121
2122
2123 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2124     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2125
2126
2127 US_RATINGS = {
2128     'G': 0,
2129     'PG': 10,
2130     'PG-13': 13,
2131     'R': 16,
2132     'NC': 18,
2133 }
2134
2135
2136 TV_PARENTAL_GUIDELINES = {
2137     'TV-Y': 0,
2138     'TV-Y7': 7,
2139     'TV-G': 0,
2140     'TV-PG': 0,
2141     'TV-14': 14,
2142     'TV-MA': 17,
2143 }
2144
2145
2146 def parse_age_limit(s):
2147     if type(s) == int:
2148         return s if 0 <= s <= 21 else None
2149     if not isinstance(s, compat_basestring):
2150         return None
2151     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2152     if m:
2153         return int(m.group('age'))
2154     if s in US_RATINGS:
2155         return US_RATINGS[s]
2156     return TV_PARENTAL_GUIDELINES.get(s)
2157
2158
2159 def strip_jsonp(code):
2160     return re.sub(
2161         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2162
2163
2164 def js_to_json(code):
2165     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2166     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2167     INTEGER_TABLE = (
2168         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2169         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2170     )
2171
2172     def fix_kv(m):
2173         v = m.group(0)
2174         if v in ('true', 'false', 'null'):
2175             return v
2176         elif v.startswith('/*') or v.startswith('//') or v == ',':
2177             return ""
2178
2179         if v[0] in ("'", '"'):
2180             v = re.sub(r'(?s)\\.|"', lambda m: {
2181                 '"': '\\"',
2182                 "\\'": "'",
2183                 '\\\n': '',
2184                 '\\x': '\\u00',
2185             }.get(m.group(0), m.group(0)), v[1:-1])
2186
2187         for regex, base in INTEGER_TABLE:
2188             im = re.match(regex, v)
2189             if im:
2190                 i = int(im.group(1), base)
2191                 return '"%d":' % i if v.endswith(':') else '%d' % i
2192
2193         return '"%s"' % v
2194
2195     return re.sub(r'''(?sx)
2196         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2197         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2198         {comment}|,(?={skip}[\]}}])|
2199         [a-zA-Z_][.a-zA-Z_0-9]*|
2200         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2201         [0-9]+(?={skip}:)
2202         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2203
2204
2205 def qualities(quality_ids):
2206     """ Get a numeric quality value out of a list of possible values """
2207     def q(qid):
2208         try:
2209             return quality_ids.index(qid)
2210         except ValueError:
2211             return -1
2212     return q
2213
2214
2215 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2216
2217
2218 def limit_length(s, length):
2219     """ Add ellipses to overly long strings """
2220     if s is None:
2221         return None
2222     ELLIPSES = '...'
2223     if len(s) > length:
2224         return s[:length - len(ELLIPSES)] + ELLIPSES
2225     return s
2226
2227
2228 def version_tuple(v):
2229     return tuple(int(e) for e in re.split(r'[-.]', v))
2230
2231
2232 def is_outdated_version(version, limit, assume_new=True):
2233     if not version:
2234         return not assume_new
2235     try:
2236         return version_tuple(version) < version_tuple(limit)
2237     except ValueError:
2238         return not assume_new
2239
2240
2241 def ytdl_is_updateable():
2242     """ Returns if youtube-dl can be updated with -U """
2243     from zipimport import zipimporter
2244
2245     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2246
2247
2248 def args_to_str(args):
2249     # Get a short string representation for a subprocess command
2250     return ' '.join(compat_shlex_quote(a) for a in args)
2251
2252
2253 def error_to_compat_str(err):
2254     err_str = str(err)
2255     # On python 2 error byte string must be decoded with proper
2256     # encoding rather than ascii
2257     if sys.version_info[0] < 3:
2258         err_str = err_str.decode(preferredencoding())
2259     return err_str
2260
2261
2262 def mimetype2ext(mt):
2263     if mt is None:
2264         return None
2265
2266     ext = {
2267         'audio/mp4': 'm4a',
2268         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2269         # it's the most popular one
2270         'audio/mpeg': 'mp3',
2271     }.get(mt)
2272     if ext is not None:
2273         return ext
2274
2275     _, _, res = mt.rpartition('/')
2276     res = res.split(';')[0].strip().lower()
2277
2278     return {
2279         '3gpp': '3gp',
2280         'smptett+xml': 'tt',
2281         'ttaf+xml': 'dfxp',
2282         'ttml+xml': 'ttml',
2283         'x-flv': 'flv',
2284         'x-mp4-fragmented': 'mp4',
2285         'x-ms-wmv': 'wmv',
2286         'mpegurl': 'm3u8',
2287         'x-mpegurl': 'm3u8',
2288         'vnd.apple.mpegurl': 'm3u8',
2289         'dash+xml': 'mpd',
2290         'f4m+xml': 'f4m',
2291         'hds+xml': 'f4m',
2292         'vnd.ms-sstr+xml': 'ism',
2293         'quicktime': 'mov',
2294         'mp2t': 'ts',
2295     }.get(res, res)
2296
2297
2298 def parse_codecs(codecs_str):
2299     # http://tools.ietf.org/html/rfc6381
2300     if not codecs_str:
2301         return {}
2302     splited_codecs = list(filter(None, map(
2303         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2304     vcodec, acodec = None, None
2305     for full_codec in splited_codecs:
2306         codec = full_codec.split('.')[0]
2307         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2308             if not vcodec:
2309                 vcodec = full_codec
2310         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2311             if not acodec:
2312                 acodec = full_codec
2313         else:
2314             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2315     if not vcodec and not acodec:
2316         if len(splited_codecs) == 2:
2317             return {
2318                 'vcodec': vcodec,
2319                 'acodec': acodec,
2320             }
2321         elif len(splited_codecs) == 1:
2322             return {
2323                 'vcodec': 'none',
2324                 'acodec': vcodec,
2325             }
2326     else:
2327         return {
2328             'vcodec': vcodec or 'none',
2329             'acodec': acodec or 'none',
2330         }
2331     return {}
2332
2333
2334 def urlhandle_detect_ext(url_handle):
2335     getheader = url_handle.headers.get
2336
2337     cd = getheader('Content-Disposition')
2338     if cd:
2339         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2340         if m:
2341             e = determine_ext(m.group('filename'), default_ext=None)
2342             if e:
2343                 return e
2344
2345     return mimetype2ext(getheader('Content-Type'))
2346
2347
2348 def encode_data_uri(data, mime_type):
2349     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2350
2351
2352 def age_restricted(content_limit, age_limit):
2353     """ Returns True iff the content should be blocked """
2354
2355     if age_limit is None:  # No limit set
2356         return False
2357     if content_limit is None:
2358         return False  # Content available for everyone
2359     return age_limit < content_limit
2360
2361
2362 def is_html(first_bytes):
2363     """ Detect whether a file contains HTML by examining its first bytes. """
2364
2365     BOMS = [
2366         (b'\xef\xbb\xbf', 'utf-8'),
2367         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2368         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2369         (b'\xff\xfe', 'utf-16-le'),
2370         (b'\xfe\xff', 'utf-16-be'),
2371     ]
2372     for bom, enc in BOMS:
2373         if first_bytes.startswith(bom):
2374             s = first_bytes[len(bom):].decode(enc, 'replace')
2375             break
2376     else:
2377         s = first_bytes.decode('utf-8', 'replace')
2378
2379     return re.match(r'^\s*<', s)
2380
2381
2382 def determine_protocol(info_dict):
2383     protocol = info_dict.get('protocol')
2384     if protocol is not None:
2385         return protocol
2386
2387     url = info_dict['url']
2388     if url.startswith('rtmp'):
2389         return 'rtmp'
2390     elif url.startswith('mms'):
2391         return 'mms'
2392     elif url.startswith('rtsp'):
2393         return 'rtsp'
2394
2395     ext = determine_ext(url)
2396     if ext == 'm3u8':
2397         return 'm3u8'
2398     elif ext == 'f4m':
2399         return 'f4m'
2400
2401     return compat_urllib_parse_urlparse(url).scheme
2402
2403
2404 def render_table(header_row, data):
2405     """ Render a list of rows, each as a list of values """
2406     table = [header_row] + data
2407     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2408     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2409     return '\n'.join(format_str % tuple(row) for row in table)
2410
2411
2412 def _match_one(filter_part, dct):
2413     COMPARISON_OPERATORS = {
2414         '<': operator.lt,
2415         '<=': operator.le,
2416         '>': operator.gt,
2417         '>=': operator.ge,
2418         '=': operator.eq,
2419         '!=': operator.ne,
2420     }
2421     operator_rex = re.compile(r'''(?x)\s*
2422         (?P<key>[a-z_]+)
2423         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2424         (?:
2425             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2426             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2427             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2428         )
2429         \s*$
2430         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2431     m = operator_rex.search(filter_part)
2432     if m:
2433         op = COMPARISON_OPERATORS[m.group('op')]
2434         actual_value = dct.get(m.group('key'))
2435         if (m.group('quotedstrval') is not None or
2436             m.group('strval') is not None or
2437             # If the original field is a string and matching comparisonvalue is
2438             # a number we should respect the origin of the original field
2439             # and process comparison value as a string (see
2440             # https://github.com/rg3/youtube-dl/issues/11082).
2441             actual_value is not None and m.group('intval') is not None and
2442                 isinstance(actual_value, compat_str)):
2443             if m.group('op') not in ('=', '!='):
2444                 raise ValueError(
2445                     'Operator %s does not support string values!' % m.group('op'))
2446             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2447             quote = m.group('quote')
2448             if quote is not None:
2449                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2450         else:
2451             try:
2452                 comparison_value = int(m.group('intval'))
2453             except ValueError:
2454                 comparison_value = parse_filesize(m.group('intval'))
2455                 if comparison_value is None:
2456                     comparison_value = parse_filesize(m.group('intval') + 'B')
2457                 if comparison_value is None:
2458                     raise ValueError(
2459                         'Invalid integer value %r in filter part %r' % (
2460                             m.group('intval'), filter_part))
2461         if actual_value is None:
2462             return m.group('none_inclusive')
2463         return op(actual_value, comparison_value)
2464
2465     UNARY_OPERATORS = {
2466         '': lambda v: v is not None,
2467         '!': lambda v: v is None,
2468     }
2469     operator_rex = re.compile(r'''(?x)\s*
2470         (?P<op>%s)\s*(?P<key>[a-z_]+)
2471         \s*$
2472         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2473     m = operator_rex.search(filter_part)
2474     if m:
2475         op = UNARY_OPERATORS[m.group('op')]
2476         actual_value = dct.get(m.group('key'))
2477         return op(actual_value)
2478
2479     raise ValueError('Invalid filter part %r' % filter_part)
2480
2481
2482 def match_str(filter_str, dct):
2483     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2484
2485     return all(
2486         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2487
2488
2489 def match_filter_func(filter_str):
2490     def _match_func(info_dict):
2491         if match_str(filter_str, info_dict):
2492             return None
2493         else:
2494             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2495             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2496     return _match_func
2497
2498
2499 def parse_dfxp_time_expr(time_expr):
2500     if not time_expr:
2501         return
2502
2503     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2504     if mobj:
2505         return float(mobj.group('time_offset'))
2506
2507     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2508     if mobj:
2509         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2510
2511
2512 def srt_subtitles_timecode(seconds):
2513     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2514
2515
2516 def dfxp2srt(dfxp_data):
2517     LEGACY_NAMESPACES = (
2518         ('http://www.w3.org/ns/ttml', [
2519             'http://www.w3.org/2004/11/ttaf1',
2520             'http://www.w3.org/2006/04/ttaf1',
2521             'http://www.w3.org/2006/10/ttaf1',
2522         ]),
2523         ('http://www.w3.org/ns/ttml#styling', [
2524             'http://www.w3.org/ns/ttml#style',
2525         ]),
2526     )
2527
2528     SUPPORTED_STYLING = [
2529         'color',
2530         'fontFamily',
2531         'fontSize',
2532         'fontStyle',
2533         'fontWeight',
2534         'textDecoration'
2535     ]
2536
2537     _x = functools.partial(xpath_with_ns, ns_map={
2538         'ttml': 'http://www.w3.org/ns/ttml',
2539         'tts': 'http://www.w3.org/ns/ttml#styling',
2540     })
2541
2542     styles = {}
2543     default_style = {}
2544
2545     class TTMLPElementParser(object):
2546         _out = ''
2547         _unclosed_elements = []
2548         _applied_styles = []
2549
2550         def start(self, tag, attrib):
2551             if tag in (_x('ttml:br'), 'br'):
2552                 self._out += '\n'
2553             else:
2554                 unclosed_elements = []
2555                 style = {}
2556                 element_style_id = attrib.get('style')
2557                 if default_style:
2558                     style.update(default_style)
2559                 if element_style_id:
2560                     style.update(styles.get(element_style_id, {}))
2561                 for prop in SUPPORTED_STYLING:
2562                     prop_val = attrib.get(_x('tts:' + prop))
2563                     if prop_val:
2564                         style[prop] = prop_val
2565                 if style:
2566                     font = ''
2567                     for k, v in sorted(style.items()):
2568                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2569                             continue
2570                         if k == 'color':
2571                             font += ' color="%s"' % v
2572                         elif k == 'fontSize':
2573                             font += ' size="%s"' % v
2574                         elif k == 'fontFamily':
2575                             font += ' face="%s"' % v
2576                         elif k == 'fontWeight' and v == 'bold':
2577                             self._out += '<b>'
2578                             unclosed_elements.append('b')
2579                         elif k == 'fontStyle' and v == 'italic':
2580                             self._out += '<i>'
2581                             unclosed_elements.append('i')
2582                         elif k == 'textDecoration' and v == 'underline':
2583                             self._out += '<u>'
2584                             unclosed_elements.append('u')
2585                     if font:
2586                         self._out += '<font' + font + '>'
2587                         unclosed_elements.append('font')
2588                     applied_style = {}
2589                     if self._applied_styles:
2590                         applied_style.update(self._applied_styles[-1])
2591                     applied_style.update(style)
2592                     self._applied_styles.append(applied_style)
2593                 self._unclosed_elements.append(unclosed_elements)
2594
2595         def end(self, tag):
2596             if tag not in (_x('ttml:br'), 'br'):
2597                 unclosed_elements = self._unclosed_elements.pop()
2598                 for element in reversed(unclosed_elements):
2599                     self._out += '</%s>' % element
2600                 if unclosed_elements and self._applied_styles:
2601                     self._applied_styles.pop()
2602
2603         def data(self, data):
2604             self._out += data
2605
2606         def close(self):
2607             return self._out.strip()
2608
2609     def parse_node(node):
2610         target = TTMLPElementParser()
2611         parser = xml.etree.ElementTree.XMLParser(target=target)
2612         parser.feed(xml.etree.ElementTree.tostring(node))
2613         return parser.close()
2614
2615     for k, v in LEGACY_NAMESPACES:
2616         for ns in v:
2617             dfxp_data = dfxp_data.replace(ns, k)
2618
2619     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2620     out = []
2621     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2622
2623     if not paras:
2624         raise ValueError('Invalid dfxp/TTML subtitle')
2625
2626     repeat = False
2627     while True:
2628         for style in dfxp.findall(_x('.//ttml:style')):
2629             style_id = style.get('id')
2630             parent_style_id = style.get('style')
2631             if parent_style_id:
2632                 if parent_style_id not in styles:
2633                     repeat = True
2634                     continue
2635                 styles[style_id] = styles[parent_style_id].copy()
2636             for prop in SUPPORTED_STYLING:
2637                 prop_val = style.get(_x('tts:' + prop))
2638                 if prop_val:
2639                     styles.setdefault(style_id, {})[prop] = prop_val
2640         if repeat:
2641             repeat = False
2642         else:
2643             break
2644
2645     for p in ('body', 'div'):
2646         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2647         if ele is None:
2648             continue
2649         style = styles.get(ele.get('style'))
2650         if not style:
2651             continue
2652         default_style.update(style)
2653
2654     for para, index in zip(paras, itertools.count(1)):
2655         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2656         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2657         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2658         if begin_time is None:
2659             continue
2660         if not end_time:
2661             if not dur:
2662                 continue
2663             end_time = begin_time + dur
2664         out.append('%d\n%s --> %s\n%s\n\n' % (
2665             index,
2666             srt_subtitles_timecode(begin_time),
2667             srt_subtitles_timecode(end_time),
2668             parse_node(para)))
2669
2670     return ''.join(out)
2671
2672
2673 def cli_option(params, command_option, param):
2674     param = params.get(param)
2675     if param:
2676         param = compat_str(param)
2677     return [command_option, param] if param is not None else []
2678
2679
2680 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2681     param = params.get(param)
2682     assert isinstance(param, bool)
2683     if separator:
2684         return [command_option + separator + (true_value if param else false_value)]
2685     return [command_option, true_value if param else false_value]
2686
2687
2688 def cli_valueless_option(params, command_option, param, expected_value=True):
2689     param = params.get(param)
2690     return [command_option] if param == expected_value else []
2691
2692
2693 def cli_configuration_args(params, param, default=[]):
2694     ex_args = params.get(param)
2695     if ex_args is None:
2696         return default
2697     assert isinstance(ex_args, list)
2698     return ex_args
2699
2700
2701 class ISO639Utils(object):
2702     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2703     _lang_map = {
2704         'aa': 'aar',
2705         'ab': 'abk',
2706         'ae': 'ave',
2707         'af': 'afr',
2708         'ak': 'aka',
2709         'am': 'amh',
2710         'an': 'arg',
2711         'ar': 'ara',
2712         'as': 'asm',
2713         'av': 'ava',
2714         'ay': 'aym',
2715         'az': 'aze',
2716         'ba': 'bak',
2717         'be': 'bel',
2718         'bg': 'bul',
2719         'bh': 'bih',
2720         'bi': 'bis',
2721         'bm': 'bam',
2722         'bn': 'ben',
2723         'bo': 'bod',
2724         'br': 'bre',
2725         'bs': 'bos',
2726         'ca': 'cat',
2727         'ce': 'che',
2728         'ch': 'cha',
2729         'co': 'cos',
2730         'cr': 'cre',
2731         'cs': 'ces',
2732         'cu': 'chu',
2733         'cv': 'chv',
2734         'cy': 'cym',
2735         'da': 'dan',
2736         'de': 'deu',
2737         'dv': 'div',
2738         'dz': 'dzo',
2739         'ee': 'ewe',
2740         'el': 'ell',
2741         'en': 'eng',
2742         'eo': 'epo',
2743         'es': 'spa',
2744         'et': 'est',
2745         'eu': 'eus',
2746         'fa': 'fas',
2747         'ff': 'ful',
2748         'fi': 'fin',
2749         'fj': 'fij',
2750         'fo': 'fao',
2751         'fr': 'fra',
2752         'fy': 'fry',
2753         'ga': 'gle',
2754         'gd': 'gla',
2755         'gl': 'glg',
2756         'gn': 'grn',
2757         'gu': 'guj',
2758         'gv': 'glv',
2759         'ha': 'hau',
2760         'he': 'heb',
2761         'hi': 'hin',
2762         'ho': 'hmo',
2763         'hr': 'hrv',
2764         'ht': 'hat',
2765         'hu': 'hun',
2766         'hy': 'hye',
2767         'hz': 'her',
2768         'ia': 'ina',
2769         'id': 'ind',
2770         'ie': 'ile',
2771         'ig': 'ibo',
2772         'ii': 'iii',
2773         'ik': 'ipk',
2774         'io': 'ido',
2775         'is': 'isl',
2776         'it': 'ita',
2777         'iu': 'iku',
2778         'ja': 'jpn',
2779         'jv': 'jav',
2780         'ka': 'kat',
2781         'kg': 'kon',
2782         'ki': 'kik',
2783         'kj': 'kua',
2784         'kk': 'kaz',
2785         'kl': 'kal',
2786         'km': 'khm',
2787         'kn': 'kan',
2788         'ko': 'kor',
2789         'kr': 'kau',
2790         'ks': 'kas',
2791         'ku': 'kur',
2792         'kv': 'kom',
2793         'kw': 'cor',
2794         'ky': 'kir',
2795         'la': 'lat',
2796         'lb': 'ltz',
2797         'lg': 'lug',
2798         'li': 'lim',
2799         'ln': 'lin',
2800         'lo': 'lao',
2801         'lt': 'lit',
2802         'lu': 'lub',
2803         'lv': 'lav',
2804         'mg': 'mlg',
2805         'mh': 'mah',
2806         'mi': 'mri',
2807         'mk': 'mkd',
2808         'ml': 'mal',
2809         'mn': 'mon',
2810         'mr': 'mar',
2811         'ms': 'msa',
2812         'mt': 'mlt',
2813         'my': 'mya',
2814         'na': 'nau',
2815         'nb': 'nob',
2816         'nd': 'nde',
2817         'ne': 'nep',
2818         'ng': 'ndo',
2819         'nl': 'nld',
2820         'nn': 'nno',
2821         'no': 'nor',
2822         'nr': 'nbl',
2823         'nv': 'nav',
2824         'ny': 'nya',
2825         'oc': 'oci',
2826         'oj': 'oji',
2827         'om': 'orm',
2828         'or': 'ori',
2829         'os': 'oss',
2830         'pa': 'pan',
2831         'pi': 'pli',
2832         'pl': 'pol',
2833         'ps': 'pus',
2834         'pt': 'por',
2835         'qu': 'que',
2836         'rm': 'roh',
2837         'rn': 'run',
2838         'ro': 'ron',
2839         'ru': 'rus',
2840         'rw': 'kin',
2841         'sa': 'san',
2842         'sc': 'srd',
2843         'sd': 'snd',
2844         'se': 'sme',
2845         'sg': 'sag',
2846         'si': 'sin',
2847         'sk': 'slk',
2848         'sl': 'slv',
2849         'sm': 'smo',
2850         'sn': 'sna',
2851         'so': 'som',
2852         'sq': 'sqi',
2853         'sr': 'srp',
2854         'ss': 'ssw',
2855         'st': 'sot',
2856         'su': 'sun',
2857         'sv': 'swe',
2858         'sw': 'swa',
2859         'ta': 'tam',
2860         'te': 'tel',
2861         'tg': 'tgk',
2862         'th': 'tha',
2863         'ti': 'tir',
2864         'tk': 'tuk',
2865         'tl': 'tgl',
2866         'tn': 'tsn',
2867         'to': 'ton',
2868         'tr': 'tur',
2869         'ts': 'tso',
2870         'tt': 'tat',
2871         'tw': 'twi',
2872         'ty': 'tah',
2873         'ug': 'uig',
2874         'uk': 'ukr',
2875         'ur': 'urd',
2876         'uz': 'uzb',
2877         've': 'ven',
2878         'vi': 'vie',
2879         'vo': 'vol',
2880         'wa': 'wln',
2881         'wo': 'wol',
2882         'xh': 'xho',
2883         'yi': 'yid',
2884         'yo': 'yor',
2885         'za': 'zha',
2886         'zh': 'zho',
2887         'zu': 'zul',
2888     }
2889
2890     @classmethod
2891     def short2long(cls, code):
2892         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2893         return cls._lang_map.get(code[:2])
2894
2895     @classmethod
2896     def long2short(cls, code):
2897         """Convert language code from ISO 639-2/T to ISO 639-1"""
2898         for short_name, long_name in cls._lang_map.items():
2899             if long_name == code:
2900                 return short_name
2901
2902
2903 class ISO3166Utils(object):
2904     # From http://data.okfn.org/data/core/country-list
2905     _country_map = {
2906         'AF': 'Afghanistan',
2907         'AX': 'Åland Islands',
2908         'AL': 'Albania',
2909         'DZ': 'Algeria',
2910         'AS': 'American Samoa',
2911         'AD': 'Andorra',
2912         'AO': 'Angola',
2913         'AI': 'Anguilla',
2914         'AQ': 'Antarctica',
2915         'AG': 'Antigua and Barbuda',
2916         'AR': 'Argentina',
2917         'AM': 'Armenia',
2918         'AW': 'Aruba',
2919         'AU': 'Australia',
2920         'AT': 'Austria',
2921         'AZ': 'Azerbaijan',
2922         'BS': 'Bahamas',
2923         'BH': 'Bahrain',
2924         'BD': 'Bangladesh',
2925         'BB': 'Barbados',
2926         'BY': 'Belarus',
2927         'BE': 'Belgium',
2928         'BZ': 'Belize',
2929         'BJ': 'Benin',
2930         'BM': 'Bermuda',
2931         'BT': 'Bhutan',
2932         'BO': 'Bolivia, Plurinational State of',
2933         'BQ': 'Bonaire, Sint Eustatius and Saba',
2934         'BA': 'Bosnia and Herzegovina',
2935         'BW': 'Botswana',
2936         'BV': 'Bouvet Island',
2937         'BR': 'Brazil',
2938         'IO': 'British Indian Ocean Territory',
2939         'BN': 'Brunei Darussalam',
2940         'BG': 'Bulgaria',
2941         'BF': 'Burkina Faso',
2942         'BI': 'Burundi',
2943         'KH': 'Cambodia',
2944         'CM': 'Cameroon',
2945         'CA': 'Canada',
2946         'CV': 'Cape Verde',
2947         'KY': 'Cayman Islands',
2948         'CF': 'Central African Republic',
2949         'TD': 'Chad',
2950         'CL': 'Chile',
2951         'CN': 'China',
2952         'CX': 'Christmas Island',
2953         'CC': 'Cocos (Keeling) Islands',
2954         'CO': 'Colombia',
2955         'KM': 'Comoros',
2956         'CG': 'Congo',
2957         'CD': 'Congo, the Democratic Republic of the',
2958         'CK': 'Cook Islands',
2959         'CR': 'Costa Rica',
2960         'CI': 'Côte d\'Ivoire',
2961         'HR': 'Croatia',
2962         'CU': 'Cuba',
2963         'CW': 'Curaçao',
2964         'CY': 'Cyprus',
2965         'CZ': 'Czech Republic',
2966         'DK': 'Denmark',
2967         'DJ': 'Djibouti',
2968         'DM': 'Dominica',
2969         'DO': 'Dominican Republic',
2970         'EC': 'Ecuador',
2971         'EG': 'Egypt',
2972         'SV': 'El Salvador',
2973         'GQ': 'Equatorial Guinea',
2974         'ER': 'Eritrea',
2975         'EE': 'Estonia',
2976         'ET': 'Ethiopia',
2977         'FK': 'Falkland Islands (Malvinas)',
2978         'FO': 'Faroe Islands',
2979         'FJ': 'Fiji',
2980         'FI': 'Finland',
2981         'FR': 'France',
2982         'GF': 'French Guiana',
2983         'PF': 'French Polynesia',
2984         'TF': 'French Southern Territories',
2985         'GA': 'Gabon',
2986         'GM': 'Gambia',
2987         'GE': 'Georgia',
2988         'DE': 'Germany',
2989         'GH': 'Ghana',
2990         'GI': 'Gibraltar',
2991         'GR': 'Greece',
2992         'GL': 'Greenland',
2993         'GD': 'Grenada',
2994         'GP': 'Guadeloupe',
2995         'GU': 'Guam',
2996         'GT': 'Guatemala',
2997         'GG': 'Guernsey',
2998         'GN': 'Guinea',
2999         'GW': 'Guinea-Bissau',
3000         'GY': 'Guyana',
3001         'HT': 'Haiti',
3002         'HM': 'Heard Island and McDonald Islands',
3003         'VA': 'Holy See (Vatican City State)',
3004         'HN': 'Honduras',
3005         'HK': 'Hong Kong',
3006         'HU': 'Hungary',
3007         'IS': 'Iceland',
3008         'IN': 'India',
3009         'ID': 'Indonesia',
3010         'IR': 'Iran, Islamic Republic of',
3011         'IQ': 'Iraq',
3012         'IE': 'Ireland',
3013         'IM': 'Isle of Man',
3014         'IL': 'Israel',
3015         'IT': 'Italy',
3016         'JM': 'Jamaica',
3017         'JP': 'Japan',
3018         'JE': 'Jersey',
3019         'JO': 'Jordan',
3020         'KZ': 'Kazakhstan',
3021         'KE': 'Kenya',
3022         'KI': 'Kiribati',
3023         'KP': 'Korea, Democratic People\'s Republic of',
3024         'KR': 'Korea, Republic of',
3025         'KW': 'Kuwait',
3026         'KG': 'Kyrgyzstan',
3027         'LA': 'Lao People\'s Democratic Republic',
3028         'LV': 'Latvia',
3029         'LB': 'Lebanon',
3030         'LS': 'Lesotho',
3031         'LR': 'Liberia',
3032         'LY': 'Libya',
3033         'LI': 'Liechtenstein',
3034         'LT': 'Lithuania',
3035         'LU': 'Luxembourg',
3036         'MO': 'Macao',
3037         'MK': 'Macedonia, the Former Yugoslav Republic of',
3038         'MG': 'Madagascar',
3039         'MW': 'Malawi',
3040         'MY': 'Malaysia',
3041         'MV': 'Maldives',
3042         'ML': 'Mali',
3043         'MT': 'Malta',
3044         'MH': 'Marshall Islands',
3045         'MQ': 'Martinique',
3046         'MR': 'Mauritania',
3047         'MU': 'Mauritius',
3048         'YT': 'Mayotte',
3049         'MX': 'Mexico',
3050         'FM': 'Micronesia, Federated States of',
3051         'MD': 'Moldova, Republic of',
3052         'MC': 'Monaco',
3053         'MN': 'Mongolia',
3054         'ME': 'Montenegro',
3055         'MS': 'Montserrat',
3056         'MA': 'Morocco',
3057         'MZ': 'Mozambique',
3058         'MM': 'Myanmar',
3059         'NA': 'Namibia',
3060         'NR': 'Nauru',
3061         'NP': 'Nepal',
3062         'NL': 'Netherlands',
3063         'NC': 'New Caledonia',
3064         'NZ': 'New Zealand',
3065         'NI': 'Nicaragua',
3066         'NE': 'Niger',
3067         'NG': 'Nigeria',
3068         'NU': 'Niue',
3069         'NF': 'Norfolk Island',
3070         'MP': 'Northern Mariana Islands',
3071         'NO': 'Norway',
3072         'OM': 'Oman',
3073         'PK': 'Pakistan',
3074         'PW': 'Palau',
3075         'PS': 'Palestine, State of',
3076         'PA': 'Panama',
3077         'PG': 'Papua New Guinea',
3078         'PY': 'Paraguay',
3079         'PE': 'Peru',
3080         'PH': 'Philippines',
3081         'PN': 'Pitcairn',
3082         'PL': 'Poland',
3083         'PT': 'Portugal',
3084         'PR': 'Puerto Rico',
3085         'QA': 'Qatar',
3086         'RE': 'Réunion',
3087         'RO': 'Romania',
3088         'RU': 'Russian Federation',
3089         'RW': 'Rwanda',
3090         'BL': 'Saint Barthélemy',
3091         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3092         'KN': 'Saint Kitts and Nevis',
3093         'LC': 'Saint Lucia',
3094         'MF': 'Saint Martin (French part)',
3095         'PM': 'Saint Pierre and Miquelon',
3096         'VC': 'Saint Vincent and the Grenadines',
3097         'WS': 'Samoa',
3098         'SM': 'San Marino',
3099         'ST': 'Sao Tome and Principe',
3100         'SA': 'Saudi Arabia',
3101         'SN': 'Senegal',
3102         'RS': 'Serbia',
3103         'SC': 'Seychelles',
3104         'SL': 'Sierra Leone',
3105         'SG': 'Singapore',
3106         'SX': 'Sint Maarten (Dutch part)',
3107         'SK': 'Slovakia',
3108         'SI': 'Slovenia',
3109         'SB': 'Solomon Islands',
3110         'SO': 'Somalia',
3111         'ZA': 'South Africa',
3112         'GS': 'South Georgia and the South Sandwich Islands',
3113         'SS': 'South Sudan',
3114         'ES': 'Spain',
3115         'LK': 'Sri Lanka',
3116         'SD': 'Sudan',
3117         'SR': 'Suriname',
3118         'SJ': 'Svalbard and Jan Mayen',
3119         'SZ': 'Swaziland',
3120         'SE': 'Sweden',
3121         'CH': 'Switzerland',
3122         'SY': 'Syrian Arab Republic',
3123         'TW': 'Taiwan, Province of China',
3124         'TJ': 'Tajikistan',
3125         'TZ': 'Tanzania, United Republic of',
3126         'TH': 'Thailand',
3127         'TL': 'Timor-Leste',
3128         'TG': 'Togo',
3129         'TK': 'Tokelau',
3130         'TO': 'Tonga',
3131         'TT': 'Trinidad and Tobago',
3132         'TN': 'Tunisia',
3133         'TR': 'Turkey',
3134         'TM': 'Turkmenistan',
3135         'TC': 'Turks and Caicos Islands',
3136         'TV': 'Tuvalu',
3137         'UG': 'Uganda',
3138         'UA': 'Ukraine',
3139         'AE': 'United Arab Emirates',
3140         'GB': 'United Kingdom',
3141         'US': 'United States',
3142         'UM': 'United States Minor Outlying Islands',
3143         'UY': 'Uruguay',
3144         'UZ': 'Uzbekistan',
3145         'VU': 'Vanuatu',
3146         'VE': 'Venezuela, Bolivarian Republic of',
3147         'VN': 'Viet Nam',
3148         'VG': 'Virgin Islands, British',
3149         'VI': 'Virgin Islands, U.S.',
3150         'WF': 'Wallis and Futuna',
3151         'EH': 'Western Sahara',
3152         'YE': 'Yemen',
3153         'ZM': 'Zambia',
3154         'ZW': 'Zimbabwe',
3155     }
3156
3157     @classmethod
3158     def short2full(cls, code):
3159         """Convert an ISO 3166-2 country code to the corresponding full name"""
3160         return cls._country_map.get(code.upper())
3161
3162
3163 class GeoUtils(object):
3164     # Major IPv4 address blocks per country
3165     _country_ip_map = {
3166         'AD': '85.94.160.0/19',
3167         'AE': '94.200.0.0/13',
3168         'AF': '149.54.0.0/17',
3169         'AG': '209.59.64.0/18',
3170         'AI': '204.14.248.0/21',
3171         'AL': '46.99.0.0/16',
3172         'AM': '46.70.0.0/15',
3173         'AO': '105.168.0.0/13',
3174         'AP': '159.117.192.0/21',
3175         'AR': '181.0.0.0/12',
3176         'AS': '202.70.112.0/20',
3177         'AT': '84.112.0.0/13',
3178         'AU': '1.128.0.0/11',
3179         'AW': '181.41.0.0/18',
3180         'AZ': '5.191.0.0/16',
3181         'BA': '31.176.128.0/17',
3182         'BB': '65.48.128.0/17',
3183         'BD': '114.130.0.0/16',
3184         'BE': '57.0.0.0/8',
3185         'BF': '129.45.128.0/17',
3186         'BG': '95.42.0.0/15',
3187         'BH': '37.131.0.0/17',
3188         'BI': '154.117.192.0/18',
3189         'BJ': '137.255.0.0/16',
3190         'BL': '192.131.134.0/24',
3191         'BM': '196.12.64.0/18',
3192         'BN': '156.31.0.0/16',
3193         'BO': '161.56.0.0/16',
3194         'BQ': '161.0.80.0/20',
3195         'BR': '152.240.0.0/12',
3196         'BS': '24.51.64.0/18',
3197         'BT': '119.2.96.0/19',
3198         'BW': '168.167.0.0/16',
3199         'BY': '178.120.0.0/13',
3200         'BZ': '179.42.192.0/18',
3201         'CA': '99.224.0.0/11',
3202         'CD': '41.243.0.0/16',
3203         'CF': '196.32.200.0/21',
3204         'CG': '197.214.128.0/17',
3205         'CH': '85.0.0.0/13',
3206         'CI': '154.232.0.0/14',
3207         'CK': '202.65.32.0/19',
3208         'CL': '152.172.0.0/14',
3209         'CM': '165.210.0.0/15',
3210         'CN': '36.128.0.0/10',
3211         'CO': '181.240.0.0/12',
3212         'CR': '201.192.0.0/12',
3213         'CU': '152.206.0.0/15',
3214         'CV': '165.90.96.0/19',
3215         'CW': '190.88.128.0/17',
3216         'CY': '46.198.0.0/15',
3217         'CZ': '88.100.0.0/14',
3218         'DE': '53.0.0.0/8',
3219         'DJ': '197.241.0.0/17',
3220         'DK': '87.48.0.0/12',
3221         'DM': '192.243.48.0/20',
3222         'DO': '152.166.0.0/15',
3223         'DZ': '41.96.0.0/12',
3224         'EC': '186.68.0.0/15',
3225         'EE': '90.190.0.0/15',
3226         'EG': '156.160.0.0/11',
3227         'ER': '196.200.96.0/20',
3228         'ES': '88.0.0.0/11',
3229         'ET': '196.188.0.0/14',
3230         'EU': '2.16.0.0/13',
3231         'FI': '91.152.0.0/13',
3232         'FJ': '144.120.0.0/16',
3233         'FM': '119.252.112.0/20',
3234         'FO': '88.85.32.0/19',
3235         'FR': '90.0.0.0/9',
3236         'GA': '41.158.0.0/15',
3237         'GB': '25.0.0.0/8',
3238         'GD': '74.122.88.0/21',
3239         'GE': '31.146.0.0/16',
3240         'GF': '161.22.64.0/18',
3241         'GG': '62.68.160.0/19',
3242         'GH': '45.208.0.0/14',
3243         'GI': '85.115.128.0/19',
3244         'GL': '88.83.0.0/19',
3245         'GM': '160.182.0.0/15',
3246         'GN': '197.149.192.0/18',
3247         'GP': '104.250.0.0/19',
3248         'GQ': '105.235.224.0/20',
3249         'GR': '94.64.0.0/13',
3250         'GT': '168.234.0.0/16',
3251         'GU': '168.123.0.0/16',
3252         'GW': '197.214.80.0/20',
3253         'GY': '181.41.64.0/18',
3254         'HK': '113.252.0.0/14',
3255         'HN': '181.210.0.0/16',
3256         'HR': '93.136.0.0/13',
3257         'HT': '148.102.128.0/17',
3258         'HU': '84.0.0.0/14',
3259         'ID': '39.192.0.0/10',
3260         'IE': '87.32.0.0/12',
3261         'IL': '79.176.0.0/13',
3262         'IM': '5.62.80.0/20',
3263         'IN': '117.192.0.0/10',
3264         'IO': '203.83.48.0/21',
3265         'IQ': '37.236.0.0/14',
3266         'IR': '2.176.0.0/12',
3267         'IS': '82.221.0.0/16',
3268         'IT': '79.0.0.0/10',
3269         'JE': '87.244.64.0/18',
3270         'JM': '72.27.0.0/17',
3271         'JO': '176.29.0.0/16',
3272         'JP': '126.0.0.0/8',
3273         'KE': '105.48.0.0/12',
3274         'KG': '158.181.128.0/17',
3275         'KH': '36.37.128.0/17',
3276         'KI': '103.25.140.0/22',
3277         'KM': '197.255.224.0/20',
3278         'KN': '198.32.32.0/19',
3279         'KP': '175.45.176.0/22',
3280         'KR': '175.192.0.0/10',
3281         'KW': '37.36.0.0/14',
3282         'KY': '64.96.0.0/15',
3283         'KZ': '2.72.0.0/13',
3284         'LA': '115.84.64.0/18',
3285         'LB': '178.135.0.0/16',
3286         'LC': '192.147.231.0/24',
3287         'LI': '82.117.0.0/19',
3288         'LK': '112.134.0.0/15',
3289         'LR': '41.86.0.0/19',
3290         'LS': '129.232.0.0/17',
3291         'LT': '78.56.0.0/13',
3292         'LU': '188.42.0.0/16',
3293         'LV': '46.109.0.0/16',
3294         'LY': '41.252.0.0/14',
3295         'MA': '105.128.0.0/11',
3296         'MC': '88.209.64.0/18',
3297         'MD': '37.246.0.0/16',
3298         'ME': '178.175.0.0/17',
3299         'MF': '74.112.232.0/21',
3300         'MG': '154.126.0.0/17',
3301         'MH': '117.103.88.0/21',
3302         'MK': '77.28.0.0/15',
3303         'ML': '154.118.128.0/18',
3304         'MM': '37.111.0.0/17',
3305         'MN': '49.0.128.0/17',
3306         'MO': '60.246.0.0/16',
3307         'MP': '202.88.64.0/20',
3308         'MQ': '109.203.224.0/19',
3309         'MR': '41.188.64.0/18',
3310         'MS': '208.90.112.0/22',
3311         'MT': '46.11.0.0/16',
3312         'MU': '105.16.0.0/12',
3313         'MV': '27.114.128.0/18',
3314         'MW': '105.234.0.0/16',
3315         'MX': '187.192.0.0/11',
3316         'MY': '175.136.0.0/13',
3317         'MZ': '197.218.0.0/15',
3318         'NA': '41.182.0.0/16',
3319         'NC': '101.101.0.0/18',
3320         'NE': '197.214.0.0/18',
3321         'NF': '203.17.240.0/22',
3322         'NG': '105.112.0.0/12',
3323         'NI': '186.76.0.0/15',
3324         'NL': '145.96.0.0/11',
3325         'NO': '84.208.0.0/13',
3326         'NP': '36.252.0.0/15',
3327         'NR': '203.98.224.0/19',
3328         'NU': '49.156.48.0/22',
3329         'NZ': '49.224.0.0/14',
3330         'OM': '5.36.0.0/15',
3331         'PA': '186.72.0.0/15',
3332         'PE': '186.160.0.0/14',
3333         'PF': '123.50.64.0/18',
3334         'PG': '124.240.192.0/19',
3335         'PH': '49.144.0.0/13',
3336         'PK': '39.32.0.0/11',
3337         'PL': '83.0.0.0/11',
3338         'PM': '70.36.0.0/20',
3339         'PR': '66.50.0.0/16',
3340         'PS': '188.161.0.0/16',
3341         'PT': '85.240.0.0/13',
3342         'PW': '202.124.224.0/20',
3343         'PY': '181.120.0.0/14',
3344         'QA': '37.210.0.0/15',
3345         'RE': '139.26.0.0/16',
3346         'RO': '79.112.0.0/13',
3347         'RS': '178.220.0.0/14',
3348         'RU': '5.136.0.0/13',
3349         'RW': '105.178.0.0/15',
3350         'SA': '188.48.0.0/13',
3351         'SB': '202.1.160.0/19',
3352         'SC': '154.192.0.0/11',
3353         'SD': '154.96.0.0/13',
3354         'SE': '78.64.0.0/12',
3355         'SG': '152.56.0.0/14',
3356         'SI': '188.196.0.0/14',
3357         'SK': '78.98.0.0/15',
3358         'SL': '197.215.0.0/17',
3359         'SM': '89.186.32.0/19',
3360         'SN': '41.82.0.0/15',
3361         'SO': '197.220.64.0/19',
3362         'SR': '186.179.128.0/17',
3363         'SS': '105.235.208.0/21',
3364         'ST': '197.159.160.0/19',
3365         'SV': '168.243.0.0/16',
3366         'SX': '190.102.0.0/20',
3367         'SY': '5.0.0.0/16',
3368         'SZ': '41.84.224.0/19',
3369         'TC': '65.255.48.0/20',
3370         'TD': '154.68.128.0/19',
3371         'TG': '196.168.0.0/14',
3372         'TH': '171.96.0.0/13',
3373         'TJ': '85.9.128.0/18',
3374         'TK': '27.96.24.0/21',
3375         'TL': '180.189.160.0/20',
3376         'TM': '95.85.96.0/19',
3377         'TN': '197.0.0.0/11',
3378         'TO': '175.176.144.0/21',
3379         'TR': '78.160.0.0/11',
3380         'TT': '186.44.0.0/15',
3381         'TV': '202.2.96.0/19',
3382         'TW': '120.96.0.0/11',
3383         'TZ': '156.156.0.0/14',
3384         'UA': '93.72.0.0/13',
3385         'UG': '154.224.0.0/13',
3386         'US': '3.0.0.0/8',
3387         'UY': '167.56.0.0/13',
3388         'UZ': '82.215.64.0/18',
3389         'VA': '212.77.0.0/19',
3390         'VC': '24.92.144.0/20',
3391         'VE': '186.88.0.0/13',
3392         'VG': '172.103.64.0/18',
3393         'VI': '146.226.0.0/16',
3394         'VN': '14.160.0.0/11',
3395         'VU': '202.80.32.0/20',
3396         'WF': '117.20.32.0/21',
3397         'WS': '202.4.32.0/19',
3398         'YE': '134.35.0.0/16',
3399         'YT': '41.242.116.0/22',
3400         'ZA': '41.0.0.0/11',
3401         'ZM': '165.56.0.0/13',
3402         'ZW': '41.85.192.0/19',
3403     }
3404
3405     @classmethod
3406     def random_ipv4(cls, code):
3407         block = cls._country_ip_map.get(code.upper())
3408         if not block:
3409             return None
3410         addr, preflen = block.split('/')
3411         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3412         addr_max = addr_min | (0xffffffff >> int(preflen))
3413         return compat_str(socket.inet_ntoa(
3414             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3415
3416
3417 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3418     def __init__(self, proxies=None):
3419         # Set default handlers
3420         for type in ('http', 'https'):
3421             setattr(self, '%s_open' % type,
3422                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3423                         meth(r, proxy, type))
3424         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3425
3426     def proxy_open(self, req, proxy, type):
3427         req_proxy = req.headers.get('Ytdl-request-proxy')
3428         if req_proxy is not None:
3429             proxy = req_proxy
3430             del req.headers['Ytdl-request-proxy']
3431
3432         if proxy == '__noproxy__':
3433             return None  # No Proxy
3434         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3435             req.add_header('Ytdl-socks-proxy', proxy)
3436             # youtube-dl's http/https handlers do wrapping the socket with socks
3437             return None
3438         return compat_urllib_request.ProxyHandler.proxy_open(
3439             self, req, proxy, type)
3440
3441
3442 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3443 # released into Public Domain
3444 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3445
3446 def long_to_bytes(n, blocksize=0):
3447     """long_to_bytes(n:long, blocksize:int) : string
3448     Convert a long integer to a byte string.
3449
3450     If optional blocksize is given and greater than zero, pad the front of the
3451     byte string with binary zeros so that the length is a multiple of
3452     blocksize.
3453     """
3454     # after much testing, this algorithm was deemed to be the fastest
3455     s = b''
3456     n = int(n)
3457     while n > 0:
3458         s = compat_struct_pack('>I', n & 0xffffffff) + s
3459         n = n >> 32
3460     # strip off leading zeros
3461     for i in range(len(s)):
3462         if s[i] != b'\000'[0]:
3463             break
3464     else:
3465         # only happens when n == 0
3466         s = b'\000'
3467         i = 0
3468     s = s[i:]
3469     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3470     # de-padding being done above, but sigh...
3471     if blocksize > 0 and len(s) % blocksize:
3472         s = (blocksize - len(s) % blocksize) * b'\000' + s
3473     return s
3474
3475
3476 def bytes_to_long(s):
3477     """bytes_to_long(string) : long
3478     Convert a byte string to a long integer.
3479
3480     This is (essentially) the inverse of long_to_bytes().
3481     """
3482     acc = 0
3483     length = len(s)
3484     if length % 4:
3485         extra = (4 - length % 4)
3486         s = b'\000' * extra + s
3487         length = length + extra
3488     for i in range(0, length, 4):
3489         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3490     return acc
3491
3492
3493 def ohdave_rsa_encrypt(data, exponent, modulus):
3494     '''
3495     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3496
3497     Input:
3498         data: data to encrypt, bytes-like object
3499         exponent, modulus: parameter e and N of RSA algorithm, both integer
3500     Output: hex string of encrypted data
3501
3502     Limitation: supports one block encryption only
3503     '''
3504
3505     payload = int(binascii.hexlify(data[::-1]), 16)
3506     encrypted = pow(payload, exponent, modulus)
3507     return '%x' % encrypted
3508
3509
3510 def pkcs1pad(data, length):
3511     """
3512     Padding input data with PKCS#1 scheme
3513
3514     @param {int[]} data        input data
3515     @param {int}   length      target length
3516     @returns {int[]}           padded data
3517     """
3518     if len(data) > length - 11:
3519         raise ValueError('Input data too long for PKCS#1 padding')
3520
3521     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3522     return [0, 2] + pseudo_random + [0] + data
3523
3524
3525 def encode_base_n(num, n, table=None):
3526     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3527     if not table:
3528         table = FULL_TABLE[:n]
3529
3530     if n > len(table):
3531         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3532
3533     if num == 0:
3534         return table[0]
3535
3536     ret = ''
3537     while num:
3538         ret = table[num % n] + ret
3539         num = num // n
3540     return ret
3541
3542
3543 def decode_packed_codes(code):
3544     mobj = re.search(PACKED_CODES_RE, code)
3545     obfucasted_code, base, count, symbols = mobj.groups()
3546     base = int(base)
3547     count = int(count)
3548     symbols = symbols.split('|')
3549     symbol_table = {}
3550
3551     while count:
3552         count -= 1
3553         base_n_count = encode_base_n(count, base)
3554         symbol_table[base_n_count] = symbols[count] or base_n_count
3555
3556     return re.sub(
3557         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3558         obfucasted_code)
3559
3560
3561 def parse_m3u8_attributes(attrib):
3562     info = {}
3563     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3564         if val.startswith('"'):
3565             val = val[1:-1]
3566         info[key] = val
3567     return info
3568
3569
3570 def urshift(val, n):
3571     return val >> n if val >= 0 else (val + 0x100000000) >> n
3572
3573
3574 # Based on png2str() written by @gdkchan and improved by @yokrysty
3575 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3576 def decode_png(png_data):
3577     # Reference: https://www.w3.org/TR/PNG/
3578     header = png_data[8:]
3579
3580     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3581         raise IOError('Not a valid PNG file.')
3582
3583     int_map = {1: '>B', 2: '>H', 4: '>I'}
3584     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3585
3586     chunks = []
3587
3588     while header:
3589         length = unpack_integer(header[:4])
3590         header = header[4:]
3591
3592         chunk_type = header[:4]
3593         header = header[4:]
3594
3595         chunk_data = header[:length]
3596         header = header[length:]
3597
3598         header = header[4:]  # Skip CRC
3599
3600         chunks.append({
3601             'type': chunk_type,
3602             'length': length,
3603             'data': chunk_data
3604         })
3605
3606     ihdr = chunks[0]['data']
3607
3608     width = unpack_integer(ihdr[:4])
3609     height = unpack_integer(ihdr[4:8])
3610
3611     idat = b''
3612
3613     for chunk in chunks:
3614         if chunk['type'] == b'IDAT':
3615             idat += chunk['data']
3616
3617     if not idat:
3618         raise IOError('Unable to read PNG data.')
3619
3620     decompressed_data = bytearray(zlib.decompress(idat))
3621
3622     stride = width * 3
3623     pixels = []
3624
3625     def _get_pixel(idx):
3626         x = idx % stride
3627         y = idx // stride
3628         return pixels[y][x]
3629
3630     for y in range(height):
3631         basePos = y * (1 + stride)
3632         filter_type = decompressed_data[basePos]
3633
3634         current_row = []
3635
3636         pixels.append(current_row)
3637
3638         for x in range(stride):
3639             color = decompressed_data[1 + basePos + x]
3640             basex = y * stride + x
3641             left = 0
3642             up = 0
3643
3644             if x > 2:
3645                 left = _get_pixel(basex - 3)
3646             if y > 0:
3647                 up = _get_pixel(basex - stride)
3648
3649             if filter_type == 1:  # Sub
3650                 color = (color + left) & 0xff
3651             elif filter_type == 2:  # Up
3652                 color = (color + up) & 0xff
3653             elif filter_type == 3:  # Average
3654                 color = (color + ((left + up) >> 1)) & 0xff
3655             elif filter_type == 4:  # Paeth
3656                 a = left
3657                 b = up
3658                 c = 0
3659
3660                 if x > 2 and y > 0:
3661                     c = _get_pixel(basex - stride - 3)
3662
3663                 p = a + b - c
3664
3665                 pa = abs(p - a)
3666                 pb = abs(p - b)
3667                 pc = abs(p - c)
3668
3669                 if pa <= pb and pa <= pc:
3670                     color = (color + a) & 0xff
3671                 elif pb <= pc:
3672                     color = (color + b) & 0xff
3673                 else:
3674                     color = (color + c) & 0xff
3675
3676             current_row.append(color)
3677
3678     return width, height, pixels
3679
3680
3681 def write_xattr(path, key, value):
3682     # This mess below finds the best xattr tool for the job
3683     try:
3684         # try the pyxattr module...
3685         import xattr
3686
3687         if hasattr(xattr, 'set'):  # pyxattr
3688             # Unicode arguments are not supported in python-pyxattr until
3689             # version 0.5.0
3690             # See https://github.com/rg3/youtube-dl/issues/5498
3691             pyxattr_required_version = '0.5.0'
3692             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3693                 # TODO: fallback to CLI tools
3694                 raise XAttrUnavailableError(
3695                     'python-pyxattr is detected but is too old. '
3696                     'youtube-dl requires %s or above while your version is %s. '
3697                     'Falling back to other xattr implementations' % (
3698                         pyxattr_required_version, xattr.__version__))
3699
3700             setxattr = xattr.set
3701         else:  # xattr
3702             setxattr = xattr.setxattr
3703
3704         try:
3705             setxattr(path, key, value)
3706         except EnvironmentError as e:
3707             raise XAttrMetadataError(e.errno, e.strerror)
3708
3709     except ImportError:
3710         if compat_os_name == 'nt':
3711             # Write xattrs to NTFS Alternate Data Streams:
3712             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3713             assert ':' not in key
3714             assert os.path.exists(path)
3715
3716             ads_fn = path + ':' + key
3717             try:
3718                 with open(ads_fn, 'wb') as f:
3719                     f.write(value)
3720             except EnvironmentError as e:
3721                 raise XAttrMetadataError(e.errno, e.strerror)
3722         else:
3723             user_has_setfattr = check_executable('setfattr', ['--version'])
3724             user_has_xattr = check_executable('xattr', ['-h'])
3725
3726             if user_has_setfattr or user_has_xattr:
3727
3728                 value = value.decode('utf-8')
3729                 if user_has_setfattr:
3730                     executable = 'setfattr'
3731                     opts = ['-n', key, '-v', value]
3732                 elif user_has_xattr:
3733                     executable = 'xattr'
3734                     opts = ['-w', key, value]
3735
3736                 cmd = ([encodeFilename(executable, True)] +
3737                        [encodeArgument(o) for o in opts] +
3738                        [encodeFilename(path, True)])
3739
3740                 try:
3741                     p = subprocess.Popen(
3742                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3743                 except EnvironmentError as e:
3744                     raise XAttrMetadataError(e.errno, e.strerror)
3745                 stdout, stderr = p.communicate()
3746                 stderr = stderr.decode('utf-8', 'replace')
3747                 if p.returncode != 0:
3748                     raise XAttrMetadataError(p.returncode, stderr)
3749
3750             else:
3751                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3752                 if sys.platform.startswith('linux'):
3753                     raise XAttrUnavailableError(
3754                         "Couldn't find a tool to set the xattrs. "
3755                         "Install either the python 'pyxattr' or 'xattr' "
3756                         "modules, or the GNU 'attr' package "
3757                         "(which contains the 'setfattr' tool).")
3758                 else:
3759                     raise XAttrUnavailableError(
3760                         "Couldn't find a tool to set the xattrs. "
3761                         "Install either the python 'xattr' module, "
3762                         "or the 'xattr' binary.")