_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import email.header
  15 import errno
  16 import functools
  17 import gzip
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import operator
  24 import os
  25 import pipes
  26 import platform
  27 import random
  28 import re
  29 import socket
  30 import ssl
  31 import subprocess
  32 import sys
  33 import tempfile
  34 import traceback
  35 import xml.etree.ElementTree
  36 import zlib
  37
  38 from .compat import (
  39     compat_HTMLParser,
  40     compat_basestring,
  41     compat_chr,
  42     compat_etree_fromstring,
  43     compat_expanduser,
  44     compat_html_entities,
  45     compat_html_entities_html5,
  46     compat_http_client,
  47     compat_kwargs,
  48     compat_os_name,
  49     compat_parse_qs,
  50     compat_shlex_quote,
  51     compat_socket_create_connection,
  52     compat_str,
  53     compat_struct_pack,
  54     compat_struct_unpack,
  55     compat_urllib_error,
  56     compat_urllib_parse,
  57     compat_urllib_parse_urlencode,
  58     compat_urllib_parse_urlparse,
  59     compat_urllib_parse_unquote_plus,
  60     compat_urllib_request,
  61     compat_urlparse,
  62     compat_xpath,
  63 )
  64
  65 from .socks import (
  66     ProxyType,
  67     sockssocket,
  68 )
  69
  70
  71 def register_socks_protocols():
  72     # "Register" SOCKS protocols
  73     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  74     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  75     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  76         if scheme not in compat_urlparse.uses_netloc:
  77             compat_urlparse.uses_netloc.append(scheme)
  78
  79
  80 # This is not clearly defined otherwise
  81 compiled_regex_type = type(re.compile(''))
  82
  83 std_headers = {
  84     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  85     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  86     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  87     'Accept-Encoding': 'gzip, deflate',
  88     'Accept-Language': 'en-us,en;q=0.5',
  89 }
  90
  91
  92 USER_AGENTS = {
  93     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  94 }
  95
  96
  97 NO_DEFAULT = object()
  98
  99 ENGLISH_MONTH_NAMES = [
 100     'January', 'February', 'March', 'April', 'May', 'June',
 101     'July', 'August', 'September', 'October', 'November', 'December']
 102
 103 MONTH_NAMES = {
 104     'en': ENGLISH_MONTH_NAMES,
 105     'fr': [
 106         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 107         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 108 }
 109
 110 KNOWN_EXTENSIONS = (
 111     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 112     'flv', 'f4v', 'f4a', 'f4b',
 113     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 114     'mkv', 'mka', 'mk3d',
 115     'avi', 'divx',
 116     'mov',
 117     'asf', 'wmv', 'wma',
 118     '3gp', '3g2',
 119     'mp3',
 120     'flac',
 121     'ape',
 122     'wav',
 123     'f4f', 'f4m', 'm3u8', 'smil')
 124
 125 # needed for sanitizing filenames in restricted mode
 126 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 127                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 128                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 129
 130 DATE_FORMATS = (
 131     '%d %B %Y',
 132     '%d %b %Y',
 133     '%B %d %Y',
 134     '%B %dst %Y',
 135     '%B %dnd %Y',
 136     '%B %dth %Y',
 137     '%b %d %Y',
 138     '%b %dst %Y',
 139     '%b %dnd %Y',
 140     '%b %dth %Y',
 141     '%b %dst %Y %I:%M',
 142     '%b %dnd %Y %I:%M',
 143     '%b %dth %Y %I:%M',
 144     '%Y %m %d',
 145     '%Y-%m-%d',
 146     '%Y/%m/%d',
 147     '%Y/%m/%d %H:%M',
 148     '%Y/%m/%d %H:%M:%S',
 149     '%Y-%m-%d %H:%M',
 150     '%Y-%m-%d %H:%M:%S',
 151     '%Y-%m-%d %H:%M:%S.%f',
 152     '%d.%m.%Y %H:%M',
 153     '%d.%m.%Y %H.%M',
 154     '%Y-%m-%dT%H:%M:%SZ',
 155     '%Y-%m-%dT%H:%M:%S.%fZ',
 156     '%Y-%m-%dT%H:%M:%S.%f0Z',
 157     '%Y-%m-%dT%H:%M:%S',
 158     '%Y-%m-%dT%H:%M:%S.%f',
 159     '%Y-%m-%dT%H:%M',
 160     '%b %d %Y at %H:%M',
 161     '%b %d %Y at %H:%M:%S',
 162 )
 163
 164 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 165 DATE_FORMATS_DAY_FIRST.extend([
 166     '%d-%m-%Y',
 167     '%d.%m.%Y',
 168     '%d.%m.%y',
 169     '%d/%m/%Y',
 170     '%d/%m/%y',
 171     '%d/%m/%Y %H:%M:%S',
 172 ])
 173
 174 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 175 DATE_FORMATS_MONTH_FIRST.extend([
 176     '%m-%d-%Y',
 177     '%m.%d.%Y',
 178     '%m/%d/%Y',
 179     '%m/%d/%y',
 180     '%m/%d/%Y %H:%M:%S',
 181 ])
 182
 183 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 184
 185
 186 def preferredencoding():
 187     """Get preferred encoding.
 188
 189     Returns the best encoding scheme for the system, based on
 190     locale.getpreferredencoding() and some further tweaks.
 191     """
 192     try:
 193         pref = locale.getpreferredencoding()
 194         'TEST'.encode(pref)
 195     except Exception:
 196         pref = 'UTF-8'
 197
 198     return pref
 199
 200
 201 def write_json_file(obj, fn):
 202     """ Encode obj as JSON and write it to fn, atomically if possible """
 203
 204     fn = encodeFilename(fn)
 205     if sys.version_info < (3, 0) and sys.platform != 'win32':
 206         encoding = get_filesystem_encoding()
 207         # os.path.basename returns a bytes object, but NamedTemporaryFile
 208         # will fail if the filename contains non ascii characters unless we
 209         # use a unicode object
 210         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 211         # the same for os.path.dirname
 212         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 213     else:
 214         path_basename = os.path.basename
 215         path_dirname = os.path.dirname
 216
 217     args = {
 218         'suffix': '.tmp',
 219         'prefix': path_basename(fn) + '.',
 220         'dir': path_dirname(fn),
 221         'delete': False,
 222     }
 223
 224     # In Python 2.x, json.dump expects a bytestream.
 225     # In Python 3.x, it writes to a character stream
 226     if sys.version_info < (3, 0):
 227         args['mode'] = 'wb'
 228     else:
 229         args.update({
 230             'mode': 'w',
 231             'encoding': 'utf-8',
 232         })
 233
 234     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 235
 236     try:
 237         with tf:
 238             json.dump(obj, tf)
 239         if sys.platform == 'win32':
 240             # Need to remove existing file on Windows, else os.rename raises
 241             # WindowsError or FileExistsError.
 242             try:
 243                 os.unlink(fn)
 244             except OSError:
 245                 pass
 246         os.rename(tf.name, fn)
 247     except Exception:
 248         try:
 249             os.remove(tf.name)
 250         except OSError:
 251             pass
 252         raise
 253
 254
 255 if sys.version_info >= (2, 7):
 256     def find_xpath_attr(node, xpath, key, val=None):
 257         """ Find the xpath xpath[@key=val] """
 258         assert re.match(r'^[a-zA-Z_-]+$', key)
 259         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 260         return node.find(expr)
 261 else:
 262     def find_xpath_attr(node, xpath, key, val=None):
 263         for f in node.findall(compat_xpath(xpath)):
 264             if key not in f.attrib:
 265                 continue
 266             if val is None or f.attrib.get(key) == val:
 267                 return f
 268         return None
 269
 270 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 271 # the namespace parameter
 272
 273
 274 def xpath_with_ns(path, ns_map):
 275     components = [c.split(':') for c in path.split('/')]
 276     replaced = []
 277     for c in components:
 278         if len(c) == 1:
 279             replaced.append(c[0])
 280         else:
 281             ns, tag = c
 282             replaced.append('{%s}%s' % (ns_map[ns], tag))
 283     return '/'.join(replaced)
 284
 285
 286 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 287     def _find_xpath(xpath):
 288         return node.find(compat_xpath(xpath))
 289
 290     if isinstance(xpath, (str, compat_str)):
 291         n = _find_xpath(xpath)
 292     else:
 293         for xp in xpath:
 294             n = _find_xpath(xp)
 295             if n is not None:
 296                 break
 297
 298     if n is None:
 299         if default is not NO_DEFAULT:
 300             return default
 301         elif fatal:
 302             name = xpath if name is None else name
 303             raise ExtractorError('Could not find XML element %s' % name)
 304         else:
 305             return None
 306     return n
 307
 308
 309 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 311     if n is None or n == default:
 312         return n
 313     if n.text is None:
 314         if default is not NO_DEFAULT:
 315             return default
 316         elif fatal:
 317             name = xpath if name is None else name
 318             raise ExtractorError('Could not find XML element\'s text %s' % name)
 319         else:
 320             return None
 321     return n.text
 322
 323
 324 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 325     n = find_xpath_attr(node, xpath, key)
 326     if n is None:
 327         if default is not NO_DEFAULT:
 328             return default
 329         elif fatal:
 330             name = '%s[@%s]' % (xpath, key) if name is None else name
 331             raise ExtractorError('Could not find XML attribute %s' % name)
 332         else:
 333             return None
 334     return n.attrib[key]
 335
 336
 337 def get_element_by_id(id, html):
 338     """Return the content of the tag with the specified ID in the passed HTML document"""
 339     return get_element_by_attribute('id', id, html)
 340
 341
 342 def get_element_by_class(class_name, html):
 343     """Return the content of the first tag with the specified class in the passed HTML document"""
 344     retval = get_elements_by_class(class_name, html)
 345     return retval[0] if retval else None
 346
 347
 348 def get_element_by_attribute(attribute, value, html, escape_value=True):
 349     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 350     return retval[0] if retval else None
 351
 352
 353 def get_elements_by_class(class_name, html):
 354     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 355     return get_elements_by_attribute(
 356         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 357         html, escape_value=False)
 358
 359
 360 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 361     """Return the content of the tag with the specified attribute in the passed HTML document"""
 362
 363     value = re.escape(value) if escape_value else value
 364
 365     retlist = []
 366     for m in re.finditer(r'''(?xs)
 367         <([a-zA-Z0-9:._-]+)
 368          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 369          \s+%s=['"]?%s['"]?
 370          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 371         \s*>
 372         (?P<content>.*?)
 373         </\1>
 374     ''' % (re.escape(attribute), value), html):
 375         res = m.group('content')
 376
 377         if res.startswith('"') or res.startswith("'"):
 378             res = res[1:-1]
 379
 380         retlist.append(unescapeHTML(res))
 381
 382     return retlist
 383
 384
 385 class HTMLAttributeParser(compat_HTMLParser):
 386     """Trivial HTML parser to gather the attributes for a single element"""
 387     def __init__(self):
 388         self.attrs = {}
 389         compat_HTMLParser.__init__(self)
 390
 391     def handle_starttag(self, tag, attrs):
 392         self.attrs = dict(attrs)
 393
 394
 395 def extract_attributes(html_element):
 396     """Given a string for an HTML element such as
 397     <el
 398          a="foo" B="bar" c="&98;az" d=boz
 399          empty= noval entity="&amp;"
 400          sq='"' dq="'"
 401     >
 402     Decode and return a dictionary of attributes.
 403     {
 404         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 405         'empty': '', 'noval': None, 'entity': '&',
 406         'sq': '"', 'dq': '\''
 407     }.
 408     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 409     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 410     """
 411     parser = HTMLAttributeParser()
 412     parser.feed(html_element)
 413     parser.close()
 414     return parser.attrs
 415
 416
 417 def clean_html(html):
 418     """Clean an HTML snippet into a readable string"""
 419
 420     if html is None:  # Convenience for sanitizing descriptions etc.
 421         return html
 422
 423     # Newline vs <br />
 424     html = html.replace('\n', ' ')
 425     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 426     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 427     # Strip html tags
 428     html = re.sub('<.*?>', '', html)
 429     # Replace html entities
 430     html = unescapeHTML(html)
 431     return html.strip()
 432
 433
 434 def sanitize_open(filename, open_mode):
 435     """Try to open the given filename, and slightly tweak it if this fails.
 436
 437     Attempts to open the given filename. If this fails, it tries to change
 438     the filename slightly, step by step, until it's either able to open it
 439     or it fails and raises a final exception, like the standard open()
 440     function.
 441
 442     It returns the tuple (stream, definitive_file_name).
 443     """
 444     try:
 445         if filename == '-':
 446             if sys.platform == 'win32':
 447                 import msvcrt
 448                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 449             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 450         stream = open(encodeFilename(filename), open_mode)
 451         return (stream, filename)
 452     except (IOError, OSError) as err:
 453         if err.errno in (errno.EACCES,):
 454             raise
 455
 456         # In case of error, try to remove win32 forbidden chars
 457         alt_filename = sanitize_path(filename)
 458         if alt_filename == filename:
 459             raise
 460         else:
 461             # An exception here should be caught in the caller
 462             stream = open(encodeFilename(alt_filename), open_mode)
 463             return (stream, alt_filename)
 464
 465
 466 def timeconvert(timestr):
 467     """Convert RFC 2822 defined time string into system timestamp"""
 468     timestamp = None
 469     timetuple = email.utils.parsedate_tz(timestr)
 470     if timetuple is not None:
 471         timestamp = email.utils.mktime_tz(timetuple)
 472     return timestamp
 473
 474
 475 def sanitize_filename(s, restricted=False, is_id=False):
 476     """Sanitizes a string so it could be used as part of a filename.
 477     If restricted is set, use a stricter subset of allowed characters.
 478     Set is_id if this is not an arbitrary string, but an ID that should be kept
 479     if possible.
 480     """
 481     def replace_insane(char):
 482         if restricted and char in ACCENT_CHARS:
 483             return ACCENT_CHARS[char]
 484         if char == '?' or ord(char) < 32 or ord(char) == 127:
 485             return ''
 486         elif char == '"':
 487             return '' if restricted else '\''
 488         elif char == ':':
 489             return '_-' if restricted else ' -'
 490         elif char in '\\/|*<>':
 491             return '_'
 492         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 493             return '_'
 494         if restricted and ord(char) > 127:
 495             return '_'
 496         return char
 497
 498     # Handle timestamps
 499     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 500     result = ''.join(map(replace_insane, s))
 501     if not is_id:
 502         while '__' in result:
 503             result = result.replace('__', '_')
 504         result = result.strip('_')
 505         # Common case of "Foreign band name - English song title"
 506         if restricted and result.startswith('-_'):
 507             result = result[2:]
 508         if result.startswith('-'):
 509             result = '_' + result[len('-'):]
 510         result = result.lstrip('.')
 511         if not result:
 512             result = '_'
 513     return result
 514
 515
 516 def sanitize_path(s):
 517     """Sanitizes and normalizes path on Windows"""
 518     if sys.platform != 'win32':
 519         return s
 520     drive_or_unc, _ = os.path.splitdrive(s)
 521     if sys.version_info < (2, 7) and not drive_or_unc:
 522         drive_or_unc, _ = os.path.splitunc(s)
 523     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 524     if drive_or_unc:
 525         norm_path.pop(0)
 526     sanitized_path = [
 527         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 528         for path_part in norm_path]
 529     if drive_or_unc:
 530         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 531     return os.path.join(*sanitized_path)
 532
 533
 534 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 535 # unwanted failures due to missing protocol
 536 def sanitize_url(url):
 537     return 'http:%s' % url if url.startswith('//') else url
 538
 539
 540 def sanitized_Request(url, *args, **kwargs):
 541     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 542
 543
 544 def expand_path(s):
 545     """Expand shell variables and ~"""
 546     return os.path.expandvars(compat_expanduser(s))
 547
 548
 549 def orderedSet(iterable):
 550     """ Remove all duplicates from the input iterable """
 551     res = []
 552     for el in iterable:
 553         if el not in res:
 554             res.append(el)
 555     return res
 556
 557
 558 def _htmlentity_transform(entity_with_semicolon):
 559     """Transforms an HTML entity to a character."""
 560     entity = entity_with_semicolon[:-1]
 561
 562     # Known non-numeric HTML entity
 563     if entity in compat_html_entities.name2codepoint:
 564         return compat_chr(compat_html_entities.name2codepoint[entity])
 565
 566     # TODO: HTML5 allows entities without a semicolon. For example,
 567     # '&Eacuteric' should be decoded as 'Éric'.
 568     if entity_with_semicolon in compat_html_entities_html5:
 569         return compat_html_entities_html5[entity_with_semicolon]
 570
 571     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 572     if mobj is not None:
 573         numstr = mobj.group(1)
 574         if numstr.startswith('x'):
 575             base = 16
 576             numstr = '0%s' % numstr
 577         else:
 578             base = 10
 579         # See https://github.com/rg3/youtube-dl/issues/7518
 580         try:
 581             return compat_chr(int(numstr, base))
 582         except ValueError:
 583             pass
 584
 585     # Unknown entity in name, return its literal representation
 586     return '&%s;' % entity
 587
 588
 589 def unescapeHTML(s):
 590     if s is None:
 591         return None
 592     assert type(s) == compat_str
 593
 594     return re.sub(
 595         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 596
 597
 598 def get_subprocess_encoding():
 599     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 600         # For subprocess calls, encode with locale encoding
 601         # Refer to http://stackoverflow.com/a/9951851/35070
 602         encoding = preferredencoding()
 603     else:
 604         encoding = sys.getfilesystemencoding()
 605     if encoding is None:
 606         encoding = 'utf-8'
 607     return encoding
 608
 609
 610 def encodeFilename(s, for_subprocess=False):
 611     """
 612     @param s The name of the file
 613     """
 614
 615     assert type(s) == compat_str
 616
 617     # Python 3 has a Unicode API
 618     if sys.version_info >= (3, 0):
 619         return s
 620
 621     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 622     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 623     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 624     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 625         return s
 626
 627     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 628     if sys.platform.startswith('java'):
 629         return s
 630
 631     return s.encode(get_subprocess_encoding(), 'ignore')
 632
 633
 634 def decodeFilename(b, for_subprocess=False):
 635
 636     if sys.version_info >= (3, 0):
 637         return b
 638
 639     if not isinstance(b, bytes):
 640         return b
 641
 642     return b.decode(get_subprocess_encoding(), 'ignore')
 643
 644
 645 def encodeArgument(s):
 646     if not isinstance(s, compat_str):
 647         # Legacy code that uses byte strings
 648         # Uncomment the following line after fixing all post processors
 649         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 650         s = s.decode('ascii')
 651     return encodeFilename(s, True)
 652
 653
 654 def decodeArgument(b):
 655     return decodeFilename(b, True)
 656
 657
 658 def decodeOption(optval):
 659     if optval is None:
 660         return optval
 661     if isinstance(optval, bytes):
 662         optval = optval.decode(preferredencoding())
 663
 664     assert isinstance(optval, compat_str)
 665     return optval
 666
 667
 668 def formatSeconds(secs):
 669     if secs > 3600:
 670         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 671     elif secs > 60:
 672         return '%d:%02d' % (secs // 60, secs % 60)
 673     else:
 674         return '%d' % secs
 675
 676
 677 def make_HTTPS_handler(params, **kwargs):
 678     opts_no_check_certificate = params.get('nocheckcertificate', False)
 679     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 680         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 681         if opts_no_check_certificate:
 682             context.check_hostname = False
 683             context.verify_mode = ssl.CERT_NONE
 684         try:
 685             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 686         except TypeError:
 687             # Python 2.7.8
 688             # (create_default_context present but HTTPSHandler has no context=)
 689             pass
 690
 691     if sys.version_info < (3, 2):
 692         return YoutubeDLHTTPSHandler(params, **kwargs)
 693     else:  # Python < 3.4
 694         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 695         context.verify_mode = (ssl.CERT_NONE
 696                                if opts_no_check_certificate
 697                                else ssl.CERT_REQUIRED)
 698         context.set_default_verify_paths()
 699         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 700
 701
 702 def bug_reports_message():
 703     if ytdl_is_updateable():
 704         update_cmd = 'type  youtube-dl -U  to update'
 705     else:
 706         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 707     msg = '; please report this issue on https://yt-dl.org/bug .'
 708     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 709     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 710     return msg
 711
 712
 713 class YoutubeDLError(Exception):
 714     """Base exception for YoutubeDL errors."""
 715     pass
 716
 717
 718 class ExtractorError(YoutubeDLError):
 719     """Error during info extraction."""
 720
 721     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 722         """ tb, if given, is the original traceback (so that it can be printed out).
 723         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 724         """
 725
 726         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 727             expected = True
 728         if video_id is not None:
 729             msg = video_id + ': ' + msg
 730         if cause:
 731             msg += ' (caused by %r)' % cause
 732         if not expected:
 733             msg += bug_reports_message()
 734         super(ExtractorError, self).__init__(msg)
 735
 736         self.traceback = tb
 737         self.exc_info = sys.exc_info()  # preserve original exception
 738         self.cause = cause
 739         self.video_id = video_id
 740
 741     def format_traceback(self):
 742         if self.traceback is None:
 743             return None
 744         return ''.join(traceback.format_tb(self.traceback))
 745
 746
 747 class UnsupportedError(ExtractorError):
 748     def __init__(self, url):
 749         super(UnsupportedError, self).__init__(
 750             'Unsupported URL: %s' % url, expected=True)
 751         self.url = url
 752
 753
 754 class RegexNotFoundError(ExtractorError):
 755     """Error when a regex didn't match"""
 756     pass
 757
 758
 759 class GeoRestrictedError(ExtractorError):
 760     """Geographic restriction Error exception.
 761
 762     This exception may be thrown when a video is not available from your
 763     geographic location due to geographic restrictions imposed by a website.
 764     """
 765     def __init__(self, msg, countries=None):
 766         super(GeoRestrictedError, self).__init__(msg, expected=True)
 767         self.msg = msg
 768         self.countries = countries
 769
 770
 771 class DownloadError(YoutubeDLError):
 772     """Download Error exception.
 773
 774     This exception may be thrown by FileDownloader objects if they are not
 775     configured to continue on errors. They will contain the appropriate
 776     error message.
 777     """
 778
 779     def __init__(self, msg, exc_info=None):
 780         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 781         super(DownloadError, self).__init__(msg)
 782         self.exc_info = exc_info
 783
 784
 785 class SameFileError(YoutubeDLError):
 786     """Same File exception.
 787
 788     This exception will be thrown by FileDownloader objects if they detect
 789     multiple files would have to be downloaded to the same file on disk.
 790     """
 791     pass
 792
 793
 794 class PostProcessingError(YoutubeDLError):
 795     """Post Processing exception.
 796
 797     This exception may be raised by PostProcessor's .run() method to
 798     indicate an error in the postprocessing task.
 799     """
 800
 801     def __init__(self, msg):
 802         super(PostProcessingError, self).__init__(msg)
 803         self.msg = msg
 804
 805
 806 class MaxDownloadsReached(YoutubeDLError):
 807     """ --max-downloads limit has been reached. """
 808     pass
 809
 810
 811 class UnavailableVideoError(YoutubeDLError):
 812     """Unavailable Format exception.
 813
 814     This exception will be thrown when a video is requested
 815     in a format that is not available for that video.
 816     """
 817     pass
 818
 819
 820 class ContentTooShortError(YoutubeDLError):
 821     """Content Too Short exception.
 822
 823     This exception may be raised by FileDownloader objects when a file they
 824     download is too small for what the server announced first, indicating
 825     the connection was probably interrupted.
 826     """
 827
 828     def __init__(self, downloaded, expected):
 829         super(ContentTooShortError, self).__init__(
 830             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
 831         )
 832         # Both in bytes
 833         self.downloaded = downloaded
 834         self.expected = expected
 835
 836
 837 class XAttrMetadataError(YoutubeDLError):
 838     def __init__(self, code=None, msg='Unknown error'):
 839         super(XAttrMetadataError, self).__init__(msg)
 840         self.code = code
 841         self.msg = msg
 842
 843         # Parsing code and msg
 844         if (self.code in (errno.ENOSPC, errno.EDQUOT) or
 845                 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
 846             self.reason = 'NO_SPACE'
 847         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
 848             self.reason = 'VALUE_TOO_LONG'
 849         else:
 850             self.reason = 'NOT_SUPPORTED'
 851
 852
 853 class XAttrUnavailableError(YoutubeDLError):
 854     pass
 855
 856
 857 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 858     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 859     # expected HTTP responses to meet HTTP/1.0 or later (see also
 860     # https://github.com/rg3/youtube-dl/issues/6727)
 861     if sys.version_info < (3, 0):
 862         kwargs[b'strict'] = True
 863     hc = http_class(*args, **kwargs)
 864     source_address = ydl_handler._params.get('source_address')
 865     if source_address is not None:
 866         sa = (source_address, 0)
 867         if hasattr(hc, 'source_address'):  # Python 2.7+
 868             hc.source_address = sa
 869         else:  # Python 2.6
 870             def _hc_connect(self, *args, **kwargs):
 871                 sock = compat_socket_create_connection(
 872                     (self.host, self.port), self.timeout, sa)
 873                 if is_https:
 874                     self.sock = ssl.wrap_socket(
 875                         sock, self.key_file, self.cert_file,
 876                         ssl_version=ssl.PROTOCOL_TLSv1)
 877                 else:
 878                     self.sock = sock
 879             hc.connect = functools.partial(_hc_connect, hc)
 880
 881     return hc
 882
 883
 884 def handle_youtubedl_headers(headers):
 885     filtered_headers = headers
 886
 887     if 'Youtubedl-no-compression' in filtered_headers:
 888         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 889         del filtered_headers['Youtubedl-no-compression']
 890
 891     return filtered_headers
 892
 893
 894 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 895     """Handler for HTTP requests and responses.
 896
 897     This class, when installed with an OpenerDirector, automatically adds
 898     the standard headers to every HTTP request and handles gzipped and
 899     deflated responses from web servers. If compression is to be avoided in
 900     a particular request, the original request in the program code only has
 901     to include the HTTP header "Youtubedl-no-compression", which will be
 902     removed before making the real request.
 903
 904     Part of this code was copied from:
 905
 906     http://techknack.net/python-urllib2-handlers/
 907
 908     Andrew Rowls, the author of that code, agreed to release it to the
 909     public domain.
 910     """
 911
 912     def __init__(self, params, *args, **kwargs):
 913         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 914         self._params = params
 915
 916     def http_open(self, req):
 917         conn_class = compat_http_client.HTTPConnection
 918
 919         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 920         if socks_proxy:
 921             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 922             del req.headers['Ytdl-socks-proxy']
 923
 924         return self.do_open(functools.partial(
 925             _create_http_connection, self, conn_class, False),
 926             req)
 927
 928     @staticmethod
 929     def deflate(data):
 930         try:
 931             return zlib.decompress(data, -zlib.MAX_WBITS)
 932         except zlib.error:
 933             return zlib.decompress(data)
 934
 935     @staticmethod
 936     def addinfourl_wrapper(stream, headers, url, code):
 937         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 938             return compat_urllib_request.addinfourl(stream, headers, url, code)
 939         ret = compat_urllib_request.addinfourl(stream, headers, url)
 940         ret.code = code
 941         return ret
 942
 943     def http_request(self, req):
 944         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 945         # always respected by websites, some tend to give out URLs with non percent-encoded
 946         # non-ASCII characters (see telemb.py, ard.py [#3412])
 947         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 948         # To work around aforementioned issue we will replace request's original URL with
 949         # percent-encoded one
 950         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 951         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 952         url = req.get_full_url()
 953         url_escaped = escape_url(url)
 954
 955         # Substitute URL if any change after escaping
 956         if url != url_escaped:
 957             req = update_Request(req, url=url_escaped)
 958
 959         for h, v in std_headers.items():
 960             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 961             # The dict keys are capitalized because of this bug by urllib
 962             if h.capitalize() not in req.headers:
 963                 req.add_header(h, v)
 964
 965         req.headers = handle_youtubedl_headers(req.headers)
 966
 967         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 968             # Python 2.6 is brain-dead when it comes to fragments
 969             req._Request__original = req._Request__original.partition('#')[0]
 970             req._Request__r_type = req._Request__r_type.partition('#')[0]
 971
 972         return req
 973
 974     def http_response(self, req, resp):
 975         old_resp = resp
 976         # gzip
 977         if resp.headers.get('Content-encoding', '') == 'gzip':
 978             content = resp.read()
 979             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 980             try:
 981                 uncompressed = io.BytesIO(gz.read())
 982             except IOError as original_ioerror:
 983                 # There may be junk add the end of the file
 984                 # See http://stackoverflow.com/q/4928560/35070 for details
 985                 for i in range(1, 1024):
 986                     try:
 987                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 988                         uncompressed = io.BytesIO(gz.read())
 989                     except IOError:
 990                         continue
 991                     break
 992                 else:
 993                     raise original_ioerror
 994             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 995             resp.msg = old_resp.msg
 996             del resp.headers['Content-encoding']
 997         # deflate
 998         if resp.headers.get('Content-encoding', '') == 'deflate':
 999             gz = io.BytesIO(self.deflate(resp.read()))
1000             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
1001             resp.msg = old_resp.msg
1002             del resp.headers['Content-encoding']
1003         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1004         # https://github.com/rg3/youtube-dl/issues/6457).
1005         if 300 <= resp.code < 400:
1006             location = resp.headers.get('Location')
1007             if location:
1008                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1009                 if sys.version_info >= (3, 0):
1010                     location = location.encode('iso-8859-1').decode('utf-8')
1011                 else:
1012                     location = location.decode('utf-8')
1013                 location_escaped = escape_url(location)
1014                 if location != location_escaped:
1015                     del resp.headers['Location']
1016                     if sys.version_info < (3, 0):
1017                         location_escaped = location_escaped.encode('utf-8')
1018                     resp.headers['Location'] = location_escaped
1019         return resp
1020
1021     https_request = http_request
1022     https_response = http_response
1023
1024
1025 def make_socks_conn_class(base_class, socks_proxy):
1026     assert issubclass(base_class, (
1027         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1028
1029     url_components = compat_urlparse.urlparse(socks_proxy)
1030     if url_components.scheme.lower() == 'socks5':
1031         socks_type = ProxyType.SOCKS5
1032     elif url_components.scheme.lower() in ('socks', 'socks4'):
1033         socks_type = ProxyType.SOCKS4
1034     elif url_components.scheme.lower() == 'socks4a':
1035         socks_type = ProxyType.SOCKS4A
1036
1037     def unquote_if_non_empty(s):
1038         if not s:
1039             return s
1040         return compat_urllib_parse_unquote_plus(s)
1041
1042     proxy_args = (
1043         socks_type,
1044         url_components.hostname, url_components.port or 1080,
1045         True,  # Remote DNS
1046         unquote_if_non_empty(url_components.username),
1047         unquote_if_non_empty(url_components.password),
1048     )
1049
1050     class SocksConnection(base_class):
1051         def connect(self):
1052             self.sock = sockssocket()
1053             self.sock.setproxy(*proxy_args)
1054             if type(self.timeout) in (int, float):
1055                 self.sock.settimeout(self.timeout)
1056             self.sock.connect((self.host, self.port))
1057
1058             if isinstance(self, compat_http_client.HTTPSConnection):
1059                 if hasattr(self, '_context'):  # Python > 2.6
1060                     self.sock = self._context.wrap_socket(
1061                         self.sock, server_hostname=self.host)
1062                 else:
1063                     self.sock = ssl.wrap_socket(self.sock)
1064
1065     return SocksConnection
1066
1067
1068 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1069     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1070         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1071         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1072         self._params = params
1073
1074     def https_open(self, req):
1075         kwargs = {}
1076         conn_class = self._https_conn_class
1077
1078         if hasattr(self, '_context'):  # python > 2.6
1079             kwargs['context'] = self._context
1080         if hasattr(self, '_check_hostname'):  # python 3.x
1081             kwargs['check_hostname'] = self._check_hostname
1082
1083         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1084         if socks_proxy:
1085             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1086             del req.headers['Ytdl-socks-proxy']
1087
1088         return self.do_open(functools.partial(
1089             _create_http_connection, self, conn_class, True),
1090             req, **kwargs)
1091
1092
1093 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1094     def __init__(self, cookiejar=None):
1095         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1096
1097     def http_response(self, request, response):
1098         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1099         # characters in Set-Cookie HTTP header of last response (see
1100         # https://github.com/rg3/youtube-dl/issues/6769).
1101         # In order to at least prevent crashing we will percent encode Set-Cookie
1102         # header before HTTPCookieProcessor starts processing it.
1103         # if sys.version_info < (3, 0) and response.headers:
1104         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1105         #         set_cookie = response.headers.get(set_cookie_header)
1106         #         if set_cookie:
1107         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1108         #             if set_cookie != set_cookie_escaped:
1109         #                 del response.headers[set_cookie_header]
1110         #                 response.headers[set_cookie_header] = set_cookie_escaped
1111         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1112
1113     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1114     https_response = http_response
1115
1116
1117 def extract_timezone(date_str):
1118     m = re.search(
1119         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1120         date_str)
1121     if not m:
1122         timezone = datetime.timedelta()
1123     else:
1124         date_str = date_str[:-len(m.group('tz'))]
1125         if not m.group('sign'):
1126             timezone = datetime.timedelta()
1127         else:
1128             sign = 1 if m.group('sign') == '+' else -1
1129             timezone = datetime.timedelta(
1130                 hours=sign * int(m.group('hours')),
1131                 minutes=sign * int(m.group('minutes')))
1132     return timezone, date_str
1133
1134
1135 def parse_iso8601(date_str, delimiter='T', timezone=None):
1136     """ Return a UNIX timestamp from the given date """
1137
1138     if date_str is None:
1139         return None
1140
1141     date_str = re.sub(r'\.[0-9]+', '', date_str)
1142
1143     if timezone is None:
1144         timezone, date_str = extract_timezone(date_str)
1145
1146     try:
1147         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1148         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1149         return calendar.timegm(dt.timetuple())
1150     except ValueError:
1151         pass
1152
1153
1154 def date_formats(day_first=True):
1155     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1156
1157
1158 def unified_strdate(date_str, day_first=True):
1159     """Return a string with the date in the format YYYYMMDD"""
1160
1161     if date_str is None:
1162         return None
1163     upload_date = None
1164     # Replace commas
1165     date_str = date_str.replace(',', ' ')
1166     # Remove AM/PM + timezone
1167     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1168     _, date_str = extract_timezone(date_str)
1169
1170     for expression in date_formats(day_first):
1171         try:
1172             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1173         except ValueError:
1174             pass
1175     if upload_date is None:
1176         timetuple = email.utils.parsedate_tz(date_str)
1177         if timetuple:
1178             try:
1179                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1180             except ValueError:
1181                 pass
1182     if upload_date is not None:
1183         return compat_str(upload_date)
1184
1185
1186 def unified_timestamp(date_str, day_first=True):
1187     if date_str is None:
1188         return None
1189
1190     date_str = date_str.replace(',', ' ')
1191
1192     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1193     timezone, date_str = extract_timezone(date_str)
1194
1195     # Remove AM/PM + timezone
1196     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197
1198     # Remove unrecognized timezones from ISO 8601 alike timestamps
1199     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1200     if m:
1201         date_str = date_str[:-len(m.group('tz'))]
1202
1203     for expression in date_formats(day_first):
1204         try:
1205             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1206             return calendar.timegm(dt.timetuple())
1207         except ValueError:
1208             pass
1209     timetuple = email.utils.parsedate_tz(date_str)
1210     if timetuple:
1211         return calendar.timegm(timetuple) + pm_delta * 3600
1212
1213
1214 def determine_ext(url, default_ext='unknown_video'):
1215     if url is None:
1216         return default_ext
1217     guess = url.partition('?')[0].rpartition('.')[2]
1218     if re.match(r'^[A-Za-z0-9]+$', guess):
1219         return guess
1220     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1221     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1222         return guess.rstrip('/')
1223     else:
1224         return default_ext
1225
1226
1227 def subtitles_filename(filename, sub_lang, sub_format):
1228     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1229
1230
1231 def date_from_str(date_str):
1232     """
1233     Return a datetime object from a string in the format YYYYMMDD or
1234     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1235     today = datetime.date.today()
1236     if date_str in ('now', 'today'):
1237         return today
1238     if date_str == 'yesterday':
1239         return today - datetime.timedelta(days=1)
1240     match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1241     if match is not None:
1242         sign = match.group('sign')
1243         time = int(match.group('time'))
1244         if sign == '-':
1245             time = -time
1246         unit = match.group('unit')
1247         # A bad approximation?
1248         if unit == 'month':
1249             unit = 'day'
1250             time *= 30
1251         elif unit == 'year':
1252             unit = 'day'
1253             time *= 365
1254         unit += 's'
1255         delta = datetime.timedelta(**{unit: time})
1256         return today + delta
1257     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1258
1259
1260 def hyphenate_date(date_str):
1261     """
1262     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1263     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1264     if match is not None:
1265         return '-'.join(match.groups())
1266     else:
1267         return date_str
1268
1269
1270 class DateRange(object):
1271     """Represents a time interval between two dates"""
1272
1273     def __init__(self, start=None, end=None):
1274         """start and end must be strings in the format accepted by date"""
1275         if start is not None:
1276             self.start = date_from_str(start)
1277         else:
1278             self.start = datetime.datetime.min.date()
1279         if end is not None:
1280             self.end = date_from_str(end)
1281         else:
1282             self.end = datetime.datetime.max.date()
1283         if self.start > self.end:
1284             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1285
1286     @classmethod
1287     def day(cls, day):
1288         """Returns a range that only contains the given day"""
1289         return cls(day, day)
1290
1291     def __contains__(self, date):
1292         """Check if the date is in the range"""
1293         if not isinstance(date, datetime.date):
1294             date = date_from_str(date)
1295         return self.start <= date <= self.end
1296
1297     def __str__(self):
1298         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1299
1300
1301 def platform_name():
1302     """ Returns the platform name as a compat_str """
1303     res = platform.platform()
1304     if isinstance(res, bytes):
1305         res = res.decode(preferredencoding())
1306
1307     assert isinstance(res, compat_str)
1308     return res
1309
1310
1311 def _windows_write_string(s, out):
1312     """ Returns True if the string was written using special methods,
1313     False if it has yet to be written out."""
1314     # Adapted from http://stackoverflow.com/a/3259271/35070
1315
1316     import ctypes
1317     import ctypes.wintypes
1318
1319     WIN_OUTPUT_IDS = {
1320         1: -11,
1321         2: -12,
1322     }
1323
1324     try:
1325         fileno = out.fileno()
1326     except AttributeError:
1327         # If the output stream doesn't have a fileno, it's virtual
1328         return False
1329     except io.UnsupportedOperation:
1330         # Some strange Windows pseudo files?
1331         return False
1332     if fileno not in WIN_OUTPUT_IDS:
1333         return False
1334
1335     GetStdHandle = ctypes.WINFUNCTYPE(
1336         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1337         (b'GetStdHandle', ctypes.windll.kernel32))
1338     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1339
1340     WriteConsoleW = ctypes.WINFUNCTYPE(
1341         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1342         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1343         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1344     written = ctypes.wintypes.DWORD(0)
1345
1346     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1347     FILE_TYPE_CHAR = 0x0002
1348     FILE_TYPE_REMOTE = 0x8000
1349     GetConsoleMode = ctypes.WINFUNCTYPE(
1350         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1351         ctypes.POINTER(ctypes.wintypes.DWORD))(
1352         (b'GetConsoleMode', ctypes.windll.kernel32))
1353     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1354
1355     def not_a_console(handle):
1356         if handle == INVALID_HANDLE_VALUE or handle is None:
1357             return True
1358         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1359                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1360
1361     if not_a_console(h):
1362         return False
1363
1364     def next_nonbmp_pos(s):
1365         try:
1366             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1367         except StopIteration:
1368             return len(s)
1369
1370     while s:
1371         count = min(next_nonbmp_pos(s), 1024)
1372
1373         ret = WriteConsoleW(
1374             h, s, count if count else 2, ctypes.byref(written), None)
1375         if ret == 0:
1376             raise OSError('Failed to write string')
1377         if not count:  # We just wrote a non-BMP character
1378             assert written.value == 2
1379             s = s[1:]
1380         else:
1381             assert written.value > 0
1382             s = s[written.value:]
1383     return True
1384
1385
1386 def write_string(s, out=None, encoding=None):
1387     if out is None:
1388         out = sys.stderr
1389     assert type(s) == compat_str
1390
1391     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1392         if _windows_write_string(s, out):
1393             return
1394
1395     if ('b' in getattr(out, 'mode', '') or
1396             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1397         byt = s.encode(encoding or preferredencoding(), 'ignore')
1398         out.write(byt)
1399     elif hasattr(out, 'buffer'):
1400         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1401         byt = s.encode(enc, 'ignore')
1402         out.buffer.write(byt)
1403     else:
1404         out.write(s)
1405     out.flush()
1406
1407
1408 def bytes_to_intlist(bs):
1409     if not bs:
1410         return []
1411     if isinstance(bs[0], int):  # Python 3
1412         return list(bs)
1413     else:
1414         return [ord(c) for c in bs]
1415
1416
1417 def intlist_to_bytes(xs):
1418     if not xs:
1419         return b''
1420     return compat_struct_pack('%dB' % len(xs), *xs)
1421
1422
1423 # Cross-platform file locking
1424 if sys.platform == 'win32':
1425     import ctypes.wintypes
1426     import msvcrt
1427
1428     class OVERLAPPED(ctypes.Structure):
1429         _fields_ = [
1430             ('Internal', ctypes.wintypes.LPVOID),
1431             ('InternalHigh', ctypes.wintypes.LPVOID),
1432             ('Offset', ctypes.wintypes.DWORD),
1433             ('OffsetHigh', ctypes.wintypes.DWORD),
1434             ('hEvent', ctypes.wintypes.HANDLE),
1435         ]
1436
1437     kernel32 = ctypes.windll.kernel32
1438     LockFileEx = kernel32.LockFileEx
1439     LockFileEx.argtypes = [
1440         ctypes.wintypes.HANDLE,     # hFile
1441         ctypes.wintypes.DWORD,      # dwFlags
1442         ctypes.wintypes.DWORD,      # dwReserved
1443         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1444         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1445         ctypes.POINTER(OVERLAPPED)  # Overlapped
1446     ]
1447     LockFileEx.restype = ctypes.wintypes.BOOL
1448     UnlockFileEx = kernel32.UnlockFileEx
1449     UnlockFileEx.argtypes = [
1450         ctypes.wintypes.HANDLE,     # hFile
1451         ctypes.wintypes.DWORD,      # dwReserved
1452         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1453         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1454         ctypes.POINTER(OVERLAPPED)  # Overlapped
1455     ]
1456     UnlockFileEx.restype = ctypes.wintypes.BOOL
1457     whole_low = 0xffffffff
1458     whole_high = 0x7fffffff
1459
1460     def _lock_file(f, exclusive):
1461         overlapped = OVERLAPPED()
1462         overlapped.Offset = 0
1463         overlapped.OffsetHigh = 0
1464         overlapped.hEvent = 0
1465         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1466         handle = msvcrt.get_osfhandle(f.fileno())
1467         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1468                           whole_low, whole_high, f._lock_file_overlapped_p):
1469             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1470
1471     def _unlock_file(f):
1472         assert f._lock_file_overlapped_p
1473         handle = msvcrt.get_osfhandle(f.fileno())
1474         if not UnlockFileEx(handle, 0,
1475                             whole_low, whole_high, f._lock_file_overlapped_p):
1476             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1477
1478 else:
1479     # Some platforms, such as Jython, is missing fcntl
1480     try:
1481         import fcntl
1482
1483         def _lock_file(f, exclusive):
1484             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1485
1486         def _unlock_file(f):
1487             fcntl.flock(f, fcntl.LOCK_UN)
1488     except ImportError:
1489         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1490
1491         def _lock_file(f, exclusive):
1492             raise IOError(UNSUPPORTED_MSG)
1493
1494         def _unlock_file(f):
1495             raise IOError(UNSUPPORTED_MSG)
1496
1497
1498 class locked_file(object):
1499     def __init__(self, filename, mode, encoding=None):
1500         assert mode in ['r', 'a', 'w']
1501         self.f = io.open(filename, mode, encoding=encoding)
1502         self.mode = mode
1503
1504     def __enter__(self):
1505         exclusive = self.mode != 'r'
1506         try:
1507             _lock_file(self.f, exclusive)
1508         except IOError:
1509             self.f.close()
1510             raise
1511         return self
1512
1513     def __exit__(self, etype, value, traceback):
1514         try:
1515             _unlock_file(self.f)
1516         finally:
1517             self.f.close()
1518
1519     def __iter__(self):
1520         return iter(self.f)
1521
1522     def write(self, *args):
1523         return self.f.write(*args)
1524
1525     def read(self, *args):
1526         return self.f.read(*args)
1527
1528
1529 def get_filesystem_encoding():
1530     encoding = sys.getfilesystemencoding()
1531     return encoding if encoding is not None else 'utf-8'
1532
1533
1534 def shell_quote(args):
1535     quoted_args = []
1536     encoding = get_filesystem_encoding()
1537     for a in args:
1538         if isinstance(a, bytes):
1539             # We may get a filename encoded with 'encodeFilename'
1540             a = a.decode(encoding)
1541         quoted_args.append(pipes.quote(a))
1542     return ' '.join(quoted_args)
1543
1544
1545 def smuggle_url(url, data):
1546     """ Pass additional data in a URL for internal use. """
1547
1548     url, idata = unsmuggle_url(url, {})
1549     data.update(idata)
1550     sdata = compat_urllib_parse_urlencode(
1551         {'__youtubedl_smuggle': json.dumps(data)})
1552     return url + '#' + sdata
1553
1554
1555 def unsmuggle_url(smug_url, default=None):
1556     if '#__youtubedl_smuggle' not in smug_url:
1557         return smug_url, default
1558     url, _, sdata = smug_url.rpartition('#')
1559     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1560     data = json.loads(jsond)
1561     return url, data
1562
1563
1564 def format_bytes(bytes):
1565     if bytes is None:
1566         return 'N/A'
1567     if type(bytes) is str:
1568         bytes = float(bytes)
1569     if bytes == 0.0:
1570         exponent = 0
1571     else:
1572         exponent = int(math.log(bytes, 1024.0))
1573     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1574     converted = float(bytes) / float(1024 ** exponent)
1575     return '%.2f%s' % (converted, suffix)
1576
1577
1578 def lookup_unit_table(unit_table, s):
1579     units_re = '|'.join(re.escape(u) for u in unit_table)
1580     m = re.match(
1581         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1582     if not m:
1583         return None
1584     num_str = m.group('num').replace(',', '.')
1585     mult = unit_table[m.group('unit')]
1586     return int(float(num_str) * mult)
1587
1588
1589 def parse_filesize(s):
1590     if s is None:
1591         return None
1592
1593     # The lower-case forms are of course incorrect and unofficial,
1594     # but we support those too
1595     _UNIT_TABLE = {
1596         'B': 1,
1597         'b': 1,
1598         'bytes': 1,
1599         'KiB': 1024,
1600         'KB': 1000,
1601         'kB': 1024,
1602         'Kb': 1000,
1603         'kb': 1000,
1604         'kilobytes': 1000,
1605         'kibibytes': 1024,
1606         'MiB': 1024 ** 2,
1607         'MB': 1000 ** 2,
1608         'mB': 1024 ** 2,
1609         'Mb': 1000 ** 2,
1610         'mb': 1000 ** 2,
1611         'megabytes': 1000 ** 2,
1612         'mebibytes': 1024 ** 2,
1613         'GiB': 1024 ** 3,
1614         'GB': 1000 ** 3,
1615         'gB': 1024 ** 3,
1616         'Gb': 1000 ** 3,
1617         'gb': 1000 ** 3,
1618         'gigabytes': 1000 ** 3,
1619         'gibibytes': 1024 ** 3,
1620         'TiB': 1024 ** 4,
1621         'TB': 1000 ** 4,
1622         'tB': 1024 ** 4,
1623         'Tb': 1000 ** 4,
1624         'tb': 1000 ** 4,
1625         'terabytes': 1000 ** 4,
1626         'tebibytes': 1024 ** 4,
1627         'PiB': 1024 ** 5,
1628         'PB': 1000 ** 5,
1629         'pB': 1024 ** 5,
1630         'Pb': 1000 ** 5,
1631         'pb': 1000 ** 5,
1632         'petabytes': 1000 ** 5,
1633         'pebibytes': 1024 ** 5,
1634         'EiB': 1024 ** 6,
1635         'EB': 1000 ** 6,
1636         'eB': 1024 ** 6,
1637         'Eb': 1000 ** 6,
1638         'eb': 1000 ** 6,
1639         'exabytes': 1000 ** 6,
1640         'exbibytes': 1024 ** 6,
1641         'ZiB': 1024 ** 7,
1642         'ZB': 1000 ** 7,
1643         'zB': 1024 ** 7,
1644         'Zb': 1000 ** 7,
1645         'zb': 1000 ** 7,
1646         'zettabytes': 1000 ** 7,
1647         'zebibytes': 1024 ** 7,
1648         'YiB': 1024 ** 8,
1649         'YB': 1000 ** 8,
1650         'yB': 1024 ** 8,
1651         'Yb': 1000 ** 8,
1652         'yb': 1000 ** 8,
1653         'yottabytes': 1000 ** 8,
1654         'yobibytes': 1024 ** 8,
1655     }
1656
1657     return lookup_unit_table(_UNIT_TABLE, s)
1658
1659
1660 def parse_count(s):
1661     if s is None:
1662         return None
1663
1664     s = s.strip()
1665
1666     if re.match(r'^[\d,.]+$', s):
1667         return str_to_int(s)
1668
1669     _UNIT_TABLE = {
1670         'k': 1000,
1671         'K': 1000,
1672         'm': 1000 ** 2,
1673         'M': 1000 ** 2,
1674         'kk': 1000 ** 2,
1675         'KK': 1000 ** 2,
1676     }
1677
1678     return lookup_unit_table(_UNIT_TABLE, s)
1679
1680
1681 def month_by_name(name, lang='en'):
1682     """ Return the number of a month by (locale-independently) English name """
1683
1684     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1685
1686     try:
1687         return month_names.index(name) + 1
1688     except ValueError:
1689         return None
1690
1691
1692 def month_by_abbreviation(abbrev):
1693     """ Return the number of a month by (locale-independently) English
1694         abbreviations """
1695
1696     try:
1697         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1698     except ValueError:
1699         return None
1700
1701
1702 def fix_xml_ampersands(xml_str):
1703     """Replace all the '&' by '&amp;' in XML"""
1704     return re.sub(
1705         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1706         '&amp;',
1707         xml_str)
1708
1709
1710 def setproctitle(title):
1711     assert isinstance(title, compat_str)
1712
1713     # ctypes in Jython is not complete
1714     # http://bugs.jython.org/issue2148
1715     if sys.platform.startswith('java'):
1716         return
1717
1718     try:
1719         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1720     except OSError:
1721         return
1722     except TypeError:
1723         # LoadLibrary in Windows Python 2.7.13 only expects
1724         # a bytestring, but since unicode_literals turns
1725         # every string into a unicode string, it fails.
1726         return
1727     title_bytes = title.encode('utf-8')
1728     buf = ctypes.create_string_buffer(len(title_bytes))
1729     buf.value = title_bytes
1730     try:
1731         libc.prctl(15, buf, 0, 0, 0)
1732     except AttributeError:
1733         return  # Strange libc, just skip this
1734
1735
1736 def remove_start(s, start):
1737     return s[len(start):] if s is not None and s.startswith(start) else s
1738
1739
1740 def remove_end(s, end):
1741     return s[:-len(end)] if s is not None and s.endswith(end) else s
1742
1743
1744 def remove_quotes(s):
1745     if s is None or len(s) < 2:
1746         return s
1747     for quote in ('"', "'", ):
1748         if s[0] == quote and s[-1] == quote:
1749             return s[1:-1]
1750     return s
1751
1752
1753 def url_basename(url):
1754     path = compat_urlparse.urlparse(url).path
1755     return path.strip('/').split('/')[-1]
1756
1757
1758 def base_url(url):
1759     return re.match(r'https?://[^?#&]+/', url).group()
1760
1761
1762 def urljoin(base, path):
1763     if isinstance(path, bytes):
1764         path = path.decode('utf-8')
1765     if not isinstance(path, compat_str) or not path:
1766         return None
1767     if re.match(r'^(?:https?:)?//', path):
1768         return path
1769     if isinstance(base, bytes):
1770         base = base.decode('utf-8')
1771     if not isinstance(base, compat_str) or not re.match(
1772             r'^(?:https?:)?//', base):
1773         return None
1774     return compat_urlparse.urljoin(base, path)
1775
1776
1777 class HEADRequest(compat_urllib_request.Request):
1778     def get_method(self):
1779         return 'HEAD'
1780
1781
1782 class PUTRequest(compat_urllib_request.Request):
1783     def get_method(self):
1784         return 'PUT'
1785
1786
1787 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1788     if get_attr:
1789         if v is not None:
1790             v = getattr(v, get_attr, None)
1791     if v == '':
1792         v = None
1793     if v is None:
1794         return default
1795     try:
1796         return int(v) * invscale // scale
1797     except ValueError:
1798         return default
1799
1800
1801 def str_or_none(v, default=None):
1802     return default if v is None else compat_str(v)
1803
1804
1805 def str_to_int(int_str):
1806     """ A more relaxed version of int_or_none """
1807     if int_str is None:
1808         return None
1809     int_str = re.sub(r'[,\.\+]', '', int_str)
1810     return int(int_str)
1811
1812
1813 def float_or_none(v, scale=1, invscale=1, default=None):
1814     if v is None:
1815         return default
1816     try:
1817         return float(v) * invscale / scale
1818     except ValueError:
1819         return default
1820
1821
1822 def strip_or_none(v):
1823     return None if v is None else v.strip()
1824
1825
1826 def parse_duration(s):
1827     if not isinstance(s, compat_basestring):
1828         return None
1829
1830     s = s.strip()
1831
1832     days, hours, mins, secs, ms = [None] * 5
1833     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1834     if m:
1835         days, hours, mins, secs, ms = m.groups()
1836     else:
1837         m = re.match(
1838             r'''(?ix)(?:P?T)?
1839                 (?:
1840                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1841                 )?
1842                 (?:
1843                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1844                 )?
1845                 (?:
1846                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1847                 )?
1848                 (?:
1849                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1850                 )?Z?$''', s)
1851         if m:
1852             days, hours, mins, secs, ms = m.groups()
1853         else:
1854             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1855             if m:
1856                 hours, mins = m.groups()
1857             else:
1858                 return None
1859
1860     duration = 0
1861     if secs:
1862         duration += float(secs)
1863     if mins:
1864         duration += float(mins) * 60
1865     if hours:
1866         duration += float(hours) * 60 * 60
1867     if days:
1868         duration += float(days) * 24 * 60 * 60
1869     if ms:
1870         duration += float(ms)
1871     return duration
1872
1873
1874 def prepend_extension(filename, ext, expected_real_ext=None):
1875     name, real_ext = os.path.splitext(filename)
1876     return (
1877         '{0}.{1}{2}'.format(name, ext, real_ext)
1878         if not expected_real_ext or real_ext[1:] == expected_real_ext
1879         else '{0}.{1}'.format(filename, ext))
1880
1881
1882 def replace_extension(filename, ext, expected_real_ext=None):
1883     name, real_ext = os.path.splitext(filename)
1884     return '{0}.{1}'.format(
1885         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1886         ext)
1887
1888
1889 def check_executable(exe, args=[]):
1890     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1891     args can be a list of arguments for a short output (like -version) """
1892     try:
1893         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1894     except OSError:
1895         return False
1896     return exe
1897
1898
1899 def get_exe_version(exe, args=['--version'],
1900                     version_re=None, unrecognized='present'):
1901     """ Returns the version of the specified executable,
1902     or False if the executable is not present """
1903     try:
1904         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1905         # SIGTTOU if youtube-dl is run in the background.
1906         # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1907         out, _ = subprocess.Popen(
1908             [encodeArgument(exe)] + args,
1909             stdin=subprocess.PIPE,
1910             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1911     except OSError:
1912         return False
1913     if isinstance(out, bytes):  # Python 2.x
1914         out = out.decode('ascii', 'ignore')
1915     return detect_exe_version(out, version_re, unrecognized)
1916
1917
1918 def detect_exe_version(output, version_re=None, unrecognized='present'):
1919     assert isinstance(output, compat_str)
1920     if version_re is None:
1921         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1922     m = re.search(version_re, output)
1923     if m:
1924         return m.group(1)
1925     else:
1926         return unrecognized
1927
1928
1929 class PagedList(object):
1930     def __len__(self):
1931         # This is only useful for tests
1932         return len(self.getslice())
1933
1934
1935 class OnDemandPagedList(PagedList):
1936     def __init__(self, pagefunc, pagesize, use_cache=False):
1937         self._pagefunc = pagefunc
1938         self._pagesize = pagesize
1939         self._use_cache = use_cache
1940         if use_cache:
1941             self._cache = {}
1942
1943     def getslice(self, start=0, end=None):
1944         res = []
1945         for pagenum in itertools.count(start // self._pagesize):
1946             firstid = pagenum * self._pagesize
1947             nextfirstid = pagenum * self._pagesize + self._pagesize
1948             if start >= nextfirstid:
1949                 continue
1950
1951             page_results = None
1952             if self._use_cache:
1953                 page_results = self._cache.get(pagenum)
1954             if page_results is None:
1955                 page_results = list(self._pagefunc(pagenum))
1956             if self._use_cache:
1957                 self._cache[pagenum] = page_results
1958
1959             startv = (
1960                 start % self._pagesize
1961                 if firstid <= start < nextfirstid
1962                 else 0)
1963
1964             endv = (
1965                 ((end - 1) % self._pagesize) + 1
1966                 if (end is not None and firstid <= end <= nextfirstid)
1967                 else None)
1968
1969             if startv != 0 or endv is not None:
1970                 page_results = page_results[startv:endv]
1971             res.extend(page_results)
1972
1973             # A little optimization - if current page is not "full", ie. does
1974             # not contain page_size videos then we can assume that this page
1975             # is the last one - there are no more ids on further pages -
1976             # i.e. no need to query again.
1977             if len(page_results) + startv < self._pagesize:
1978                 break
1979
1980             # If we got the whole page, but the next page is not interesting,
1981             # break out early as well
1982             if end == nextfirstid:
1983                 break
1984         return res
1985
1986
1987 class InAdvancePagedList(PagedList):
1988     def __init__(self, pagefunc, pagecount, pagesize):
1989         self._pagefunc = pagefunc
1990         self._pagecount = pagecount
1991         self._pagesize = pagesize
1992
1993     def getslice(self, start=0, end=None):
1994         res = []
1995         start_page = start // self._pagesize
1996         end_page = (
1997             self._pagecount if end is None else (end // self._pagesize + 1))
1998         skip_elems = start - start_page * self._pagesize
1999         only_more = None if end is None else end - start
2000         for pagenum in range(start_page, end_page):
2001             page = list(self._pagefunc(pagenum))
2002             if skip_elems:
2003                 page = page[skip_elems:]
2004                 skip_elems = None
2005             if only_more is not None:
2006                 if len(page) < only_more:
2007                     only_more -= len(page)
2008                 else:
2009                     page = page[:only_more]
2010                     res.extend(page)
2011                     break
2012             res.extend(page)
2013         return res
2014
2015
2016 def uppercase_escape(s):
2017     unicode_escape = codecs.getdecoder('unicode_escape')
2018     return re.sub(
2019         r'\\U[0-9a-fA-F]{8}',
2020         lambda m: unicode_escape(m.group(0))[0],
2021         s)
2022
2023
2024 def lowercase_escape(s):
2025     unicode_escape = codecs.getdecoder('unicode_escape')
2026     return re.sub(
2027         r'\\u[0-9a-fA-F]{4}',
2028         lambda m: unicode_escape(m.group(0))[0],
2029         s)
2030
2031
2032 def escape_rfc3986(s):
2033     """Escape non-ASCII characters as suggested by RFC 3986"""
2034     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2035         s = s.encode('utf-8')
2036     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2037
2038
2039 def escape_url(url):
2040     """Escape URL as suggested by RFC 3986"""
2041     url_parsed = compat_urllib_parse_urlparse(url)
2042     return url_parsed._replace(
2043         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2044         path=escape_rfc3986(url_parsed.path),
2045         params=escape_rfc3986(url_parsed.params),
2046         query=escape_rfc3986(url_parsed.query),
2047         fragment=escape_rfc3986(url_parsed.fragment)
2048     ).geturl()
2049
2050
2051 def read_batch_urls(batch_fd):
2052     def fixup(url):
2053         if not isinstance(url, compat_str):
2054             url = url.decode('utf-8', 'replace')
2055         BOM_UTF8 = '\xef\xbb\xbf'
2056         if url.startswith(BOM_UTF8):
2057             url = url[len(BOM_UTF8):]
2058         url = url.strip()
2059         if url.startswith(('#', ';', ']')):
2060             return False
2061         return url
2062
2063     with contextlib.closing(batch_fd) as fd:
2064         return [url for url in map(fixup, fd) if url]
2065
2066
2067 def urlencode_postdata(*args, **kargs):
2068     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2069
2070
2071 def update_url_query(url, query):
2072     if not query:
2073         return url
2074     parsed_url = compat_urlparse.urlparse(url)
2075     qs = compat_parse_qs(parsed_url.query)
2076     qs.update(query)
2077     return compat_urlparse.urlunparse(parsed_url._replace(
2078         query=compat_urllib_parse_urlencode(qs, True)))
2079
2080
2081 def update_Request(req, url=None, data=None, headers={}, query={}):
2082     req_headers = req.headers.copy()
2083     req_headers.update(headers)
2084     req_data = data or req.data
2085     req_url = update_url_query(url or req.get_full_url(), query)
2086     req_get_method = req.get_method()
2087     if req_get_method == 'HEAD':
2088         req_type = HEADRequest
2089     elif req_get_method == 'PUT':
2090         req_type = PUTRequest
2091     else:
2092         req_type = compat_urllib_request.Request
2093     new_req = req_type(
2094         req_url, data=req_data, headers=req_headers,
2095         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2096     if hasattr(req, 'timeout'):
2097         new_req.timeout = req.timeout
2098     return new_req
2099
2100
2101 def try_multipart_encode(data, boundary):
2102     content_type = 'multipart/form-data; boundary=%s' % boundary
2103
2104     out = b''
2105     for k, v in data.items():
2106         out += b'--' + boundary.encode('ascii') + b'\r\n'
2107         if isinstance(k, compat_str):
2108             k = k.encode('utf-8')
2109         if isinstance(v, compat_str):
2110             v = v.encode('utf-8')
2111         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2112         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2113         content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n'
2114         if boundary.encode('ascii') in content:
2115             raise ValueError('Boundary overlaps with data')
2116         out += content
2117
2118     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2119
2120     return out, content_type
2121
2122
2123 def multipart_encode(data, boundary=None):
2124     '''
2125     Encode a dict to RFC 7578-compliant form-data
2126
2127     data:
2128         A dict where keys and values can be either Unicode or bytes-like
2129         objects.
2130     boundary:
2131         If specified a Unicode object, it's used as the boundary. Otherwise
2132         a random boundary is generated.
2133
2134     Reference: https://tools.ietf.org/html/rfc7578
2135     '''
2136     has_specified_boundary = boundary is not None
2137
2138     while True:
2139         if boundary is None:
2140             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2141
2142         try:
2143             out, content_type = try_multipart_encode(data, boundary)
2144             break
2145         except ValueError:
2146             if has_specified_boundary:
2147                 raise
2148             boundary = None
2149
2150     return out, content_type
2151
2152
2153 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2154     if isinstance(key_or_keys, (list, tuple)):
2155         for key in key_or_keys:
2156             if key not in d or d[key] is None or skip_false_values and not d[key]:
2157                 continue
2158             return d[key]
2159         return default
2160     return d.get(key_or_keys, default)
2161
2162
2163 def try_get(src, getter, expected_type=None):
2164     if not isinstance(getter, (list, tuple)):
2165         getter = [getter]
2166     for get in getter:
2167         try:
2168             v = get(src)
2169         except (AttributeError, KeyError, TypeError, IndexError):
2170             pass
2171         else:
2172             if expected_type is None or isinstance(v, expected_type):
2173                 return v
2174
2175
2176 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2177     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2178
2179
2180 US_RATINGS = {
2181     'G': 0,
2182     'PG': 10,
2183     'PG-13': 13,
2184     'R': 16,
2185     'NC': 18,
2186 }
2187
2188
2189 TV_PARENTAL_GUIDELINES = {
2190     'TV-Y': 0,
2191     'TV-Y7': 7,
2192     'TV-G': 0,
2193     'TV-PG': 0,
2194     'TV-14': 14,
2195     'TV-MA': 17,
2196 }
2197
2198
2199 def parse_age_limit(s):
2200     if type(s) == int:
2201         return s if 0 <= s <= 21 else None
2202     if not isinstance(s, compat_basestring):
2203         return None
2204     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2205     if m:
2206         return int(m.group('age'))
2207     if s in US_RATINGS:
2208         return US_RATINGS[s]
2209     return TV_PARENTAL_GUIDELINES.get(s)
2210
2211
2212 def strip_jsonp(code):
2213     return re.sub(
2214         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2215
2216
2217 def js_to_json(code):
2218     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2219     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2220     INTEGER_TABLE = (
2221         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2222         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2223     )
2224
2225     def fix_kv(m):
2226         v = m.group(0)
2227         if v in ('true', 'false', 'null'):
2228             return v
2229         elif v.startswith('/*') or v.startswith('//') or v == ',':
2230             return ""
2231
2232         if v[0] in ("'", '"'):
2233             v = re.sub(r'(?s)\\.|"', lambda m: {
2234                 '"': '\\"',
2235                 "\\'": "'",
2236                 '\\\n': '',
2237                 '\\x': '\\u00',
2238             }.get(m.group(0), m.group(0)), v[1:-1])
2239
2240         for regex, base in INTEGER_TABLE:
2241             im = re.match(regex, v)
2242             if im:
2243                 i = int(im.group(1), base)
2244                 return '"%d":' % i if v.endswith(':') else '%d' % i
2245
2246         return '"%s"' % v
2247
2248     return re.sub(r'''(?sx)
2249         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2250         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2251         {comment}|,(?={skip}[\]}}])|
2252         [a-zA-Z_][.a-zA-Z_0-9]*|
2253         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2254         [0-9]+(?={skip}:)
2255         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2256
2257
2258 def qualities(quality_ids):
2259     """ Get a numeric quality value out of a list of possible values """
2260     def q(qid):
2261         try:
2262             return quality_ids.index(qid)
2263         except ValueError:
2264             return -1
2265     return q
2266
2267
2268 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2269
2270
2271 def limit_length(s, length):
2272     """ Add ellipses to overly long strings """
2273     if s is None:
2274         return None
2275     ELLIPSES = '...'
2276     if len(s) > length:
2277         return s[:length - len(ELLIPSES)] + ELLIPSES
2278     return s
2279
2280
2281 def version_tuple(v):
2282     return tuple(int(e) for e in re.split(r'[-.]', v))
2283
2284
2285 def is_outdated_version(version, limit, assume_new=True):
2286     if not version:
2287         return not assume_new
2288     try:
2289         return version_tuple(version) < version_tuple(limit)
2290     except ValueError:
2291         return not assume_new
2292
2293
2294 def ytdl_is_updateable():
2295     """ Returns if youtube-dl can be updated with -U """
2296     from zipimport import zipimporter
2297
2298     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2299
2300
2301 def args_to_str(args):
2302     # Get a short string representation for a subprocess command
2303     return ' '.join(compat_shlex_quote(a) for a in args)
2304
2305
2306 def error_to_compat_str(err):
2307     err_str = str(err)
2308     # On python 2 error byte string must be decoded with proper
2309     # encoding rather than ascii
2310     if sys.version_info[0] < 3:
2311         err_str = err_str.decode(preferredencoding())
2312     return err_str
2313
2314
2315 def mimetype2ext(mt):
2316     if mt is None:
2317         return None
2318
2319     ext = {
2320         'audio/mp4': 'm4a',
2321         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2322         # it's the most popular one
2323         'audio/mpeg': 'mp3',
2324     }.get(mt)
2325     if ext is not None:
2326         return ext
2327
2328     _, _, res = mt.rpartition('/')
2329     res = res.split(';')[0].strip().lower()
2330
2331     return {
2332         '3gpp': '3gp',
2333         'smptett+xml': 'tt',
2334         'ttaf+xml': 'dfxp',
2335         'ttml+xml': 'ttml',
2336         'x-flv': 'flv',
2337         'x-mp4-fragmented': 'mp4',
2338         'x-ms-wmv': 'wmv',
2339         'mpegurl': 'm3u8',
2340         'x-mpegurl': 'm3u8',
2341         'vnd.apple.mpegurl': 'm3u8',
2342         'dash+xml': 'mpd',
2343         'f4m+xml': 'f4m',
2344         'hds+xml': 'f4m',
2345         'vnd.ms-sstr+xml': 'ism',
2346         'quicktime': 'mov',
2347         'mp2t': 'ts',
2348     }.get(res, res)
2349
2350
2351 def parse_codecs(codecs_str):
2352     # http://tools.ietf.org/html/rfc6381
2353     if not codecs_str:
2354         return {}
2355     splited_codecs = list(filter(None, map(
2356         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2357     vcodec, acodec = None, None
2358     for full_codec in splited_codecs:
2359         codec = full_codec.split('.')[0]
2360         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2361             if not vcodec:
2362                 vcodec = full_codec
2363         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2364             if not acodec:
2365                 acodec = full_codec
2366         else:
2367             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2368     if not vcodec and not acodec:
2369         if len(splited_codecs) == 2:
2370             return {
2371                 'vcodec': vcodec,
2372                 'acodec': acodec,
2373             }
2374         elif len(splited_codecs) == 1:
2375             return {
2376                 'vcodec': 'none',
2377                 'acodec': vcodec,
2378             }
2379     else:
2380         return {
2381             'vcodec': vcodec or 'none',
2382             'acodec': acodec or 'none',
2383         }
2384     return {}
2385
2386
2387 def urlhandle_detect_ext(url_handle):
2388     getheader = url_handle.headers.get
2389
2390     cd = getheader('Content-Disposition')
2391     if cd:
2392         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2393         if m:
2394             e = determine_ext(m.group('filename'), default_ext=None)
2395             if e:
2396                 return e
2397
2398     return mimetype2ext(getheader('Content-Type'))
2399
2400
2401 def encode_data_uri(data, mime_type):
2402     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2403
2404
2405 def age_restricted(content_limit, age_limit):
2406     """ Returns True iff the content should be blocked """
2407
2408     if age_limit is None:  # No limit set
2409         return False
2410     if content_limit is None:
2411         return False  # Content available for everyone
2412     return age_limit < content_limit
2413
2414
2415 def is_html(first_bytes):
2416     """ Detect whether a file contains HTML by examining its first bytes. """
2417
2418     BOMS = [
2419         (b'\xef\xbb\xbf', 'utf-8'),
2420         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2421         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2422         (b'\xff\xfe', 'utf-16-le'),
2423         (b'\xfe\xff', 'utf-16-be'),
2424     ]
2425     for bom, enc in BOMS:
2426         if first_bytes.startswith(bom):
2427             s = first_bytes[len(bom):].decode(enc, 'replace')
2428             break
2429     else:
2430         s = first_bytes.decode('utf-8', 'replace')
2431
2432     return re.match(r'^\s*<', s)
2433
2434
2435 def determine_protocol(info_dict):
2436     protocol = info_dict.get('protocol')
2437     if protocol is not None:
2438         return protocol
2439
2440     url = info_dict['url']
2441     if url.startswith('rtmp'):
2442         return 'rtmp'
2443     elif url.startswith('mms'):
2444         return 'mms'
2445     elif url.startswith('rtsp'):
2446         return 'rtsp'
2447
2448     ext = determine_ext(url)
2449     if ext == 'm3u8':
2450         return 'm3u8'
2451     elif ext == 'f4m':
2452         return 'f4m'
2453
2454     return compat_urllib_parse_urlparse(url).scheme
2455
2456
2457 def render_table(header_row, data):
2458     """ Render a list of rows, each as a list of values """
2459     table = [header_row] + data
2460     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2461     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2462     return '\n'.join(format_str % tuple(row) for row in table)
2463
2464
2465 def _match_one(filter_part, dct):
2466     COMPARISON_OPERATORS = {
2467         '<': operator.lt,
2468         '<=': operator.le,
2469         '>': operator.gt,
2470         '>=': operator.ge,
2471         '=': operator.eq,
2472         '!=': operator.ne,
2473     }
2474     operator_rex = re.compile(r'''(?x)\s*
2475         (?P<key>[a-z_]+)
2476         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2477         (?:
2478             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2479             (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2480             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2481         )
2482         \s*$
2483         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2484     m = operator_rex.search(filter_part)
2485     if m:
2486         op = COMPARISON_OPERATORS[m.group('op')]
2487         actual_value = dct.get(m.group('key'))
2488         if (m.group('quotedstrval') is not None or
2489             m.group('strval') is not None or
2490             # If the original field is a string and matching comparisonvalue is
2491             # a number we should respect the origin of the original field
2492             # and process comparison value as a string (see
2493             # https://github.com/rg3/youtube-dl/issues/11082).
2494             actual_value is not None and m.group('intval') is not None and
2495                 isinstance(actual_value, compat_str)):
2496             if m.group('op') not in ('=', '!='):
2497                 raise ValueError(
2498                     'Operator %s does not support string values!' % m.group('op'))
2499             comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2500             quote = m.group('quote')
2501             if quote is not None:
2502                 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2503         else:
2504             try:
2505                 comparison_value = int(m.group('intval'))
2506             except ValueError:
2507                 comparison_value = parse_filesize(m.group('intval'))
2508                 if comparison_value is None:
2509                     comparison_value = parse_filesize(m.group('intval') + 'B')
2510                 if comparison_value is None:
2511                     raise ValueError(
2512                         'Invalid integer value %r in filter part %r' % (
2513                             m.group('intval'), filter_part))
2514         if actual_value is None:
2515             return m.group('none_inclusive')
2516         return op(actual_value, comparison_value)
2517
2518     UNARY_OPERATORS = {
2519         '': lambda v: v is not None,
2520         '!': lambda v: v is None,
2521     }
2522     operator_rex = re.compile(r'''(?x)\s*
2523         (?P<op>%s)\s*(?P<key>[a-z_]+)
2524         \s*$
2525         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2526     m = operator_rex.search(filter_part)
2527     if m:
2528         op = UNARY_OPERATORS[m.group('op')]
2529         actual_value = dct.get(m.group('key'))
2530         return op(actual_value)
2531
2532     raise ValueError('Invalid filter part %r' % filter_part)
2533
2534
2535 def match_str(filter_str, dct):
2536     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2537
2538     return all(
2539         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2540
2541
2542 def match_filter_func(filter_str):
2543     def _match_func(info_dict):
2544         if match_str(filter_str, info_dict):
2545             return None
2546         else:
2547             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2548             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2549     return _match_func
2550
2551
2552 def parse_dfxp_time_expr(time_expr):
2553     if not time_expr:
2554         return
2555
2556     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2557     if mobj:
2558         return float(mobj.group('time_offset'))
2559
2560     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2561     if mobj:
2562         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2563
2564
2565 def srt_subtitles_timecode(seconds):
2566     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2567
2568
2569 def dfxp2srt(dfxp_data):
2570     LEGACY_NAMESPACES = (
2571         ('http://www.w3.org/ns/ttml', [
2572             'http://www.w3.org/2004/11/ttaf1',
2573             'http://www.w3.org/2006/04/ttaf1',
2574             'http://www.w3.org/2006/10/ttaf1',
2575         ]),
2576         ('http://www.w3.org/ns/ttml#styling', [
2577             'http://www.w3.org/ns/ttml#style',
2578         ]),
2579     )
2580
2581     SUPPORTED_STYLING = [
2582         'color',
2583         'fontFamily',
2584         'fontSize',
2585         'fontStyle',
2586         'fontWeight',
2587         'textDecoration'
2588     ]
2589
2590     _x = functools.partial(xpath_with_ns, ns_map={
2591         'ttml': 'http://www.w3.org/ns/ttml',
2592         'tts': 'http://www.w3.org/ns/ttml#styling',
2593     })
2594
2595     styles = {}
2596     default_style = {}
2597
2598     class TTMLPElementParser(object):
2599         _out = ''
2600         _unclosed_elements = []
2601         _applied_styles = []
2602
2603         def start(self, tag, attrib):
2604             if tag in (_x('ttml:br'), 'br'):
2605                 self._out += '\n'
2606             else:
2607                 unclosed_elements = []
2608                 style = {}
2609                 element_style_id = attrib.get('style')
2610                 if default_style:
2611                     style.update(default_style)
2612                 if element_style_id:
2613                     style.update(styles.get(element_style_id, {}))
2614                 for prop in SUPPORTED_STYLING:
2615                     prop_val = attrib.get(_x('tts:' + prop))
2616                     if prop_val:
2617                         style[prop] = prop_val
2618                 if style:
2619                     font = ''
2620                     for k, v in sorted(style.items()):
2621                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
2622                             continue
2623                         if k == 'color':
2624                             font += ' color="%s"' % v
2625                         elif k == 'fontSize':
2626                             font += ' size="%s"' % v
2627                         elif k == 'fontFamily':
2628                             font += ' face="%s"' % v
2629                         elif k == 'fontWeight' and v == 'bold':
2630                             self._out += '<b>'
2631                             unclosed_elements.append('b')
2632                         elif k == 'fontStyle' and v == 'italic':
2633                             self._out += '<i>'
2634                             unclosed_elements.append('i')
2635                         elif k == 'textDecoration' and v == 'underline':
2636                             self._out += '<u>'
2637                             unclosed_elements.append('u')
2638                     if font:
2639                         self._out += '<font' + font + '>'
2640                         unclosed_elements.append('font')
2641                     applied_style = {}
2642                     if self._applied_styles:
2643                         applied_style.update(self._applied_styles[-1])
2644                     applied_style.update(style)
2645                     self._applied_styles.append(applied_style)
2646                 self._unclosed_elements.append(unclosed_elements)
2647
2648         def end(self, tag):
2649             if tag not in (_x('ttml:br'), 'br'):
2650                 unclosed_elements = self._unclosed_elements.pop()
2651                 for element in reversed(unclosed_elements):
2652                     self._out += '</%s>' % element
2653                 if unclosed_elements and self._applied_styles:
2654                     self._applied_styles.pop()
2655
2656         def data(self, data):
2657             self._out += data
2658
2659         def close(self):
2660             return self._out.strip()
2661
2662     def parse_node(node):
2663         target = TTMLPElementParser()
2664         parser = xml.etree.ElementTree.XMLParser(target=target)
2665         parser.feed(xml.etree.ElementTree.tostring(node))
2666         return parser.close()
2667
2668     for k, v in LEGACY_NAMESPACES:
2669         for ns in v:
2670             dfxp_data = dfxp_data.replace(ns, k)
2671
2672     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2673     out = []
2674     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2675
2676     if not paras:
2677         raise ValueError('Invalid dfxp/TTML subtitle')
2678
2679     repeat = False
2680     while True:
2681         for style in dfxp.findall(_x('.//ttml:style')):
2682             style_id = style.get('id')
2683             parent_style_id = style.get('style')
2684             if parent_style_id:
2685                 if parent_style_id not in styles:
2686                     repeat = True
2687                     continue
2688                 styles[style_id] = styles[parent_style_id].copy()
2689             for prop in SUPPORTED_STYLING:
2690                 prop_val = style.get(_x('tts:' + prop))
2691                 if prop_val:
2692                     styles.setdefault(style_id, {})[prop] = prop_val
2693         if repeat:
2694             repeat = False
2695         else:
2696             break
2697
2698     for p in ('body', 'div'):
2699         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2700         if ele is None:
2701             continue
2702         style = styles.get(ele.get('style'))
2703         if not style:
2704             continue
2705         default_style.update(style)
2706
2707     for para, index in zip(paras, itertools.count(1)):
2708         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2709         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2710         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2711         if begin_time is None:
2712             continue
2713         if not end_time:
2714             if not dur:
2715                 continue
2716             end_time = begin_time + dur
2717         out.append('%d\n%s --> %s\n%s\n\n' % (
2718             index,
2719             srt_subtitles_timecode(begin_time),
2720             srt_subtitles_timecode(end_time),
2721             parse_node(para)))
2722
2723     return ''.join(out)
2724
2725
2726 def cli_option(params, command_option, param):
2727     param = params.get(param)
2728     if param:
2729         param = compat_str(param)
2730     return [command_option, param] if param is not None else []
2731
2732
2733 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2734     param = params.get(param)
2735     assert isinstance(param, bool)
2736     if separator:
2737         return [command_option + separator + (true_value if param else false_value)]
2738     return [command_option, true_value if param else false_value]
2739
2740
2741 def cli_valueless_option(params, command_option, param, expected_value=True):
2742     param = params.get(param)
2743     return [command_option] if param == expected_value else []
2744
2745
2746 def cli_configuration_args(params, param, default=[]):
2747     ex_args = params.get(param)
2748     if ex_args is None:
2749         return default
2750     assert isinstance(ex_args, list)
2751     return ex_args
2752
2753
2754 class ISO639Utils(object):
2755     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2756     _lang_map = {
2757         'aa': 'aar',
2758         'ab': 'abk',
2759         'ae': 'ave',
2760         'af': 'afr',
2761         'ak': 'aka',
2762         'am': 'amh',
2763         'an': 'arg',
2764         'ar': 'ara',
2765         'as': 'asm',
2766         'av': 'ava',
2767         'ay': 'aym',
2768         'az': 'aze',
2769         'ba': 'bak',
2770         'be': 'bel',
2771         'bg': 'bul',
2772         'bh': 'bih',
2773         'bi': 'bis',
2774         'bm': 'bam',
2775         'bn': 'ben',
2776         'bo': 'bod',
2777         'br': 'bre',
2778         'bs': 'bos',
2779         'ca': 'cat',
2780         'ce': 'che',
2781         'ch': 'cha',
2782         'co': 'cos',
2783         'cr': 'cre',
2784         'cs': 'ces',
2785         'cu': 'chu',
2786         'cv': 'chv',
2787         'cy': 'cym',
2788         'da': 'dan',
2789         'de': 'deu',
2790         'dv': 'div',
2791         'dz': 'dzo',
2792         'ee': 'ewe',
2793         'el': 'ell',
2794         'en': 'eng',
2795         'eo': 'epo',
2796         'es': 'spa',
2797         'et': 'est',
2798         'eu': 'eus',
2799         'fa': 'fas',
2800         'ff': 'ful',
2801         'fi': 'fin',
2802         'fj': 'fij',
2803         'fo': 'fao',
2804         'fr': 'fra',
2805         'fy': 'fry',
2806         'ga': 'gle',
2807         'gd': 'gla',
2808         'gl': 'glg',
2809         'gn': 'grn',
2810         'gu': 'guj',
2811         'gv': 'glv',
2812         'ha': 'hau',
2813         'he': 'heb',
2814         'hi': 'hin',
2815         'ho': 'hmo',
2816         'hr': 'hrv',
2817         'ht': 'hat',
2818         'hu': 'hun',
2819         'hy': 'hye',
2820         'hz': 'her',
2821         'ia': 'ina',
2822         'id': 'ind',
2823         'ie': 'ile',
2824         'ig': 'ibo',
2825         'ii': 'iii',
2826         'ik': 'ipk',
2827         'io': 'ido',
2828         'is': 'isl',
2829         'it': 'ita',
2830         'iu': 'iku',
2831         'ja': 'jpn',
2832         'jv': 'jav',
2833         'ka': 'kat',
2834         'kg': 'kon',
2835         'ki': 'kik',
2836         'kj': 'kua',
2837         'kk': 'kaz',
2838         'kl': 'kal',
2839         'km': 'khm',
2840         'kn': 'kan',
2841         'ko': 'kor',
2842         'kr': 'kau',
2843         'ks': 'kas',
2844         'ku': 'kur',
2845         'kv': 'kom',
2846         'kw': 'cor',
2847         'ky': 'kir',
2848         'la': 'lat',
2849         'lb': 'ltz',
2850         'lg': 'lug',
2851         'li': 'lim',
2852         'ln': 'lin',
2853         'lo': 'lao',
2854         'lt': 'lit',
2855         'lu': 'lub',
2856         'lv': 'lav',
2857         'mg': 'mlg',
2858         'mh': 'mah',
2859         'mi': 'mri',
2860         'mk': 'mkd',
2861         'ml': 'mal',
2862         'mn': 'mon',
2863         'mr': 'mar',
2864         'ms': 'msa',
2865         'mt': 'mlt',
2866         'my': 'mya',
2867         'na': 'nau',
2868         'nb': 'nob',
2869         'nd': 'nde',
2870         'ne': 'nep',
2871         'ng': 'ndo',
2872         'nl': 'nld',
2873         'nn': 'nno',
2874         'no': 'nor',
2875         'nr': 'nbl',
2876         'nv': 'nav',
2877         'ny': 'nya',
2878         'oc': 'oci',
2879         'oj': 'oji',
2880         'om': 'orm',
2881         'or': 'ori',
2882         'os': 'oss',
2883         'pa': 'pan',
2884         'pi': 'pli',
2885         'pl': 'pol',
2886         'ps': 'pus',
2887         'pt': 'por',
2888         'qu': 'que',
2889         'rm': 'roh',
2890         'rn': 'run',
2891         'ro': 'ron',
2892         'ru': 'rus',
2893         'rw': 'kin',
2894         'sa': 'san',
2895         'sc': 'srd',
2896         'sd': 'snd',
2897         'se': 'sme',
2898         'sg': 'sag',
2899         'si': 'sin',
2900         'sk': 'slk',
2901         'sl': 'slv',
2902         'sm': 'smo',
2903         'sn': 'sna',
2904         'so': 'som',
2905         'sq': 'sqi',
2906         'sr': 'srp',
2907         'ss': 'ssw',
2908         'st': 'sot',
2909         'su': 'sun',
2910         'sv': 'swe',
2911         'sw': 'swa',
2912         'ta': 'tam',
2913         'te': 'tel',
2914         'tg': 'tgk',
2915         'th': 'tha',
2916         'ti': 'tir',
2917         'tk': 'tuk',
2918         'tl': 'tgl',
2919         'tn': 'tsn',
2920         'to': 'ton',
2921         'tr': 'tur',
2922         'ts': 'tso',
2923         'tt': 'tat',
2924         'tw': 'twi',
2925         'ty': 'tah',
2926         'ug': 'uig',
2927         'uk': 'ukr',
2928         'ur': 'urd',
2929         'uz': 'uzb',
2930         've': 'ven',
2931         'vi': 'vie',
2932         'vo': 'vol',
2933         'wa': 'wln',
2934         'wo': 'wol',
2935         'xh': 'xho',
2936         'yi': 'yid',
2937         'yo': 'yor',
2938         'za': 'zha',
2939         'zh': 'zho',
2940         'zu': 'zul',
2941     }
2942
2943     @classmethod
2944     def short2long(cls, code):
2945         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2946         return cls._lang_map.get(code[:2])
2947
2948     @classmethod
2949     def long2short(cls, code):
2950         """Convert language code from ISO 639-2/T to ISO 639-1"""
2951         for short_name, long_name in cls._lang_map.items():
2952             if long_name == code:
2953                 return short_name
2954
2955
2956 class ISO3166Utils(object):
2957     # From http://data.okfn.org/data/core/country-list
2958     _country_map = {
2959         'AF': 'Afghanistan',
2960         'AX': 'Åland Islands',
2961         'AL': 'Albania',
2962         'DZ': 'Algeria',
2963         'AS': 'American Samoa',
2964         'AD': 'Andorra',
2965         'AO': 'Angola',
2966         'AI': 'Anguilla',
2967         'AQ': 'Antarctica',
2968         'AG': 'Antigua and Barbuda',
2969         'AR': 'Argentina',
2970         'AM': 'Armenia',
2971         'AW': 'Aruba',
2972         'AU': 'Australia',
2973         'AT': 'Austria',
2974         'AZ': 'Azerbaijan',
2975         'BS': 'Bahamas',
2976         'BH': 'Bahrain',
2977         'BD': 'Bangladesh',
2978         'BB': 'Barbados',
2979         'BY': 'Belarus',
2980         'BE': 'Belgium',
2981         'BZ': 'Belize',
2982         'BJ': 'Benin',
2983         'BM': 'Bermuda',
2984         'BT': 'Bhutan',
2985         'BO': 'Bolivia, Plurinational State of',
2986         'BQ': 'Bonaire, Sint Eustatius and Saba',
2987         'BA': 'Bosnia and Herzegovina',
2988         'BW': 'Botswana',
2989         'BV': 'Bouvet Island',
2990         'BR': 'Brazil',
2991         'IO': 'British Indian Ocean Territory',
2992         'BN': 'Brunei Darussalam',
2993         'BG': 'Bulgaria',
2994         'BF': 'Burkina Faso',
2995         'BI': 'Burundi',
2996         'KH': 'Cambodia',
2997         'CM': 'Cameroon',
2998         'CA': 'Canada',
2999         'CV': 'Cape Verde',
3000         'KY': 'Cayman Islands',
3001         'CF': 'Central African Republic',
3002         'TD': 'Chad',
3003         'CL': 'Chile',
3004         'CN': 'China',
3005         'CX': 'Christmas Island',
3006         'CC': 'Cocos (Keeling) Islands',
3007         'CO': 'Colombia',
3008         'KM': 'Comoros',
3009         'CG': 'Congo',
3010         'CD': 'Congo, the Democratic Republic of the',
3011         'CK': 'Cook Islands',
3012         'CR': 'Costa Rica',
3013         'CI': 'Côte d\'Ivoire',
3014         'HR': 'Croatia',
3015         'CU': 'Cuba',
3016         'CW': 'Curaçao',
3017         'CY': 'Cyprus',
3018         'CZ': 'Czech Republic',
3019         'DK': 'Denmark',
3020         'DJ': 'Djibouti',
3021         'DM': 'Dominica',
3022         'DO': 'Dominican Republic',
3023         'EC': 'Ecuador',
3024         'EG': 'Egypt',
3025         'SV': 'El Salvador',
3026         'GQ': 'Equatorial Guinea',
3027         'ER': 'Eritrea',
3028         'EE': 'Estonia',
3029         'ET': 'Ethiopia',
3030         'FK': 'Falkland Islands (Malvinas)',
3031         'FO': 'Faroe Islands',
3032         'FJ': 'Fiji',
3033         'FI': 'Finland',
3034         'FR': 'France',
3035         'GF': 'French Guiana',
3036         'PF': 'French Polynesia',
3037         'TF': 'French Southern Territories',
3038         'GA': 'Gabon',
3039         'GM': 'Gambia',
3040         'GE': 'Georgia',
3041         'DE': 'Germany',
3042         'GH': 'Ghana',
3043         'GI': 'Gibraltar',
3044         'GR': 'Greece',
3045         'GL': 'Greenland',
3046         'GD': 'Grenada',
3047         'GP': 'Guadeloupe',
3048         'GU': 'Guam',
3049         'GT': 'Guatemala',
3050         'GG': 'Guernsey',
3051         'GN': 'Guinea',
3052         'GW': 'Guinea-Bissau',
3053         'GY': 'Guyana',
3054         'HT': 'Haiti',
3055         'HM': 'Heard Island and McDonald Islands',
3056         'VA': 'Holy See (Vatican City State)',
3057         'HN': 'Honduras',
3058         'HK': 'Hong Kong',
3059         'HU': 'Hungary',
3060         'IS': 'Iceland',
3061         'IN': 'India',
3062         'ID': 'Indonesia',
3063         'IR': 'Iran, Islamic Republic of',
3064         'IQ': 'Iraq',
3065         'IE': 'Ireland',
3066         'IM': 'Isle of Man',
3067         'IL': 'Israel',
3068         'IT': 'Italy',
3069         'JM': 'Jamaica',
3070         'JP': 'Japan',
3071         'JE': 'Jersey',
3072         'JO': 'Jordan',
3073         'KZ': 'Kazakhstan',
3074         'KE': 'Kenya',
3075         'KI': 'Kiribati',
3076         'KP': 'Korea, Democratic People\'s Republic of',
3077         'KR': 'Korea, Republic of',
3078         'KW': 'Kuwait',
3079         'KG': 'Kyrgyzstan',
3080         'LA': 'Lao People\'s Democratic Republic',
3081         'LV': 'Latvia',
3082         'LB': 'Lebanon',
3083         'LS': 'Lesotho',
3084         'LR': 'Liberia',
3085         'LY': 'Libya',
3086         'LI': 'Liechtenstein',
3087         'LT': 'Lithuania',
3088         'LU': 'Luxembourg',
3089         'MO': 'Macao',
3090         'MK': 'Macedonia, the Former Yugoslav Republic of',
3091         'MG': 'Madagascar',
3092         'MW': 'Malawi',
3093         'MY': 'Malaysia',
3094         'MV': 'Maldives',
3095         'ML': 'Mali',
3096         'MT': 'Malta',
3097         'MH': 'Marshall Islands',
3098         'MQ': 'Martinique',
3099         'MR': 'Mauritania',
3100         'MU': 'Mauritius',
3101         'YT': 'Mayotte',
3102         'MX': 'Mexico',
3103         'FM': 'Micronesia, Federated States of',
3104         'MD': 'Moldova, Republic of',
3105         'MC': 'Monaco',
3106         'MN': 'Mongolia',
3107         'ME': 'Montenegro',
3108         'MS': 'Montserrat',
3109         'MA': 'Morocco',
3110         'MZ': 'Mozambique',
3111         'MM': 'Myanmar',
3112         'NA': 'Namibia',
3113         'NR': 'Nauru',
3114         'NP': 'Nepal',
3115         'NL': 'Netherlands',
3116         'NC': 'New Caledonia',
3117         'NZ': 'New Zealand',
3118         'NI': 'Nicaragua',
3119         'NE': 'Niger',
3120         'NG': 'Nigeria',
3121         'NU': 'Niue',
3122         'NF': 'Norfolk Island',
3123         'MP': 'Northern Mariana Islands',
3124         'NO': 'Norway',
3125         'OM': 'Oman',
3126         'PK': 'Pakistan',
3127         'PW': 'Palau',
3128         'PS': 'Palestine, State of',
3129         'PA': 'Panama',
3130         'PG': 'Papua New Guinea',
3131         'PY': 'Paraguay',
3132         'PE': 'Peru',
3133         'PH': 'Philippines',
3134         'PN': 'Pitcairn',
3135         'PL': 'Poland',
3136         'PT': 'Portugal',
3137         'PR': 'Puerto Rico',
3138         'QA': 'Qatar',
3139         'RE': 'Réunion',
3140         'RO': 'Romania',
3141         'RU': 'Russian Federation',
3142         'RW': 'Rwanda',
3143         'BL': 'Saint Barthélemy',
3144         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3145         'KN': 'Saint Kitts and Nevis',
3146         'LC': 'Saint Lucia',
3147         'MF': 'Saint Martin (French part)',
3148         'PM': 'Saint Pierre and Miquelon',
3149         'VC': 'Saint Vincent and the Grenadines',
3150         'WS': 'Samoa',
3151         'SM': 'San Marino',
3152         'ST': 'Sao Tome and Principe',
3153         'SA': 'Saudi Arabia',
3154         'SN': 'Senegal',
3155         'RS': 'Serbia',
3156         'SC': 'Seychelles',
3157         'SL': 'Sierra Leone',
3158         'SG': 'Singapore',
3159         'SX': 'Sint Maarten (Dutch part)',
3160         'SK': 'Slovakia',
3161         'SI': 'Slovenia',
3162         'SB': 'Solomon Islands',
3163         'SO': 'Somalia',
3164         'ZA': 'South Africa',
3165         'GS': 'South Georgia and the South Sandwich Islands',
3166         'SS': 'South Sudan',
3167         'ES': 'Spain',
3168         'LK': 'Sri Lanka',
3169         'SD': 'Sudan',
3170         'SR': 'Suriname',
3171         'SJ': 'Svalbard and Jan Mayen',
3172         'SZ': 'Swaziland',
3173         'SE': 'Sweden',
3174         'CH': 'Switzerland',
3175         'SY': 'Syrian Arab Republic',
3176         'TW': 'Taiwan, Province of China',
3177         'TJ': 'Tajikistan',
3178         'TZ': 'Tanzania, United Republic of',
3179         'TH': 'Thailand',
3180         'TL': 'Timor-Leste',
3181         'TG': 'Togo',
3182         'TK': 'Tokelau',
3183         'TO': 'Tonga',
3184         'TT': 'Trinidad and Tobago',
3185         'TN': 'Tunisia',
3186         'TR': 'Turkey',
3187         'TM': 'Turkmenistan',
3188         'TC': 'Turks and Caicos Islands',
3189         'TV': 'Tuvalu',
3190         'UG': 'Uganda',
3191         'UA': 'Ukraine',
3192         'AE': 'United Arab Emirates',
3193         'GB': 'United Kingdom',
3194         'US': 'United States',
3195         'UM': 'United States Minor Outlying Islands',
3196         'UY': 'Uruguay',
3197         'UZ': 'Uzbekistan',
3198         'VU': 'Vanuatu',
3199         'VE': 'Venezuela, Bolivarian Republic of',
3200         'VN': 'Viet Nam',
3201         'VG': 'Virgin Islands, British',
3202         'VI': 'Virgin Islands, U.S.',
3203         'WF': 'Wallis and Futuna',
3204         'EH': 'Western Sahara',
3205         'YE': 'Yemen',
3206         'ZM': 'Zambia',
3207         'ZW': 'Zimbabwe',
3208     }
3209
3210     @classmethod
3211     def short2full(cls, code):
3212         """Convert an ISO 3166-2 country code to the corresponding full name"""
3213         return cls._country_map.get(code.upper())
3214
3215
3216 class GeoUtils(object):
3217     # Major IPv4 address blocks per country
3218     _country_ip_map = {
3219         'AD': '85.94.160.0/19',
3220         'AE': '94.200.0.0/13',
3221         'AF': '149.54.0.0/17',
3222         'AG': '209.59.64.0/18',
3223         'AI': '204.14.248.0/21',
3224         'AL': '46.99.0.0/16',
3225         'AM': '46.70.0.0/15',
3226         'AO': '105.168.0.0/13',
3227         'AP': '159.117.192.0/21',
3228         'AR': '181.0.0.0/12',
3229         'AS': '202.70.112.0/20',
3230         'AT': '84.112.0.0/13',
3231         'AU': '1.128.0.0/11',
3232         'AW': '181.41.0.0/18',
3233         'AZ': '5.191.0.0/16',
3234         'BA': '31.176.128.0/17',
3235         'BB': '65.48.128.0/17',
3236         'BD': '114.130.0.0/16',
3237         'BE': '57.0.0.0/8',
3238         'BF': '129.45.128.0/17',
3239         'BG': '95.42.0.0/15',
3240         'BH': '37.131.0.0/17',
3241         'BI': '154.117.192.0/18',
3242         'BJ': '137.255.0.0/16',
3243         'BL': '192.131.134.0/24',
3244         'BM': '196.12.64.0/18',
3245         'BN': '156.31.0.0/16',
3246         'BO': '161.56.0.0/16',
3247         'BQ': '161.0.80.0/20',
3248         'BR': '152.240.0.0/12',
3249         'BS': '24.51.64.0/18',
3250         'BT': '119.2.96.0/19',
3251         'BW': '168.167.0.0/16',
3252         'BY': '178.120.0.0/13',
3253         'BZ': '179.42.192.0/18',
3254         'CA': '99.224.0.0/11',
3255         'CD': '41.243.0.0/16',
3256         'CF': '196.32.200.0/21',
3257         'CG': '197.214.128.0/17',
3258         'CH': '85.0.0.0/13',
3259         'CI': '154.232.0.0/14',
3260         'CK': '202.65.32.0/19',
3261         'CL': '152.172.0.0/14',
3262         'CM': '165.210.0.0/15',
3263         'CN': '36.128.0.0/10',
3264         'CO': '181.240.0.0/12',
3265         'CR': '201.192.0.0/12',
3266         'CU': '152.206.0.0/15',
3267         'CV': '165.90.96.0/19',
3268         'CW': '190.88.128.0/17',
3269         'CY': '46.198.0.0/15',
3270         'CZ': '88.100.0.0/14',
3271         'DE': '53.0.0.0/8',
3272         'DJ': '197.241.0.0/17',
3273         'DK': '87.48.0.0/12',
3274         'DM': '192.243.48.0/20',
3275         'DO': '152.166.0.0/15',
3276         'DZ': '41.96.0.0/12',
3277         'EC': '186.68.0.0/15',
3278         'EE': '90.190.0.0/15',
3279         'EG': '156.160.0.0/11',
3280         'ER': '196.200.96.0/20',
3281         'ES': '88.0.0.0/11',
3282         'ET': '196.188.0.0/14',
3283         'EU': '2.16.0.0/13',
3284         'FI': '91.152.0.0/13',
3285         'FJ': '144.120.0.0/16',
3286         'FM': '119.252.112.0/20',
3287         'FO': '88.85.32.0/19',
3288         'FR': '90.0.0.0/9',
3289         'GA': '41.158.0.0/15',
3290         'GB': '25.0.0.0/8',
3291         'GD': '74.122.88.0/21',
3292         'GE': '31.146.0.0/16',
3293         'GF': '161.22.64.0/18',
3294         'GG': '62.68.160.0/19',
3295         'GH': '45.208.0.0/14',
3296         'GI': '85.115.128.0/19',
3297         'GL': '88.83.0.0/19',
3298         'GM': '160.182.0.0/15',
3299         'GN': '197.149.192.0/18',
3300         'GP': '104.250.0.0/19',
3301         'GQ': '105.235.224.0/20',
3302         'GR': '94.64.0.0/13',
3303         'GT': '168.234.0.0/16',
3304         'GU': '168.123.0.0/16',
3305         'GW': '197.214.80.0/20',
3306         'GY': '181.41.64.0/18',
3307         'HK': '113.252.0.0/14',
3308         'HN': '181.210.0.0/16',
3309         'HR': '93.136.0.0/13',
3310         'HT': '148.102.128.0/17',
3311         'HU': '84.0.0.0/14',
3312         'ID': '39.192.0.0/10',
3313         'IE': '87.32.0.0/12',
3314         'IL': '79.176.0.0/13',
3315         'IM': '5.62.80.0/20',
3316         'IN': '117.192.0.0/10',
3317         'IO': '203.83.48.0/21',
3318         'IQ': '37.236.0.0/14',
3319         'IR': '2.176.0.0/12',
3320         'IS': '82.221.0.0/16',
3321         'IT': '79.0.0.0/10',
3322         'JE': '87.244.64.0/18',
3323         'JM': '72.27.0.0/17',
3324         'JO': '176.29.0.0/16',
3325         'JP': '126.0.0.0/8',
3326         'KE': '105.48.0.0/12',
3327         'KG': '158.181.128.0/17',
3328         'KH': '36.37.128.0/17',
3329         'KI': '103.25.140.0/22',
3330         'KM': '197.255.224.0/20',
3331         'KN': '198.32.32.0/19',
3332         'KP': '175.45.176.0/22',
3333         'KR': '175.192.0.0/10',
3334         'KW': '37.36.0.0/14',
3335         'KY': '64.96.0.0/15',
3336         'KZ': '2.72.0.0/13',
3337         'LA': '115.84.64.0/18',
3338         'LB': '178.135.0.0/16',
3339         'LC': '192.147.231.0/24',
3340         'LI': '82.117.0.0/19',
3341         'LK': '112.134.0.0/15',
3342         'LR': '41.86.0.0/19',
3343         'LS': '129.232.0.0/17',
3344         'LT': '78.56.0.0/13',
3345         'LU': '188.42.0.0/16',
3346         'LV': '46.109.0.0/16',
3347         'LY': '41.252.0.0/14',
3348         'MA': '105.128.0.0/11',
3349         'MC': '88.209.64.0/18',
3350         'MD': '37.246.0.0/16',
3351         'ME': '178.175.0.0/17',
3352         'MF': '74.112.232.0/21',
3353         'MG': '154.126.0.0/17',
3354         'MH': '117.103.88.0/21',
3355         'MK': '77.28.0.0/15',
3356         'ML': '154.118.128.0/18',
3357         'MM': '37.111.0.0/17',
3358         'MN': '49.0.128.0/17',
3359         'MO': '60.246.0.0/16',
3360         'MP': '202.88.64.0/20',
3361         'MQ': '109.203.224.0/19',
3362         'MR': '41.188.64.0/18',
3363         'MS': '208.90.112.0/22',
3364         'MT': '46.11.0.0/16',
3365         'MU': '105.16.0.0/12',
3366         'MV': '27.114.128.0/18',
3367         'MW': '105.234.0.0/16',
3368         'MX': '187.192.0.0/11',
3369         'MY': '175.136.0.0/13',
3370         'MZ': '197.218.0.0/15',
3371         'NA': '41.182.0.0/16',
3372         'NC': '101.101.0.0/18',
3373         'NE': '197.214.0.0/18',
3374         'NF': '203.17.240.0/22',
3375         'NG': '105.112.0.0/12',
3376         'NI': '186.76.0.0/15',
3377         'NL': '145.96.0.0/11',
3378         'NO': '84.208.0.0/13',
3379         'NP': '36.252.0.0/15',
3380         'NR': '203.98.224.0/19',
3381         'NU': '49.156.48.0/22',
3382         'NZ': '49.224.0.0/14',
3383         'OM': '5.36.0.0/15',
3384         'PA': '186.72.0.0/15',
3385         'PE': '186.160.0.0/14',
3386         'PF': '123.50.64.0/18',
3387         'PG': '124.240.192.0/19',
3388         'PH': '49.144.0.0/13',
3389         'PK': '39.32.0.0/11',
3390         'PL': '83.0.0.0/11',
3391         'PM': '70.36.0.0/20',
3392         'PR': '66.50.0.0/16',
3393         'PS': '188.161.0.0/16',
3394         'PT': '85.240.0.0/13',
3395         'PW': '202.124.224.0/20',
3396         'PY': '181.120.0.0/14',
3397         'QA': '37.210.0.0/15',
3398         'RE': '139.26.0.0/16',
3399         'RO': '79.112.0.0/13',
3400         'RS': '178.220.0.0/14',
3401         'RU': '5.136.0.0/13',
3402         'RW': '105.178.0.0/15',
3403         'SA': '188.48.0.0/13',
3404         'SB': '202.1.160.0/19',
3405         'SC': '154.192.0.0/11',
3406         'SD': '154.96.0.0/13',
3407         'SE': '78.64.0.0/12',
3408         'SG': '152.56.0.0/14',
3409         'SI': '188.196.0.0/14',
3410         'SK': '78.98.0.0/15',
3411         'SL': '197.215.0.0/17',
3412         'SM': '89.186.32.0/19',
3413         'SN': '41.82.0.0/15',
3414         'SO': '197.220.64.0/19',
3415         'SR': '186.179.128.0/17',
3416         'SS': '105.235.208.0/21',
3417         'ST': '197.159.160.0/19',
3418         'SV': '168.243.0.0/16',
3419         'SX': '190.102.0.0/20',
3420         'SY': '5.0.0.0/16',
3421         'SZ': '41.84.224.0/19',
3422         'TC': '65.255.48.0/20',
3423         'TD': '154.68.128.0/19',
3424         'TG': '196.168.0.0/14',
3425         'TH': '171.96.0.0/13',
3426         'TJ': '85.9.128.0/18',
3427         'TK': '27.96.24.0/21',
3428         'TL': '180.189.160.0/20',
3429         'TM': '95.85.96.0/19',
3430         'TN': '197.0.0.0/11',
3431         'TO': '175.176.144.0/21',
3432         'TR': '78.160.0.0/11',
3433         'TT': '186.44.0.0/15',
3434         'TV': '202.2.96.0/19',
3435         'TW': '120.96.0.0/11',
3436         'TZ': '156.156.0.0/14',
3437         'UA': '93.72.0.0/13',
3438         'UG': '154.224.0.0/13',
3439         'US': '3.0.0.0/8',
3440         'UY': '167.56.0.0/13',
3441         'UZ': '82.215.64.0/18',
3442         'VA': '212.77.0.0/19',
3443         'VC': '24.92.144.0/20',
3444         'VE': '186.88.0.0/13',
3445         'VG': '172.103.64.0/18',
3446         'VI': '146.226.0.0/16',
3447         'VN': '14.160.0.0/11',
3448         'VU': '202.80.32.0/20',
3449         'WF': '117.20.32.0/21',
3450         'WS': '202.4.32.0/19',
3451         'YE': '134.35.0.0/16',
3452         'YT': '41.242.116.0/22',
3453         'ZA': '41.0.0.0/11',
3454         'ZM': '165.56.0.0/13',
3455         'ZW': '41.85.192.0/19',
3456     }
3457
3458     @classmethod
3459     def random_ipv4(cls, code):
3460         block = cls._country_ip_map.get(code.upper())
3461         if not block:
3462             return None
3463         addr, preflen = block.split('/')
3464         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3465         addr_max = addr_min | (0xffffffff >> int(preflen))
3466         return compat_str(socket.inet_ntoa(
3467             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3468
3469
3470 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3471     def __init__(self, proxies=None):
3472         # Set default handlers
3473         for type in ('http', 'https'):
3474             setattr(self, '%s_open' % type,
3475                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3476                         meth(r, proxy, type))
3477         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3478
3479     def proxy_open(self, req, proxy, type):
3480         req_proxy = req.headers.get('Ytdl-request-proxy')
3481         if req_proxy is not None:
3482             proxy = req_proxy
3483             del req.headers['Ytdl-request-proxy']
3484
3485         if proxy == '__noproxy__':
3486             return None  # No Proxy
3487         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3488             req.add_header('Ytdl-socks-proxy', proxy)
3489             # youtube-dl's http/https handlers do wrapping the socket with socks
3490             return None
3491         return compat_urllib_request.ProxyHandler.proxy_open(
3492             self, req, proxy, type)
3493
3494
3495 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3496 # released into Public Domain
3497 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3498
3499 def long_to_bytes(n, blocksize=0):
3500     """long_to_bytes(n:long, blocksize:int) : string
3501     Convert a long integer to a byte string.
3502
3503     If optional blocksize is given and greater than zero, pad the front of the
3504     byte string with binary zeros so that the length is a multiple of
3505     blocksize.
3506     """
3507     # after much testing, this algorithm was deemed to be the fastest
3508     s = b''
3509     n = int(n)
3510     while n > 0:
3511         s = compat_struct_pack('>I', n & 0xffffffff) + s
3512         n = n >> 32
3513     # strip off leading zeros
3514     for i in range(len(s)):
3515         if s[i] != b'\000'[0]:
3516             break
3517     else:
3518         # only happens when n == 0
3519         s = b'\000'
3520         i = 0
3521     s = s[i:]
3522     # add back some pad bytes.  this could be done more efficiently w.r.t. the
3523     # de-padding being done above, but sigh...
3524     if blocksize > 0 and len(s) % blocksize:
3525         s = (blocksize - len(s) % blocksize) * b'\000' + s
3526     return s
3527
3528
3529 def bytes_to_long(s):
3530     """bytes_to_long(string) : long
3531     Convert a byte string to a long integer.
3532
3533     This is (essentially) the inverse of long_to_bytes().
3534     """
3535     acc = 0
3536     length = len(s)
3537     if length % 4:
3538         extra = (4 - length % 4)
3539         s = b'\000' * extra + s
3540         length = length + extra
3541     for i in range(0, length, 4):
3542         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3543     return acc
3544
3545
3546 def ohdave_rsa_encrypt(data, exponent, modulus):
3547     '''
3548     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3549
3550     Input:
3551         data: data to encrypt, bytes-like object
3552         exponent, modulus: parameter e and N of RSA algorithm, both integer
3553     Output: hex string of encrypted data
3554
3555     Limitation: supports one block encryption only
3556     '''
3557
3558     payload = int(binascii.hexlify(data[::-1]), 16)
3559     encrypted = pow(payload, exponent, modulus)
3560     return '%x' % encrypted
3561
3562
3563 def pkcs1pad(data, length):
3564     """
3565     Padding input data with PKCS#1 scheme
3566
3567     @param {int[]} data        input data
3568     @param {int}   length      target length
3569     @returns {int[]}           padded data
3570     """
3571     if len(data) > length - 11:
3572         raise ValueError('Input data too long for PKCS#1 padding')
3573
3574     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3575     return [0, 2] + pseudo_random + [0] + data
3576
3577
3578 def encode_base_n(num, n, table=None):
3579     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3580     if not table:
3581         table = FULL_TABLE[:n]
3582
3583     if n > len(table):
3584         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3585
3586     if num == 0:
3587         return table[0]
3588
3589     ret = ''
3590     while num:
3591         ret = table[num % n] + ret
3592         num = num // n
3593     return ret
3594
3595
3596 def decode_packed_codes(code):
3597     mobj = re.search(PACKED_CODES_RE, code)
3598     obfucasted_code, base, count, symbols = mobj.groups()
3599     base = int(base)
3600     count = int(count)
3601     symbols = symbols.split('|')
3602     symbol_table = {}
3603
3604     while count:
3605         count -= 1
3606         base_n_count = encode_base_n(count, base)
3607         symbol_table[base_n_count] = symbols[count] or base_n_count
3608
3609     return re.sub(
3610         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3611         obfucasted_code)
3612
3613
3614 def parse_m3u8_attributes(attrib):
3615     info = {}
3616     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3617         if val.startswith('"'):
3618             val = val[1:-1]
3619         info[key] = val
3620     return info
3621
3622
3623 def urshift(val, n):
3624     return val >> n if val >= 0 else (val + 0x100000000) >> n
3625
3626
3627 # Based on png2str() written by @gdkchan and improved by @yokrysty
3628 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3629 def decode_png(png_data):
3630     # Reference: https://www.w3.org/TR/PNG/
3631     header = png_data[8:]
3632
3633     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3634         raise IOError('Not a valid PNG file.')
3635
3636     int_map = {1: '>B', 2: '>H', 4: '>I'}
3637     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3638
3639     chunks = []
3640
3641     while header:
3642         length = unpack_integer(header[:4])
3643         header = header[4:]
3644
3645         chunk_type = header[:4]
3646         header = header[4:]
3647
3648         chunk_data = header[:length]
3649         header = header[length:]
3650
3651         header = header[4:]  # Skip CRC
3652
3653         chunks.append({
3654             'type': chunk_type,
3655             'length': length,
3656             'data': chunk_data
3657         })
3658
3659     ihdr = chunks[0]['data']
3660
3661     width = unpack_integer(ihdr[:4])
3662     height = unpack_integer(ihdr[4:8])
3663
3664     idat = b''
3665
3666     for chunk in chunks:
3667         if chunk['type'] == b'IDAT':
3668             idat += chunk['data']
3669
3670     if not idat:
3671         raise IOError('Unable to read PNG data.')
3672
3673     decompressed_data = bytearray(zlib.decompress(idat))
3674
3675     stride = width * 3
3676     pixels = []
3677
3678     def _get_pixel(idx):
3679         x = idx % stride
3680         y = idx // stride
3681         return pixels[y][x]
3682
3683     for y in range(height):
3684         basePos = y * (1 + stride)
3685         filter_type = decompressed_data[basePos]
3686
3687         current_row = []
3688
3689         pixels.append(current_row)
3690
3691         for x in range(stride):
3692             color = decompressed_data[1 + basePos + x]
3693             basex = y * stride + x
3694             left = 0
3695             up = 0
3696
3697             if x > 2:
3698                 left = _get_pixel(basex - 3)
3699             if y > 0:
3700                 up = _get_pixel(basex - stride)
3701
3702             if filter_type == 1:  # Sub
3703                 color = (color + left) & 0xff
3704             elif filter_type == 2:  # Up
3705                 color = (color + up) & 0xff
3706             elif filter_type == 3:  # Average
3707                 color = (color + ((left + up) >> 1)) & 0xff
3708             elif filter_type == 4:  # Paeth
3709                 a = left
3710                 b = up
3711                 c = 0
3712
3713                 if x > 2 and y > 0:
3714                     c = _get_pixel(basex - stride - 3)
3715
3716                 p = a + b - c
3717
3718                 pa = abs(p - a)
3719                 pb = abs(p - b)
3720                 pc = abs(p - c)
3721
3722                 if pa <= pb and pa <= pc:
3723                     color = (color + a) & 0xff
3724                 elif pb <= pc:
3725                     color = (color + b) & 0xff
3726                 else:
3727                     color = (color + c) & 0xff
3728
3729             current_row.append(color)
3730
3731     return width, height, pixels
3732
3733
3734 def write_xattr(path, key, value):
3735     # This mess below finds the best xattr tool for the job
3736     try:
3737         # try the pyxattr module...
3738         import xattr
3739
3740         if hasattr(xattr, 'set'):  # pyxattr
3741             # Unicode arguments are not supported in python-pyxattr until
3742             # version 0.5.0
3743             # See https://github.com/rg3/youtube-dl/issues/5498
3744             pyxattr_required_version = '0.5.0'
3745             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3746                 # TODO: fallback to CLI tools
3747                 raise XAttrUnavailableError(
3748                     'python-pyxattr is detected but is too old. '
3749                     'youtube-dl requires %s or above while your version is %s. '
3750                     'Falling back to other xattr implementations' % (
3751                         pyxattr_required_version, xattr.__version__))
3752
3753             setxattr = xattr.set
3754         else:  # xattr
3755             setxattr = xattr.setxattr
3756
3757         try:
3758             setxattr(path, key, value)
3759         except EnvironmentError as e:
3760             raise XAttrMetadataError(e.errno, e.strerror)
3761
3762     except ImportError:
3763         if compat_os_name == 'nt':
3764             # Write xattrs to NTFS Alternate Data Streams:
3765             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3766             assert ':' not in key
3767             assert os.path.exists(path)
3768
3769             ads_fn = path + ':' + key
3770             try:
3771                 with open(ads_fn, 'wb') as f:
3772                     f.write(value)
3773             except EnvironmentError as e:
3774                 raise XAttrMetadataError(e.errno, e.strerror)
3775         else:
3776             user_has_setfattr = check_executable('setfattr', ['--version'])
3777             user_has_xattr = check_executable('xattr', ['-h'])
3778
3779             if user_has_setfattr or user_has_xattr:
3780
3781                 value = value.decode('utf-8')
3782                 if user_has_setfattr:
3783                     executable = 'setfattr'
3784                     opts = ['-n', key, '-v', value]
3785                 elif user_has_xattr:
3786                     executable = 'xattr'
3787                     opts = ['-w', key, value]
3788
3789                 cmd = ([encodeFilename(executable, True)] +
3790                        [encodeArgument(o) for o in opts] +
3791                        [encodeFilename(path, True)])
3792
3793                 try:
3794                     p = subprocess.Popen(
3795                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3796                 except EnvironmentError as e:
3797                     raise XAttrMetadataError(e.errno, e.strerror)
3798                 stdout, stderr = p.communicate()
3799                 stderr = stderr.decode('utf-8', 'replace')
3800                 if p.returncode != 0:
3801                     raise XAttrMetadataError(p.returncode, stderr)
3802
3803             else:
3804                 # On Unix, and can't find pyxattr, setfattr, or xattr.
3805                 if sys.platform.startswith('linux'):
3806                     raise XAttrUnavailableError(
3807                         "Couldn't find a tool to set the xattrs. "
3808                         "Install either the python 'pyxattr' or 'xattr' "
3809                         "modules, or the GNU 'attr' package "
3810                         "(which contains the 'setfattr' tool).")
3811                 else:
3812                     raise XAttrUnavailableError(
3813                         "Couldn't find a tool to set the xattrs. "
3814                         "Install either the python 'xattr' module, "
3815                         "or the 'xattr' binary.")
3816
3817
3818 def cookie_to_dict(cookie):
3819     cookie_dict = {
3820         'name': cookie.name,
3821         'value': cookie.value,
3822     };
3823     if cookie.port_specified:
3824         cookie_dict['port'] = cookie.port
3825     if cookie.domain_specified:
3826         cookie_dict['domain'] = cookie.domain
3827     if cookie.path_specified:
3828         cookie_dict['path'] = cookie.path
3829     if not cookie.expires is None:
3830         cookie_dict['expires'] = cookie.expires
3831     if not cookie.secure is None:
3832         cookie_dict['secure'] = cookie.secure
3833     if not cookie.discard is None:
3834         cookie_dict['discard'] = cookie.discard
3835     try:
3836         if (cookie.has_nonstandard_attr('httpOnly') or
3837             cookie.has_nonstandard_attr('httponly') or
3838             cookie.has_nonstandard_attr('HttpOnly')):
3839             cookie_dict['httponly'] = True
3840     except TypeError:
3841         pass
3842     return cookie_dict
3843
3844
3845 def cookie_jar_to_list(cookie_jar):
3846     return [cookie_to_dict(cookie) for cookie in cookie_jar]
3847
3848
3849 class PhantomJSwrapper(object):
3850     """PhantomJS wrapper class"""
3851
3852     _TEMPLATE = r'''
3853         phantom.onError = function(msg, trace) {{
3854           var msgStack = ['PHANTOM ERROR: ' + msg];
3855           if(trace && trace.length) {{
3856             msgStack.push('TRACE:');
3857             trace.forEach(function(t) {{
3858               msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
3859                 + (t.function ? ' (in function ' + t.function +')' : ''));
3860             }});
3861           }}
3862           console.error(msgStack.join('\n'));
3863           phantom.exit(1);
3864         }};
3865         var page = require('webpage').create();
3866         var fs = require('fs');
3867         var read = {{ mode: 'r', charset: 'utf-8' }};
3868         var write = {{ mode: 'w', charset: 'utf-8' }};
3869         JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
3870           phantom.addCookie(x);
3871         }});
3872         page.settings.resourceTimeout = {timeout};
3873         page.settings.userAgent = "{ua}";
3874         page.onLoadStarted = function() {{
3875           page.evaluate(function() {{
3876             delete window._phantom;
3877             delete window.callPhantom;
3878           }});
3879         }};
3880         var saveAndExit = function() {{
3881           fs.write("{html}", page.content, write);
3882           fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
3883           phantom.exit();
3884         }};
3885         page.onLoadFinished = function(status) {{
3886           if(page.url === "") {{
3887             page.setContent(fs.read("{html}", read), "{url}");
3888           }}
3889           else {{
3890             {jscode}
3891           }}
3892         }};
3893         page.open("");
3894     '''
3895
3896     _TMP_FILE_NAMES = ['script', 'html', 'cookies']
3897
3898     def __init__(self, extractor, required_version=None, timeout=10000):
3899         self.exe = check_executable('phantomjs', ['-v'])
3900         if not self.exe:
3901             raise ExtractorError('PhantomJS executable not found in PATH, '
3902                                  'download it from http://phantomjs.org',
3903                                  expected=True)
3904
3905         self.extractor = extractor
3906
3907         if required_version:
3908             version = get_exe_version(self.exe, version_re=r'([0-9.]+)')
3909             if is_outdated_version(version, required_version):
3910                 self.extractor._downloader.report_warning(
3911                     'Your copy of PhantomJS is outdated, update it to version '
3912                     '%s or newer if you encounter any errors.' % required_version)
3913
3914         self.options = {
3915             'timeout': timeout,
3916         }
3917         self._TMP_FILES = {}
3918         for name in self._TMP_FILE_NAMES:
3919             tmp = tempfile.NamedTemporaryFile(delete=False)
3920             tmp.close()
3921             self._TMP_FILES[name] = tmp
3922
3923     def __del__(self):
3924         for name in self._TMP_FILE_NAMES:
3925             try:
3926                 os.remove(self._TMP_FILES[name].name)
3927             except:
3928                 pass
3929
3930     def _save_cookies(self, url):
3931         cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
3932         for cookie in cookies:
3933             if 'path' not in cookie:
3934                 cookie['path'] = '/'
3935             if 'domain' not in cookie:
3936                 cookie['domain'] = compat_urlparse.urlparse(url).netloc
3937         with open(self._TMP_FILES['cookies'].name, 'wb') as f:
3938             f.write(json.dumps(cookies).encode('utf-8'))
3939
3940     def _load_cookies(self):
3941         with open(self._TMP_FILES['cookies'].name, 'rb') as f:
3942             cookies = json.loads(f.read().decode('utf-8'))
3943         for cookie in cookies:
3944             if cookie['httponly'] is True:
3945                 cookie['rest'] = { 'httpOnly': None }
3946             if 'expiry' in cookie:
3947                 cookie['expire_time'] = cookie['expiry']
3948             self.extractor._set_cookie(**cookie)
3949
3950     def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
3951         """
3952         Downloads webpage (if needed) and executes JS
3953
3954         Params:
3955             url: website url
3956             html: optional, html code of website
3957             video_id: video id
3958             note: optional, displayed when downloading webpage
3959             note2: optional, displayed when executing JS
3960             headers: custom http headers
3961             jscode: code to be executed when page is loaded
3962
3963         Returns tuple with:
3964             * downloaded website (after JS execution)
3965             * anything you print with `console.log` (but not inside `page.execute`!)
3966
3967         In most cases you don't need to add any `jscode`.
3968         It is executed in `page.onLoadFinished`.
3969         `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
3970         It is possible to wait for some element on the webpage, for example:
3971             var check = function() {
3972               var elementFound = page.evaluate(function() {
3973                 return document.querySelector('#b.done') !== null;
3974               });
3975               if(elementFound)
3976                 saveAndExit();
3977               else
3978                 window.setTimeout(check, 500);
3979             }
3980
3981             page.evaluate(function(){
3982               document.querySelector('#a').click();
3983             });
3984             check();
3985         """
3986         if 'saveAndExit();' not in jscode:
3987             raise ExtractorError('`saveAndExit();` not found in `jscode`')
3988         if not html:
3989             html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
3990         with open(self._TMP_FILES['html'].name, 'wb') as f:
3991             f.write(html.encode('utf-8'))
3992
3993         self._save_cookies(url)
3994
3995         replaces = self.options
3996         replaces['url'] = url
3997         user_agent = headers.get('User-Agent') or std_headers['User-Agent']
3998         replaces['ua'] = user_agent.replace('"', '\\"')
3999         replaces['jscode'] = jscode
4000
4001         for x in self._TMP_FILE_NAMES:
4002             replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
4003
4004         with open(self._TMP_FILES['script'].name, 'wb') as f:
4005             f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
4006
4007         if video_id is None:
4008             self.extractor.to_screen('%s' % (note2,))
4009         else:
4010             self.extractor.to_screen('%s: %s' % (video_id, note2))
4011
4012         p = subprocess.Popen([self.exe, '--ssl-protocol=any',
4013             self._TMP_FILES['script'].name], stdout=subprocess.PIPE,
4014             stderr=subprocess.PIPE)
4015         out, err = p.communicate()
4016         if p.returncode != 0:
4017             raise ExtractorError('Executing JS failed\n:'
4018                                  + encodeArgument(err))
4019         with open(self._TMP_FILES['html'].name, 'rb') as f:
4020             html = f.read().decode('utf-8')
4021
4022         self._load_cookies()
4023
4024         return (html, encodeArgument(out))
4025
4026
4027 def random_birthday(year_field, month_field, day_field):
4028     return {
4029         year_field: str(random.randint(1950, 1995)),
4030         month_field: str(random.randint(1, 12)),
4031         day_field: str(random.randint(1, 31)),
4032     }