_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113 DATE_FORMATS = (
 114     '%d %B %Y',
 115     '%d %b %Y',
 116     '%B %d %Y',
 117     '%b %d %Y',
 118     '%b %dst %Y %I:%M',
 119     '%b %dnd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y/%m/%d',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y-%m-%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S.%f',
 127     '%d.%m.%Y %H:%M',
 128     '%d.%m.%Y %H.%M',
 129     '%Y-%m-%dT%H:%M:%SZ',
 130     '%Y-%m-%dT%H:%M:%S.%fZ',
 131     '%Y-%m-%dT%H:%M:%S.%f0Z',
 132     '%Y-%m-%dT%H:%M:%S',
 133     '%Y-%m-%dT%H:%M:%S.%f',
 134     '%Y-%m-%dT%H:%M',
 135 )
 136
 137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 138 DATE_FORMATS_DAY_FIRST.extend([
 139     '%d-%m-%Y',
 140     '%d.%m.%Y',
 141     '%d.%m.%y',
 142     '%d/%m/%Y',
 143     '%d/%m/%y',
 144     '%d/%m/%Y %H:%M:%S',
 145 ])
 146
 147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_MONTH_FIRST.extend([
 149     '%m-%d-%Y',
 150     '%m.%d.%Y',
 151     '%m/%d/%Y',
 152     '%m/%d/%y',
 153     '%m/%d/%Y %H:%M:%S',
 154 ])
 155
 156
 157 def preferredencoding():
 158     """Get preferred encoding.
 159
 160     Returns the best encoding scheme for the system, based on
 161     locale.getpreferredencoding() and some further tweaks.
 162     """
 163     try:
 164         pref = locale.getpreferredencoding()
 165         'TEST'.encode(pref)
 166     except Exception:
 167         pref = 'UTF-8'
 168
 169     return pref
 170
 171
 172 def write_json_file(obj, fn):
 173     """ Encode obj as JSON and write it to fn, atomically if possible """
 174
 175     fn = encodeFilename(fn)
 176     if sys.version_info < (3, 0) and sys.platform != 'win32':
 177         encoding = get_filesystem_encoding()
 178         # os.path.basename returns a bytes object, but NamedTemporaryFile
 179         # will fail if the filename contains non ascii characters unless we
 180         # use a unicode object
 181         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 182         # the same for os.path.dirname
 183         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 184     else:
 185         path_basename = os.path.basename
 186         path_dirname = os.path.dirname
 187
 188     args = {
 189         'suffix': '.tmp',
 190         'prefix': path_basename(fn) + '.',
 191         'dir': path_dirname(fn),
 192         'delete': False,
 193     }
 194
 195     # In Python 2.x, json.dump expects a bytestream.
 196     # In Python 3.x, it writes to a character stream
 197     if sys.version_info < (3, 0):
 198         args['mode'] = 'wb'
 199     else:
 200         args.update({
 201             'mode': 'w',
 202             'encoding': 'utf-8',
 203         })
 204
 205     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 206
 207     try:
 208         with tf:
 209             json.dump(obj, tf)
 210         if sys.platform == 'win32':
 211             # Need to remove existing file on Windows, else os.rename raises
 212             # WindowsError or FileExistsError.
 213             try:
 214                 os.unlink(fn)
 215             except OSError:
 216                 pass
 217         os.rename(tf.name, fn)
 218     except Exception:
 219         try:
 220             os.remove(tf.name)
 221         except OSError:
 222             pass
 223         raise
 224
 225
 226 if sys.version_info >= (2, 7):
 227     def find_xpath_attr(node, xpath, key, val=None):
 228         """ Find the xpath xpath[@key=val] """
 229         assert re.match(r'^[a-zA-Z_-]+$', key)
 230         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 231         return node.find(expr)
 232 else:
 233     def find_xpath_attr(node, xpath, key, val=None):
 234         for f in node.findall(compat_xpath(xpath)):
 235             if key not in f.attrib:
 236                 continue
 237             if val is None or f.attrib.get(key) == val:
 238                 return f
 239         return None
 240
 241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 242 # the namespace parameter
 243
 244
 245 def xpath_with_ns(path, ns_map):
 246     components = [c.split(':') for c in path.split('/')]
 247     replaced = []
 248     for c in components:
 249         if len(c) == 1:
 250             replaced.append(c[0])
 251         else:
 252             ns, tag = c
 253             replaced.append('{%s}%s' % (ns_map[ns], tag))
 254     return '/'.join(replaced)
 255
 256
 257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 258     def _find_xpath(xpath):
 259         return node.find(compat_xpath(xpath))
 260
 261     if isinstance(xpath, (str, compat_str)):
 262         n = _find_xpath(xpath)
 263     else:
 264         for xp in xpath:
 265             n = _find_xpath(xp)
 266             if n is not None:
 267                 break
 268
 269     if n is None:
 270         if default is not NO_DEFAULT:
 271             return default
 272         elif fatal:
 273             name = xpath if name is None else name
 274             raise ExtractorError('Could not find XML element %s' % name)
 275         else:
 276             return None
 277     return n
 278
 279
 280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 281     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 282     if n is None or n == default:
 283         return n
 284     if n.text is None:
 285         if default is not NO_DEFAULT:
 286             return default
 287         elif fatal:
 288             name = xpath if name is None else name
 289             raise ExtractorError('Could not find XML element\'s text %s' % name)
 290         else:
 291             return None
 292     return n.text
 293
 294
 295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 296     n = find_xpath_attr(node, xpath, key)
 297     if n is None:
 298         if default is not NO_DEFAULT:
 299             return default
 300         elif fatal:
 301             name = '%s[@%s]' % (xpath, key) if name is None else name
 302             raise ExtractorError('Could not find XML attribute %s' % name)
 303         else:
 304             return None
 305     return n.attrib[key]
 306
 307
 308 def get_element_by_id(id, html):
 309     """Return the content of the tag with the specified ID in the passed HTML document"""
 310     return get_element_by_attribute('id', id, html)
 311
 312
 313 def get_element_by_attribute(attribute, value, html):
 314     """Return the content of the tag with the specified attribute in the passed HTML document"""
 315
 316     m = re.search(r'''(?xs)
 317         <([a-zA-Z0-9:._-]+)
 318          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 319          \s+%s=['"]?%s['"]?
 320          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 321         \s*>
 322         (?P<content>.*?)
 323         </\1>
 324     ''' % (re.escape(attribute), re.escape(value)), html)
 325
 326     if not m:
 327         return None
 328     res = m.group('content')
 329
 330     if res.startswith('"') or res.startswith("'"):
 331         res = res[1:-1]
 332
 333     return unescapeHTML(res)
 334
 335
 336 class HTMLAttributeParser(compat_HTMLParser):
 337     """Trivial HTML parser to gather the attributes for a single element"""
 338     def __init__(self):
 339         self.attrs = {}
 340         compat_HTMLParser.__init__(self)
 341
 342     def handle_starttag(self, tag, attrs):
 343         self.attrs = dict(attrs)
 344
 345
 346 def extract_attributes(html_element):
 347     """Given a string for an HTML element such as
 348     <el
 349          a="foo" B="bar" c="&98;az" d=boz
 350          empty= noval entity="&amp;"
 351          sq='"' dq="'"
 352     >
 353     Decode and return a dictionary of attributes.
 354     {
 355         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 356         'empty': '', 'noval': None, 'entity': '&',
 357         'sq': '"', 'dq': '\''
 358     }.
 359     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 360     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 361     """
 362     parser = HTMLAttributeParser()
 363     parser.feed(html_element)
 364     parser.close()
 365     return parser.attrs
 366
 367
 368 def clean_html(html):
 369     """Clean an HTML snippet into a readable string"""
 370
 371     if html is None:  # Convenience for sanitizing descriptions etc.
 372         return html
 373
 374     # Newline vs <br />
 375     html = html.replace('\n', ' ')
 376     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 377     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 378     # Strip html tags
 379     html = re.sub('<.*?>', '', html)
 380     # Replace html entities
 381     html = unescapeHTML(html)
 382     return html.strip()
 383
 384
 385 def sanitize_open(filename, open_mode):
 386     """Try to open the given filename, and slightly tweak it if this fails.
 387
 388     Attempts to open the given filename. If this fails, it tries to change
 389     the filename slightly, step by step, until it's either able to open it
 390     or it fails and raises a final exception, like the standard open()
 391     function.
 392
 393     It returns the tuple (stream, definitive_file_name).
 394     """
 395     try:
 396         if filename == '-':
 397             if sys.platform == 'win32':
 398                 import msvcrt
 399                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 400             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 401         stream = open(encodeFilename(filename), open_mode)
 402         return (stream, filename)
 403     except (IOError, OSError) as err:
 404         if err.errno in (errno.EACCES,):
 405             raise
 406
 407         # In case of error, try to remove win32 forbidden chars
 408         alt_filename = sanitize_path(filename)
 409         if alt_filename == filename:
 410             raise
 411         else:
 412             # An exception here should be caught in the caller
 413             stream = open(encodeFilename(alt_filename), open_mode)
 414             return (stream, alt_filename)
 415
 416
 417 def timeconvert(timestr):
 418     """Convert RFC 2822 defined time string into system timestamp"""
 419     timestamp = None
 420     timetuple = email.utils.parsedate_tz(timestr)
 421     if timetuple is not None:
 422         timestamp = email.utils.mktime_tz(timetuple)
 423     return timestamp
 424
 425
 426 def sanitize_filename(s, restricted=False, is_id=False):
 427     """Sanitizes a string so it could be used as part of a filename.
 428     If restricted is set, use a stricter subset of allowed characters.
 429     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 430     """
 431     def replace_insane(char):
 432         if restricted and char in ACCENT_CHARS:
 433             return ACCENT_CHARS[char]
 434         if char == '?' or ord(char) < 32 or ord(char) == 127:
 435             return ''
 436         elif char == '"':
 437             return '' if restricted else '\''
 438         elif char == ':':
 439             return '_-' if restricted else ' -'
 440         elif char in '\\/|*<>':
 441             return '_'
 442         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 443             return '_'
 444         if restricted and ord(char) > 127:
 445             return '_'
 446         return char
 447
 448     # Handle timestamps
 449     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 450     result = ''.join(map(replace_insane, s))
 451     if not is_id:
 452         while '__' in result:
 453             result = result.replace('__', '_')
 454         result = result.strip('_')
 455         # Common case of "Foreign band name - English song title"
 456         if restricted and result.startswith('-_'):
 457             result = result[2:]
 458         if result.startswith('-'):
 459             result = '_' + result[len('-'):]
 460         result = result.lstrip('.')
 461         if not result:
 462             result = '_'
 463     return result
 464
 465
 466 def sanitize_path(s):
 467     """Sanitizes and normalizes path on Windows"""
 468     if sys.platform != 'win32':
 469         return s
 470     drive_or_unc, _ = os.path.splitdrive(s)
 471     if sys.version_info < (2, 7) and not drive_or_unc:
 472         drive_or_unc, _ = os.path.splitunc(s)
 473     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 474     if drive_or_unc:
 475         norm_path.pop(0)
 476     sanitized_path = [
 477         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 478         for path_part in norm_path]
 479     if drive_or_unc:
 480         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 481     return os.path.join(*sanitized_path)
 482
 483
 484 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 485 # unwanted failures due to missing protocol
 486 def sanitize_url(url):
 487     return 'http:%s' % url if url.startswith('//') else url
 488
 489
 490 def sanitized_Request(url, *args, **kwargs):
 491     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 492
 493
 494 def orderedSet(iterable):
 495     """ Remove all duplicates from the input iterable """
 496     res = []
 497     for el in iterable:
 498         if el not in res:
 499             res.append(el)
 500     return res
 501
 502
 503 def _htmlentity_transform(entity_with_semicolon):
 504     """Transforms an HTML entity to a character."""
 505     entity = entity_with_semicolon[:-1]
 506
 507     # Known non-numeric HTML entity
 508     if entity in compat_html_entities.name2codepoint:
 509         return compat_chr(compat_html_entities.name2codepoint[entity])
 510
 511     # TODO: HTML5 allows entities without a semicolon. For example,
 512     # '&Eacuteric' should be decoded as 'Éric'.
 513     if entity_with_semicolon in compat_html_entities_html5:
 514         return compat_html_entities_html5[entity_with_semicolon]
 515
 516     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 517     if mobj is not None:
 518         numstr = mobj.group(1)
 519         if numstr.startswith('x'):
 520             base = 16
 521             numstr = '0%s' % numstr
 522         else:
 523             base = 10
 524         # See https://github.com/rg3/youtube-dl/issues/7518
 525         try:
 526             return compat_chr(int(numstr, base))
 527         except ValueError:
 528             pass
 529
 530     # Unknown entity in name, return its literal representation
 531     return '&%s;' % entity
 532
 533
 534 def unescapeHTML(s):
 535     if s is None:
 536         return None
 537     assert type(s) == compat_str
 538
 539     return re.sub(
 540         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 541
 542
 543 def get_subprocess_encoding():
 544     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 545         # For subprocess calls, encode with locale encoding
 546         # Refer to http://stackoverflow.com/a/9951851/35070
 547         encoding = preferredencoding()
 548     else:
 549         encoding = sys.getfilesystemencoding()
 550     if encoding is None:
 551         encoding = 'utf-8'
 552     return encoding
 553
 554
 555 def encodeFilename(s, for_subprocess=False):
 556     """
 557     @param s The name of the file
 558     """
 559
 560     assert type(s) == compat_str
 561
 562     # Python 3 has a Unicode API
 563     if sys.version_info >= (3, 0):
 564         return s
 565
 566     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 567     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 568     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 569     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 570         return s
 571
 572     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 573     if sys.platform.startswith('java'):
 574         return s
 575
 576     return s.encode(get_subprocess_encoding(), 'ignore')
 577
 578
 579 def decodeFilename(b, for_subprocess=False):
 580
 581     if sys.version_info >= (3, 0):
 582         return b
 583
 584     if not isinstance(b, bytes):
 585         return b
 586
 587     return b.decode(get_subprocess_encoding(), 'ignore')
 588
 589
 590 def encodeArgument(s):
 591     if not isinstance(s, compat_str):
 592         # Legacy code that uses byte strings
 593         # Uncomment the following line after fixing all post processors
 594         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 595         s = s.decode('ascii')
 596     return encodeFilename(s, True)
 597
 598
 599 def decodeArgument(b):
 600     return decodeFilename(b, True)
 601
 602
 603 def decodeOption(optval):
 604     if optval is None:
 605         return optval
 606     if isinstance(optval, bytes):
 607         optval = optval.decode(preferredencoding())
 608
 609     assert isinstance(optval, compat_str)
 610     return optval
 611
 612
 613 def formatSeconds(secs):
 614     if secs > 3600:
 615         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 616     elif secs > 60:
 617         return '%d:%02d' % (secs // 60, secs % 60)
 618     else:
 619         return '%d' % secs
 620
 621
 622 def make_HTTPS_handler(params, **kwargs):
 623     opts_no_check_certificate = params.get('nocheckcertificate', False)
 624     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 625         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 626         if opts_no_check_certificate:
 627             context.check_hostname = False
 628             context.verify_mode = ssl.CERT_NONE
 629         try:
 630             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 631         except TypeError:
 632             # Python 2.7.8
 633             # (create_default_context present but HTTPSHandler has no context=)
 634             pass
 635
 636     if sys.version_info < (3, 2):
 637         return YoutubeDLHTTPSHandler(params, **kwargs)
 638     else:  # Python < 3.4
 639         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 640         context.verify_mode = (ssl.CERT_NONE
 641                                if opts_no_check_certificate
 642                                else ssl.CERT_REQUIRED)
 643         context.set_default_verify_paths()
 644         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 645
 646
 647 def bug_reports_message():
 648     if ytdl_is_updateable():
 649         update_cmd = 'type  youtube-dl -U  to update'
 650     else:
 651         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 652     msg = '; please report this issue on https://yt-dl.org/bug .'
 653     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 654     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 655     return msg
 656
 657
 658 class ExtractorError(Exception):
 659     """Error during info extraction."""
 660
 661     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 662         """ tb, if given, is the original traceback (so that it can be printed out).
 663         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 664         """
 665
 666         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 667             expected = True
 668         if video_id is not None:
 669             msg = video_id + ': ' + msg
 670         if cause:
 671             msg += ' (caused by %r)' % cause
 672         if not expected:
 673             msg += bug_reports_message()
 674         super(ExtractorError, self).__init__(msg)
 675
 676         self.traceback = tb
 677         self.exc_info = sys.exc_info()  # preserve original exception
 678         self.cause = cause
 679         self.video_id = video_id
 680
 681     def format_traceback(self):
 682         if self.traceback is None:
 683             return None
 684         return ''.join(traceback.format_tb(self.traceback))
 685
 686
 687 class UnsupportedError(ExtractorError):
 688     def __init__(self, url):
 689         super(UnsupportedError, self).__init__(
 690             'Unsupported URL: %s' % url, expected=True)
 691         self.url = url
 692
 693
 694 class RegexNotFoundError(ExtractorError):
 695     """Error when a regex didn't match"""
 696     pass
 697
 698
 699 class DownloadError(Exception):
 700     """Download Error exception.
 701
 702     This exception may be thrown by FileDownloader objects if they are not
 703     configured to continue on errors. They will contain the appropriate
 704     error message.
 705     """
 706
 707     def __init__(self, msg, exc_info=None):
 708         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 709         super(DownloadError, self).__init__(msg)
 710         self.exc_info = exc_info
 711
 712
 713 class SameFileError(Exception):
 714     """Same File exception.
 715
 716     This exception will be thrown by FileDownloader objects if they detect
 717     multiple files would have to be downloaded to the same file on disk.
 718     """
 719     pass
 720
 721
 722 class PostProcessingError(Exception):
 723     """Post Processing exception.
 724
 725     This exception may be raised by PostProcessor's .run() method to
 726     indicate an error in the postprocessing task.
 727     """
 728
 729     def __init__(self, msg):
 730         self.msg = msg
 731
 732
 733 class MaxDownloadsReached(Exception):
 734     """ --max-downloads limit has been reached. """
 735     pass
 736
 737
 738 class UnavailableVideoError(Exception):
 739     """Unavailable Format exception.
 740
 741     This exception will be thrown when a video is requested
 742     in a format that is not available for that video.
 743     """
 744     pass
 745
 746
 747 class ContentTooShortError(Exception):
 748     """Content Too Short exception.
 749
 750     This exception may be raised by FileDownloader objects when a file they
 751     download is too small for what the server announced first, indicating
 752     the connection was probably interrupted.
 753     """
 754
 755     def __init__(self, downloaded, expected):
 756         # Both in bytes
 757         self.downloaded = downloaded
 758         self.expected = expected
 759
 760
 761 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 762     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 763     # expected HTTP responses to meet HTTP/1.0 or later (see also
 764     # https://github.com/rg3/youtube-dl/issues/6727)
 765     if sys.version_info < (3, 0):
 766         kwargs[b'strict'] = True
 767     hc = http_class(*args, **kwargs)
 768     source_address = ydl_handler._params.get('source_address')
 769     if source_address is not None:
 770         sa = (source_address, 0)
 771         if hasattr(hc, 'source_address'):  # Python 2.7+
 772             hc.source_address = sa
 773         else:  # Python 2.6
 774             def _hc_connect(self, *args, **kwargs):
 775                 sock = compat_socket_create_connection(
 776                     (self.host, self.port), self.timeout, sa)
 777                 if is_https:
 778                     self.sock = ssl.wrap_socket(
 779                         sock, self.key_file, self.cert_file,
 780                         ssl_version=ssl.PROTOCOL_TLSv1)
 781                 else:
 782                     self.sock = sock
 783             hc.connect = functools.partial(_hc_connect, hc)
 784
 785     return hc
 786
 787
 788 def handle_youtubedl_headers(headers):
 789     filtered_headers = headers
 790
 791     if 'Youtubedl-no-compression' in filtered_headers:
 792         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 793         del filtered_headers['Youtubedl-no-compression']
 794
 795     return filtered_headers
 796
 797
 798 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 799     """Handler for HTTP requests and responses.
 800
 801     This class, when installed with an OpenerDirector, automatically adds
 802     the standard headers to every HTTP request and handles gzipped and
 803     deflated responses from web servers. If compression is to be avoided in
 804     a particular request, the original request in the program code only has
 805     to include the HTTP header "Youtubedl-no-compression", which will be
 806     removed before making the real request.
 807
 808     Part of this code was copied from:
 809
 810     http://techknack.net/python-urllib2-handlers/
 811
 812     Andrew Rowls, the author of that code, agreed to release it to the
 813     public domain.
 814     """
 815
 816     def __init__(self, params, *args, **kwargs):
 817         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 818         self._params = params
 819
 820     def http_open(self, req):
 821         conn_class = compat_http_client.HTTPConnection
 822
 823         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 824         if socks_proxy:
 825             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 826             del req.headers['Ytdl-socks-proxy']
 827
 828         return self.do_open(functools.partial(
 829             _create_http_connection, self, conn_class, False),
 830             req)
 831
 832     @staticmethod
 833     def deflate(data):
 834         try:
 835             return zlib.decompress(data, -zlib.MAX_WBITS)
 836         except zlib.error:
 837             return zlib.decompress(data)
 838
 839     @staticmethod
 840     def addinfourl_wrapper(stream, headers, url, code):
 841         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 842             return compat_urllib_request.addinfourl(stream, headers, url, code)
 843         ret = compat_urllib_request.addinfourl(stream, headers, url)
 844         ret.code = code
 845         return ret
 846
 847     def http_request(self, req):
 848         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 849         # always respected by websites, some tend to give out URLs with non percent-encoded
 850         # non-ASCII characters (see telemb.py, ard.py [#3412])
 851         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 852         # To work around aforementioned issue we will replace request's original URL with
 853         # percent-encoded one
 854         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 855         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 856         url = req.get_full_url()
 857         url_escaped = escape_url(url)
 858
 859         # Substitute URL if any change after escaping
 860         if url != url_escaped:
 861             req = update_Request(req, url=url_escaped)
 862
 863         for h, v in std_headers.items():
 864             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 865             # The dict keys are capitalized because of this bug by urllib
 866             if h.capitalize() not in req.headers:
 867                 req.add_header(h, v)
 868
 869         req.headers = handle_youtubedl_headers(req.headers)
 870
 871         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 872             # Python 2.6 is brain-dead when it comes to fragments
 873             req._Request__original = req._Request__original.partition('#')[0]
 874             req._Request__r_type = req._Request__r_type.partition('#')[0]
 875
 876         return req
 877
 878     def http_response(self, req, resp):
 879         old_resp = resp
 880         # gzip
 881         if resp.headers.get('Content-encoding', '') == 'gzip':
 882             content = resp.read()
 883             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 884             try:
 885                 uncompressed = io.BytesIO(gz.read())
 886             except IOError as original_ioerror:
 887                 # There may be junk add the end of the file
 888                 # See http://stackoverflow.com/q/4928560/35070 for details
 889                 for i in range(1, 1024):
 890                     try:
 891                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 892                         uncompressed = io.BytesIO(gz.read())
 893                     except IOError:
 894                         continue
 895                     break
 896                 else:
 897                     raise original_ioerror
 898             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 899             resp.msg = old_resp.msg
 900             del resp.headers['Content-encoding']
 901         # deflate
 902         if resp.headers.get('Content-encoding', '') == 'deflate':
 903             gz = io.BytesIO(self.deflate(resp.read()))
 904             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 905             resp.msg = old_resp.msg
 906             del resp.headers['Content-encoding']
 907         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 908         # https://github.com/rg3/youtube-dl/issues/6457).
 909         if 300 <= resp.code < 400:
 910             location = resp.headers.get('Location')
 911             if location:
 912                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 913                 if sys.version_info >= (3, 0):
 914                     location = location.encode('iso-8859-1').decode('utf-8')
 915                 else:
 916                     location = location.decode('utf-8')
 917                 location_escaped = escape_url(location)
 918                 if location != location_escaped:
 919                     del resp.headers['Location']
 920                     if sys.version_info < (3, 0):
 921                         location_escaped = location_escaped.encode('utf-8')
 922                     resp.headers['Location'] = location_escaped
 923         return resp
 924
 925     https_request = http_request
 926     https_response = http_response
 927
 928
 929 def make_socks_conn_class(base_class, socks_proxy):
 930     assert issubclass(base_class, (
 931         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 932
 933     url_components = compat_urlparse.urlparse(socks_proxy)
 934     if url_components.scheme.lower() == 'socks5':
 935         socks_type = ProxyType.SOCKS5
 936     elif url_components.scheme.lower() in ('socks', 'socks4'):
 937         socks_type = ProxyType.SOCKS4
 938     elif url_components.scheme.lower() == 'socks4a':
 939         socks_type = ProxyType.SOCKS4A
 940
 941     def unquote_if_non_empty(s):
 942         if not s:
 943             return s
 944         return compat_urllib_parse_unquote_plus(s)
 945
 946     proxy_args = (
 947         socks_type,
 948         url_components.hostname, url_components.port or 1080,
 949         True,  # Remote DNS
 950         unquote_if_non_empty(url_components.username),
 951         unquote_if_non_empty(url_components.password),
 952     )
 953
 954     class SocksConnection(base_class):
 955         def connect(self):
 956             self.sock = sockssocket()
 957             self.sock.setproxy(*proxy_args)
 958             if type(self.timeout) in (int, float):
 959                 self.sock.settimeout(self.timeout)
 960             self.sock.connect((self.host, self.port))
 961
 962             if isinstance(self, compat_http_client.HTTPSConnection):
 963                 if hasattr(self, '_context'):  # Python > 2.6
 964                     self.sock = self._context.wrap_socket(
 965                         self.sock, server_hostname=self.host)
 966                 else:
 967                     self.sock = ssl.wrap_socket(self.sock)
 968
 969     return SocksConnection
 970
 971
 972 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 973     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 974         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 975         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 976         self._params = params
 977
 978     def https_open(self, req):
 979         kwargs = {}
 980         conn_class = self._https_conn_class
 981
 982         if hasattr(self, '_context'):  # python > 2.6
 983             kwargs['context'] = self._context
 984         if hasattr(self, '_check_hostname'):  # python 3.x
 985             kwargs['check_hostname'] = self._check_hostname
 986
 987         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 988         if socks_proxy:
 989             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 990             del req.headers['Ytdl-socks-proxy']
 991
 992         return self.do_open(functools.partial(
 993             _create_http_connection, self, conn_class, True),
 994             req, **kwargs)
 995
 996
 997 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 998     def __init__(self, cookiejar=None):
 999         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001     def http_response(self, request, response):
1002         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003         # characters in Set-Cookie HTTP header of last response (see
1004         # https://github.com/rg3/youtube-dl/issues/6769).
1005         # In order to at least prevent crashing we will percent encode Set-Cookie
1006         # header before HTTPCookieProcessor starts processing it.
1007         # if sys.version_info < (3, 0) and response.headers:
1008         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009         #         set_cookie = response.headers.get(set_cookie_header)
1010         #         if set_cookie:
1011         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012         #             if set_cookie != set_cookie_escaped:
1013         #                 del response.headers[set_cookie_header]
1014         #                 response.headers[set_cookie_header] = set_cookie_escaped
1015         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018     https_response = http_response
1019
1020
1021 def extract_timezone(date_str):
1022     m = re.search(
1023         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024         date_str)
1025     if not m:
1026         timezone = datetime.timedelta()
1027     else:
1028         date_str = date_str[:-len(m.group('tz'))]
1029         if not m.group('sign'):
1030             timezone = datetime.timedelta()
1031         else:
1032             sign = 1 if m.group('sign') == '+' else -1
1033             timezone = datetime.timedelta(
1034                 hours=sign * int(m.group('hours')),
1035                 minutes=sign * int(m.group('minutes')))
1036     return timezone, date_str
1037
1038
1039 def parse_iso8601(date_str, delimiter='T', timezone=None):
1040     """ Return a UNIX timestamp from the given date """
1041
1042     if date_str is None:
1043         return None
1044
1045     date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
1047     if timezone is None:
1048         timezone, date_str = extract_timezone(date_str)
1049
1050     try:
1051         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053         return calendar.timegm(dt.timetuple())
1054     except ValueError:
1055         pass
1056
1057
1058 def date_formats(day_first=True):
1059     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
1062 def unified_strdate(date_str, day_first=True):
1063     """Return a string with the date in the format YYYYMMDD"""
1064
1065     if date_str is None:
1066         return None
1067     upload_date = None
1068     # Replace commas
1069     date_str = date_str.replace(',', ' ')
1070     # Remove AM/PM + timezone
1071     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1072     _, date_str = extract_timezone(date_str)
1073
1074     for expression in date_formats(day_first):
1075         try:
1076             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1077         except ValueError:
1078             pass
1079     if upload_date is None:
1080         timetuple = email.utils.parsedate_tz(date_str)
1081         if timetuple:
1082             try:
1083                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084             except ValueError:
1085                 pass
1086     if upload_date is not None:
1087         return compat_str(upload_date)
1088
1089
1090 def unified_timestamp(date_str, day_first=True):
1091     if date_str is None:
1092         return None
1093
1094     date_str = date_str.replace(',', ' ')
1095
1096     pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097     timezone, date_str = extract_timezone(date_str)
1098
1099     # Remove AM/PM + timezone
1100     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102     for expression in date_formats(day_first):
1103         try:
1104             dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105             return calendar.timegm(dt.timetuple())
1106         except ValueError:
1107             pass
1108     timetuple = email.utils.parsedate_tz(date_str)
1109     if timetuple:
1110         return calendar.timegm(timetuple.timetuple())
1111
1112
1113 def determine_ext(url, default_ext='unknown_video'):
1114     if url is None:
1115         return default_ext
1116     guess = url.partition('?')[0].rpartition('.')[2]
1117     if re.match(r'^[A-Za-z0-9]+$', guess):
1118         return guess
1119     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1121         return guess.rstrip('/')
1122     else:
1123         return default_ext
1124
1125
1126 def subtitles_filename(filename, sub_lang, sub_format):
1127     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1128
1129
1130 def date_from_str(date_str):
1131     """
1132     Return a datetime object from a string in the format YYYYMMDD or
1133     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134     today = datetime.date.today()
1135     if date_str in ('now', 'today'):
1136         return today
1137     if date_str == 'yesterday':
1138         return today - datetime.timedelta(days=1)
1139     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140     if match is not None:
1141         sign = match.group('sign')
1142         time = int(match.group('time'))
1143         if sign == '-':
1144             time = -time
1145         unit = match.group('unit')
1146         # A bad approximation?
1147         if unit == 'month':
1148             unit = 'day'
1149             time *= 30
1150         elif unit == 'year':
1151             unit = 'day'
1152             time *= 365
1153         unit += 's'
1154         delta = datetime.timedelta(**{unit: time})
1155         return today + delta
1156     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1157
1158
1159 def hyphenate_date(date_str):
1160     """
1161     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163     if match is not None:
1164         return '-'.join(match.groups())
1165     else:
1166         return date_str
1167
1168
1169 class DateRange(object):
1170     """Represents a time interval between two dates"""
1171
1172     def __init__(self, start=None, end=None):
1173         """start and end must be strings in the format accepted by date"""
1174         if start is not None:
1175             self.start = date_from_str(start)
1176         else:
1177             self.start = datetime.datetime.min.date()
1178         if end is not None:
1179             self.end = date_from_str(end)
1180         else:
1181             self.end = datetime.datetime.max.date()
1182         if self.start > self.end:
1183             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1184
1185     @classmethod
1186     def day(cls, day):
1187         """Returns a range that only contains the given day"""
1188         return cls(day, day)
1189
1190     def __contains__(self, date):
1191         """Check if the date is in the range"""
1192         if not isinstance(date, datetime.date):
1193             date = date_from_str(date)
1194         return self.start <= date <= self.end
1195
1196     def __str__(self):
1197         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1198
1199
1200 def platform_name():
1201     """ Returns the platform name as a compat_str """
1202     res = platform.platform()
1203     if isinstance(res, bytes):
1204         res = res.decode(preferredencoding())
1205
1206     assert isinstance(res, compat_str)
1207     return res
1208
1209
1210 def _windows_write_string(s, out):
1211     """ Returns True if the string was written using special methods,
1212     False if it has yet to be written out."""
1213     # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215     import ctypes
1216     import ctypes.wintypes
1217
1218     WIN_OUTPUT_IDS = {
1219         1: -11,
1220         2: -12,
1221     }
1222
1223     try:
1224         fileno = out.fileno()
1225     except AttributeError:
1226         # If the output stream doesn't have a fileno, it's virtual
1227         return False
1228     except io.UnsupportedOperation:
1229         # Some strange Windows pseudo files?
1230         return False
1231     if fileno not in WIN_OUTPUT_IDS:
1232         return False
1233
1234     GetStdHandle = ctypes.WINFUNCTYPE(
1235         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1236         (b'GetStdHandle', ctypes.windll.kernel32))
1237     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
1239     WriteConsoleW = ctypes.WINFUNCTYPE(
1240         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1242         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1243     written = ctypes.wintypes.DWORD(0)
1244
1245     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1246     FILE_TYPE_CHAR = 0x0002
1247     FILE_TYPE_REMOTE = 0x8000
1248     GetConsoleMode = ctypes.WINFUNCTYPE(
1249         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250         ctypes.POINTER(ctypes.wintypes.DWORD))(
1251         (b'GetConsoleMode', ctypes.windll.kernel32))
1252     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254     def not_a_console(handle):
1255         if handle == INVALID_HANDLE_VALUE or handle is None:
1256             return True
1257         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1259
1260     if not_a_console(h):
1261         return False
1262
1263     def next_nonbmp_pos(s):
1264         try:
1265             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266         except StopIteration:
1267             return len(s)
1268
1269     while s:
1270         count = min(next_nonbmp_pos(s), 1024)
1271
1272         ret = WriteConsoleW(
1273             h, s, count if count else 2, ctypes.byref(written), None)
1274         if ret == 0:
1275             raise OSError('Failed to write string')
1276         if not count:  # We just wrote a non-BMP character
1277             assert written.value == 2
1278             s = s[1:]
1279         else:
1280             assert written.value > 0
1281             s = s[written.value:]
1282     return True
1283
1284
1285 def write_string(s, out=None, encoding=None):
1286     if out is None:
1287         out = sys.stderr
1288     assert type(s) == compat_str
1289
1290     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291         if _windows_write_string(s, out):
1292             return
1293
1294     if ('b' in getattr(out, 'mode', '') or
1295             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1296         byt = s.encode(encoding or preferredencoding(), 'ignore')
1297         out.write(byt)
1298     elif hasattr(out, 'buffer'):
1299         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300         byt = s.encode(enc, 'ignore')
1301         out.buffer.write(byt)
1302     else:
1303         out.write(s)
1304     out.flush()
1305
1306
1307 def bytes_to_intlist(bs):
1308     if not bs:
1309         return []
1310     if isinstance(bs[0], int):  # Python 3
1311         return list(bs)
1312     else:
1313         return [ord(c) for c in bs]
1314
1315
1316 def intlist_to_bytes(xs):
1317     if not xs:
1318         return b''
1319     return compat_struct_pack('%dB' % len(xs), *xs)
1320
1321
1322 # Cross-platform file locking
1323 if sys.platform == 'win32':
1324     import ctypes.wintypes
1325     import msvcrt
1326
1327     class OVERLAPPED(ctypes.Structure):
1328         _fields_ = [
1329             ('Internal', ctypes.wintypes.LPVOID),
1330             ('InternalHigh', ctypes.wintypes.LPVOID),
1331             ('Offset', ctypes.wintypes.DWORD),
1332             ('OffsetHigh', ctypes.wintypes.DWORD),
1333             ('hEvent', ctypes.wintypes.HANDLE),
1334         ]
1335
1336     kernel32 = ctypes.windll.kernel32
1337     LockFileEx = kernel32.LockFileEx
1338     LockFileEx.argtypes = [
1339         ctypes.wintypes.HANDLE,     # hFile
1340         ctypes.wintypes.DWORD,      # dwFlags
1341         ctypes.wintypes.DWORD,      # dwReserved
1342         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1343         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1344         ctypes.POINTER(OVERLAPPED)  # Overlapped
1345     ]
1346     LockFileEx.restype = ctypes.wintypes.BOOL
1347     UnlockFileEx = kernel32.UnlockFileEx
1348     UnlockFileEx.argtypes = [
1349         ctypes.wintypes.HANDLE,     # hFile
1350         ctypes.wintypes.DWORD,      # dwReserved
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1353         ctypes.POINTER(OVERLAPPED)  # Overlapped
1354     ]
1355     UnlockFileEx.restype = ctypes.wintypes.BOOL
1356     whole_low = 0xffffffff
1357     whole_high = 0x7fffffff
1358
1359     def _lock_file(f, exclusive):
1360         overlapped = OVERLAPPED()
1361         overlapped.Offset = 0
1362         overlapped.OffsetHigh = 0
1363         overlapped.hEvent = 0
1364         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365         handle = msvcrt.get_osfhandle(f.fileno())
1366         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367                           whole_low, whole_high, f._lock_file_overlapped_p):
1368             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370     def _unlock_file(f):
1371         assert f._lock_file_overlapped_p
1372         handle = msvcrt.get_osfhandle(f.fileno())
1373         if not UnlockFileEx(handle, 0,
1374                             whole_low, whole_high, f._lock_file_overlapped_p):
1375             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377 else:
1378     # Some platforms, such as Jython, is missing fcntl
1379     try:
1380         import fcntl
1381
1382         def _lock_file(f, exclusive):
1383             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1384
1385         def _unlock_file(f):
1386             fcntl.flock(f, fcntl.LOCK_UN)
1387     except ImportError:
1388         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390         def _lock_file(f, exclusive):
1391             raise IOError(UNSUPPORTED_MSG)
1392
1393         def _unlock_file(f):
1394             raise IOError(UNSUPPORTED_MSG)
1395
1396
1397 class locked_file(object):
1398     def __init__(self, filename, mode, encoding=None):
1399         assert mode in ['r', 'a', 'w']
1400         self.f = io.open(filename, mode, encoding=encoding)
1401         self.mode = mode
1402
1403     def __enter__(self):
1404         exclusive = self.mode != 'r'
1405         try:
1406             _lock_file(self.f, exclusive)
1407         except IOError:
1408             self.f.close()
1409             raise
1410         return self
1411
1412     def __exit__(self, etype, value, traceback):
1413         try:
1414             _unlock_file(self.f)
1415         finally:
1416             self.f.close()
1417
1418     def __iter__(self):
1419         return iter(self.f)
1420
1421     def write(self, *args):
1422         return self.f.write(*args)
1423
1424     def read(self, *args):
1425         return self.f.read(*args)
1426
1427
1428 def get_filesystem_encoding():
1429     encoding = sys.getfilesystemencoding()
1430     return encoding if encoding is not None else 'utf-8'
1431
1432
1433 def shell_quote(args):
1434     quoted_args = []
1435     encoding = get_filesystem_encoding()
1436     for a in args:
1437         if isinstance(a, bytes):
1438             # We may get a filename encoded with 'encodeFilename'
1439             a = a.decode(encoding)
1440         quoted_args.append(pipes.quote(a))
1441     return ' '.join(quoted_args)
1442
1443
1444 def smuggle_url(url, data):
1445     """ Pass additional data in a URL for internal use. """
1446
1447     sdata = compat_urllib_parse_urlencode(
1448         {'__youtubedl_smuggle': json.dumps(data)})
1449     return url + '#' + sdata
1450
1451
1452 def unsmuggle_url(smug_url, default=None):
1453     if '#__youtubedl_smuggle' not in smug_url:
1454         return smug_url, default
1455     url, _, sdata = smug_url.rpartition('#')
1456     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1457     data = json.loads(jsond)
1458     return url, data
1459
1460
1461 def format_bytes(bytes):
1462     if bytes is None:
1463         return 'N/A'
1464     if type(bytes) is str:
1465         bytes = float(bytes)
1466     if bytes == 0.0:
1467         exponent = 0
1468     else:
1469         exponent = int(math.log(bytes, 1024.0))
1470     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1471     converted = float(bytes) / float(1024 ** exponent)
1472     return '%.2f%s' % (converted, suffix)
1473
1474
1475 def lookup_unit_table(unit_table, s):
1476     units_re = '|'.join(re.escape(u) for u in unit_table)
1477     m = re.match(
1478         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1479     if not m:
1480         return None
1481     num_str = m.group('num').replace(',', '.')
1482     mult = unit_table[m.group('unit')]
1483     return int(float(num_str) * mult)
1484
1485
1486 def parse_filesize(s):
1487     if s is None:
1488         return None
1489
1490     # The lower-case forms are of course incorrect and unofficial,
1491     # but we support those too
1492     _UNIT_TABLE = {
1493         'B': 1,
1494         'b': 1,
1495         'KiB': 1024,
1496         'KB': 1000,
1497         'kB': 1024,
1498         'Kb': 1000,
1499         'MiB': 1024 ** 2,
1500         'MB': 1000 ** 2,
1501         'mB': 1024 ** 2,
1502         'Mb': 1000 ** 2,
1503         'GiB': 1024 ** 3,
1504         'GB': 1000 ** 3,
1505         'gB': 1024 ** 3,
1506         'Gb': 1000 ** 3,
1507         'TiB': 1024 ** 4,
1508         'TB': 1000 ** 4,
1509         'tB': 1024 ** 4,
1510         'Tb': 1000 ** 4,
1511         'PiB': 1024 ** 5,
1512         'PB': 1000 ** 5,
1513         'pB': 1024 ** 5,
1514         'Pb': 1000 ** 5,
1515         'EiB': 1024 ** 6,
1516         'EB': 1000 ** 6,
1517         'eB': 1024 ** 6,
1518         'Eb': 1000 ** 6,
1519         'ZiB': 1024 ** 7,
1520         'ZB': 1000 ** 7,
1521         'zB': 1024 ** 7,
1522         'Zb': 1000 ** 7,
1523         'YiB': 1024 ** 8,
1524         'YB': 1000 ** 8,
1525         'yB': 1024 ** 8,
1526         'Yb': 1000 ** 8,
1527     }
1528
1529     return lookup_unit_table(_UNIT_TABLE, s)
1530
1531
1532 def parse_count(s):
1533     if s is None:
1534         return None
1535
1536     s = s.strip()
1537
1538     if re.match(r'^[\d,.]+$', s):
1539         return str_to_int(s)
1540
1541     _UNIT_TABLE = {
1542         'k': 1000,
1543         'K': 1000,
1544         'm': 1000 ** 2,
1545         'M': 1000 ** 2,
1546         'kk': 1000 ** 2,
1547         'KK': 1000 ** 2,
1548     }
1549
1550     return lookup_unit_table(_UNIT_TABLE, s)
1551
1552
1553 def month_by_name(name):
1554     """ Return the number of a month by (locale-independently) English name """
1555
1556     try:
1557         return ENGLISH_MONTH_NAMES.index(name) + 1
1558     except ValueError:
1559         return None
1560
1561
1562 def month_by_abbreviation(abbrev):
1563     """ Return the number of a month by (locale-independently) English
1564         abbreviations """
1565
1566     try:
1567         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1568     except ValueError:
1569         return None
1570
1571
1572 def fix_xml_ampersands(xml_str):
1573     """Replace all the '&' by '&amp;' in XML"""
1574     return re.sub(
1575         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1576         '&amp;',
1577         xml_str)
1578
1579
1580 def setproctitle(title):
1581     assert isinstance(title, compat_str)
1582
1583     # ctypes in Jython is not complete
1584     # http://bugs.jython.org/issue2148
1585     if sys.platform.startswith('java'):
1586         return
1587
1588     try:
1589         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1590     except OSError:
1591         return
1592     title_bytes = title.encode('utf-8')
1593     buf = ctypes.create_string_buffer(len(title_bytes))
1594     buf.value = title_bytes
1595     try:
1596         libc.prctl(15, buf, 0, 0, 0)
1597     except AttributeError:
1598         return  # Strange libc, just skip this
1599
1600
1601 def remove_start(s, start):
1602     return s[len(start):] if s is not None and s.startswith(start) else s
1603
1604
1605 def remove_end(s, end):
1606     return s[:-len(end)] if s is not None and s.endswith(end) else s
1607
1608
1609 def remove_quotes(s):
1610     if s is None or len(s) < 2:
1611         return s
1612     for quote in ('"', "'", ):
1613         if s[0] == quote and s[-1] == quote:
1614             return s[1:-1]
1615     return s
1616
1617
1618 def url_basename(url):
1619     path = compat_urlparse.urlparse(url).path
1620     return path.strip('/').split('/')[-1]
1621
1622
1623 class HEADRequest(compat_urllib_request.Request):
1624     def get_method(self):
1625         return 'HEAD'
1626
1627
1628 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1629     if get_attr:
1630         if v is not None:
1631             v = getattr(v, get_attr, None)
1632     if v == '':
1633         v = None
1634     if v is None:
1635         return default
1636     try:
1637         return int(v) * invscale // scale
1638     except ValueError:
1639         return default
1640
1641
1642 def str_or_none(v, default=None):
1643     return default if v is None else compat_str(v)
1644
1645
1646 def str_to_int(int_str):
1647     """ A more relaxed version of int_or_none """
1648     if int_str is None:
1649         return None
1650     int_str = re.sub(r'[,\.\+]', '', int_str)
1651     return int(int_str)
1652
1653
1654 def float_or_none(v, scale=1, invscale=1, default=None):
1655     if v is None:
1656         return default
1657     try:
1658         return float(v) * invscale / scale
1659     except ValueError:
1660         return default
1661
1662
1663 def parse_duration(s):
1664     if not isinstance(s, compat_basestring):
1665         return None
1666
1667     s = s.strip()
1668
1669     days, hours, mins, secs, ms = [None] * 5
1670     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1671     if m:
1672         days, hours, mins, secs, ms = m.groups()
1673     else:
1674         m = re.match(
1675             r'''(?ix)(?:P?T)?
1676                 (?:
1677                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1678                 )?
1679                 (?:
1680                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1681                 )?
1682                 (?:
1683                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1684                 )?
1685                 (?:
1686                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1687                 )?$''', s)
1688         if m:
1689             days, hours, mins, secs, ms = m.groups()
1690         else:
1691             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1692             if m:
1693                 hours, mins = m.groups()
1694             else:
1695                 return None
1696
1697     duration = 0
1698     if secs:
1699         duration += float(secs)
1700     if mins:
1701         duration += float(mins) * 60
1702     if hours:
1703         duration += float(hours) * 60 * 60
1704     if days:
1705         duration += float(days) * 24 * 60 * 60
1706     if ms:
1707         duration += float(ms)
1708     return duration
1709
1710
1711 def prepend_extension(filename, ext, expected_real_ext=None):
1712     name, real_ext = os.path.splitext(filename)
1713     return (
1714         '{0}.{1}{2}'.format(name, ext, real_ext)
1715         if not expected_real_ext or real_ext[1:] == expected_real_ext
1716         else '{0}.{1}'.format(filename, ext))
1717
1718
1719 def replace_extension(filename, ext, expected_real_ext=None):
1720     name, real_ext = os.path.splitext(filename)
1721     return '{0}.{1}'.format(
1722         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1723         ext)
1724
1725
1726 def check_executable(exe, args=[]):
1727     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1728     args can be a list of arguments for a short output (like -version) """
1729     try:
1730         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1731     except OSError:
1732         return False
1733     return exe
1734
1735
1736 def get_exe_version(exe, args=['--version'],
1737                     version_re=None, unrecognized='present'):
1738     """ Returns the version of the specified executable,
1739     or False if the executable is not present """
1740     try:
1741         out, _ = subprocess.Popen(
1742             [encodeArgument(exe)] + args,
1743             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1744     except OSError:
1745         return False
1746     if isinstance(out, bytes):  # Python 2.x
1747         out = out.decode('ascii', 'ignore')
1748     return detect_exe_version(out, version_re, unrecognized)
1749
1750
1751 def detect_exe_version(output, version_re=None, unrecognized='present'):
1752     assert isinstance(output, compat_str)
1753     if version_re is None:
1754         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1755     m = re.search(version_re, output)
1756     if m:
1757         return m.group(1)
1758     else:
1759         return unrecognized
1760
1761
1762 class PagedList(object):
1763     def __len__(self):
1764         # This is only useful for tests
1765         return len(self.getslice())
1766
1767
1768 class OnDemandPagedList(PagedList):
1769     def __init__(self, pagefunc, pagesize, use_cache=False):
1770         self._pagefunc = pagefunc
1771         self._pagesize = pagesize
1772         self._use_cache = use_cache
1773         if use_cache:
1774             self._cache = {}
1775
1776     def getslice(self, start=0, end=None):
1777         res = []
1778         for pagenum in itertools.count(start // self._pagesize):
1779             firstid = pagenum * self._pagesize
1780             nextfirstid = pagenum * self._pagesize + self._pagesize
1781             if start >= nextfirstid:
1782                 continue
1783
1784             page_results = None
1785             if self._use_cache:
1786                 page_results = self._cache.get(pagenum)
1787             if page_results is None:
1788                 page_results = list(self._pagefunc(pagenum))
1789             if self._use_cache:
1790                 self._cache[pagenum] = page_results
1791
1792             startv = (
1793                 start % self._pagesize
1794                 if firstid <= start < nextfirstid
1795                 else 0)
1796
1797             endv = (
1798                 ((end - 1) % self._pagesize) + 1
1799                 if (end is not None and firstid <= end <= nextfirstid)
1800                 else None)
1801
1802             if startv != 0 or endv is not None:
1803                 page_results = page_results[startv:endv]
1804             res.extend(page_results)
1805
1806             # A little optimization - if current page is not "full", ie. does
1807             # not contain page_size videos then we can assume that this page
1808             # is the last one - there are no more ids on further pages -
1809             # i.e. no need to query again.
1810             if len(page_results) + startv < self._pagesize:
1811                 break
1812
1813             # If we got the whole page, but the next page is not interesting,
1814             # break out early as well
1815             if end == nextfirstid:
1816                 break
1817         return res
1818
1819
1820 class InAdvancePagedList(PagedList):
1821     def __init__(self, pagefunc, pagecount, pagesize):
1822         self._pagefunc = pagefunc
1823         self._pagecount = pagecount
1824         self._pagesize = pagesize
1825
1826     def getslice(self, start=0, end=None):
1827         res = []
1828         start_page = start // self._pagesize
1829         end_page = (
1830             self._pagecount if end is None else (end // self._pagesize + 1))
1831         skip_elems = start - start_page * self._pagesize
1832         only_more = None if end is None else end - start
1833         for pagenum in range(start_page, end_page):
1834             page = list(self._pagefunc(pagenum))
1835             if skip_elems:
1836                 page = page[skip_elems:]
1837                 skip_elems = None
1838             if only_more is not None:
1839                 if len(page) < only_more:
1840                     only_more -= len(page)
1841                 else:
1842                     page = page[:only_more]
1843                     res.extend(page)
1844                     break
1845             res.extend(page)
1846         return res
1847
1848
1849 def uppercase_escape(s):
1850     unicode_escape = codecs.getdecoder('unicode_escape')
1851     return re.sub(
1852         r'\\U[0-9a-fA-F]{8}',
1853         lambda m: unicode_escape(m.group(0))[0],
1854         s)
1855
1856
1857 def lowercase_escape(s):
1858     unicode_escape = codecs.getdecoder('unicode_escape')
1859     return re.sub(
1860         r'\\u[0-9a-fA-F]{4}',
1861         lambda m: unicode_escape(m.group(0))[0],
1862         s)
1863
1864
1865 def escape_rfc3986(s):
1866     """Escape non-ASCII characters as suggested by RFC 3986"""
1867     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1868         s = s.encode('utf-8')
1869     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1870
1871
1872 def escape_url(url):
1873     """Escape URL as suggested by RFC 3986"""
1874     url_parsed = compat_urllib_parse_urlparse(url)
1875     return url_parsed._replace(
1876         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1877         path=escape_rfc3986(url_parsed.path),
1878         params=escape_rfc3986(url_parsed.params),
1879         query=escape_rfc3986(url_parsed.query),
1880         fragment=escape_rfc3986(url_parsed.fragment)
1881     ).geturl()
1882
1883
1884 def read_batch_urls(batch_fd):
1885     def fixup(url):
1886         if not isinstance(url, compat_str):
1887             url = url.decode('utf-8', 'replace')
1888         BOM_UTF8 = '\xef\xbb\xbf'
1889         if url.startswith(BOM_UTF8):
1890             url = url[len(BOM_UTF8):]
1891         url = url.strip()
1892         if url.startswith(('#', ';', ']')):
1893             return False
1894         return url
1895
1896     with contextlib.closing(batch_fd) as fd:
1897         return [url for url in map(fixup, fd) if url]
1898
1899
1900 def urlencode_postdata(*args, **kargs):
1901     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1902
1903
1904 def update_url_query(url, query):
1905     if not query:
1906         return url
1907     parsed_url = compat_urlparse.urlparse(url)
1908     qs = compat_parse_qs(parsed_url.query)
1909     qs.update(query)
1910     return compat_urlparse.urlunparse(parsed_url._replace(
1911         query=compat_urllib_parse_urlencode(qs, True)))
1912
1913
1914 def update_Request(req, url=None, data=None, headers={}, query={}):
1915     req_headers = req.headers.copy()
1916     req_headers.update(headers)
1917     req_data = data or req.data
1918     req_url = update_url_query(url or req.get_full_url(), query)
1919     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1920     new_req = req_type(
1921         req_url, data=req_data, headers=req_headers,
1922         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1923     if hasattr(req, 'timeout'):
1924         new_req.timeout = req.timeout
1925     return new_req
1926
1927
1928 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1929     if isinstance(key_or_keys, (list, tuple)):
1930         for key in key_or_keys:
1931             if key not in d or d[key] is None or skip_false_values and not d[key]:
1932                 continue
1933             return d[key]
1934         return default
1935     return d.get(key_or_keys, default)
1936
1937
1938 def try_get(src, getter, expected_type=None):
1939     try:
1940         v = getter(src)
1941     except (AttributeError, KeyError, TypeError, IndexError):
1942         pass
1943     else:
1944         if expected_type is None or isinstance(v, expected_type):
1945             return v
1946
1947
1948 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1949     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1950
1951
1952 US_RATINGS = {
1953     'G': 0,
1954     'PG': 10,
1955     'PG-13': 13,
1956     'R': 16,
1957     'NC': 18,
1958 }
1959
1960
1961 def parse_age_limit(s):
1962     if s is None:
1963         return None
1964     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1965     return int(m.group('age')) if m else US_RATINGS.get(s)
1966
1967
1968 def strip_jsonp(code):
1969     return re.sub(
1970         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1971
1972
1973 def js_to_json(code):
1974     def fix_kv(m):
1975         v = m.group(0)
1976         if v in ('true', 'false', 'null'):
1977             return v
1978         elif v.startswith('/*') or v == ',':
1979             return ""
1980
1981         if v[0] in ("'", '"'):
1982             v = re.sub(r'(?s)\\.|"', lambda m: {
1983                 '"': '\\"',
1984                 "\\'": "'",
1985                 '\\\n': '',
1986                 '\\x': '\\u00',
1987             }.get(m.group(0), m.group(0)), v[1:-1])
1988
1989         INTEGER_TABLE = (
1990             (r'^0[xX][0-9a-fA-F]+', 16),
1991             (r'^0+[0-7]+', 8),
1992         )
1993
1994         for regex, base in INTEGER_TABLE:
1995             im = re.match(regex, v)
1996             if im:
1997                 i = int(im.group(0), base)
1998                 return '"%d":' % i if v.endswith(':') else '%d' % i
1999
2000         return '"%s"' % v
2001
2002     return re.sub(r'''(?sx)
2003         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2004         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2005         /\*.*?\*/|,(?=\s*[\]}])|
2006         [a-zA-Z_][.a-zA-Z_0-9]*|
2007         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2008         [0-9]+(?=\s*:)
2009         ''', fix_kv, code)
2010
2011
2012 def qualities(quality_ids):
2013     """ Get a numeric quality value out of a list of possible values """
2014     def q(qid):
2015         try:
2016             return quality_ids.index(qid)
2017         except ValueError:
2018             return -1
2019     return q
2020
2021
2022 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2023
2024
2025 def limit_length(s, length):
2026     """ Add ellipses to overly long strings """
2027     if s is None:
2028         return None
2029     ELLIPSES = '...'
2030     if len(s) > length:
2031         return s[:length - len(ELLIPSES)] + ELLIPSES
2032     return s
2033
2034
2035 def version_tuple(v):
2036     return tuple(int(e) for e in re.split(r'[-.]', v))
2037
2038
2039 def is_outdated_version(version, limit, assume_new=True):
2040     if not version:
2041         return not assume_new
2042     try:
2043         return version_tuple(version) < version_tuple(limit)
2044     except ValueError:
2045         return not assume_new
2046
2047
2048 def ytdl_is_updateable():
2049     """ Returns if youtube-dl can be updated with -U """
2050     from zipimport import zipimporter
2051
2052     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2053
2054
2055 def args_to_str(args):
2056     # Get a short string representation for a subprocess command
2057     return ' '.join(compat_shlex_quote(a) for a in args)
2058
2059
2060 def error_to_compat_str(err):
2061     err_str = str(err)
2062     # On python 2 error byte string must be decoded with proper
2063     # encoding rather than ascii
2064     if sys.version_info[0] < 3:
2065         err_str = err_str.decode(preferredencoding())
2066     return err_str
2067
2068
2069 def mimetype2ext(mt):
2070     if mt is None:
2071         return None
2072
2073     ext = {
2074         'audio/mp4': 'm4a',
2075         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2076         # it's the most popular one
2077         'audio/mpeg': 'mp3',
2078     }.get(mt)
2079     if ext is not None:
2080         return ext
2081
2082     _, _, res = mt.rpartition('/')
2083
2084     return {
2085         '3gpp': '3gp',
2086         'smptett+xml': 'tt',
2087         'srt': 'srt',
2088         'ttaf+xml': 'dfxp',
2089         'ttml+xml': 'ttml',
2090         'vtt': 'vtt',
2091         'x-flv': 'flv',
2092         'x-mp4-fragmented': 'mp4',
2093         'x-ms-wmv': 'wmv',
2094     }.get(res, res)
2095
2096
2097 def urlhandle_detect_ext(url_handle):
2098     getheader = url_handle.headers.get
2099
2100     cd = getheader('Content-Disposition')
2101     if cd:
2102         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2103         if m:
2104             e = determine_ext(m.group('filename'), default_ext=None)
2105             if e:
2106                 return e
2107
2108     return mimetype2ext(getheader('Content-Type'))
2109
2110
2111 def encode_data_uri(data, mime_type):
2112     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2113
2114
2115 def age_restricted(content_limit, age_limit):
2116     """ Returns True iff the content should be blocked """
2117
2118     if age_limit is None:  # No limit set
2119         return False
2120     if content_limit is None:
2121         return False  # Content available for everyone
2122     return age_limit < content_limit
2123
2124
2125 def is_html(first_bytes):
2126     """ Detect whether a file contains HTML by examining its first bytes. """
2127
2128     BOMS = [
2129         (b'\xef\xbb\xbf', 'utf-8'),
2130         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2131         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2132         (b'\xff\xfe', 'utf-16-le'),
2133         (b'\xfe\xff', 'utf-16-be'),
2134     ]
2135     for bom, enc in BOMS:
2136         if first_bytes.startswith(bom):
2137             s = first_bytes[len(bom):].decode(enc, 'replace')
2138             break
2139     else:
2140         s = first_bytes.decode('utf-8', 'replace')
2141
2142     return re.match(r'^\s*<', s)
2143
2144
2145 def determine_protocol(info_dict):
2146     protocol = info_dict.get('protocol')
2147     if protocol is not None:
2148         return protocol
2149
2150     url = info_dict['url']
2151     if url.startswith('rtmp'):
2152         return 'rtmp'
2153     elif url.startswith('mms'):
2154         return 'mms'
2155     elif url.startswith('rtsp'):
2156         return 'rtsp'
2157
2158     ext = determine_ext(url)
2159     if ext == 'm3u8':
2160         return 'm3u8'
2161     elif ext == 'f4m':
2162         return 'f4m'
2163
2164     return compat_urllib_parse_urlparse(url).scheme
2165
2166
2167 def render_table(header_row, data):
2168     """ Render a list of rows, each as a list of values """
2169     table = [header_row] + data
2170     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2171     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2172     return '\n'.join(format_str % tuple(row) for row in table)
2173
2174
2175 def _match_one(filter_part, dct):
2176     COMPARISON_OPERATORS = {
2177         '<': operator.lt,
2178         '<=': operator.le,
2179         '>': operator.gt,
2180         '>=': operator.ge,
2181         '=': operator.eq,
2182         '!=': operator.ne,
2183     }
2184     operator_rex = re.compile(r'''(?x)\s*
2185         (?P<key>[a-z_]+)
2186         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2187         (?:
2188             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2189             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2190         )
2191         \s*$
2192         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2193     m = operator_rex.search(filter_part)
2194     if m:
2195         op = COMPARISON_OPERATORS[m.group('op')]
2196         if m.group('strval') is not None:
2197             if m.group('op') not in ('=', '!='):
2198                 raise ValueError(
2199                     'Operator %s does not support string values!' % m.group('op'))
2200             comparison_value = m.group('strval')
2201         else:
2202             try:
2203                 comparison_value = int(m.group('intval'))
2204             except ValueError:
2205                 comparison_value = parse_filesize(m.group('intval'))
2206                 if comparison_value is None:
2207                     comparison_value = parse_filesize(m.group('intval') + 'B')
2208                 if comparison_value is None:
2209                     raise ValueError(
2210                         'Invalid integer value %r in filter part %r' % (
2211                             m.group('intval'), filter_part))
2212         actual_value = dct.get(m.group('key'))
2213         if actual_value is None:
2214             return m.group('none_inclusive')
2215         return op(actual_value, comparison_value)
2216
2217     UNARY_OPERATORS = {
2218         '': lambda v: v is not None,
2219         '!': lambda v: v is None,
2220     }
2221     operator_rex = re.compile(r'''(?x)\s*
2222         (?P<op>%s)\s*(?P<key>[a-z_]+)
2223         \s*$
2224         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2225     m = operator_rex.search(filter_part)
2226     if m:
2227         op = UNARY_OPERATORS[m.group('op')]
2228         actual_value = dct.get(m.group('key'))
2229         return op(actual_value)
2230
2231     raise ValueError('Invalid filter part %r' % filter_part)
2232
2233
2234 def match_str(filter_str, dct):
2235     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2236
2237     return all(
2238         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2239
2240
2241 def match_filter_func(filter_str):
2242     def _match_func(info_dict):
2243         if match_str(filter_str, info_dict):
2244             return None
2245         else:
2246             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2247             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2248     return _match_func
2249
2250
2251 def parse_dfxp_time_expr(time_expr):
2252     if not time_expr:
2253         return
2254
2255     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2256     if mobj:
2257         return float(mobj.group('time_offset'))
2258
2259     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2260     if mobj:
2261         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2262
2263
2264 def srt_subtitles_timecode(seconds):
2265     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2266
2267
2268 def dfxp2srt(dfxp_data):
2269     _x = functools.partial(xpath_with_ns, ns_map={
2270         'ttml': 'http://www.w3.org/ns/ttml',
2271         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2272         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2273     })
2274
2275     class TTMLPElementParser(object):
2276         out = ''
2277
2278         def start(self, tag, attrib):
2279             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2280                 self.out += '\n'
2281
2282         def end(self, tag):
2283             pass
2284
2285         def data(self, data):
2286             self.out += data
2287
2288         def close(self):
2289             return self.out.strip()
2290
2291     def parse_node(node):
2292         target = TTMLPElementParser()
2293         parser = xml.etree.ElementTree.XMLParser(target=target)
2294         parser.feed(xml.etree.ElementTree.tostring(node))
2295         return parser.close()
2296
2297     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2298     out = []
2299     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2300
2301     if not paras:
2302         raise ValueError('Invalid dfxp/TTML subtitle')
2303
2304     for para, index in zip(paras, itertools.count(1)):
2305         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2306         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2307         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2308         if begin_time is None:
2309             continue
2310         if not end_time:
2311             if not dur:
2312                 continue
2313             end_time = begin_time + dur
2314         out.append('%d\n%s --> %s\n%s\n\n' % (
2315             index,
2316             srt_subtitles_timecode(begin_time),
2317             srt_subtitles_timecode(end_time),
2318             parse_node(para)))
2319
2320     return ''.join(out)
2321
2322
2323 def cli_option(params, command_option, param):
2324     param = params.get(param)
2325     return [command_option, param] if param is not None else []
2326
2327
2328 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2329     param = params.get(param)
2330     assert isinstance(param, bool)
2331     if separator:
2332         return [command_option + separator + (true_value if param else false_value)]
2333     return [command_option, true_value if param else false_value]
2334
2335
2336 def cli_valueless_option(params, command_option, param, expected_value=True):
2337     param = params.get(param)
2338     return [command_option] if param == expected_value else []
2339
2340
2341 def cli_configuration_args(params, param, default=[]):
2342     ex_args = params.get(param)
2343     if ex_args is None:
2344         return default
2345     assert isinstance(ex_args, list)
2346     return ex_args
2347
2348
2349 class ISO639Utils(object):
2350     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2351     _lang_map = {
2352         'aa': 'aar',
2353         'ab': 'abk',
2354         'ae': 'ave',
2355         'af': 'afr',
2356         'ak': 'aka',
2357         'am': 'amh',
2358         'an': 'arg',
2359         'ar': 'ara',
2360         'as': 'asm',
2361         'av': 'ava',
2362         'ay': 'aym',
2363         'az': 'aze',
2364         'ba': 'bak',
2365         'be': 'bel',
2366         'bg': 'bul',
2367         'bh': 'bih',
2368         'bi': 'bis',
2369         'bm': 'bam',
2370         'bn': 'ben',
2371         'bo': 'bod',
2372         'br': 'bre',
2373         'bs': 'bos',
2374         'ca': 'cat',
2375         'ce': 'che',
2376         'ch': 'cha',
2377         'co': 'cos',
2378         'cr': 'cre',
2379         'cs': 'ces',
2380         'cu': 'chu',
2381         'cv': 'chv',
2382         'cy': 'cym',
2383         'da': 'dan',
2384         'de': 'deu',
2385         'dv': 'div',
2386         'dz': 'dzo',
2387         'ee': 'ewe',
2388         'el': 'ell',
2389         'en': 'eng',
2390         'eo': 'epo',
2391         'es': 'spa',
2392         'et': 'est',
2393         'eu': 'eus',
2394         'fa': 'fas',
2395         'ff': 'ful',
2396         'fi': 'fin',
2397         'fj': 'fij',
2398         'fo': 'fao',
2399         'fr': 'fra',
2400         'fy': 'fry',
2401         'ga': 'gle',
2402         'gd': 'gla',
2403         'gl': 'glg',
2404         'gn': 'grn',
2405         'gu': 'guj',
2406         'gv': 'glv',
2407         'ha': 'hau',
2408         'he': 'heb',
2409         'hi': 'hin',
2410         'ho': 'hmo',
2411         'hr': 'hrv',
2412         'ht': 'hat',
2413         'hu': 'hun',
2414         'hy': 'hye',
2415         'hz': 'her',
2416         'ia': 'ina',
2417         'id': 'ind',
2418         'ie': 'ile',
2419         'ig': 'ibo',
2420         'ii': 'iii',
2421         'ik': 'ipk',
2422         'io': 'ido',
2423         'is': 'isl',
2424         'it': 'ita',
2425         'iu': 'iku',
2426         'ja': 'jpn',
2427         'jv': 'jav',
2428         'ka': 'kat',
2429         'kg': 'kon',
2430         'ki': 'kik',
2431         'kj': 'kua',
2432         'kk': 'kaz',
2433         'kl': 'kal',
2434         'km': 'khm',
2435         'kn': 'kan',
2436         'ko': 'kor',
2437         'kr': 'kau',
2438         'ks': 'kas',
2439         'ku': 'kur',
2440         'kv': 'kom',
2441         'kw': 'cor',
2442         'ky': 'kir',
2443         'la': 'lat',
2444         'lb': 'ltz',
2445         'lg': 'lug',
2446         'li': 'lim',
2447         'ln': 'lin',
2448         'lo': 'lao',
2449         'lt': 'lit',
2450         'lu': 'lub',
2451         'lv': 'lav',
2452         'mg': 'mlg',
2453         'mh': 'mah',
2454         'mi': 'mri',
2455         'mk': 'mkd',
2456         'ml': 'mal',
2457         'mn': 'mon',
2458         'mr': 'mar',
2459         'ms': 'msa',
2460         'mt': 'mlt',
2461         'my': 'mya',
2462         'na': 'nau',
2463         'nb': 'nob',
2464         'nd': 'nde',
2465         'ne': 'nep',
2466         'ng': 'ndo',
2467         'nl': 'nld',
2468         'nn': 'nno',
2469         'no': 'nor',
2470         'nr': 'nbl',
2471         'nv': 'nav',
2472         'ny': 'nya',
2473         'oc': 'oci',
2474         'oj': 'oji',
2475         'om': 'orm',
2476         'or': 'ori',
2477         'os': 'oss',
2478         'pa': 'pan',
2479         'pi': 'pli',
2480         'pl': 'pol',
2481         'ps': 'pus',
2482         'pt': 'por',
2483         'qu': 'que',
2484         'rm': 'roh',
2485         'rn': 'run',
2486         'ro': 'ron',
2487         'ru': 'rus',
2488         'rw': 'kin',
2489         'sa': 'san',
2490         'sc': 'srd',
2491         'sd': 'snd',
2492         'se': 'sme',
2493         'sg': 'sag',
2494         'si': 'sin',
2495         'sk': 'slk',
2496         'sl': 'slv',
2497         'sm': 'smo',
2498         'sn': 'sna',
2499         'so': 'som',
2500         'sq': 'sqi',
2501         'sr': 'srp',
2502         'ss': 'ssw',
2503         'st': 'sot',
2504         'su': 'sun',
2505         'sv': 'swe',
2506         'sw': 'swa',
2507         'ta': 'tam',
2508         'te': 'tel',
2509         'tg': 'tgk',
2510         'th': 'tha',
2511         'ti': 'tir',
2512         'tk': 'tuk',
2513         'tl': 'tgl',
2514         'tn': 'tsn',
2515         'to': 'ton',
2516         'tr': 'tur',
2517         'ts': 'tso',
2518         'tt': 'tat',
2519         'tw': 'twi',
2520         'ty': 'tah',
2521         'ug': 'uig',
2522         'uk': 'ukr',
2523         'ur': 'urd',
2524         'uz': 'uzb',
2525         've': 'ven',
2526         'vi': 'vie',
2527         'vo': 'vol',
2528         'wa': 'wln',
2529         'wo': 'wol',
2530         'xh': 'xho',
2531         'yi': 'yid',
2532         'yo': 'yor',
2533         'za': 'zha',
2534         'zh': 'zho',
2535         'zu': 'zul',
2536     }
2537
2538     @classmethod
2539     def short2long(cls, code):
2540         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2541         return cls._lang_map.get(code[:2])
2542
2543     @classmethod
2544     def long2short(cls, code):
2545         """Convert language code from ISO 639-2/T to ISO 639-1"""
2546         for short_name, long_name in cls._lang_map.items():
2547             if long_name == code:
2548                 return short_name
2549
2550
2551 class ISO3166Utils(object):
2552     # From http://data.okfn.org/data/core/country-list
2553     _country_map = {
2554         'AF': 'Afghanistan',
2555         'AX': 'Åland Islands',
2556         'AL': 'Albania',
2557         'DZ': 'Algeria',
2558         'AS': 'American Samoa',
2559         'AD': 'Andorra',
2560         'AO': 'Angola',
2561         'AI': 'Anguilla',
2562         'AQ': 'Antarctica',
2563         'AG': 'Antigua and Barbuda',
2564         'AR': 'Argentina',
2565         'AM': 'Armenia',
2566         'AW': 'Aruba',
2567         'AU': 'Australia',
2568         'AT': 'Austria',
2569         'AZ': 'Azerbaijan',
2570         'BS': 'Bahamas',
2571         'BH': 'Bahrain',
2572         'BD': 'Bangladesh',
2573         'BB': 'Barbados',
2574         'BY': 'Belarus',
2575         'BE': 'Belgium',
2576         'BZ': 'Belize',
2577         'BJ': 'Benin',
2578         'BM': 'Bermuda',
2579         'BT': 'Bhutan',
2580         'BO': 'Bolivia, Plurinational State of',
2581         'BQ': 'Bonaire, Sint Eustatius and Saba',
2582         'BA': 'Bosnia and Herzegovina',
2583         'BW': 'Botswana',
2584         'BV': 'Bouvet Island',
2585         'BR': 'Brazil',
2586         'IO': 'British Indian Ocean Territory',
2587         'BN': 'Brunei Darussalam',
2588         'BG': 'Bulgaria',
2589         'BF': 'Burkina Faso',
2590         'BI': 'Burundi',
2591         'KH': 'Cambodia',
2592         'CM': 'Cameroon',
2593         'CA': 'Canada',
2594         'CV': 'Cape Verde',
2595         'KY': 'Cayman Islands',
2596         'CF': 'Central African Republic',
2597         'TD': 'Chad',
2598         'CL': 'Chile',
2599         'CN': 'China',
2600         'CX': 'Christmas Island',
2601         'CC': 'Cocos (Keeling) Islands',
2602         'CO': 'Colombia',
2603         'KM': 'Comoros',
2604         'CG': 'Congo',
2605         'CD': 'Congo, the Democratic Republic of the',
2606         'CK': 'Cook Islands',
2607         'CR': 'Costa Rica',
2608         'CI': 'Côte d\'Ivoire',
2609         'HR': 'Croatia',
2610         'CU': 'Cuba',
2611         'CW': 'Curaçao',
2612         'CY': 'Cyprus',
2613         'CZ': 'Czech Republic',
2614         'DK': 'Denmark',
2615         'DJ': 'Djibouti',
2616         'DM': 'Dominica',
2617         'DO': 'Dominican Republic',
2618         'EC': 'Ecuador',
2619         'EG': 'Egypt',
2620         'SV': 'El Salvador',
2621         'GQ': 'Equatorial Guinea',
2622         'ER': 'Eritrea',
2623         'EE': 'Estonia',
2624         'ET': 'Ethiopia',
2625         'FK': 'Falkland Islands (Malvinas)',
2626         'FO': 'Faroe Islands',
2627         'FJ': 'Fiji',
2628         'FI': 'Finland',
2629         'FR': 'France',
2630         'GF': 'French Guiana',
2631         'PF': 'French Polynesia',
2632         'TF': 'French Southern Territories',
2633         'GA': 'Gabon',
2634         'GM': 'Gambia',
2635         'GE': 'Georgia',
2636         'DE': 'Germany',
2637         'GH': 'Ghana',
2638         'GI': 'Gibraltar',
2639         'GR': 'Greece',
2640         'GL': 'Greenland',
2641         'GD': 'Grenada',
2642         'GP': 'Guadeloupe',
2643         'GU': 'Guam',
2644         'GT': 'Guatemala',
2645         'GG': 'Guernsey',
2646         'GN': 'Guinea',
2647         'GW': 'Guinea-Bissau',
2648         'GY': 'Guyana',
2649         'HT': 'Haiti',
2650         'HM': 'Heard Island and McDonald Islands',
2651         'VA': 'Holy See (Vatican City State)',
2652         'HN': 'Honduras',
2653         'HK': 'Hong Kong',
2654         'HU': 'Hungary',
2655         'IS': 'Iceland',
2656         'IN': 'India',
2657         'ID': 'Indonesia',
2658         'IR': 'Iran, Islamic Republic of',
2659         'IQ': 'Iraq',
2660         'IE': 'Ireland',
2661         'IM': 'Isle of Man',
2662         'IL': 'Israel',
2663         'IT': 'Italy',
2664         'JM': 'Jamaica',
2665         'JP': 'Japan',
2666         'JE': 'Jersey',
2667         'JO': 'Jordan',
2668         'KZ': 'Kazakhstan',
2669         'KE': 'Kenya',
2670         'KI': 'Kiribati',
2671         'KP': 'Korea, Democratic People\'s Republic of',
2672         'KR': 'Korea, Republic of',
2673         'KW': 'Kuwait',
2674         'KG': 'Kyrgyzstan',
2675         'LA': 'Lao People\'s Democratic Republic',
2676         'LV': 'Latvia',
2677         'LB': 'Lebanon',
2678         'LS': 'Lesotho',
2679         'LR': 'Liberia',
2680         'LY': 'Libya',
2681         'LI': 'Liechtenstein',
2682         'LT': 'Lithuania',
2683         'LU': 'Luxembourg',
2684         'MO': 'Macao',
2685         'MK': 'Macedonia, the Former Yugoslav Republic of',
2686         'MG': 'Madagascar',
2687         'MW': 'Malawi',
2688         'MY': 'Malaysia',
2689         'MV': 'Maldives',
2690         'ML': 'Mali',
2691         'MT': 'Malta',
2692         'MH': 'Marshall Islands',
2693         'MQ': 'Martinique',
2694         'MR': 'Mauritania',
2695         'MU': 'Mauritius',
2696         'YT': 'Mayotte',
2697         'MX': 'Mexico',
2698         'FM': 'Micronesia, Federated States of',
2699         'MD': 'Moldova, Republic of',
2700         'MC': 'Monaco',
2701         'MN': 'Mongolia',
2702         'ME': 'Montenegro',
2703         'MS': 'Montserrat',
2704         'MA': 'Morocco',
2705         'MZ': 'Mozambique',
2706         'MM': 'Myanmar',
2707         'NA': 'Namibia',
2708         'NR': 'Nauru',
2709         'NP': 'Nepal',
2710         'NL': 'Netherlands',
2711         'NC': 'New Caledonia',
2712         'NZ': 'New Zealand',
2713         'NI': 'Nicaragua',
2714         'NE': 'Niger',
2715         'NG': 'Nigeria',
2716         'NU': 'Niue',
2717         'NF': 'Norfolk Island',
2718         'MP': 'Northern Mariana Islands',
2719         'NO': 'Norway',
2720         'OM': 'Oman',
2721         'PK': 'Pakistan',
2722         'PW': 'Palau',
2723         'PS': 'Palestine, State of',
2724         'PA': 'Panama',
2725         'PG': 'Papua New Guinea',
2726         'PY': 'Paraguay',
2727         'PE': 'Peru',
2728         'PH': 'Philippines',
2729         'PN': 'Pitcairn',
2730         'PL': 'Poland',
2731         'PT': 'Portugal',
2732         'PR': 'Puerto Rico',
2733         'QA': 'Qatar',
2734         'RE': 'Réunion',
2735         'RO': 'Romania',
2736         'RU': 'Russian Federation',
2737         'RW': 'Rwanda',
2738         'BL': 'Saint Barthélemy',
2739         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2740         'KN': 'Saint Kitts and Nevis',
2741         'LC': 'Saint Lucia',
2742         'MF': 'Saint Martin (French part)',
2743         'PM': 'Saint Pierre and Miquelon',
2744         'VC': 'Saint Vincent and the Grenadines',
2745         'WS': 'Samoa',
2746         'SM': 'San Marino',
2747         'ST': 'Sao Tome and Principe',
2748         'SA': 'Saudi Arabia',
2749         'SN': 'Senegal',
2750         'RS': 'Serbia',
2751         'SC': 'Seychelles',
2752         'SL': 'Sierra Leone',
2753         'SG': 'Singapore',
2754         'SX': 'Sint Maarten (Dutch part)',
2755         'SK': 'Slovakia',
2756         'SI': 'Slovenia',
2757         'SB': 'Solomon Islands',
2758         'SO': 'Somalia',
2759         'ZA': 'South Africa',
2760         'GS': 'South Georgia and the South Sandwich Islands',
2761         'SS': 'South Sudan',
2762         'ES': 'Spain',
2763         'LK': 'Sri Lanka',
2764         'SD': 'Sudan',
2765         'SR': 'Suriname',
2766         'SJ': 'Svalbard and Jan Mayen',
2767         'SZ': 'Swaziland',
2768         'SE': 'Sweden',
2769         'CH': 'Switzerland',
2770         'SY': 'Syrian Arab Republic',
2771         'TW': 'Taiwan, Province of China',
2772         'TJ': 'Tajikistan',
2773         'TZ': 'Tanzania, United Republic of',
2774         'TH': 'Thailand',
2775         'TL': 'Timor-Leste',
2776         'TG': 'Togo',
2777         'TK': 'Tokelau',
2778         'TO': 'Tonga',
2779         'TT': 'Trinidad and Tobago',
2780         'TN': 'Tunisia',
2781         'TR': 'Turkey',
2782         'TM': 'Turkmenistan',
2783         'TC': 'Turks and Caicos Islands',
2784         'TV': 'Tuvalu',
2785         'UG': 'Uganda',
2786         'UA': 'Ukraine',
2787         'AE': 'United Arab Emirates',
2788         'GB': 'United Kingdom',
2789         'US': 'United States',
2790         'UM': 'United States Minor Outlying Islands',
2791         'UY': 'Uruguay',
2792         'UZ': 'Uzbekistan',
2793         'VU': 'Vanuatu',
2794         'VE': 'Venezuela, Bolivarian Republic of',
2795         'VN': 'Viet Nam',
2796         'VG': 'Virgin Islands, British',
2797         'VI': 'Virgin Islands, U.S.',
2798         'WF': 'Wallis and Futuna',
2799         'EH': 'Western Sahara',
2800         'YE': 'Yemen',
2801         'ZM': 'Zambia',
2802         'ZW': 'Zimbabwe',
2803     }
2804
2805     @classmethod
2806     def short2full(cls, code):
2807         """Convert an ISO 3166-2 country code to the corresponding full name"""
2808         return cls._country_map.get(code.upper())
2809
2810
2811 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2812     def __init__(self, proxies=None):
2813         # Set default handlers
2814         for type in ('http', 'https'):
2815             setattr(self, '%s_open' % type,
2816                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2817                         meth(r, proxy, type))
2818         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2819
2820     def proxy_open(self, req, proxy, type):
2821         req_proxy = req.headers.get('Ytdl-request-proxy')
2822         if req_proxy is not None:
2823             proxy = req_proxy
2824             del req.headers['Ytdl-request-proxy']
2825
2826         if proxy == '__noproxy__':
2827             return None  # No Proxy
2828         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2829             req.add_header('Ytdl-socks-proxy', proxy)
2830             # youtube-dl's http/https handlers do wrapping the socket with socks
2831             return None
2832         return compat_urllib_request.ProxyHandler.proxy_open(
2833             self, req, proxy, type)
2834
2835
2836 def ohdave_rsa_encrypt(data, exponent, modulus):
2837     '''
2838     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2839
2840     Input:
2841         data: data to encrypt, bytes-like object
2842         exponent, modulus: parameter e and N of RSA algorithm, both integer
2843     Output: hex string of encrypted data
2844
2845     Limitation: supports one block encryption only
2846     '''
2847
2848     payload = int(binascii.hexlify(data[::-1]), 16)
2849     encrypted = pow(payload, exponent, modulus)
2850     return '%x' % encrypted
2851
2852
2853 def encode_base_n(num, n, table=None):
2854     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2855     if not table:
2856         table = FULL_TABLE[:n]
2857
2858     if n > len(table):
2859         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2860
2861     if num == 0:
2862         return table[0]
2863
2864     ret = ''
2865     while num:
2866         ret = table[num % n] + ret
2867         num = num // n
2868     return ret
2869
2870
2871 def decode_packed_codes(code):
2872     mobj = re.search(
2873         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2874         code)
2875     obfucasted_code, base, count, symbols = mobj.groups()
2876     base = int(base)
2877     count = int(count)
2878     symbols = symbols.split('|')
2879     symbol_table = {}
2880
2881     while count:
2882         count -= 1
2883         base_n_count = encode_base_n(count, base)
2884         symbol_table[base_n_count] = symbols[count] or base_n_count
2885
2886     return re.sub(
2887         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2888         obfucasted_code)
2889
2890
2891 def parse_m3u8_attributes(attrib):
2892     info = {}
2893     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2894         if val.startswith('"'):
2895             val = val[1:-1]
2896         info[key] = val
2897     return info