git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_http_client,
  43     compat_kwargs,
  44     compat_parse_qs,
  45     compat_socket_create_connection,
  46     compat_str,
  47     compat_urllib_error,
  48     compat_urllib_parse,
  49     compat_urllib_parse_urlencode,
  50     compat_urllib_parse_urlparse,
  51     compat_urllib_request,
  52     compat_urlparse,
  53     compat_xpath,
  54     shlex_quote,
  55     struct_pack,
  56 )
  57
  58 from .socks import (
  59     ProxyType,
  60     sockssocket,
  61 )
  62
  63
  64 # This is not clearly defined otherwise
  65 compiled_regex_type = type(re.compile(''))
  66
  67 std_headers = {
  68     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
  69     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  70     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  71     'Accept-Encoding': 'gzip, deflate',
  72     'Accept-Language': 'en-us,en;q=0.5',
  73 }
  74
  75
  76 NO_DEFAULT = object()
  77
  78 ENGLISH_MONTH_NAMES = [
  79     'January', 'February', 'March', 'April', 'May', 'June',
  80     'July', 'August', 'September', 'October', 'November', 'December']
  81
  82 KNOWN_EXTENSIONS = (
  83     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  84     'flv', 'f4v', 'f4a', 'f4b',
  85     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  86     'mkv', 'mka', 'mk3d',
  87     'avi', 'divx',
  88     'mov',
  89     'asf', 'wmv', 'wma',
  90     '3gp', '3g2',
  91     'mp3',
  92     'flac',
  93     'ape',
  94     'wav',
  95     'f4f', 'f4m', 'm3u8', 'smil')
  96
  97 # needed for sanitizing filenames in restricted mode
  98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
  99                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
 100                                         'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
 101
 102
 103 def preferredencoding():
 104     """Get preferred encoding.
 105
 106     Returns the best encoding scheme for the system, based on
 107     locale.getpreferredencoding() and some further tweaks.
 108     """
 109     try:
 110         pref = locale.getpreferredencoding()
 111         'TEST'.encode(pref)
 112     except Exception:
 113         pref = 'UTF-8'
 114
 115     return pref
 116
 117
 118 def write_json_file(obj, fn):
 119     """ Encode obj as JSON and write it to fn, atomically if possible """
 120
 121     fn = encodeFilename(fn)
 122     if sys.version_info < (3, 0) and sys.platform != 'win32':
 123         encoding = get_filesystem_encoding()
 124         # os.path.basename returns a bytes object, but NamedTemporaryFile
 125         # will fail if the filename contains non ascii characters unless we
 126         # use a unicode object
 127         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 128         # the same for os.path.dirname
 129         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 130     else:
 131         path_basename = os.path.basename
 132         path_dirname = os.path.dirname
 133
 134     args = {
 135         'suffix': '.tmp',
 136         'prefix': path_basename(fn) + '.',
 137         'dir': path_dirname(fn),
 138         'delete': False,
 139     }
 140
 141     # In Python 2.x, json.dump expects a bytestream.
 142     # In Python 3.x, it writes to a character stream
 143     if sys.version_info < (3, 0):
 144         args['mode'] = 'wb'
 145     else:
 146         args.update({
 147             'mode': 'w',
 148             'encoding': 'utf-8',
 149         })
 150
 151     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 152
 153     try:
 154         with tf:
 155             json.dump(obj, tf)
 156         if sys.platform == 'win32':
 157             # Need to remove existing file on Windows, else os.rename raises
 158             # WindowsError or FileExistsError.
 159             try:
 160                 os.unlink(fn)
 161             except OSError:
 162                 pass
 163         os.rename(tf.name, fn)
 164     except Exception:
 165         try:
 166             os.remove(tf.name)
 167         except OSError:
 168             pass
 169         raise
 170
 171
 172 if sys.version_info >= (2, 7):
 173     def find_xpath_attr(node, xpath, key, val=None):
 174         """ Find the xpath xpath[@key=val] """
 175         assert re.match(r'^[a-zA-Z_-]+$', key)
 176         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 177         return node.find(expr)
 178 else:
 179     def find_xpath_attr(node, xpath, key, val=None):
 180         for f in node.findall(compat_xpath(xpath)):
 181             if key not in f.attrib:
 182                 continue
 183             if val is None or f.attrib.get(key) == val:
 184                 return f
 185         return None
 186
 187 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 188 # the namespace parameter
 189
 190
 191 def xpath_with_ns(path, ns_map):
 192     components = [c.split(':') for c in path.split('/')]
 193     replaced = []
 194     for c in components:
 195         if len(c) == 1:
 196             replaced.append(c[0])
 197         else:
 198             ns, tag = c
 199             replaced.append('{%s}%s' % (ns_map[ns], tag))
 200     return '/'.join(replaced)
 201
 202
 203 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 204     def _find_xpath(xpath):
 205         return node.find(compat_xpath(xpath))
 206
 207     if isinstance(xpath, (str, compat_str)):
 208         n = _find_xpath(xpath)
 209     else:
 210         for xp in xpath:
 211             n = _find_xpath(xp)
 212             if n is not None:
 213                 break
 214
 215     if n is None:
 216         if default is not NO_DEFAULT:
 217             return default
 218         elif fatal:
 219             name = xpath if name is None else name
 220             raise ExtractorError('Could not find XML element %s' % name)
 221         else:
 222             return None
 223     return n
 224
 225
 226 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 227     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 228     if n is None or n == default:
 229         return n
 230     if n.text is None:
 231         if default is not NO_DEFAULT:
 232             return default
 233         elif fatal:
 234             name = xpath if name is None else name
 235             raise ExtractorError('Could not find XML element\'s text %s' % name)
 236         else:
 237             return None
 238     return n.text
 239
 240
 241 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 242     n = find_xpath_attr(node, xpath, key)
 243     if n is None:
 244         if default is not NO_DEFAULT:
 245             return default
 246         elif fatal:
 247             name = '%s[@%s]' % (xpath, key) if name is None else name
 248             raise ExtractorError('Could not find XML attribute %s' % name)
 249         else:
 250             return None
 251     return n.attrib[key]
 252
 253
 254 def get_element_by_id(id, html):
 255     """Return the content of the tag with the specified ID in the passed HTML document"""
 256     return get_element_by_attribute('id', id, html)
 257
 258
 259 def get_element_by_attribute(attribute, value, html):
 260     """Return the content of the tag with the specified attribute in the passed HTML document"""
 261
 262     m = re.search(r'''(?xs)
 263         <([a-zA-Z0-9:._-]+)
 264          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 265          \s+%s=['"]?%s['"]?
 266          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 267         \s*>
 268         (?P<content>.*?)
 269         </\1>
 270     ''' % (re.escape(attribute), re.escape(value)), html)
 271
 272     if not m:
 273         return None
 274     res = m.group('content')
 275
 276     if res.startswith('"') or res.startswith("'"):
 277         res = res[1:-1]
 278
 279     return unescapeHTML(res)
 280
 281
 282 class HTMLAttributeParser(compat_HTMLParser):
 283     """Trivial HTML parser to gather the attributes for a single element"""
 284     def __init__(self):
 285         self.attrs = {}
 286         compat_HTMLParser.__init__(self)
 287
 288     def handle_starttag(self, tag, attrs):
 289         self.attrs = dict(attrs)
 290
 291
 292 def extract_attributes(html_element):
 293     """Given a string for an HTML element such as
 294     <el
 295          a="foo" B="bar" c="&98;az" d=boz
 296          empty= noval entity="&amp;"
 297          sq='"' dq="'"
 298     >
 299     Decode and return a dictionary of attributes.
 300     {
 301         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 302         'empty': '', 'noval': None, 'entity': '&',
 303         'sq': '"', 'dq': '\''
 304     }.
 305     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 306     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 307     """
 308     parser = HTMLAttributeParser()
 309     parser.feed(html_element)
 310     parser.close()
 311     return parser.attrs
 312
 313
 314 def clean_html(html):
 315     """Clean an HTML snippet into a readable string"""
 316
 317     if html is None:  # Convenience for sanitizing descriptions etc.
 318         return html
 319
 320     # Newline vs <br />
 321     html = html.replace('\n', ' ')
 322     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 323     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 324     # Strip html tags
 325     html = re.sub('<.*?>', '', html)
 326     # Replace html entities
 327     html = unescapeHTML(html)
 328     return html.strip()
 329
 330
 331 def sanitize_open(filename, open_mode):
 332     """Try to open the given filename, and slightly tweak it if this fails.
 333
 334     Attempts to open the given filename. If this fails, it tries to change
 335     the filename slightly, step by step, until it's either able to open it
 336     or it fails and raises a final exception, like the standard open()
 337     function.
 338
 339     It returns the tuple (stream, definitive_file_name).
 340     """
 341     try:
 342         if filename == '-':
 343             if sys.platform == 'win32':
 344                 import msvcrt
 345                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 346             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 347         stream = open(encodeFilename(filename), open_mode)
 348         return (stream, filename)
 349     except (IOError, OSError) as err:
 350         if err.errno in (errno.EACCES,):
 351             raise
 352
 353         # In case of error, try to remove win32 forbidden chars
 354         alt_filename = sanitize_path(filename)
 355         if alt_filename == filename:
 356             raise
 357         else:
 358             # An exception here should be caught in the caller
 359             stream = open(encodeFilename(alt_filename), open_mode)
 360             return (stream, alt_filename)
 361
 362
 363 def timeconvert(timestr):
 364     """Convert RFC 2822 defined time string into system timestamp"""
 365     timestamp = None
 366     timetuple = email.utils.parsedate_tz(timestr)
 367     if timetuple is not None:
 368         timestamp = email.utils.mktime_tz(timetuple)
 369     return timestamp
 370
 371
 372 def sanitize_filename(s, restricted=False, is_id=False):
 373     """Sanitizes a string so it could be used as part of a filename.
 374     If restricted is set, use a stricter subset of allowed characters.
 375     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 376     """
 377     def replace_insane(char):
 378         if restricted and char in ACCENT_CHARS:
 379             return ACCENT_CHARS[char]
 380         if char == '?' or ord(char) < 32 or ord(char) == 127:
 381             return ''
 382         elif char == '"':
 383             return '' if restricted else '\''
 384         elif char == ':':
 385             return '_-' if restricted else ' -'
 386         elif char in '\\/|*<>':
 387             return '_'
 388         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 389             return '_'
 390         if restricted and ord(char) > 127:
 391             return '_'
 392         return char
 393
 394     # Handle timestamps
 395     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 396     result = ''.join(map(replace_insane, s))
 397     if not is_id:
 398         while '__' in result:
 399             result = result.replace('__', '_')
 400         result = result.strip('_')
 401         # Common case of "Foreign band name - English song title"
 402         if restricted and result.startswith('-_'):
 403             result = result[2:]
 404         if result.startswith('-'):
 405             result = '_' + result[len('-'):]
 406         result = result.lstrip('.')
 407         if not result:
 408             result = '_'
 409     return result
 410
 411
 412 def sanitize_path(s):
 413     """Sanitizes and normalizes path on Windows"""
 414     if sys.platform != 'win32':
 415         return s
 416     drive_or_unc, _ = os.path.splitdrive(s)
 417     if sys.version_info < (2, 7) and not drive_or_unc:
 418         drive_or_unc, _ = os.path.splitunc(s)
 419     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 420     if drive_or_unc:
 421         norm_path.pop(0)
 422     sanitized_path = [
 423         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 424         for path_part in norm_path]
 425     if drive_or_unc:
 426         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 427     return os.path.join(*sanitized_path)
 428
 429
 430 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 431 # unwanted failures due to missing protocol
 432 def sanitize_url(url):
 433     return 'http:%s' % url if url.startswith('//') else url
 434
 435
 436 def sanitized_Request(url, *args, **kwargs):
 437     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 438
 439
 440 def orderedSet(iterable):
 441     """ Remove all duplicates from the input iterable """
 442     res = []
 443     for el in iterable:
 444         if el not in res:
 445             res.append(el)
 446     return res
 447
 448
 449 def _htmlentity_transform(entity):
 450     """Transforms an HTML entity to a character."""
 451     # Known non-numeric HTML entity
 452     if entity in compat_html_entities.name2codepoint:
 453         return compat_chr(compat_html_entities.name2codepoint[entity])
 454
 455     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 456     if mobj is not None:
 457         numstr = mobj.group(1)
 458         if numstr.startswith('x'):
 459             base = 16
 460             numstr = '0%s' % numstr
 461         else:
 462             base = 10
 463         # See https://github.com/rg3/youtube-dl/issues/7518
 464         try:
 465             return compat_chr(int(numstr, base))
 466         except ValueError:
 467             pass
 468
 469     # Unknown entity in name, return its literal representation
 470     return '&%s;' % entity
 471
 472
 473 def unescapeHTML(s):
 474     if s is None:
 475         return None
 476     assert type(s) == compat_str
 477
 478     return re.sub(
 479         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 480
 481
 482 def get_subprocess_encoding():
 483     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 484         # For subprocess calls, encode with locale encoding
 485         # Refer to http://stackoverflow.com/a/9951851/35070
 486         encoding = preferredencoding()
 487     else:
 488         encoding = sys.getfilesystemencoding()
 489     if encoding is None:
 490         encoding = 'utf-8'
 491     return encoding
 492
 493
 494 def encodeFilename(s, for_subprocess=False):
 495     """
 496     @param s The name of the file
 497     """
 498
 499     assert type(s) == compat_str
 500
 501     # Python 3 has a Unicode API
 502     if sys.version_info >= (3, 0):
 503         return s
 504
 505     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 506     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 507     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 508     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 509         return s
 510
 511     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 512     if sys.platform.startswith('java'):
 513         return s
 514
 515     return s.encode(get_subprocess_encoding(), 'ignore')
 516
 517
 518 def decodeFilename(b, for_subprocess=False):
 519
 520     if sys.version_info >= (3, 0):
 521         return b
 522
 523     if not isinstance(b, bytes):
 524         return b
 525
 526     return b.decode(get_subprocess_encoding(), 'ignore')
 527
 528
 529 def encodeArgument(s):
 530     if not isinstance(s, compat_str):
 531         # Legacy code that uses byte strings
 532         # Uncomment the following line after fixing all post processors
 533         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 534         s = s.decode('ascii')
 535     return encodeFilename(s, True)
 536
 537
 538 def decodeArgument(b):
 539     return decodeFilename(b, True)
 540
 541
 542 def decodeOption(optval):
 543     if optval is None:
 544         return optval
 545     if isinstance(optval, bytes):
 546         optval = optval.decode(preferredencoding())
 547
 548     assert isinstance(optval, compat_str)
 549     return optval
 550
 551
 552 def formatSeconds(secs):
 553     if secs > 3600:
 554         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 555     elif secs > 60:
 556         return '%d:%02d' % (secs // 60, secs % 60)
 557     else:
 558         return '%d' % secs
 559
 560
 561 def make_HTTPS_handler(params, **kwargs):
 562     opts_no_check_certificate = params.get('nocheckcertificate', False)
 563     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 564         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 565         if opts_no_check_certificate:
 566             context.check_hostname = False
 567             context.verify_mode = ssl.CERT_NONE
 568         try:
 569             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 570         except TypeError:
 571             # Python 2.7.8
 572             # (create_default_context present but HTTPSHandler has no context=)
 573             pass
 574
 575     if sys.version_info < (3, 2):
 576         return YoutubeDLHTTPSHandler(params, **kwargs)
 577     else:  # Python < 3.4
 578         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 579         context.verify_mode = (ssl.CERT_NONE
 580                                if opts_no_check_certificate
 581                                else ssl.CERT_REQUIRED)
 582         context.set_default_verify_paths()
 583         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 584
 585
 586 def bug_reports_message():
 587     if ytdl_is_updateable():
 588         update_cmd = 'type  youtube-dl -U  to update'
 589     else:
 590         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 591     msg = '; please report this issue on https://yt-dl.org/bug .'
 592     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 593     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 594     return msg
 595
 596
 597 class ExtractorError(Exception):
 598     """Error during info extraction."""
 599
 600     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 601         """ tb, if given, is the original traceback (so that it can be printed out).
 602         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 603         """
 604
 605         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 606             expected = True
 607         if video_id is not None:
 608             msg = video_id + ': ' + msg
 609         if cause:
 610             msg += ' (caused by %r)' % cause
 611         if not expected:
 612             msg += bug_reports_message()
 613         super(ExtractorError, self).__init__(msg)
 614
 615         self.traceback = tb
 616         self.exc_info = sys.exc_info()  # preserve original exception
 617         self.cause = cause
 618         self.video_id = video_id
 619
 620     def format_traceback(self):
 621         if self.traceback is None:
 622             return None
 623         return ''.join(traceback.format_tb(self.traceback))
 624
 625
 626 class UnsupportedError(ExtractorError):
 627     def __init__(self, url):
 628         super(UnsupportedError, self).__init__(
 629             'Unsupported URL: %s' % url, expected=True)
 630         self.url = url
 631
 632
 633 class RegexNotFoundError(ExtractorError):
 634     """Error when a regex didn't match"""
 635     pass
 636
 637
 638 class DownloadError(Exception):
 639     """Download Error exception.
 640
 641     This exception may be thrown by FileDownloader objects if they are not
 642     configured to continue on errors. They will contain the appropriate
 643     error message.
 644     """
 645
 646     def __init__(self, msg, exc_info=None):
 647         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 648         super(DownloadError, self).__init__(msg)
 649         self.exc_info = exc_info
 650
 651
 652 class SameFileError(Exception):
 653     """Same File exception.
 654
 655     This exception will be thrown by FileDownloader objects if they detect
 656     multiple files would have to be downloaded to the same file on disk.
 657     """
 658     pass
 659
 660
 661 class PostProcessingError(Exception):
 662     """Post Processing exception.
 663
 664     This exception may be raised by PostProcessor's .run() method to
 665     indicate an error in the postprocessing task.
 666     """
 667
 668     def __init__(self, msg):
 669         self.msg = msg
 670
 671
 672 class MaxDownloadsReached(Exception):
 673     """ --max-downloads limit has been reached. """
 674     pass
 675
 676
 677 class UnavailableVideoError(Exception):
 678     """Unavailable Format exception.
 679
 680     This exception will be thrown when a video is requested
 681     in a format that is not available for that video.
 682     """
 683     pass
 684
 685
 686 class ContentTooShortError(Exception):
 687     """Content Too Short exception.
 688
 689     This exception may be raised by FileDownloader objects when a file they
 690     download is too small for what the server announced first, indicating
 691     the connection was probably interrupted.
 692     """
 693
 694     def __init__(self, downloaded, expected):
 695         # Both in bytes
 696         self.downloaded = downloaded
 697         self.expected = expected
 698
 699
 700 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 701     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 702     # expected HTTP responses to meet HTTP/1.0 or later (see also
 703     # https://github.com/rg3/youtube-dl/issues/6727)
 704     if sys.version_info < (3, 0):
 705         kwargs[b'strict'] = True
 706     hc = http_class(*args, **kwargs)
 707     source_address = ydl_handler._params.get('source_address')
 708     if source_address is not None:
 709         sa = (source_address, 0)
 710         if hasattr(hc, 'source_address'):  # Python 2.7+
 711             hc.source_address = sa
 712         else:  # Python 2.6
 713             def _hc_connect(self, *args, **kwargs):
 714                 sock = compat_socket_create_connection(
 715                     (self.host, self.port), self.timeout, sa)
 716                 if is_https:
 717                     self.sock = ssl.wrap_socket(
 718                         sock, self.key_file, self.cert_file,
 719                         ssl_version=ssl.PROTOCOL_TLSv1)
 720                 else:
 721                     self.sock = sock
 722             hc.connect = functools.partial(_hc_connect, hc)
 723
 724     return hc
 725
 726
 727 def handle_youtubedl_headers(headers):
 728     filtered_headers = headers
 729
 730     if 'Youtubedl-no-compression' in filtered_headers:
 731         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 732         del filtered_headers['Youtubedl-no-compression']
 733
 734     return filtered_headers
 735
 736
 737 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 738     """Handler for HTTP requests and responses.
 739
 740     This class, when installed with an OpenerDirector, automatically adds
 741     the standard headers to every HTTP request and handles gzipped and
 742     deflated responses from web servers. If compression is to be avoided in
 743     a particular request, the original request in the program code only has
 744     to include the HTTP header "Youtubedl-no-compression", which will be
 745     removed before making the real request.
 746
 747     Part of this code was copied from:
 748
 749     http://techknack.net/python-urllib2-handlers/
 750
 751     Andrew Rowls, the author of that code, agreed to release it to the
 752     public domain.
 753     """
 754
 755     def __init__(self, params, *args, **kwargs):
 756         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 757         self._params = params
 758
 759     def http_open(self, req):
 760         conn_class = compat_http_client.HTTPConnection
 761
 762         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 763         if socks_proxy:
 764             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 765             del req.headers['Ytdl-socks-proxy']
 766
 767         return self.do_open(functools.partial(
 768             _create_http_connection, self, conn_class, False),
 769             req)
 770
 771     @staticmethod
 772     def deflate(data):
 773         try:
 774             return zlib.decompress(data, -zlib.MAX_WBITS)
 775         except zlib.error:
 776             return zlib.decompress(data)
 777
 778     @staticmethod
 779     def addinfourl_wrapper(stream, headers, url, code):
 780         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 781             return compat_urllib_request.addinfourl(stream, headers, url, code)
 782         ret = compat_urllib_request.addinfourl(stream, headers, url)
 783         ret.code = code
 784         return ret
 785
 786     def http_request(self, req):
 787         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 788         # always respected by websites, some tend to give out URLs with non percent-encoded
 789         # non-ASCII characters (see telemb.py, ard.py [#3412])
 790         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 791         # To work around aforementioned issue we will replace request's original URL with
 792         # percent-encoded one
 793         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 794         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 795         url = req.get_full_url()
 796         url_escaped = escape_url(url)
 797
 798         # Substitute URL if any change after escaping
 799         if url != url_escaped:
 800             req = update_Request(req, url=url_escaped)
 801
 802         for h, v in std_headers.items():
 803             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 804             # The dict keys are capitalized because of this bug by urllib
 805             if h.capitalize() not in req.headers:
 806                 req.add_header(h, v)
 807
 808         req.headers = handle_youtubedl_headers(req.headers)
 809
 810         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 811             # Python 2.6 is brain-dead when it comes to fragments
 812             req._Request__original = req._Request__original.partition('#')[0]
 813             req._Request__r_type = req._Request__r_type.partition('#')[0]
 814
 815         return req
 816
 817     def http_response(self, req, resp):
 818         old_resp = resp
 819         # gzip
 820         if resp.headers.get('Content-encoding', '') == 'gzip':
 821             content = resp.read()
 822             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 823             try:
 824                 uncompressed = io.BytesIO(gz.read())
 825             except IOError as original_ioerror:
 826                 # There may be junk add the end of the file
 827                 # See http://stackoverflow.com/q/4928560/35070 for details
 828                 for i in range(1, 1024):
 829                     try:
 830                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 831                         uncompressed = io.BytesIO(gz.read())
 832                     except IOError:
 833                         continue
 834                     break
 835                 else:
 836                     raise original_ioerror
 837             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 838             resp.msg = old_resp.msg
 839             del resp.headers['Content-encoding']
 840         # deflate
 841         if resp.headers.get('Content-encoding', '') == 'deflate':
 842             gz = io.BytesIO(self.deflate(resp.read()))
 843             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 844             resp.msg = old_resp.msg
 845             del resp.headers['Content-encoding']
 846         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 847         # https://github.com/rg3/youtube-dl/issues/6457).
 848         if 300 <= resp.code < 400:
 849             location = resp.headers.get('Location')
 850             if location:
 851                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 852                 if sys.version_info >= (3, 0):
 853                     location = location.encode('iso-8859-1').decode('utf-8')
 854                 location_escaped = escape_url(location)
 855                 if location != location_escaped:
 856                     del resp.headers['Location']
 857                     resp.headers['Location'] = location_escaped
 858         return resp
 859
 860     https_request = http_request
 861     https_response = http_response
 862
 863
 864 def make_socks_conn_class(base_class, socks_proxy):
 865     assert issubclass(base_class, (
 866         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 867
 868     url_components = compat_urlparse.urlparse(socks_proxy)
 869     if url_components.scheme.lower() == 'socks5':
 870         socks_type = ProxyType.SOCKS5
 871     elif url_components.scheme.lower() in ('socks', 'socks4'):
 872         socks_type = ProxyType.SOCKS4
 873
 874     proxy_args = (
 875         socks_type,
 876         url_components.hostname, url_components.port or 1080,
 877         True,  # Remote DNS
 878         url_components.username, url_components.password
 879     )
 880
 881     class SocksConnection(base_class):
 882         def connect(self):
 883             self.sock = sockssocket()
 884             self.sock.setproxy(*proxy_args)
 885             if type(self.timeout) in (int, float):
 886                 self.sock.settimeout(self.timeout)
 887             self.sock.connect((self.host, self.port))
 888
 889             if isinstance(self, compat_http_client.HTTPSConnection):
 890                 if hasattr(self, '_context'):  # Python > 2.6
 891                     self.sock = self._context.wrap_socket(
 892                         self.sock, server_hostname=self.host)
 893                 else:
 894                     self.sock = ssl.wrap_socket(self.sock)
 895
 896     return SocksConnection
 897
 898
 899 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 900     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 901         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 902         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 903         self._params = params
 904
 905     def https_open(self, req):
 906         kwargs = {}
 907         conn_class = self._https_conn_class
 908
 909         if hasattr(self, '_context'):  # python > 2.6
 910             kwargs['context'] = self._context
 911         if hasattr(self, '_check_hostname'):  # python 3.x
 912             kwargs['check_hostname'] = self._check_hostname
 913
 914         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 915         if socks_proxy:
 916             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 917             del req.headers['Ytdl-socks-proxy']
 918
 919         return self.do_open(functools.partial(
 920             _create_http_connection, self, conn_class, True),
 921             req, **kwargs)
 922
 923
 924 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 925     def __init__(self, cookiejar=None):
 926         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 927
 928     def http_response(self, request, response):
 929         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 930         # characters in Set-Cookie HTTP header of last response (see
 931         # https://github.com/rg3/youtube-dl/issues/6769).
 932         # In order to at least prevent crashing we will percent encode Set-Cookie
 933         # header before HTTPCookieProcessor starts processing it.
 934         # if sys.version_info < (3, 0) and response.headers:
 935         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 936         #         set_cookie = response.headers.get(set_cookie_header)
 937         #         if set_cookie:
 938         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 939         #             if set_cookie != set_cookie_escaped:
 940         #                 del response.headers[set_cookie_header]
 941         #                 response.headers[set_cookie_header] = set_cookie_escaped
 942         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 943
 944     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 945     https_response = http_response
 946
 947
 948 def parse_iso8601(date_str, delimiter='T', timezone=None):
 949     """ Return a UNIX timestamp from the given date """
 950
 951     if date_str is None:
 952         return None
 953
 954     date_str = re.sub(r'\.[0-9]+', '', date_str)
 955
 956     if timezone is None:
 957         m = re.search(
 958             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 959             date_str)
 960         if not m:
 961             timezone = datetime.timedelta()
 962         else:
 963             date_str = date_str[:-len(m.group(0))]
 964             if not m.group('sign'):
 965                 timezone = datetime.timedelta()
 966             else:
 967                 sign = 1 if m.group('sign') == '+' else -1
 968                 timezone = datetime.timedelta(
 969                     hours=sign * int(m.group('hours')),
 970                     minutes=sign * int(m.group('minutes')))
 971     try:
 972         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 973         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 974         return calendar.timegm(dt.timetuple())
 975     except ValueError:
 976         pass
 977
 978
 979 def unified_strdate(date_str, day_first=True):
 980     """Return a string with the date in the format YYYYMMDD"""
 981
 982     if date_str is None:
 983         return None
 984     upload_date = None
 985     # Replace commas
 986     date_str = date_str.replace(',', ' ')
 987     # %z (UTC offset) is only supported in python>=3.2
 988     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 989         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 990     # Remove AM/PM + timezone
 991     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 992
 993     format_expressions = [
 994         '%d %B %Y',
 995         '%d %b %Y',
 996         '%B %d %Y',
 997         '%b %d %Y',
 998         '%b %dst %Y %I:%M',
 999         '%b %dnd %Y %I:%M',
1000         '%b %dth %Y %I:%M',
1001         '%Y %m %d',
1002         '%Y-%m-%d',
1003         '%Y/%m/%d',
1004         '%Y/%m/%d %H:%M:%S',
1005         '%Y-%m-%d %H:%M:%S',
1006         '%Y-%m-%d %H:%M:%S.%f',
1007         '%d.%m.%Y %H:%M',
1008         '%d.%m.%Y %H.%M',
1009         '%Y-%m-%dT%H:%M:%SZ',
1010         '%Y-%m-%dT%H:%M:%S.%fZ',
1011         '%Y-%m-%dT%H:%M:%S.%f0Z',
1012         '%Y-%m-%dT%H:%M:%S',
1013         '%Y-%m-%dT%H:%M:%S.%f',
1014         '%Y-%m-%dT%H:%M',
1015     ]
1016     if day_first:
1017         format_expressions.extend([
1018             '%d-%m-%Y',
1019             '%d.%m.%Y',
1020             '%d/%m/%Y',
1021             '%d/%m/%y',
1022             '%d/%m/%Y %H:%M:%S',
1023         ])
1024     else:
1025         format_expressions.extend([
1026             '%m-%d-%Y',
1027             '%m.%d.%Y',
1028             '%m/%d/%Y',
1029             '%m/%d/%y',
1030             '%m/%d/%Y %H:%M:%S',
1031         ])
1032     for expression in format_expressions:
1033         try:
1034             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1035         except ValueError:
1036             pass
1037     if upload_date is None:
1038         timetuple = email.utils.parsedate_tz(date_str)
1039         if timetuple:
1040             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1041     if upload_date is not None:
1042         return compat_str(upload_date)
1043
1044
1045 def determine_ext(url, default_ext='unknown_video'):
1046     if url is None:
1047         return default_ext
1048     guess = url.partition('?')[0].rpartition('.')[2]
1049     if re.match(r'^[A-Za-z0-9]+$', guess):
1050         return guess
1051     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1052     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1053         return guess.rstrip('/')
1054     else:
1055         return default_ext
1056
1057
1058 def subtitles_filename(filename, sub_lang, sub_format):
1059     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1060
1061
1062 def date_from_str(date_str):
1063     """
1064     Return a datetime object from a string in the format YYYYMMDD or
1065     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1066     today = datetime.date.today()
1067     if date_str in ('now', 'today'):
1068         return today
1069     if date_str == 'yesterday':
1070         return today - datetime.timedelta(days=1)
1071     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1072     if match is not None:
1073         sign = match.group('sign')
1074         time = int(match.group('time'))
1075         if sign == '-':
1076             time = -time
1077         unit = match.group('unit')
1078         # A bad approximation?
1079         if unit == 'month':
1080             unit = 'day'
1081             time *= 30
1082         elif unit == 'year':
1083             unit = 'day'
1084             time *= 365
1085         unit += 's'
1086         delta = datetime.timedelta(**{unit: time})
1087         return today + delta
1088     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1089
1090
1091 def hyphenate_date(date_str):
1092     """
1093     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1094     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1095     if match is not None:
1096         return '-'.join(match.groups())
1097     else:
1098         return date_str
1099
1100
1101 class DateRange(object):
1102     """Represents a time interval between two dates"""
1103
1104     def __init__(self, start=None, end=None):
1105         """start and end must be strings in the format accepted by date"""
1106         if start is not None:
1107             self.start = date_from_str(start)
1108         else:
1109             self.start = datetime.datetime.min.date()
1110         if end is not None:
1111             self.end = date_from_str(end)
1112         else:
1113             self.end = datetime.datetime.max.date()
1114         if self.start > self.end:
1115             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1116
1117     @classmethod
1118     def day(cls, day):
1119         """Returns a range that only contains the given day"""
1120         return cls(day, day)
1121
1122     def __contains__(self, date):
1123         """Check if the date is in the range"""
1124         if not isinstance(date, datetime.date):
1125             date = date_from_str(date)
1126         return self.start <= date <= self.end
1127
1128     def __str__(self):
1129         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1130
1131
1132 def platform_name():
1133     """ Returns the platform name as a compat_str """
1134     res = platform.platform()
1135     if isinstance(res, bytes):
1136         res = res.decode(preferredencoding())
1137
1138     assert isinstance(res, compat_str)
1139     return res
1140
1141
1142 def _windows_write_string(s, out):
1143     """ Returns True if the string was written using special methods,
1144     False if it has yet to be written out."""
1145     # Adapted from http://stackoverflow.com/a/3259271/35070
1146
1147     import ctypes
1148     import ctypes.wintypes
1149
1150     WIN_OUTPUT_IDS = {
1151         1: -11,
1152         2: -12,
1153     }
1154
1155     try:
1156         fileno = out.fileno()
1157     except AttributeError:
1158         # If the output stream doesn't have a fileno, it's virtual
1159         return False
1160     except io.UnsupportedOperation:
1161         # Some strange Windows pseudo files?
1162         return False
1163     if fileno not in WIN_OUTPUT_IDS:
1164         return False
1165
1166     GetStdHandle = ctypes.WINFUNCTYPE(
1167         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1168         (b'GetStdHandle', ctypes.windll.kernel32))
1169     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1170
1171     WriteConsoleW = ctypes.WINFUNCTYPE(
1172         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1173         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1174         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1175     written = ctypes.wintypes.DWORD(0)
1176
1177     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1178     FILE_TYPE_CHAR = 0x0002
1179     FILE_TYPE_REMOTE = 0x8000
1180     GetConsoleMode = ctypes.WINFUNCTYPE(
1181         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1182         ctypes.POINTER(ctypes.wintypes.DWORD))(
1183         (b'GetConsoleMode', ctypes.windll.kernel32))
1184     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1185
1186     def not_a_console(handle):
1187         if handle == INVALID_HANDLE_VALUE or handle is None:
1188             return True
1189         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1190                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1191
1192     if not_a_console(h):
1193         return False
1194
1195     def next_nonbmp_pos(s):
1196         try:
1197             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1198         except StopIteration:
1199             return len(s)
1200
1201     while s:
1202         count = min(next_nonbmp_pos(s), 1024)
1203
1204         ret = WriteConsoleW(
1205             h, s, count if count else 2, ctypes.byref(written), None)
1206         if ret == 0:
1207             raise OSError('Failed to write string')
1208         if not count:  # We just wrote a non-BMP character
1209             assert written.value == 2
1210             s = s[1:]
1211         else:
1212             assert written.value > 0
1213             s = s[written.value:]
1214     return True
1215
1216
1217 def write_string(s, out=None, encoding=None):
1218     if out is None:
1219         out = sys.stderr
1220     assert type(s) == compat_str
1221
1222     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1223         if _windows_write_string(s, out):
1224             return
1225
1226     if ('b' in getattr(out, 'mode', '') or
1227             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1228         byt = s.encode(encoding or preferredencoding(), 'ignore')
1229         out.write(byt)
1230     elif hasattr(out, 'buffer'):
1231         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1232         byt = s.encode(enc, 'ignore')
1233         out.buffer.write(byt)
1234     else:
1235         out.write(s)
1236     out.flush()
1237
1238
1239 def bytes_to_intlist(bs):
1240     if not bs:
1241         return []
1242     if isinstance(bs[0], int):  # Python 3
1243         return list(bs)
1244     else:
1245         return [ord(c) for c in bs]
1246
1247
1248 def intlist_to_bytes(xs):
1249     if not xs:
1250         return b''
1251     return struct_pack('%dB' % len(xs), *xs)
1252
1253
1254 # Cross-platform file locking
1255 if sys.platform == 'win32':
1256     import ctypes.wintypes
1257     import msvcrt
1258
1259     class OVERLAPPED(ctypes.Structure):
1260         _fields_ = [
1261             ('Internal', ctypes.wintypes.LPVOID),
1262             ('InternalHigh', ctypes.wintypes.LPVOID),
1263             ('Offset', ctypes.wintypes.DWORD),
1264             ('OffsetHigh', ctypes.wintypes.DWORD),
1265             ('hEvent', ctypes.wintypes.HANDLE),
1266         ]
1267
1268     kernel32 = ctypes.windll.kernel32
1269     LockFileEx = kernel32.LockFileEx
1270     LockFileEx.argtypes = [
1271         ctypes.wintypes.HANDLE,     # hFile
1272         ctypes.wintypes.DWORD,      # dwFlags
1273         ctypes.wintypes.DWORD,      # dwReserved
1274         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1275         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1276         ctypes.POINTER(OVERLAPPED)  # Overlapped
1277     ]
1278     LockFileEx.restype = ctypes.wintypes.BOOL
1279     UnlockFileEx = kernel32.UnlockFileEx
1280     UnlockFileEx.argtypes = [
1281         ctypes.wintypes.HANDLE,     # hFile
1282         ctypes.wintypes.DWORD,      # dwReserved
1283         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1284         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1285         ctypes.POINTER(OVERLAPPED)  # Overlapped
1286     ]
1287     UnlockFileEx.restype = ctypes.wintypes.BOOL
1288     whole_low = 0xffffffff
1289     whole_high = 0x7fffffff
1290
1291     def _lock_file(f, exclusive):
1292         overlapped = OVERLAPPED()
1293         overlapped.Offset = 0
1294         overlapped.OffsetHigh = 0
1295         overlapped.hEvent = 0
1296         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1297         handle = msvcrt.get_osfhandle(f.fileno())
1298         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1299                           whole_low, whole_high, f._lock_file_overlapped_p):
1300             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1301
1302     def _unlock_file(f):
1303         assert f._lock_file_overlapped_p
1304         handle = msvcrt.get_osfhandle(f.fileno())
1305         if not UnlockFileEx(handle, 0,
1306                             whole_low, whole_high, f._lock_file_overlapped_p):
1307             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1308
1309 else:
1310     # Some platforms, such as Jython, is missing fcntl
1311     try:
1312         import fcntl
1313
1314         def _lock_file(f, exclusive):
1315             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1316
1317         def _unlock_file(f):
1318             fcntl.flock(f, fcntl.LOCK_UN)
1319     except ImportError:
1320         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1321
1322         def _lock_file(f, exclusive):
1323             raise IOError(UNSUPPORTED_MSG)
1324
1325         def _unlock_file(f):
1326             raise IOError(UNSUPPORTED_MSG)
1327
1328
1329 class locked_file(object):
1330     def __init__(self, filename, mode, encoding=None):
1331         assert mode in ['r', 'a', 'w']
1332         self.f = io.open(filename, mode, encoding=encoding)
1333         self.mode = mode
1334
1335     def __enter__(self):
1336         exclusive = self.mode != 'r'
1337         try:
1338             _lock_file(self.f, exclusive)
1339         except IOError:
1340             self.f.close()
1341             raise
1342         return self
1343
1344     def __exit__(self, etype, value, traceback):
1345         try:
1346             _unlock_file(self.f)
1347         finally:
1348             self.f.close()
1349
1350     def __iter__(self):
1351         return iter(self.f)
1352
1353     def write(self, *args):
1354         return self.f.write(*args)
1355
1356     def read(self, *args):
1357         return self.f.read(*args)
1358
1359
1360 def get_filesystem_encoding():
1361     encoding = sys.getfilesystemencoding()
1362     return encoding if encoding is not None else 'utf-8'
1363
1364
1365 def shell_quote(args):
1366     quoted_args = []
1367     encoding = get_filesystem_encoding()
1368     for a in args:
1369         if isinstance(a, bytes):
1370             # We may get a filename encoded with 'encodeFilename'
1371             a = a.decode(encoding)
1372         quoted_args.append(pipes.quote(a))
1373     return ' '.join(quoted_args)
1374
1375
1376 def smuggle_url(url, data):
1377     """ Pass additional data in a URL for internal use. """
1378
1379     sdata = compat_urllib_parse_urlencode(
1380         {'__youtubedl_smuggle': json.dumps(data)})
1381     return url + '#' + sdata
1382
1383
1384 def unsmuggle_url(smug_url, default=None):
1385     if '#__youtubedl_smuggle' not in smug_url:
1386         return smug_url, default
1387     url, _, sdata = smug_url.rpartition('#')
1388     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1389     data = json.loads(jsond)
1390     return url, data
1391
1392
1393 def format_bytes(bytes):
1394     if bytes is None:
1395         return 'N/A'
1396     if type(bytes) is str:
1397         bytes = float(bytes)
1398     if bytes == 0.0:
1399         exponent = 0
1400     else:
1401         exponent = int(math.log(bytes, 1024.0))
1402     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1403     converted = float(bytes) / float(1024 ** exponent)
1404     return '%.2f%s' % (converted, suffix)
1405
1406
1407 def lookup_unit_table(unit_table, s):
1408     units_re = '|'.join(re.escape(u) for u in unit_table)
1409     m = re.match(
1410         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1411     if not m:
1412         return None
1413     num_str = m.group('num').replace(',', '.')
1414     mult = unit_table[m.group('unit')]
1415     return int(float(num_str) * mult)
1416
1417
1418 def parse_filesize(s):
1419     if s is None:
1420         return None
1421
1422     # The lower-case forms are of course incorrect and unofficial,
1423     # but we support those too
1424     _UNIT_TABLE = {
1425         'B': 1,
1426         'b': 1,
1427         'KiB': 1024,
1428         'KB': 1000,
1429         'kB': 1024,
1430         'Kb': 1000,
1431         'MiB': 1024 ** 2,
1432         'MB': 1000 ** 2,
1433         'mB': 1024 ** 2,
1434         'Mb': 1000 ** 2,
1435         'GiB': 1024 ** 3,
1436         'GB': 1000 ** 3,
1437         'gB': 1024 ** 3,
1438         'Gb': 1000 ** 3,
1439         'TiB': 1024 ** 4,
1440         'TB': 1000 ** 4,
1441         'tB': 1024 ** 4,
1442         'Tb': 1000 ** 4,
1443         'PiB': 1024 ** 5,
1444         'PB': 1000 ** 5,
1445         'pB': 1024 ** 5,
1446         'Pb': 1000 ** 5,
1447         'EiB': 1024 ** 6,
1448         'EB': 1000 ** 6,
1449         'eB': 1024 ** 6,
1450         'Eb': 1000 ** 6,
1451         'ZiB': 1024 ** 7,
1452         'ZB': 1000 ** 7,
1453         'zB': 1024 ** 7,
1454         'Zb': 1000 ** 7,
1455         'YiB': 1024 ** 8,
1456         'YB': 1000 ** 8,
1457         'yB': 1024 ** 8,
1458         'Yb': 1000 ** 8,
1459     }
1460
1461     return lookup_unit_table(_UNIT_TABLE, s)
1462
1463
1464 def parse_count(s):
1465     if s is None:
1466         return None
1467
1468     s = s.strip()
1469
1470     if re.match(r'^[\d,.]+$', s):
1471         return str_to_int(s)
1472
1473     _UNIT_TABLE = {
1474         'k': 1000,
1475         'K': 1000,
1476         'm': 1000 ** 2,
1477         'M': 1000 ** 2,
1478         'kk': 1000 ** 2,
1479         'KK': 1000 ** 2,
1480     }
1481
1482     return lookup_unit_table(_UNIT_TABLE, s)
1483
1484
1485 def month_by_name(name):
1486     """ Return the number of a month by (locale-independently) English name """
1487
1488     try:
1489         return ENGLISH_MONTH_NAMES.index(name) + 1
1490     except ValueError:
1491         return None
1492
1493
1494 def month_by_abbreviation(abbrev):
1495     """ Return the number of a month by (locale-independently) English
1496         abbreviations """
1497
1498     try:
1499         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1500     except ValueError:
1501         return None
1502
1503
1504 def fix_xml_ampersands(xml_str):
1505     """Replace all the '&' by '&amp;' in XML"""
1506     return re.sub(
1507         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1508         '&amp;',
1509         xml_str)
1510
1511
1512 def setproctitle(title):
1513     assert isinstance(title, compat_str)
1514
1515     # ctypes in Jython is not complete
1516     # http://bugs.jython.org/issue2148
1517     if sys.platform.startswith('java'):
1518         return
1519
1520     try:
1521         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1522     except OSError:
1523         return
1524     title_bytes = title.encode('utf-8')
1525     buf = ctypes.create_string_buffer(len(title_bytes))
1526     buf.value = title_bytes
1527     try:
1528         libc.prctl(15, buf, 0, 0, 0)
1529     except AttributeError:
1530         return  # Strange libc, just skip this
1531
1532
1533 def remove_start(s, start):
1534     if s.startswith(start):
1535         return s[len(start):]
1536     return s
1537
1538
1539 def remove_end(s, end):
1540     if s.endswith(end):
1541         return s[:-len(end)]
1542     return s
1543
1544
1545 def remove_quotes(s):
1546     if s is None or len(s) < 2:
1547         return s
1548     for quote in ('"', "'", ):
1549         if s[0] == quote and s[-1] == quote:
1550             return s[1:-1]
1551     return s
1552
1553
1554 def url_basename(url):
1555     path = compat_urlparse.urlparse(url).path
1556     return path.strip('/').split('/')[-1]
1557
1558
1559 class HEADRequest(compat_urllib_request.Request):
1560     def get_method(self):
1561         return 'HEAD'
1562
1563
1564 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1565     if get_attr:
1566         if v is not None:
1567             v = getattr(v, get_attr, None)
1568     if v == '':
1569         v = None
1570     if v is None:
1571         return default
1572     try:
1573         return int(v) * invscale // scale
1574     except ValueError:
1575         return default
1576
1577
1578 def str_or_none(v, default=None):
1579     return default if v is None else compat_str(v)
1580
1581
1582 def str_to_int(int_str):
1583     """ A more relaxed version of int_or_none """
1584     if int_str is None:
1585         return None
1586     int_str = re.sub(r'[,\.\+]', '', int_str)
1587     return int(int_str)
1588
1589
1590 def float_or_none(v, scale=1, invscale=1, default=None):
1591     if v is None:
1592         return default
1593     try:
1594         return float(v) * invscale / scale
1595     except ValueError:
1596         return default
1597
1598
1599 def parse_duration(s):
1600     if not isinstance(s, compat_basestring):
1601         return None
1602
1603     s = s.strip()
1604
1605     days, hours, mins, secs, ms = [None] * 5
1606     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1607     if m:
1608         days, hours, mins, secs, ms = m.groups()
1609     else:
1610         m = re.match(
1611             r'''(?ix)(?:P?T)?
1612                 (?:
1613                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1614                 )?
1615                 (?:
1616                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1617                 )?
1618                 (?:
1619                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1620                 )?
1621                 (?:
1622                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1623                 )?$''', s)
1624         if m:
1625             days, hours, mins, secs, ms = m.groups()
1626         else:
1627             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1628             if m:
1629                 hours, mins = m.groups()
1630             else:
1631                 return None
1632
1633     duration = 0
1634     if secs:
1635         duration += float(secs)
1636     if mins:
1637         duration += float(mins) * 60
1638     if hours:
1639         duration += float(hours) * 60 * 60
1640     if days:
1641         duration += float(days) * 24 * 60 * 60
1642     if ms:
1643         duration += float(ms)
1644     return duration
1645
1646
1647 def prepend_extension(filename, ext, expected_real_ext=None):
1648     name, real_ext = os.path.splitext(filename)
1649     return (
1650         '{0}.{1}{2}'.format(name, ext, real_ext)
1651         if not expected_real_ext or real_ext[1:] == expected_real_ext
1652         else '{0}.{1}'.format(filename, ext))
1653
1654
1655 def replace_extension(filename, ext, expected_real_ext=None):
1656     name, real_ext = os.path.splitext(filename)
1657     return '{0}.{1}'.format(
1658         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1659         ext)
1660
1661
1662 def check_executable(exe, args=[]):
1663     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1664     args can be a list of arguments for a short output (like -version) """
1665     try:
1666         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1667     except OSError:
1668         return False
1669     return exe
1670
1671
1672 def get_exe_version(exe, args=['--version'],
1673                     version_re=None, unrecognized='present'):
1674     """ Returns the version of the specified executable,
1675     or False if the executable is not present """
1676     try:
1677         out, _ = subprocess.Popen(
1678             [encodeArgument(exe)] + args,
1679             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1680     except OSError:
1681         return False
1682     if isinstance(out, bytes):  # Python 2.x
1683         out = out.decode('ascii', 'ignore')
1684     return detect_exe_version(out, version_re, unrecognized)
1685
1686
1687 def detect_exe_version(output, version_re=None, unrecognized='present'):
1688     assert isinstance(output, compat_str)
1689     if version_re is None:
1690         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1691     m = re.search(version_re, output)
1692     if m:
1693         return m.group(1)
1694     else:
1695         return unrecognized
1696
1697
1698 class PagedList(object):
1699     def __len__(self):
1700         # This is only useful for tests
1701         return len(self.getslice())
1702
1703
1704 class OnDemandPagedList(PagedList):
1705     def __init__(self, pagefunc, pagesize, use_cache=False):
1706         self._pagefunc = pagefunc
1707         self._pagesize = pagesize
1708         self._use_cache = use_cache
1709         if use_cache:
1710             self._cache = {}
1711
1712     def getslice(self, start=0, end=None):
1713         res = []
1714         for pagenum in itertools.count(start // self._pagesize):
1715             firstid = pagenum * self._pagesize
1716             nextfirstid = pagenum * self._pagesize + self._pagesize
1717             if start >= nextfirstid:
1718                 continue
1719
1720             page_results = None
1721             if self._use_cache:
1722                 page_results = self._cache.get(pagenum)
1723             if page_results is None:
1724                 page_results = list(self._pagefunc(pagenum))
1725             if self._use_cache:
1726                 self._cache[pagenum] = page_results
1727
1728             startv = (
1729                 start % self._pagesize
1730                 if firstid <= start < nextfirstid
1731                 else 0)
1732
1733             endv = (
1734                 ((end - 1) % self._pagesize) + 1
1735                 if (end is not None and firstid <= end <= nextfirstid)
1736                 else None)
1737
1738             if startv != 0 or endv is not None:
1739                 page_results = page_results[startv:endv]
1740             res.extend(page_results)
1741
1742             # A little optimization - if current page is not "full", ie. does
1743             # not contain page_size videos then we can assume that this page
1744             # is the last one - there are no more ids on further pages -
1745             # i.e. no need to query again.
1746             if len(page_results) + startv < self._pagesize:
1747                 break
1748
1749             # If we got the whole page, but the next page is not interesting,
1750             # break out early as well
1751             if end == nextfirstid:
1752                 break
1753         return res
1754
1755
1756 class InAdvancePagedList(PagedList):
1757     def __init__(self, pagefunc, pagecount, pagesize):
1758         self._pagefunc = pagefunc
1759         self._pagecount = pagecount
1760         self._pagesize = pagesize
1761
1762     def getslice(self, start=0, end=None):
1763         res = []
1764         start_page = start // self._pagesize
1765         end_page = (
1766             self._pagecount if end is None else (end // self._pagesize + 1))
1767         skip_elems = start - start_page * self._pagesize
1768         only_more = None if end is None else end - start
1769         for pagenum in range(start_page, end_page):
1770             page = list(self._pagefunc(pagenum))
1771             if skip_elems:
1772                 page = page[skip_elems:]
1773                 skip_elems = None
1774             if only_more is not None:
1775                 if len(page) < only_more:
1776                     only_more -= len(page)
1777                 else:
1778                     page = page[:only_more]
1779                     res.extend(page)
1780                     break
1781             res.extend(page)
1782         return res
1783
1784
1785 def uppercase_escape(s):
1786     unicode_escape = codecs.getdecoder('unicode_escape')
1787     return re.sub(
1788         r'\\U[0-9a-fA-F]{8}',
1789         lambda m: unicode_escape(m.group(0))[0],
1790         s)
1791
1792
1793 def lowercase_escape(s):
1794     unicode_escape = codecs.getdecoder('unicode_escape')
1795     return re.sub(
1796         r'\\u[0-9a-fA-F]{4}',
1797         lambda m: unicode_escape(m.group(0))[0],
1798         s)
1799
1800
1801 def escape_rfc3986(s):
1802     """Escape non-ASCII characters as suggested by RFC 3986"""
1803     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1804         s = s.encode('utf-8')
1805     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1806
1807
1808 def escape_url(url):
1809     """Escape URL as suggested by RFC 3986"""
1810     url_parsed = compat_urllib_parse_urlparse(url)
1811     return url_parsed._replace(
1812         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1813         path=escape_rfc3986(url_parsed.path),
1814         params=escape_rfc3986(url_parsed.params),
1815         query=escape_rfc3986(url_parsed.query),
1816         fragment=escape_rfc3986(url_parsed.fragment)
1817     ).geturl()
1818
1819
1820 def read_batch_urls(batch_fd):
1821     def fixup(url):
1822         if not isinstance(url, compat_str):
1823             url = url.decode('utf-8', 'replace')
1824         BOM_UTF8 = '\xef\xbb\xbf'
1825         if url.startswith(BOM_UTF8):
1826             url = url[len(BOM_UTF8):]
1827         url = url.strip()
1828         if url.startswith(('#', ';', ']')):
1829             return False
1830         return url
1831
1832     with contextlib.closing(batch_fd) as fd:
1833         return [url for url in map(fixup, fd) if url]
1834
1835
1836 def urlencode_postdata(*args, **kargs):
1837     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1838
1839
1840 def update_url_query(url, query):
1841     if not query:
1842         return url
1843     parsed_url = compat_urlparse.urlparse(url)
1844     qs = compat_parse_qs(parsed_url.query)
1845     qs.update(query)
1846     return compat_urlparse.urlunparse(parsed_url._replace(
1847         query=compat_urllib_parse_urlencode(qs, True)))
1848
1849
1850 def update_Request(req, url=None, data=None, headers={}, query={}):
1851     req_headers = req.headers.copy()
1852     req_headers.update(headers)
1853     req_data = data or req.data
1854     req_url = update_url_query(url or req.get_full_url(), query)
1855     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1856     new_req = req_type(
1857         req_url, data=req_data, headers=req_headers,
1858         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1859     if hasattr(req, 'timeout'):
1860         new_req.timeout = req.timeout
1861     return new_req
1862
1863
1864 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1865     if isinstance(key_or_keys, (list, tuple)):
1866         for key in key_or_keys:
1867             if key not in d or d[key] is None or skip_false_values and not d[key]:
1868                 continue
1869             return d[key]
1870         return default
1871     return d.get(key_or_keys, default)
1872
1873
1874 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1875     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1876
1877
1878 US_RATINGS = {
1879     'G': 0,
1880     'PG': 10,
1881     'PG-13': 13,
1882     'R': 16,
1883     'NC': 18,
1884 }
1885
1886
1887 def parse_age_limit(s):
1888     if s is None:
1889         return None
1890     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1891     return int(m.group('age')) if m else US_RATINGS.get(s)
1892
1893
1894 def strip_jsonp(code):
1895     return re.sub(
1896         r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1897
1898
1899 def js_to_json(code):
1900     def fix_kv(m):
1901         v = m.group(0)
1902         if v in ('true', 'false', 'null'):
1903             return v
1904         if v.startswith('"'):
1905             v = re.sub(r"\\'", "'", v[1:-1])
1906         elif v.startswith("'"):
1907             v = v[1:-1]
1908             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1909                 '\\\\': '\\\\',
1910                 "\\'": "'",
1911                 '"': '\\"',
1912             }[m.group(0)], v)
1913         return '"%s"' % v
1914
1915     res = re.sub(r'''(?x)
1916         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1917         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1918         [a-zA-Z_][.a-zA-Z_0-9]*
1919         ''', fix_kv, code)
1920     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1921     return res
1922
1923
1924 def qualities(quality_ids):
1925     """ Get a numeric quality value out of a list of possible values """
1926     def q(qid):
1927         try:
1928             return quality_ids.index(qid)
1929         except ValueError:
1930             return -1
1931     return q
1932
1933
1934 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1935
1936
1937 def limit_length(s, length):
1938     """ Add ellipses to overly long strings """
1939     if s is None:
1940         return None
1941     ELLIPSES = '...'
1942     if len(s) > length:
1943         return s[:length - len(ELLIPSES)] + ELLIPSES
1944     return s
1945
1946
1947 def version_tuple(v):
1948     return tuple(int(e) for e in re.split(r'[-.]', v))
1949
1950
1951 def is_outdated_version(version, limit, assume_new=True):
1952     if not version:
1953         return not assume_new
1954     try:
1955         return version_tuple(version) < version_tuple(limit)
1956     except ValueError:
1957         return not assume_new
1958
1959
1960 def ytdl_is_updateable():
1961     """ Returns if youtube-dl can be updated with -U """
1962     from zipimport import zipimporter
1963
1964     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1965
1966
1967 def args_to_str(args):
1968     # Get a short string representation for a subprocess command
1969     return ' '.join(shlex_quote(a) for a in args)
1970
1971
1972 def error_to_compat_str(err):
1973     err_str = str(err)
1974     # On python 2 error byte string must be decoded with proper
1975     # encoding rather than ascii
1976     if sys.version_info[0] < 3:
1977         err_str = err_str.decode(preferredencoding())
1978     return err_str
1979
1980
1981 def mimetype2ext(mt):
1982     if mt is None:
1983         return None
1984
1985     ext = {
1986         'audio/mp4': 'm4a',
1987     }.get(mt)
1988     if ext is not None:
1989         return ext
1990
1991     _, _, res = mt.rpartition('/')
1992
1993     return {
1994         '3gpp': '3gp',
1995         'smptett+xml': 'tt',
1996         'srt': 'srt',
1997         'ttaf+xml': 'dfxp',
1998         'ttml+xml': 'ttml',
1999         'vtt': 'vtt',
2000         'x-flv': 'flv',
2001         'x-mp4-fragmented': 'mp4',
2002         'x-ms-wmv': 'wmv',
2003     }.get(res, res)
2004
2005
2006 def urlhandle_detect_ext(url_handle):
2007     try:
2008         url_handle.headers
2009         getheader = lambda h: url_handle.headers[h]
2010     except AttributeError:  # Python < 3
2011         getheader = url_handle.info().getheader
2012
2013     cd = getheader('Content-Disposition')
2014     if cd:
2015         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2016         if m:
2017             e = determine_ext(m.group('filename'), default_ext=None)
2018             if e:
2019                 return e
2020
2021     return mimetype2ext(getheader('Content-Type'))
2022
2023
2024 def encode_data_uri(data, mime_type):
2025     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2026
2027
2028 def age_restricted(content_limit, age_limit):
2029     """ Returns True iff the content should be blocked """
2030
2031     if age_limit is None:  # No limit set
2032         return False
2033     if content_limit is None:
2034         return False  # Content available for everyone
2035     return age_limit < content_limit
2036
2037
2038 def is_html(first_bytes):
2039     """ Detect whether a file contains HTML by examining its first bytes. """
2040
2041     BOMS = [
2042         (b'\xef\xbb\xbf', 'utf-8'),
2043         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2044         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2045         (b'\xff\xfe', 'utf-16-le'),
2046         (b'\xfe\xff', 'utf-16-be'),
2047     ]
2048     for bom, enc in BOMS:
2049         if first_bytes.startswith(bom):
2050             s = first_bytes[len(bom):].decode(enc, 'replace')
2051             break
2052     else:
2053         s = first_bytes.decode('utf-8', 'replace')
2054
2055     return re.match(r'^\s*<', s)
2056
2057
2058 def determine_protocol(info_dict):
2059     protocol = info_dict.get('protocol')
2060     if protocol is not None:
2061         return protocol
2062
2063     url = info_dict['url']
2064     if url.startswith('rtmp'):
2065         return 'rtmp'
2066     elif url.startswith('mms'):
2067         return 'mms'
2068     elif url.startswith('rtsp'):
2069         return 'rtsp'
2070
2071     ext = determine_ext(url)
2072     if ext == 'm3u8':
2073         return 'm3u8'
2074     elif ext == 'f4m':
2075         return 'f4m'
2076
2077     return compat_urllib_parse_urlparse(url).scheme
2078
2079
2080 def render_table(header_row, data):
2081     """ Render a list of rows, each as a list of values """
2082     table = [header_row] + data
2083     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2084     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2085     return '\n'.join(format_str % tuple(row) for row in table)
2086
2087
2088 def _match_one(filter_part, dct):
2089     COMPARISON_OPERATORS = {
2090         '<': operator.lt,
2091         '<=': operator.le,
2092         '>': operator.gt,
2093         '>=': operator.ge,
2094         '=': operator.eq,
2095         '!=': operator.ne,
2096     }
2097     operator_rex = re.compile(r'''(?x)\s*
2098         (?P<key>[a-z_]+)
2099         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2100         (?:
2101             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2102             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2103         )
2104         \s*$
2105         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2106     m = operator_rex.search(filter_part)
2107     if m:
2108         op = COMPARISON_OPERATORS[m.group('op')]
2109         if m.group('strval') is not None:
2110             if m.group('op') not in ('=', '!='):
2111                 raise ValueError(
2112                     'Operator %s does not support string values!' % m.group('op'))
2113             comparison_value = m.group('strval')
2114         else:
2115             try:
2116                 comparison_value = int(m.group('intval'))
2117             except ValueError:
2118                 comparison_value = parse_filesize(m.group('intval'))
2119                 if comparison_value is None:
2120                     comparison_value = parse_filesize(m.group('intval') + 'B')
2121                 if comparison_value is None:
2122                     raise ValueError(
2123                         'Invalid integer value %r in filter part %r' % (
2124                             m.group('intval'), filter_part))
2125         actual_value = dct.get(m.group('key'))
2126         if actual_value is None:
2127             return m.group('none_inclusive')
2128         return op(actual_value, comparison_value)
2129
2130     UNARY_OPERATORS = {
2131         '': lambda v: v is not None,
2132         '!': lambda v: v is None,
2133     }
2134     operator_rex = re.compile(r'''(?x)\s*
2135         (?P<op>%s)\s*(?P<key>[a-z_]+)
2136         \s*$
2137         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2138     m = operator_rex.search(filter_part)
2139     if m:
2140         op = UNARY_OPERATORS[m.group('op')]
2141         actual_value = dct.get(m.group('key'))
2142         return op(actual_value)
2143
2144     raise ValueError('Invalid filter part %r' % filter_part)
2145
2146
2147 def match_str(filter_str, dct):
2148     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2149
2150     return all(
2151         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2152
2153
2154 def match_filter_func(filter_str):
2155     def _match_func(info_dict):
2156         if match_str(filter_str, info_dict):
2157             return None
2158         else:
2159             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2160             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2161     return _match_func
2162
2163
2164 def parse_dfxp_time_expr(time_expr):
2165     if not time_expr:
2166         return
2167
2168     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2169     if mobj:
2170         return float(mobj.group('time_offset'))
2171
2172     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2173     if mobj:
2174         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2175
2176
2177 def srt_subtitles_timecode(seconds):
2178     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2179
2180
2181 def dfxp2srt(dfxp_data):
2182     _x = functools.partial(xpath_with_ns, ns_map={
2183         'ttml': 'http://www.w3.org/ns/ttml',
2184         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2185         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2186     })
2187
2188     class TTMLPElementParser(object):
2189         out = ''
2190
2191         def start(self, tag, attrib):
2192             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2193                 self.out += '\n'
2194
2195         def end(self, tag):
2196             pass
2197
2198         def data(self, data):
2199             self.out += data
2200
2201         def close(self):
2202             return self.out.strip()
2203
2204     def parse_node(node):
2205         target = TTMLPElementParser()
2206         parser = xml.etree.ElementTree.XMLParser(target=target)
2207         parser.feed(xml.etree.ElementTree.tostring(node))
2208         return parser.close()
2209
2210     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2211     out = []
2212     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2213
2214     if not paras:
2215         raise ValueError('Invalid dfxp/TTML subtitle')
2216
2217     for para, index in zip(paras, itertools.count(1)):
2218         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2219         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2220         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2221         if begin_time is None:
2222             continue
2223         if not end_time:
2224             if not dur:
2225                 continue
2226             end_time = begin_time + dur
2227         out.append('%d\n%s --> %s\n%s\n\n' % (
2228             index,
2229             srt_subtitles_timecode(begin_time),
2230             srt_subtitles_timecode(end_time),
2231             parse_node(para)))
2232
2233     return ''.join(out)
2234
2235
2236 def cli_option(params, command_option, param):
2237     param = params.get(param)
2238     return [command_option, param] if param is not None else []
2239
2240
2241 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2242     param = params.get(param)
2243     assert isinstance(param, bool)
2244     if separator:
2245         return [command_option + separator + (true_value if param else false_value)]
2246     return [command_option, true_value if param else false_value]
2247
2248
2249 def cli_valueless_option(params, command_option, param, expected_value=True):
2250     param = params.get(param)
2251     return [command_option] if param == expected_value else []
2252
2253
2254 def cli_configuration_args(params, param, default=[]):
2255     ex_args = params.get(param)
2256     if ex_args is None:
2257         return default
2258     assert isinstance(ex_args, list)
2259     return ex_args
2260
2261
2262 class ISO639Utils(object):
2263     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2264     _lang_map = {
2265         'aa': 'aar',
2266         'ab': 'abk',
2267         'ae': 'ave',
2268         'af': 'afr',
2269         'ak': 'aka',
2270         'am': 'amh',
2271         'an': 'arg',
2272         'ar': 'ara',
2273         'as': 'asm',
2274         'av': 'ava',
2275         'ay': 'aym',
2276         'az': 'aze',
2277         'ba': 'bak',
2278         'be': 'bel',
2279         'bg': 'bul',
2280         'bh': 'bih',
2281         'bi': 'bis',
2282         'bm': 'bam',
2283         'bn': 'ben',
2284         'bo': 'bod',
2285         'br': 'bre',
2286         'bs': 'bos',
2287         'ca': 'cat',
2288         'ce': 'che',
2289         'ch': 'cha',
2290         'co': 'cos',
2291         'cr': 'cre',
2292         'cs': 'ces',
2293         'cu': 'chu',
2294         'cv': 'chv',
2295         'cy': 'cym',
2296         'da': 'dan',
2297         'de': 'deu',
2298         'dv': 'div',
2299         'dz': 'dzo',
2300         'ee': 'ewe',
2301         'el': 'ell',
2302         'en': 'eng',
2303         'eo': 'epo',
2304         'es': 'spa',
2305         'et': 'est',
2306         'eu': 'eus',
2307         'fa': 'fas',
2308         'ff': 'ful',
2309         'fi': 'fin',
2310         'fj': 'fij',
2311         'fo': 'fao',
2312         'fr': 'fra',
2313         'fy': 'fry',
2314         'ga': 'gle',
2315         'gd': 'gla',
2316         'gl': 'glg',
2317         'gn': 'grn',
2318         'gu': 'guj',
2319         'gv': 'glv',
2320         'ha': 'hau',
2321         'he': 'heb',
2322         'hi': 'hin',
2323         'ho': 'hmo',
2324         'hr': 'hrv',
2325         'ht': 'hat',
2326         'hu': 'hun',
2327         'hy': 'hye',
2328         'hz': 'her',
2329         'ia': 'ina',
2330         'id': 'ind',
2331         'ie': 'ile',
2332         'ig': 'ibo',
2333         'ii': 'iii',
2334         'ik': 'ipk',
2335         'io': 'ido',
2336         'is': 'isl',
2337         'it': 'ita',
2338         'iu': 'iku',
2339         'ja': 'jpn',
2340         'jv': 'jav',
2341         'ka': 'kat',
2342         'kg': 'kon',
2343         'ki': 'kik',
2344         'kj': 'kua',
2345         'kk': 'kaz',
2346         'kl': 'kal',
2347         'km': 'khm',
2348         'kn': 'kan',
2349         'ko': 'kor',
2350         'kr': 'kau',
2351         'ks': 'kas',
2352         'ku': 'kur',
2353         'kv': 'kom',
2354         'kw': 'cor',
2355         'ky': 'kir',
2356         'la': 'lat',
2357         'lb': 'ltz',
2358         'lg': 'lug',
2359         'li': 'lim',
2360         'ln': 'lin',
2361         'lo': 'lao',
2362         'lt': 'lit',
2363         'lu': 'lub',
2364         'lv': 'lav',
2365         'mg': 'mlg',
2366         'mh': 'mah',
2367         'mi': 'mri',
2368         'mk': 'mkd',
2369         'ml': 'mal',
2370         'mn': 'mon',
2371         'mr': 'mar',
2372         'ms': 'msa',
2373         'mt': 'mlt',
2374         'my': 'mya',
2375         'na': 'nau',
2376         'nb': 'nob',
2377         'nd': 'nde',
2378         'ne': 'nep',
2379         'ng': 'ndo',
2380         'nl': 'nld',
2381         'nn': 'nno',
2382         'no': 'nor',
2383         'nr': 'nbl',
2384         'nv': 'nav',
2385         'ny': 'nya',
2386         'oc': 'oci',
2387         'oj': 'oji',
2388         'om': 'orm',
2389         'or': 'ori',
2390         'os': 'oss',
2391         'pa': 'pan',
2392         'pi': 'pli',
2393         'pl': 'pol',
2394         'ps': 'pus',
2395         'pt': 'por',
2396         'qu': 'que',
2397         'rm': 'roh',
2398         'rn': 'run',
2399         'ro': 'ron',
2400         'ru': 'rus',
2401         'rw': 'kin',
2402         'sa': 'san',
2403         'sc': 'srd',
2404         'sd': 'snd',
2405         'se': 'sme',
2406         'sg': 'sag',
2407         'si': 'sin',
2408         'sk': 'slk',
2409         'sl': 'slv',
2410         'sm': 'smo',
2411         'sn': 'sna',
2412         'so': 'som',
2413         'sq': 'sqi',
2414         'sr': 'srp',
2415         'ss': 'ssw',
2416         'st': 'sot',
2417         'su': 'sun',
2418         'sv': 'swe',
2419         'sw': 'swa',
2420         'ta': 'tam',
2421         'te': 'tel',
2422         'tg': 'tgk',
2423         'th': 'tha',
2424         'ti': 'tir',
2425         'tk': 'tuk',
2426         'tl': 'tgl',
2427         'tn': 'tsn',
2428         'to': 'ton',
2429         'tr': 'tur',
2430         'ts': 'tso',
2431         'tt': 'tat',
2432         'tw': 'twi',
2433         'ty': 'tah',
2434         'ug': 'uig',
2435         'uk': 'ukr',
2436         'ur': 'urd',
2437         'uz': 'uzb',
2438         've': 'ven',
2439         'vi': 'vie',
2440         'vo': 'vol',
2441         'wa': 'wln',
2442         'wo': 'wol',
2443         'xh': 'xho',
2444         'yi': 'yid',
2445         'yo': 'yor',
2446         'za': 'zha',
2447         'zh': 'zho',
2448         'zu': 'zul',
2449     }
2450
2451     @classmethod
2452     def short2long(cls, code):
2453         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2454         return cls._lang_map.get(code[:2])
2455
2456     @classmethod
2457     def long2short(cls, code):
2458         """Convert language code from ISO 639-2/T to ISO 639-1"""
2459         for short_name, long_name in cls._lang_map.items():
2460             if long_name == code:
2461                 return short_name
2462
2463
2464 class ISO3166Utils(object):
2465     # From http://data.okfn.org/data/core/country-list
2466     _country_map = {
2467         'AF': 'Afghanistan',
2468         'AX': 'Åland Islands',
2469         'AL': 'Albania',
2470         'DZ': 'Algeria',
2471         'AS': 'American Samoa',
2472         'AD': 'Andorra',
2473         'AO': 'Angola',
2474         'AI': 'Anguilla',
2475         'AQ': 'Antarctica',
2476         'AG': 'Antigua and Barbuda',
2477         'AR': 'Argentina',
2478         'AM': 'Armenia',
2479         'AW': 'Aruba',
2480         'AU': 'Australia',
2481         'AT': 'Austria',
2482         'AZ': 'Azerbaijan',
2483         'BS': 'Bahamas',
2484         'BH': 'Bahrain',
2485         'BD': 'Bangladesh',
2486         'BB': 'Barbados',
2487         'BY': 'Belarus',
2488         'BE': 'Belgium',
2489         'BZ': 'Belize',
2490         'BJ': 'Benin',
2491         'BM': 'Bermuda',
2492         'BT': 'Bhutan',
2493         'BO': 'Bolivia, Plurinational State of',
2494         'BQ': 'Bonaire, Sint Eustatius and Saba',
2495         'BA': 'Bosnia and Herzegovina',
2496         'BW': 'Botswana',
2497         'BV': 'Bouvet Island',
2498         'BR': 'Brazil',
2499         'IO': 'British Indian Ocean Territory',
2500         'BN': 'Brunei Darussalam',
2501         'BG': 'Bulgaria',
2502         'BF': 'Burkina Faso',
2503         'BI': 'Burundi',
2504         'KH': 'Cambodia',
2505         'CM': 'Cameroon',
2506         'CA': 'Canada',
2507         'CV': 'Cape Verde',
2508         'KY': 'Cayman Islands',
2509         'CF': 'Central African Republic',
2510         'TD': 'Chad',
2511         'CL': 'Chile',
2512         'CN': 'China',
2513         'CX': 'Christmas Island',
2514         'CC': 'Cocos (Keeling) Islands',
2515         'CO': 'Colombia',
2516         'KM': 'Comoros',
2517         'CG': 'Congo',
2518         'CD': 'Congo, the Democratic Republic of the',
2519         'CK': 'Cook Islands',
2520         'CR': 'Costa Rica',
2521         'CI': 'Côte d\'Ivoire',
2522         'HR': 'Croatia',
2523         'CU': 'Cuba',
2524         'CW': 'Curaçao',
2525         'CY': 'Cyprus',
2526         'CZ': 'Czech Republic',
2527         'DK': 'Denmark',
2528         'DJ': 'Djibouti',
2529         'DM': 'Dominica',
2530         'DO': 'Dominican Republic',
2531         'EC': 'Ecuador',
2532         'EG': 'Egypt',
2533         'SV': 'El Salvador',
2534         'GQ': 'Equatorial Guinea',
2535         'ER': 'Eritrea',
2536         'EE': 'Estonia',
2537         'ET': 'Ethiopia',
2538         'FK': 'Falkland Islands (Malvinas)',
2539         'FO': 'Faroe Islands',
2540         'FJ': 'Fiji',
2541         'FI': 'Finland',
2542         'FR': 'France',
2543         'GF': 'French Guiana',
2544         'PF': 'French Polynesia',
2545         'TF': 'French Southern Territories',
2546         'GA': 'Gabon',
2547         'GM': 'Gambia',
2548         'GE': 'Georgia',
2549         'DE': 'Germany',
2550         'GH': 'Ghana',
2551         'GI': 'Gibraltar',
2552         'GR': 'Greece',
2553         'GL': 'Greenland',
2554         'GD': 'Grenada',
2555         'GP': 'Guadeloupe',
2556         'GU': 'Guam',
2557         'GT': 'Guatemala',
2558         'GG': 'Guernsey',
2559         'GN': 'Guinea',
2560         'GW': 'Guinea-Bissau',
2561         'GY': 'Guyana',
2562         'HT': 'Haiti',
2563         'HM': 'Heard Island and McDonald Islands',
2564         'VA': 'Holy See (Vatican City State)',
2565         'HN': 'Honduras',
2566         'HK': 'Hong Kong',
2567         'HU': 'Hungary',
2568         'IS': 'Iceland',
2569         'IN': 'India',
2570         'ID': 'Indonesia',
2571         'IR': 'Iran, Islamic Republic of',
2572         'IQ': 'Iraq',
2573         'IE': 'Ireland',
2574         'IM': 'Isle of Man',
2575         'IL': 'Israel',
2576         'IT': 'Italy',
2577         'JM': 'Jamaica',
2578         'JP': 'Japan',
2579         'JE': 'Jersey',
2580         'JO': 'Jordan',
2581         'KZ': 'Kazakhstan',
2582         'KE': 'Kenya',
2583         'KI': 'Kiribati',
2584         'KP': 'Korea, Democratic People\'s Republic of',
2585         'KR': 'Korea, Republic of',
2586         'KW': 'Kuwait',
2587         'KG': 'Kyrgyzstan',
2588         'LA': 'Lao People\'s Democratic Republic',
2589         'LV': 'Latvia',
2590         'LB': 'Lebanon',
2591         'LS': 'Lesotho',
2592         'LR': 'Liberia',
2593         'LY': 'Libya',
2594         'LI': 'Liechtenstein',
2595         'LT': 'Lithuania',
2596         'LU': 'Luxembourg',
2597         'MO': 'Macao',
2598         'MK': 'Macedonia, the Former Yugoslav Republic of',
2599         'MG': 'Madagascar',
2600         'MW': 'Malawi',
2601         'MY': 'Malaysia',
2602         'MV': 'Maldives',
2603         'ML': 'Mali',
2604         'MT': 'Malta',
2605         'MH': 'Marshall Islands',
2606         'MQ': 'Martinique',
2607         'MR': 'Mauritania',
2608         'MU': 'Mauritius',
2609         'YT': 'Mayotte',
2610         'MX': 'Mexico',
2611         'FM': 'Micronesia, Federated States of',
2612         'MD': 'Moldova, Republic of',
2613         'MC': 'Monaco',
2614         'MN': 'Mongolia',
2615         'ME': 'Montenegro',
2616         'MS': 'Montserrat',
2617         'MA': 'Morocco',
2618         'MZ': 'Mozambique',
2619         'MM': 'Myanmar',
2620         'NA': 'Namibia',
2621         'NR': 'Nauru',
2622         'NP': 'Nepal',
2623         'NL': 'Netherlands',
2624         'NC': 'New Caledonia',
2625         'NZ': 'New Zealand',
2626         'NI': 'Nicaragua',
2627         'NE': 'Niger',
2628         'NG': 'Nigeria',
2629         'NU': 'Niue',
2630         'NF': 'Norfolk Island',
2631         'MP': 'Northern Mariana Islands',
2632         'NO': 'Norway',
2633         'OM': 'Oman',
2634         'PK': 'Pakistan',
2635         'PW': 'Palau',
2636         'PS': 'Palestine, State of',
2637         'PA': 'Panama',
2638         'PG': 'Papua New Guinea',
2639         'PY': 'Paraguay',
2640         'PE': 'Peru',
2641         'PH': 'Philippines',
2642         'PN': 'Pitcairn',
2643         'PL': 'Poland',
2644         'PT': 'Portugal',
2645         'PR': 'Puerto Rico',
2646         'QA': 'Qatar',
2647         'RE': 'Réunion',
2648         'RO': 'Romania',
2649         'RU': 'Russian Federation',
2650         'RW': 'Rwanda',
2651         'BL': 'Saint Barthélemy',
2652         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2653         'KN': 'Saint Kitts and Nevis',
2654         'LC': 'Saint Lucia',
2655         'MF': 'Saint Martin (French part)',
2656         'PM': 'Saint Pierre and Miquelon',
2657         'VC': 'Saint Vincent and the Grenadines',
2658         'WS': 'Samoa',
2659         'SM': 'San Marino',
2660         'ST': 'Sao Tome and Principe',
2661         'SA': 'Saudi Arabia',
2662         'SN': 'Senegal',
2663         'RS': 'Serbia',
2664         'SC': 'Seychelles',
2665         'SL': 'Sierra Leone',
2666         'SG': 'Singapore',
2667         'SX': 'Sint Maarten (Dutch part)',
2668         'SK': 'Slovakia',
2669         'SI': 'Slovenia',
2670         'SB': 'Solomon Islands',
2671         'SO': 'Somalia',
2672         'ZA': 'South Africa',
2673         'GS': 'South Georgia and the South Sandwich Islands',
2674         'SS': 'South Sudan',
2675         'ES': 'Spain',
2676         'LK': 'Sri Lanka',
2677         'SD': 'Sudan',
2678         'SR': 'Suriname',
2679         'SJ': 'Svalbard and Jan Mayen',
2680         'SZ': 'Swaziland',
2681         'SE': 'Sweden',
2682         'CH': 'Switzerland',
2683         'SY': 'Syrian Arab Republic',
2684         'TW': 'Taiwan, Province of China',
2685         'TJ': 'Tajikistan',
2686         'TZ': 'Tanzania, United Republic of',
2687         'TH': 'Thailand',
2688         'TL': 'Timor-Leste',
2689         'TG': 'Togo',
2690         'TK': 'Tokelau',
2691         'TO': 'Tonga',
2692         'TT': 'Trinidad and Tobago',
2693         'TN': 'Tunisia',
2694         'TR': 'Turkey',
2695         'TM': 'Turkmenistan',
2696         'TC': 'Turks and Caicos Islands',
2697         'TV': 'Tuvalu',
2698         'UG': 'Uganda',
2699         'UA': 'Ukraine',
2700         'AE': 'United Arab Emirates',
2701         'GB': 'United Kingdom',
2702         'US': 'United States',
2703         'UM': 'United States Minor Outlying Islands',
2704         'UY': 'Uruguay',
2705         'UZ': 'Uzbekistan',
2706         'VU': 'Vanuatu',
2707         'VE': 'Venezuela, Bolivarian Republic of',
2708         'VN': 'Viet Nam',
2709         'VG': 'Virgin Islands, British',
2710         'VI': 'Virgin Islands, U.S.',
2711         'WF': 'Wallis and Futuna',
2712         'EH': 'Western Sahara',
2713         'YE': 'Yemen',
2714         'ZM': 'Zambia',
2715         'ZW': 'Zimbabwe',
2716     }
2717
2718     @classmethod
2719     def short2full(cls, code):
2720         """Convert an ISO 3166-2 country code to the corresponding full name"""
2721         return cls._country_map.get(code.upper())
2722
2723
2724 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2725     def __init__(self, proxies=None):
2726         # Set default handlers
2727         for type in ('http', 'https'):
2728             setattr(self, '%s_open' % type,
2729                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2730                         meth(r, proxy, type))
2731         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2732
2733     def proxy_open(self, req, proxy, type):
2734         req_proxy = req.headers.get('Ytdl-request-proxy')
2735         if req_proxy is not None:
2736             proxy = req_proxy
2737             del req.headers['Ytdl-request-proxy']
2738
2739         if proxy == '__noproxy__':
2740             return None  # No Proxy
2741         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks5'):
2742             req.add_header('Ytdl-socks-proxy', proxy)
2743             # youtube-dl's http/https handlers do wrapping the socket with socks
2744             return None
2745         return compat_urllib_request.ProxyHandler.proxy_open(
2746             self, req, proxy, type)
2747
2748
2749 def ohdave_rsa_encrypt(data, exponent, modulus):
2750     '''
2751     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2752
2753     Input:
2754         data: data to encrypt, bytes-like object
2755         exponent, modulus: parameter e and N of RSA algorithm, both integer
2756     Output: hex string of encrypted data
2757
2758     Limitation: supports one block encryption only
2759     '''
2760
2761     payload = int(binascii.hexlify(data[::-1]), 16)
2762     encrypted = pow(payload, exponent, modulus)
2763     return '%x' % encrypted
2764
2765
2766 def encode_base_n(num, n, table=None):
2767     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2768     if not table:
2769         table = FULL_TABLE[:n]
2770
2771     if n > len(table):
2772         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2773
2774     if num == 0:
2775         return table[0]
2776
2777     ret = ''
2778     while num:
2779         ret = table[num % n] + ret
2780         num = num // n
2781     return ret
2782
2783
2784 def decode_packed_codes(code):
2785     mobj = re.search(
2786         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2787         code)
2788     obfucasted_code, base, count, symbols = mobj.groups()
2789     base = int(base)
2790     count = int(count)
2791     symbols = symbols.split('|')
2792     symbol_table = {}
2793
2794     while count:
2795         count -= 1
2796         base_n_count = encode_base_n(count, base)
2797         symbol_table[base_n_count] = symbols[count] or base_n_count
2798
2799     return re.sub(
2800         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2801         obfucasted_code)