_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 NO_DEFAULT = object()
  66
  67 ENGLISH_MONTH_NAMES = [
  68     'January', 'February', 'March', 'April', 'May', 'June',
  69     'July', 'August', 'September', 'October', 'November', 'December']
  70
  71
  72 def preferredencoding():
  73     """Get preferred encoding.
  74
  75     Returns the best encoding scheme for the system, based on
  76     locale.getpreferredencoding() and some further tweaks.
  77     """
  78     try:
  79         pref = locale.getpreferredencoding()
  80         'TEST'.encode(pref)
  81     except Exception:
  82         pref = 'UTF-8'
  83
  84     return pref
  85
  86
  87 def write_json_file(obj, fn):
  88     """ Encode obj as JSON and write it to fn, atomically if possible """
  89
  90     fn = encodeFilename(fn)
  91     if sys.version_info < (3, 0) and sys.platform != 'win32':
  92         encoding = get_filesystem_encoding()
  93         # os.path.basename returns a bytes object, but NamedTemporaryFile
  94         # will fail if the filename contains non ascii characters unless we
  95         # use a unicode object
  96         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  97         # the same for os.path.dirname
  98         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  99     else:
 100         path_basename = os.path.basename
 101         path_dirname = os.path.dirname
 102
 103     args = {
 104         'suffix': '.tmp',
 105         'prefix': path_basename(fn) + '.',
 106         'dir': path_dirname(fn),
 107         'delete': False,
 108     }
 109
 110     # In Python 2.x, json.dump expects a bytestream.
 111     # In Python 3.x, it writes to a character stream
 112     if sys.version_info < (3, 0):
 113         args['mode'] = 'wb'
 114     else:
 115         args.update({
 116             'mode': 'w',
 117             'encoding': 'utf-8',
 118         })
 119
 120     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 121
 122     try:
 123         with tf:
 124             json.dump(obj, tf)
 125         if sys.platform == 'win32':
 126             # Need to remove existing file on Windows, else os.rename raises
 127             # WindowsError or FileExistsError.
 128             try:
 129                 os.unlink(fn)
 130             except OSError:
 131                 pass
 132         os.rename(tf.name, fn)
 133     except Exception:
 134         try:
 135             os.remove(tf.name)
 136         except OSError:
 137             pass
 138         raise
 139
 140
 141 if sys.version_info >= (2, 7):
 142     def find_xpath_attr(node, xpath, key, val):
 143         """ Find the xpath xpath[@key=val] """
 144         assert re.match(r'^[a-zA-Z-]+$', key)
 145         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 146         expr = xpath + "[@%s='%s']" % (key, val)
 147         return node.find(expr)
 148 else:
 149     def find_xpath_attr(node, xpath, key, val):
 150         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 151         # .//node does not match if a node is a direct child of . !
 152         if isinstance(xpath, compat_str):
 153             xpath = xpath.encode('ascii')
 154
 155         for f in node.findall(xpath):
 156             if f.attrib.get(key) == val:
 157                 return f
 158         return None
 159
 160 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 161 # the namespace parameter
 162
 163
 164 def xpath_with_ns(path, ns_map):
 165     components = [c.split(':') for c in path.split('/')]
 166     replaced = []
 167     for c in components:
 168         if len(c) == 1:
 169             replaced.append(c[0])
 170         else:
 171             ns, tag = c
 172             replaced.append('{%s}%s' % (ns_map[ns], tag))
 173     return '/'.join(replaced)
 174
 175
 176 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 177     if sys.version_info < (2, 7):  # Crazy 2.6
 178         xpath = xpath.encode('ascii')
 179
 180     n = node.find(xpath)
 181     if n is None or n.text is None:
 182         if default is not NO_DEFAULT:
 183             return default
 184         elif fatal:
 185             name = xpath if name is None else name
 186             raise ExtractorError('Could not find XML element %s' % name)
 187         else:
 188             return None
 189     return n.text
 190
 191
 192 def get_element_by_id(id, html):
 193     """Return the content of the tag with the specified ID in the passed HTML document"""
 194     return get_element_by_attribute("id", id, html)
 195
 196
 197 def get_element_by_attribute(attribute, value, html):
 198     """Return the content of the tag with the specified attribute in the passed HTML document"""
 199
 200     m = re.search(r'''(?xs)
 201         <([a-zA-Z0-9:._-]+)
 202          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 203          \s+%s=['"]?%s['"]?
 204          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 205         \s*>
 206         (?P<content>.*?)
 207         </\1>
 208     ''' % (re.escape(attribute), re.escape(value)), html)
 209
 210     if not m:
 211         return None
 212     res = m.group('content')
 213
 214     if res.startswith('"') or res.startswith("'"):
 215         res = res[1:-1]
 216
 217     return unescapeHTML(res)
 218
 219
 220 def clean_html(html):
 221     """Clean an HTML snippet into a readable string"""
 222
 223     if html is None:  # Convenience for sanitizing descriptions etc.
 224         return html
 225
 226     # Newline vs <br />
 227     html = html.replace('\n', ' ')
 228     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 229     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 230     # Strip html tags
 231     html = re.sub('<.*?>', '', html)
 232     # Replace html entities
 233     html = unescapeHTML(html)
 234     return html.strip()
 235
 236
 237 def sanitize_open(filename, open_mode):
 238     """Try to open the given filename, and slightly tweak it if this fails.
 239
 240     Attempts to open the given filename. If this fails, it tries to change
 241     the filename slightly, step by step, until it's either able to open it
 242     or it fails and raises a final exception, like the standard open()
 243     function.
 244
 245     It returns the tuple (stream, definitive_file_name).
 246     """
 247     try:
 248         if filename == '-':
 249             if sys.platform == 'win32':
 250                 import msvcrt
 251                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 252             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 253         stream = open(encodeFilename(filename), open_mode)
 254         return (stream, filename)
 255     except (IOError, OSError) as err:
 256         if err.errno in (errno.EACCES,):
 257             raise
 258
 259         # In case of error, try to remove win32 forbidden chars
 260         alt_filename = sanitize_path(filename)
 261         if alt_filename == filename:
 262             raise
 263         else:
 264             # An exception here should be caught in the caller
 265             stream = open(encodeFilename(alt_filename), open_mode)
 266             return (stream, alt_filename)
 267
 268
 269 def timeconvert(timestr):
 270     """Convert RFC 2822 defined time string into system timestamp"""
 271     timestamp = None
 272     timetuple = email.utils.parsedate_tz(timestr)
 273     if timetuple is not None:
 274         timestamp = email.utils.mktime_tz(timetuple)
 275     return timestamp
 276
 277
 278 def sanitize_filename(s, restricted=False, is_id=False):
 279     """Sanitizes a string so it could be used as part of a filename.
 280     If restricted is set, use a stricter subset of allowed characters.
 281     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 282     """
 283     def replace_insane(char):
 284         if char == '?' or ord(char) < 32 or ord(char) == 127:
 285             return ''
 286         elif char == '"':
 287             return '' if restricted else '\''
 288         elif char == ':':
 289             return '_-' if restricted else ' -'
 290         elif char in '\\/|*<>':
 291             return '_'
 292         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 293             return '_'
 294         if restricted and ord(char) > 127:
 295             return '_'
 296         return char
 297
 298     # Handle timestamps
 299     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 300     result = ''.join(map(replace_insane, s))
 301     if not is_id:
 302         while '__' in result:
 303             result = result.replace('__', '_')
 304         result = result.strip('_')
 305         # Common case of "Foreign band name - English song title"
 306         if restricted and result.startswith('-_'):
 307             result = result[2:]
 308         if result.startswith('-'):
 309             result = '_' + result[len('-'):]
 310         result = result.lstrip('.')
 311         if not result:
 312             result = '_'
 313     return result
 314
 315
 316 def sanitize_path(s):
 317     """Sanitizes and normalizes path on Windows"""
 318     if sys.platform != 'win32':
 319         return s
 320     drive_or_unc, _ = os.path.splitdrive(s)
 321     if sys.version_info < (2, 7) and not drive_or_unc:
 322         drive_or_unc, _ = os.path.splitunc(s)
 323     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 324     if drive_or_unc:
 325         norm_path.pop(0)
 326     sanitized_path = [
 327         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 328         for path_part in norm_path]
 329     if drive_or_unc:
 330         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 331     return os.path.join(*sanitized_path)
 332
 333
 334 def orderedSet(iterable):
 335     """ Remove all duplicates from the input iterable """
 336     res = []
 337     for el in iterable:
 338         if el not in res:
 339             res.append(el)
 340     return res
 341
 342
 343 def _htmlentity_transform(entity):
 344     """Transforms an HTML entity to a character."""
 345     # Known non-numeric HTML entity
 346     if entity in compat_html_entities.name2codepoint:
 347         return compat_chr(compat_html_entities.name2codepoint[entity])
 348
 349     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 350     if mobj is not None:
 351         numstr = mobj.group(1)
 352         if numstr.startswith('x'):
 353             base = 16
 354             numstr = '0%s' % numstr
 355         else:
 356             base = 10
 357         return compat_chr(int(numstr, base))
 358
 359     # Unknown entity in name, return its literal representation
 360     return ('&%s;' % entity)
 361
 362
 363 def unescapeHTML(s):
 364     if s is None:
 365         return None
 366     assert type(s) == compat_str
 367
 368     return re.sub(
 369         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 370
 371
 372 def get_subprocess_encoding():
 373     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 374         # For subprocess calls, encode with locale encoding
 375         # Refer to http://stackoverflow.com/a/9951851/35070
 376         encoding = preferredencoding()
 377     else:
 378         encoding = sys.getfilesystemencoding()
 379     if encoding is None:
 380         encoding = 'utf-8'
 381     return encoding
 382
 383
 384 def encodeFilename(s, for_subprocess=False):
 385     """
 386     @param s The name of the file
 387     """
 388
 389     assert type(s) == compat_str
 390
 391     # Python 3 has a Unicode API
 392     if sys.version_info >= (3, 0):
 393         return s
 394
 395     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 396     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 397     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 398     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 399         return s
 400
 401     return s.encode(get_subprocess_encoding(), 'ignore')
 402
 403
 404 def decodeFilename(b, for_subprocess=False):
 405
 406     if sys.version_info >= (3, 0):
 407         return b
 408
 409     if not isinstance(b, bytes):
 410         return b
 411
 412     return b.decode(get_subprocess_encoding(), 'ignore')
 413
 414
 415 def encodeArgument(s):
 416     if not isinstance(s, compat_str):
 417         # Legacy code that uses byte strings
 418         # Uncomment the following line after fixing all post processors
 419         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 420         s = s.decode('ascii')
 421     return encodeFilename(s, True)
 422
 423
 424 def decodeArgument(b):
 425     return decodeFilename(b, True)
 426
 427
 428 def decodeOption(optval):
 429     if optval is None:
 430         return optval
 431     if isinstance(optval, bytes):
 432         optval = optval.decode(preferredencoding())
 433
 434     assert isinstance(optval, compat_str)
 435     return optval
 436
 437
 438 def formatSeconds(secs):
 439     if secs > 3600:
 440         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 441     elif secs > 60:
 442         return '%d:%02d' % (secs // 60, secs % 60)
 443     else:
 444         return '%d' % secs
 445
 446
 447 def make_HTTPS_handler(params, **kwargs):
 448     opts_no_check_certificate = params.get('nocheckcertificate', False)
 449     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 450         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 451         if opts_no_check_certificate:
 452             context.check_hostname = False
 453             context.verify_mode = ssl.CERT_NONE
 454         try:
 455             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 456         except TypeError:
 457             # Python 2.7.8
 458             # (create_default_context present but HTTPSHandler has no context=)
 459             pass
 460
 461     if sys.version_info < (3, 2):
 462         return YoutubeDLHTTPSHandler(params, **kwargs)
 463     else:  # Python < 3.4
 464         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 465         context.verify_mode = (ssl.CERT_NONE
 466                                if opts_no_check_certificate
 467                                else ssl.CERT_REQUIRED)
 468         context.set_default_verify_paths()
 469         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 470
 471
 472 def bug_reports_message():
 473     if ytdl_is_updateable():
 474         update_cmd = 'type  youtube-dl -U  to update'
 475     else:
 476         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 477     msg = '; please report this issue on https://yt-dl.org/bug .'
 478     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 479     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 480     return msg
 481
 482
 483 class ExtractorError(Exception):
 484     """Error during info extraction."""
 485
 486     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 487         """ tb, if given, is the original traceback (so that it can be printed out).
 488         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 489         """
 490
 491         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 492             expected = True
 493         if video_id is not None:
 494             msg = video_id + ': ' + msg
 495         if cause:
 496             msg += ' (caused by %r)' % cause
 497         if not expected:
 498             msg += bug_reports_message()
 499         super(ExtractorError, self).__init__(msg)
 500
 501         self.traceback = tb
 502         self.exc_info = sys.exc_info()  # preserve original exception
 503         self.cause = cause
 504         self.video_id = video_id
 505
 506     def format_traceback(self):
 507         if self.traceback is None:
 508             return None
 509         return ''.join(traceback.format_tb(self.traceback))
 510
 511
 512 class UnsupportedError(ExtractorError):
 513     def __init__(self, url):
 514         super(UnsupportedError, self).__init__(
 515             'Unsupported URL: %s' % url, expected=True)
 516         self.url = url
 517
 518
 519 class RegexNotFoundError(ExtractorError):
 520     """Error when a regex didn't match"""
 521     pass
 522
 523
 524 class DownloadError(Exception):
 525     """Download Error exception.
 526
 527     This exception may be thrown by FileDownloader objects if they are not
 528     configured to continue on errors. They will contain the appropriate
 529     error message.
 530     """
 531
 532     def __init__(self, msg, exc_info=None):
 533         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 534         super(DownloadError, self).__init__(msg)
 535         self.exc_info = exc_info
 536
 537
 538 class SameFileError(Exception):
 539     """Same File exception.
 540
 541     This exception will be thrown by FileDownloader objects if they detect
 542     multiple files would have to be downloaded to the same file on disk.
 543     """
 544     pass
 545
 546
 547 class PostProcessingError(Exception):
 548     """Post Processing exception.
 549
 550     This exception may be raised by PostProcessor's .run() method to
 551     indicate an error in the postprocessing task.
 552     """
 553
 554     def __init__(self, msg):
 555         self.msg = msg
 556
 557
 558 class MaxDownloadsReached(Exception):
 559     """ --max-downloads limit has been reached. """
 560     pass
 561
 562
 563 class UnavailableVideoError(Exception):
 564     """Unavailable Format exception.
 565
 566     This exception will be thrown when a video is requested
 567     in a format that is not available for that video.
 568     """
 569     pass
 570
 571
 572 class ContentTooShortError(Exception):
 573     """Content Too Short exception.
 574
 575     This exception may be raised by FileDownloader objects when a file they
 576     download is too small for what the server announced first, indicating
 577     the connection was probably interrupted.
 578     """
 579     # Both in bytes
 580     downloaded = None
 581     expected = None
 582
 583     def __init__(self, downloaded, expected):
 584         self.downloaded = downloaded
 585         self.expected = expected
 586
 587
 588 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 589     hc = http_class(*args, **kwargs)
 590     source_address = ydl_handler._params.get('source_address')
 591     if source_address is not None:
 592         sa = (source_address, 0)
 593         if hasattr(hc, 'source_address'):  # Python 2.7+
 594             hc.source_address = sa
 595         else:  # Python 2.6
 596             def _hc_connect(self, *args, **kwargs):
 597                 sock = compat_socket_create_connection(
 598                     (self.host, self.port), self.timeout, sa)
 599                 if is_https:
 600                     self.sock = ssl.wrap_socket(
 601                         sock, self.key_file, self.cert_file,
 602                         ssl_version=ssl.PROTOCOL_TLSv1)
 603                 else:
 604                     self.sock = sock
 605             hc.connect = functools.partial(_hc_connect, hc)
 606
 607     return hc
 608
 609
 610 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 611     """Handler for HTTP requests and responses.
 612
 613     This class, when installed with an OpenerDirector, automatically adds
 614     the standard headers to every HTTP request and handles gzipped and
 615     deflated responses from web servers. If compression is to be avoided in
 616     a particular request, the original request in the program code only has
 617     to include the HTTP header "Youtubedl-No-Compression", which will be
 618     removed before making the real request.
 619
 620     Part of this code was copied from:
 621
 622     http://techknack.net/python-urllib2-handlers/
 623
 624     Andrew Rowls, the author of that code, agreed to release it to the
 625     public domain.
 626     """
 627
 628     def __init__(self, params, *args, **kwargs):
 629         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 630         self._params = params
 631
 632     def http_open(self, req):
 633         return self.do_open(functools.partial(
 634             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 635             req)
 636
 637     @staticmethod
 638     def deflate(data):
 639         try:
 640             return zlib.decompress(data, -zlib.MAX_WBITS)
 641         except zlib.error:
 642             return zlib.decompress(data)
 643
 644     @staticmethod
 645     def addinfourl_wrapper(stream, headers, url, code):
 646         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 647             return compat_urllib_request.addinfourl(stream, headers, url, code)
 648         ret = compat_urllib_request.addinfourl(stream, headers, url)
 649         ret.code = code
 650         return ret
 651
 652     def http_request(self, req):
 653         for h, v in std_headers.items():
 654             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 655             # The dict keys are capitalized because of this bug by urllib
 656             if h.capitalize() not in req.headers:
 657                 req.add_header(h, v)
 658         if 'Youtubedl-no-compression' in req.headers:
 659             if 'Accept-encoding' in req.headers:
 660                 del req.headers['Accept-encoding']
 661             del req.headers['Youtubedl-no-compression']
 662
 663         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 664             # Python 2.6 is brain-dead when it comes to fragments
 665             req._Request__original = req._Request__original.partition('#')[0]
 666             req._Request__r_type = req._Request__r_type.partition('#')[0]
 667
 668         return req
 669
 670     def http_response(self, req, resp):
 671         old_resp = resp
 672         # gzip
 673         if resp.headers.get('Content-encoding', '') == 'gzip':
 674             content = resp.read()
 675             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 676             try:
 677                 uncompressed = io.BytesIO(gz.read())
 678             except IOError as original_ioerror:
 679                 # There may be junk add the end of the file
 680                 # See http://stackoverflow.com/q/4928560/35070 for details
 681                 for i in range(1, 1024):
 682                     try:
 683                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 684                         uncompressed = io.BytesIO(gz.read())
 685                     except IOError:
 686                         continue
 687                     break
 688                 else:
 689                     raise original_ioerror
 690             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 691             resp.msg = old_resp.msg
 692         # deflate
 693         if resp.headers.get('Content-encoding', '') == 'deflate':
 694             gz = io.BytesIO(self.deflate(resp.read()))
 695             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 696             resp.msg = old_resp.msg
 697         return resp
 698
 699     https_request = http_request
 700     https_response = http_response
 701
 702
 703 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 704     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 705         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 706         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 707         self._params = params
 708
 709     def https_open(self, req):
 710         kwargs = {}
 711         if hasattr(self, '_context'):  # python > 2.6
 712             kwargs['context'] = self._context
 713         if hasattr(self, '_check_hostname'):  # python 3.x
 714             kwargs['check_hostname'] = self._check_hostname
 715         return self.do_open(functools.partial(
 716             _create_http_connection, self, self._https_conn_class, True),
 717             req, **kwargs)
 718
 719
 720 def parse_iso8601(date_str, delimiter='T', timezone=None):
 721     """ Return a UNIX timestamp from the given date """
 722
 723     if date_str is None:
 724         return None
 725
 726     if timezone is None:
 727         m = re.search(
 728             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 729             date_str)
 730         if not m:
 731             timezone = datetime.timedelta()
 732         else:
 733             date_str = date_str[:-len(m.group(0))]
 734             if not m.group('sign'):
 735                 timezone = datetime.timedelta()
 736             else:
 737                 sign = 1 if m.group('sign') == '+' else -1
 738                 timezone = datetime.timedelta(
 739                     hours=sign * int(m.group('hours')),
 740                     minutes=sign * int(m.group('minutes')))
 741     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 742     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 743     return calendar.timegm(dt.timetuple())
 744
 745
 746 def unified_strdate(date_str, day_first=True):
 747     """Return a string with the date in the format YYYYMMDD"""
 748
 749     if date_str is None:
 750         return None
 751     upload_date = None
 752     # Replace commas
 753     date_str = date_str.replace(',', ' ')
 754     # %z (UTC offset) is only supported in python>=3.2
 755     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 756         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 757     # Remove AM/PM + timezone
 758     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 759
 760     format_expressions = [
 761         '%d %B %Y',
 762         '%d %b %Y',
 763         '%B %d %Y',
 764         '%b %d %Y',
 765         '%b %dst %Y %I:%M%p',
 766         '%b %dnd %Y %I:%M%p',
 767         '%b %dth %Y %I:%M%p',
 768         '%Y %m %d',
 769         '%Y-%m-%d',
 770         '%Y/%m/%d',
 771         '%Y/%m/%d %H:%M:%S',
 772         '%Y-%m-%d %H:%M:%S',
 773         '%Y-%m-%d %H:%M:%S.%f',
 774         '%d.%m.%Y %H:%M',
 775         '%d.%m.%Y %H.%M',
 776         '%Y-%m-%dT%H:%M:%SZ',
 777         '%Y-%m-%dT%H:%M:%S.%fZ',
 778         '%Y-%m-%dT%H:%M:%S.%f0Z',
 779         '%Y-%m-%dT%H:%M:%S',
 780         '%Y-%m-%dT%H:%M:%S.%f',
 781         '%Y-%m-%dT%H:%M',
 782     ]
 783     if day_first:
 784         format_expressions.extend([
 785             '%d-%m-%Y',
 786             '%d.%m.%Y',
 787             '%d/%m/%Y',
 788             '%d/%m/%y',
 789             '%d/%m/%Y %H:%M:%S',
 790         ])
 791     else:
 792         format_expressions.extend([
 793             '%m-%d-%Y',
 794             '%m.%d.%Y',
 795             '%m/%d/%Y',
 796             '%m/%d/%y',
 797             '%m/%d/%Y %H:%M:%S',
 798         ])
 799     for expression in format_expressions:
 800         try:
 801             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 802         except ValueError:
 803             pass
 804     if upload_date is None:
 805         timetuple = email.utils.parsedate_tz(date_str)
 806         if timetuple:
 807             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 808     return upload_date
 809
 810
 811 def determine_ext(url, default_ext='unknown_video'):
 812     if url is None:
 813         return default_ext
 814     guess = url.partition('?')[0].rpartition('.')[2]
 815     if re.match(r'^[A-Za-z0-9]+$', guess):
 816         return guess
 817     else:
 818         return default_ext
 819
 820
 821 def subtitles_filename(filename, sub_lang, sub_format):
 822     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 823
 824
 825 def date_from_str(date_str):
 826     """
 827     Return a datetime object from a string in the format YYYYMMDD or
 828     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 829     today = datetime.date.today()
 830     if date_str in ('now', 'today'):
 831         return today
 832     if date_str == 'yesterday':
 833         return today - datetime.timedelta(days=1)
 834     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 835     if match is not None:
 836         sign = match.group('sign')
 837         time = int(match.group('time'))
 838         if sign == '-':
 839             time = -time
 840         unit = match.group('unit')
 841         # A bad aproximation?
 842         if unit == 'month':
 843             unit = 'day'
 844             time *= 30
 845         elif unit == 'year':
 846             unit = 'day'
 847             time *= 365
 848         unit += 's'
 849         delta = datetime.timedelta(**{unit: time})
 850         return today + delta
 851     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 852
 853
 854 def hyphenate_date(date_str):
 855     """
 856     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 857     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 858     if match is not None:
 859         return '-'.join(match.groups())
 860     else:
 861         return date_str
 862
 863
 864 class DateRange(object):
 865     """Represents a time interval between two dates"""
 866
 867     def __init__(self, start=None, end=None):
 868         """start and end must be strings in the format accepted by date"""
 869         if start is not None:
 870             self.start = date_from_str(start)
 871         else:
 872             self.start = datetime.datetime.min.date()
 873         if end is not None:
 874             self.end = date_from_str(end)
 875         else:
 876             self.end = datetime.datetime.max.date()
 877         if self.start > self.end:
 878             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 879
 880     @classmethod
 881     def day(cls, day):
 882         """Returns a range that only contains the given day"""
 883         return cls(day, day)
 884
 885     def __contains__(self, date):
 886         """Check if the date is in the range"""
 887         if not isinstance(date, datetime.date):
 888             date = date_from_str(date)
 889         return self.start <= date <= self.end
 890
 891     def __str__(self):
 892         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 893
 894
 895 def platform_name():
 896     """ Returns the platform name as a compat_str """
 897     res = platform.platform()
 898     if isinstance(res, bytes):
 899         res = res.decode(preferredencoding())
 900
 901     assert isinstance(res, compat_str)
 902     return res
 903
 904
 905 def _windows_write_string(s, out):
 906     """ Returns True if the string was written using special methods,
 907     False if it has yet to be written out."""
 908     # Adapted from http://stackoverflow.com/a/3259271/35070
 909
 910     import ctypes
 911     import ctypes.wintypes
 912
 913     WIN_OUTPUT_IDS = {
 914         1: -11,
 915         2: -12,
 916     }
 917
 918     try:
 919         fileno = out.fileno()
 920     except AttributeError:
 921         # If the output stream doesn't have a fileno, it's virtual
 922         return False
 923     except io.UnsupportedOperation:
 924         # Some strange Windows pseudo files?
 925         return False
 926     if fileno not in WIN_OUTPUT_IDS:
 927         return False
 928
 929     GetStdHandle = ctypes.WINFUNCTYPE(
 930         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 931         (b"GetStdHandle", ctypes.windll.kernel32))
 932     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 933
 934     WriteConsoleW = ctypes.WINFUNCTYPE(
 935         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 936         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 937         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 938     written = ctypes.wintypes.DWORD(0)
 939
 940     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 941     FILE_TYPE_CHAR = 0x0002
 942     FILE_TYPE_REMOTE = 0x8000
 943     GetConsoleMode = ctypes.WINFUNCTYPE(
 944         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 945         ctypes.POINTER(ctypes.wintypes.DWORD))(
 946         (b"GetConsoleMode", ctypes.windll.kernel32))
 947     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 948
 949     def not_a_console(handle):
 950         if handle == INVALID_HANDLE_VALUE or handle is None:
 951             return True
 952         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 953                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 954
 955     if not_a_console(h):
 956         return False
 957
 958     def next_nonbmp_pos(s):
 959         try:
 960             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 961         except StopIteration:
 962             return len(s)
 963
 964     while s:
 965         count = min(next_nonbmp_pos(s), 1024)
 966
 967         ret = WriteConsoleW(
 968             h, s, count if count else 2, ctypes.byref(written), None)
 969         if ret == 0:
 970             raise OSError('Failed to write string')
 971         if not count:  # We just wrote a non-BMP character
 972             assert written.value == 2
 973             s = s[1:]
 974         else:
 975             assert written.value > 0
 976             s = s[written.value:]
 977     return True
 978
 979
 980 def write_string(s, out=None, encoding=None):
 981     if out is None:
 982         out = sys.stderr
 983     assert type(s) == compat_str
 984
 985     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 986         if _windows_write_string(s, out):
 987             return
 988
 989     if ('b' in getattr(out, 'mode', '') or
 990             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 991         byt = s.encode(encoding or preferredencoding(), 'ignore')
 992         out.write(byt)
 993     elif hasattr(out, 'buffer'):
 994         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 995         byt = s.encode(enc, 'ignore')
 996         out.buffer.write(byt)
 997     else:
 998         out.write(s)
 999     out.flush()
1000
1001
1002 def bytes_to_intlist(bs):
1003     if not bs:
1004         return []
1005     if isinstance(bs[0], int):  # Python 3
1006         return list(bs)
1007     else:
1008         return [ord(c) for c in bs]
1009
1010
1011 def intlist_to_bytes(xs):
1012     if not xs:
1013         return b''
1014     return struct_pack('%dB' % len(xs), *xs)
1015
1016
1017 # Cross-platform file locking
1018 if sys.platform == 'win32':
1019     import ctypes.wintypes
1020     import msvcrt
1021
1022     class OVERLAPPED(ctypes.Structure):
1023         _fields_ = [
1024             ('Internal', ctypes.wintypes.LPVOID),
1025             ('InternalHigh', ctypes.wintypes.LPVOID),
1026             ('Offset', ctypes.wintypes.DWORD),
1027             ('OffsetHigh', ctypes.wintypes.DWORD),
1028             ('hEvent', ctypes.wintypes.HANDLE),
1029         ]
1030
1031     kernel32 = ctypes.windll.kernel32
1032     LockFileEx = kernel32.LockFileEx
1033     LockFileEx.argtypes = [
1034         ctypes.wintypes.HANDLE,     # hFile
1035         ctypes.wintypes.DWORD,      # dwFlags
1036         ctypes.wintypes.DWORD,      # dwReserved
1037         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1038         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1039         ctypes.POINTER(OVERLAPPED)  # Overlapped
1040     ]
1041     LockFileEx.restype = ctypes.wintypes.BOOL
1042     UnlockFileEx = kernel32.UnlockFileEx
1043     UnlockFileEx.argtypes = [
1044         ctypes.wintypes.HANDLE,     # hFile
1045         ctypes.wintypes.DWORD,      # dwReserved
1046         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1047         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1048         ctypes.POINTER(OVERLAPPED)  # Overlapped
1049     ]
1050     UnlockFileEx.restype = ctypes.wintypes.BOOL
1051     whole_low = 0xffffffff
1052     whole_high = 0x7fffffff
1053
1054     def _lock_file(f, exclusive):
1055         overlapped = OVERLAPPED()
1056         overlapped.Offset = 0
1057         overlapped.OffsetHigh = 0
1058         overlapped.hEvent = 0
1059         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1060         handle = msvcrt.get_osfhandle(f.fileno())
1061         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1062                           whole_low, whole_high, f._lock_file_overlapped_p):
1063             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1064
1065     def _unlock_file(f):
1066         assert f._lock_file_overlapped_p
1067         handle = msvcrt.get_osfhandle(f.fileno())
1068         if not UnlockFileEx(handle, 0,
1069                             whole_low, whole_high, f._lock_file_overlapped_p):
1070             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1071
1072 else:
1073     import fcntl
1074
1075     def _lock_file(f, exclusive):
1076         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1077
1078     def _unlock_file(f):
1079         fcntl.flock(f, fcntl.LOCK_UN)
1080
1081
1082 class locked_file(object):
1083     def __init__(self, filename, mode, encoding=None):
1084         assert mode in ['r', 'a', 'w']
1085         self.f = io.open(filename, mode, encoding=encoding)
1086         self.mode = mode
1087
1088     def __enter__(self):
1089         exclusive = self.mode != 'r'
1090         try:
1091             _lock_file(self.f, exclusive)
1092         except IOError:
1093             self.f.close()
1094             raise
1095         return self
1096
1097     def __exit__(self, etype, value, traceback):
1098         try:
1099             _unlock_file(self.f)
1100         finally:
1101             self.f.close()
1102
1103     def __iter__(self):
1104         return iter(self.f)
1105
1106     def write(self, *args):
1107         return self.f.write(*args)
1108
1109     def read(self, *args):
1110         return self.f.read(*args)
1111
1112
1113 def get_filesystem_encoding():
1114     encoding = sys.getfilesystemencoding()
1115     return encoding if encoding is not None else 'utf-8'
1116
1117
1118 def shell_quote(args):
1119     quoted_args = []
1120     encoding = get_filesystem_encoding()
1121     for a in args:
1122         if isinstance(a, bytes):
1123             # We may get a filename encoded with 'encodeFilename'
1124             a = a.decode(encoding)
1125         quoted_args.append(pipes.quote(a))
1126     return ' '.join(quoted_args)
1127
1128
1129 def smuggle_url(url, data):
1130     """ Pass additional data in a URL for internal use. """
1131
1132     sdata = compat_urllib_parse.urlencode(
1133         {'__youtubedl_smuggle': json.dumps(data)})
1134     return url + '#' + sdata
1135
1136
1137 def unsmuggle_url(smug_url, default=None):
1138     if '#__youtubedl_smuggle' not in smug_url:
1139         return smug_url, default
1140     url, _, sdata = smug_url.rpartition('#')
1141     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1142     data = json.loads(jsond)
1143     return url, data
1144
1145
1146 def format_bytes(bytes):
1147     if bytes is None:
1148         return 'N/A'
1149     if type(bytes) is str:
1150         bytes = float(bytes)
1151     if bytes == 0.0:
1152         exponent = 0
1153     else:
1154         exponent = int(math.log(bytes, 1024.0))
1155     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1156     converted = float(bytes) / float(1024 ** exponent)
1157     return '%.2f%s' % (converted, suffix)
1158
1159
1160 def parse_filesize(s):
1161     if s is None:
1162         return None
1163
1164     # The lower-case forms are of course incorrect and inofficial,
1165     # but we support those too
1166     _UNIT_TABLE = {
1167         'B': 1,
1168         'b': 1,
1169         'KiB': 1024,
1170         'KB': 1000,
1171         'kB': 1024,
1172         'Kb': 1000,
1173         'MiB': 1024 ** 2,
1174         'MB': 1000 ** 2,
1175         'mB': 1024 ** 2,
1176         'Mb': 1000 ** 2,
1177         'GiB': 1024 ** 3,
1178         'GB': 1000 ** 3,
1179         'gB': 1024 ** 3,
1180         'Gb': 1000 ** 3,
1181         'TiB': 1024 ** 4,
1182         'TB': 1000 ** 4,
1183         'tB': 1024 ** 4,
1184         'Tb': 1000 ** 4,
1185         'PiB': 1024 ** 5,
1186         'PB': 1000 ** 5,
1187         'pB': 1024 ** 5,
1188         'Pb': 1000 ** 5,
1189         'EiB': 1024 ** 6,
1190         'EB': 1000 ** 6,
1191         'eB': 1024 ** 6,
1192         'Eb': 1000 ** 6,
1193         'ZiB': 1024 ** 7,
1194         'ZB': 1000 ** 7,
1195         'zB': 1024 ** 7,
1196         'Zb': 1000 ** 7,
1197         'YiB': 1024 ** 8,
1198         'YB': 1000 ** 8,
1199         'yB': 1024 ** 8,
1200         'Yb': 1000 ** 8,
1201     }
1202
1203     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1204     m = re.match(
1205         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1206     if not m:
1207         return None
1208
1209     num_str = m.group('num').replace(',', '.')
1210     mult = _UNIT_TABLE[m.group('unit')]
1211     return int(float(num_str) * mult)
1212
1213
1214 def month_by_name(name):
1215     """ Return the number of a month by (locale-independently) English name """
1216
1217     try:
1218         return ENGLISH_MONTH_NAMES.index(name) + 1
1219     except ValueError:
1220         return None
1221
1222
1223 def month_by_abbreviation(abbrev):
1224     """ Return the number of a month by (locale-independently) English
1225         abbreviations """
1226
1227     try:
1228         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1229     except ValueError:
1230         return None
1231
1232
1233 def fix_xml_ampersands(xml_str):
1234     """Replace all the '&' by '&amp;' in XML"""
1235     return re.sub(
1236         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1237         '&amp;',
1238         xml_str)
1239
1240
1241 def setproctitle(title):
1242     assert isinstance(title, compat_str)
1243     try:
1244         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245     except OSError:
1246         return
1247     title_bytes = title.encode('utf-8')
1248     buf = ctypes.create_string_buffer(len(title_bytes))
1249     buf.value = title_bytes
1250     try:
1251         libc.prctl(15, buf, 0, 0, 0)
1252     except AttributeError:
1253         return  # Strange libc, just skip this
1254
1255
1256 def remove_start(s, start):
1257     if s.startswith(start):
1258         return s[len(start):]
1259     return s
1260
1261
1262 def remove_end(s, end):
1263     if s.endswith(end):
1264         return s[:-len(end)]
1265     return s
1266
1267
1268 def url_basename(url):
1269     path = compat_urlparse.urlparse(url).path
1270     return path.strip('/').split('/')[-1]
1271
1272
1273 class HEADRequest(compat_urllib_request.Request):
1274     def get_method(self):
1275         return "HEAD"
1276
1277
1278 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1279     if get_attr:
1280         if v is not None:
1281             v = getattr(v, get_attr, None)
1282     if v == '':
1283         v = None
1284     return default if v is None else (int(v) * invscale // scale)
1285
1286
1287 def str_or_none(v, default=None):
1288     return default if v is None else compat_str(v)
1289
1290
1291 def str_to_int(int_str):
1292     """ A more relaxed version of int_or_none """
1293     if int_str is None:
1294         return None
1295     int_str = re.sub(r'[,\.\+]', '', int_str)
1296     return int(int_str)
1297
1298
1299 def float_or_none(v, scale=1, invscale=1, default=None):
1300     return default if v is None else (float(v) * invscale / scale)
1301
1302
1303 def parse_duration(s):
1304     if not isinstance(s, compat_basestring):
1305         return None
1306
1307     s = s.strip()
1308
1309     m = re.match(
1310         r'''(?ix)(?:P?T)?
1311         (?:
1312             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1313             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1314
1315             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1316             (?:
1317                 (?:
1318                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1319                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1320                 )?
1321                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1322             )?
1323             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1324         )$''', s)
1325     if not m:
1326         return None
1327     res = 0
1328     if m.group('only_mins'):
1329         return float_or_none(m.group('only_mins'), invscale=60)
1330     if m.group('only_hours'):
1331         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1332     if m.group('secs'):
1333         res += int(m.group('secs'))
1334     if m.group('mins_reversed'):
1335         res += int(m.group('mins_reversed')) * 60
1336     if m.group('mins'):
1337         res += int(m.group('mins')) * 60
1338     if m.group('hours'):
1339         res += int(m.group('hours')) * 60 * 60
1340     if m.group('hours_reversed'):
1341         res += int(m.group('hours_reversed')) * 60 * 60
1342     if m.group('days'):
1343         res += int(m.group('days')) * 24 * 60 * 60
1344     if m.group('ms'):
1345         res += float(m.group('ms'))
1346     return res
1347
1348
1349 def prepend_extension(filename, ext, expected_real_ext=None):
1350     name, real_ext = os.path.splitext(filename)
1351     return (
1352         '{0}.{1}{2}'.format(name, ext, real_ext)
1353         if not expected_real_ext or real_ext[1:] == expected_real_ext
1354         else '{0}.{1}'.format(filename, ext))
1355
1356
1357 def replace_extension(filename, ext, expected_real_ext=None):
1358     name, real_ext = os.path.splitext(filename)
1359     return '{0}.{1}'.format(
1360         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1361         ext)
1362
1363
1364 def check_executable(exe, args=[]):
1365     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1366     args can be a list of arguments for a short output (like -version) """
1367     try:
1368         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1369     except OSError:
1370         return False
1371     return exe
1372
1373
1374 def get_exe_version(exe, args=['--version'],
1375                     version_re=None, unrecognized='present'):
1376     """ Returns the version of the specified executable,
1377     or False if the executable is not present """
1378     try:
1379         out, _ = subprocess.Popen(
1380             [encodeArgument(exe)] + args,
1381             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1382     except OSError:
1383         return False
1384     if isinstance(out, bytes):  # Python 2.x
1385         out = out.decode('ascii', 'ignore')
1386     return detect_exe_version(out, version_re, unrecognized)
1387
1388
1389 def detect_exe_version(output, version_re=None, unrecognized='present'):
1390     assert isinstance(output, compat_str)
1391     if version_re is None:
1392         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1393     m = re.search(version_re, output)
1394     if m:
1395         return m.group(1)
1396     else:
1397         return unrecognized
1398
1399
1400 class PagedList(object):
1401     def __len__(self):
1402         # This is only useful for tests
1403         return len(self.getslice())
1404
1405
1406 class OnDemandPagedList(PagedList):
1407     def __init__(self, pagefunc, pagesize):
1408         self._pagefunc = pagefunc
1409         self._pagesize = pagesize
1410
1411     def getslice(self, start=0, end=None):
1412         res = []
1413         for pagenum in itertools.count(start // self._pagesize):
1414             firstid = pagenum * self._pagesize
1415             nextfirstid = pagenum * self._pagesize + self._pagesize
1416             if start >= nextfirstid:
1417                 continue
1418
1419             page_results = list(self._pagefunc(pagenum))
1420
1421             startv = (
1422                 start % self._pagesize
1423                 if firstid <= start < nextfirstid
1424                 else 0)
1425
1426             endv = (
1427                 ((end - 1) % self._pagesize) + 1
1428                 if (end is not None and firstid <= end <= nextfirstid)
1429                 else None)
1430
1431             if startv != 0 or endv is not None:
1432                 page_results = page_results[startv:endv]
1433             res.extend(page_results)
1434
1435             # A little optimization - if current page is not "full", ie. does
1436             # not contain page_size videos then we can assume that this page
1437             # is the last one - there are no more ids on further pages -
1438             # i.e. no need to query again.
1439             if len(page_results) + startv < self._pagesize:
1440                 break
1441
1442             # If we got the whole page, but the next page is not interesting,
1443             # break out early as well
1444             if end == nextfirstid:
1445                 break
1446         return res
1447
1448
1449 class InAdvancePagedList(PagedList):
1450     def __init__(self, pagefunc, pagecount, pagesize):
1451         self._pagefunc = pagefunc
1452         self._pagecount = pagecount
1453         self._pagesize = pagesize
1454
1455     def getslice(self, start=0, end=None):
1456         res = []
1457         start_page = start // self._pagesize
1458         end_page = (
1459             self._pagecount if end is None else (end // self._pagesize + 1))
1460         skip_elems = start - start_page * self._pagesize
1461         only_more = None if end is None else end - start
1462         for pagenum in range(start_page, end_page):
1463             page = list(self._pagefunc(pagenum))
1464             if skip_elems:
1465                 page = page[skip_elems:]
1466                 skip_elems = None
1467             if only_more is not None:
1468                 if len(page) < only_more:
1469                     only_more -= len(page)
1470                 else:
1471                     page = page[:only_more]
1472                     res.extend(page)
1473                     break
1474             res.extend(page)
1475         return res
1476
1477
1478 def uppercase_escape(s):
1479     unicode_escape = codecs.getdecoder('unicode_escape')
1480     return re.sub(
1481         r'\\U[0-9a-fA-F]{8}',
1482         lambda m: unicode_escape(m.group(0))[0],
1483         s)
1484
1485
1486 def lowercase_escape(s):
1487     unicode_escape = codecs.getdecoder('unicode_escape')
1488     return re.sub(
1489         r'\\u[0-9a-fA-F]{4}',
1490         lambda m: unicode_escape(m.group(0))[0],
1491         s)
1492
1493
1494 def escape_rfc3986(s):
1495     """Escape non-ASCII characters as suggested by RFC 3986"""
1496     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1497         s = s.encode('utf-8')
1498     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1499
1500
1501 def escape_url(url):
1502     """Escape URL as suggested by RFC 3986"""
1503     url_parsed = compat_urllib_parse_urlparse(url)
1504     return url_parsed._replace(
1505         path=escape_rfc3986(url_parsed.path),
1506         params=escape_rfc3986(url_parsed.params),
1507         query=escape_rfc3986(url_parsed.query),
1508         fragment=escape_rfc3986(url_parsed.fragment)
1509     ).geturl()
1510
1511 try:
1512     struct.pack('!I', 0)
1513 except TypeError:
1514     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1515     def struct_pack(spec, *args):
1516         if isinstance(spec, compat_str):
1517             spec = spec.encode('ascii')
1518         return struct.pack(spec, *args)
1519
1520     def struct_unpack(spec, *args):
1521         if isinstance(spec, compat_str):
1522             spec = spec.encode('ascii')
1523         return struct.unpack(spec, *args)
1524 else:
1525     struct_pack = struct.pack
1526     struct_unpack = struct.unpack
1527
1528
1529 def read_batch_urls(batch_fd):
1530     def fixup(url):
1531         if not isinstance(url, compat_str):
1532             url = url.decode('utf-8', 'replace')
1533         BOM_UTF8 = '\xef\xbb\xbf'
1534         if url.startswith(BOM_UTF8):
1535             url = url[len(BOM_UTF8):]
1536         url = url.strip()
1537         if url.startswith(('#', ';', ']')):
1538             return False
1539         return url
1540
1541     with contextlib.closing(batch_fd) as fd:
1542         return [url for url in map(fixup, fd) if url]
1543
1544
1545 def urlencode_postdata(*args, **kargs):
1546     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1547
1548
1549 try:
1550     etree_iter = xml.etree.ElementTree.Element.iter
1551 except AttributeError:  # Python <=2.6
1552     etree_iter = lambda n: n.findall('.//*')
1553
1554
1555 def parse_xml(s):
1556     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1557         def doctype(self, name, pubid, system):
1558             pass  # Ignore doctypes
1559
1560     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1561     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1562     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1563     # Fix up XML parser in Python 2.x
1564     if sys.version_info < (3, 0):
1565         for n in etree_iter(tree):
1566             if n.text is not None:
1567                 if not isinstance(n.text, compat_str):
1568                     n.text = n.text.decode('utf-8')
1569     return tree
1570
1571
1572 US_RATINGS = {
1573     'G': 0,
1574     'PG': 10,
1575     'PG-13': 13,
1576     'R': 16,
1577     'NC': 18,
1578 }
1579
1580
1581 def parse_age_limit(s):
1582     if s is None:
1583         return None
1584     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1585     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1586
1587
1588 def strip_jsonp(code):
1589     return re.sub(
1590         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1591
1592
1593 def js_to_json(code):
1594     def fix_kv(m):
1595         v = m.group(0)
1596         if v in ('true', 'false', 'null'):
1597             return v
1598         if v.startswith('"'):
1599             return v
1600         if v.startswith("'"):
1601             v = v[1:-1]
1602             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1603                 '\\\\': '\\\\',
1604                 "\\'": "'",
1605                 '"': '\\"',
1606             }[m.group(0)], v)
1607         return '"%s"' % v
1608
1609     res = re.sub(r'''(?x)
1610         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1611         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1612         [a-zA-Z_][.a-zA-Z_0-9]*
1613         ''', fix_kv, code)
1614     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1615     return res
1616
1617
1618 def qualities(quality_ids):
1619     """ Get a numeric quality value out of a list of possible values """
1620     def q(qid):
1621         try:
1622             return quality_ids.index(qid)
1623         except ValueError:
1624             return -1
1625     return q
1626
1627
1628 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1629
1630
1631 def limit_length(s, length):
1632     """ Add ellipses to overly long strings """
1633     if s is None:
1634         return None
1635     ELLIPSES = '...'
1636     if len(s) > length:
1637         return s[:length - len(ELLIPSES)] + ELLIPSES
1638     return s
1639
1640
1641 def version_tuple(v):
1642     return tuple(int(e) for e in re.split(r'[-.]', v))
1643
1644
1645 def is_outdated_version(version, limit, assume_new=True):
1646     if not version:
1647         return not assume_new
1648     try:
1649         return version_tuple(version) < version_tuple(limit)
1650     except ValueError:
1651         return not assume_new
1652
1653
1654 def ytdl_is_updateable():
1655     """ Returns if youtube-dl can be updated with -U """
1656     from zipimport import zipimporter
1657
1658     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1659
1660
1661 def args_to_str(args):
1662     # Get a short string representation for a subprocess command
1663     return ' '.join(shlex_quote(a) for a in args)
1664
1665
1666 def mimetype2ext(mt):
1667     _, _, res = mt.rpartition('/')
1668
1669     return {
1670         'x-ms-wmv': 'wmv',
1671         'x-mp4-fragmented': 'mp4',
1672         'ttml+xml': 'ttml',
1673     }.get(res, res)
1674
1675
1676 def urlhandle_detect_ext(url_handle):
1677     try:
1678         url_handle.headers
1679         getheader = lambda h: url_handle.headers[h]
1680     except AttributeError:  # Python < 3
1681         getheader = url_handle.info().getheader
1682
1683     cd = getheader('Content-Disposition')
1684     if cd:
1685         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1686         if m:
1687             e = determine_ext(m.group('filename'), default_ext=None)
1688             if e:
1689                 return e
1690
1691     return mimetype2ext(getheader('Content-Type'))
1692
1693
1694 def age_restricted(content_limit, age_limit):
1695     """ Returns True iff the content should be blocked """
1696
1697     if age_limit is None:  # No limit set
1698         return False
1699     if content_limit is None:
1700         return False  # Content available for everyone
1701     return age_limit < content_limit
1702
1703
1704 def is_html(first_bytes):
1705     """ Detect whether a file contains HTML by examining its first bytes. """
1706
1707     BOMS = [
1708         (b'\xef\xbb\xbf', 'utf-8'),
1709         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1710         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1711         (b'\xff\xfe', 'utf-16-le'),
1712         (b'\xfe\xff', 'utf-16-be'),
1713     ]
1714     for bom, enc in BOMS:
1715         if first_bytes.startswith(bom):
1716             s = first_bytes[len(bom):].decode(enc, 'replace')
1717             break
1718     else:
1719         s = first_bytes.decode('utf-8', 'replace')
1720
1721     return re.match(r'^\s*<', s)
1722
1723
1724 def determine_protocol(info_dict):
1725     protocol = info_dict.get('protocol')
1726     if protocol is not None:
1727         return protocol
1728
1729     url = info_dict['url']
1730     if url.startswith('rtmp'):
1731         return 'rtmp'
1732     elif url.startswith('mms'):
1733         return 'mms'
1734     elif url.startswith('rtsp'):
1735         return 'rtsp'
1736
1737     ext = determine_ext(url)
1738     if ext == 'm3u8':
1739         return 'm3u8'
1740     elif ext == 'f4m':
1741         return 'f4m'
1742
1743     return compat_urllib_parse_urlparse(url).scheme
1744
1745
1746 def render_table(header_row, data):
1747     """ Render a list of rows, each as a list of values """
1748     table = [header_row] + data
1749     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1750     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1751     return '\n'.join(format_str % tuple(row) for row in table)
1752
1753
1754 def _match_one(filter_part, dct):
1755     COMPARISON_OPERATORS = {
1756         '<': operator.lt,
1757         '<=': operator.le,
1758         '>': operator.gt,
1759         '>=': operator.ge,
1760         '=': operator.eq,
1761         '!=': operator.ne,
1762     }
1763     operator_rex = re.compile(r'''(?x)\s*
1764         (?P<key>[a-z_]+)
1765         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1766         (?:
1767             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1768             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1769         )
1770         \s*$
1771         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1772     m = operator_rex.search(filter_part)
1773     if m:
1774         op = COMPARISON_OPERATORS[m.group('op')]
1775         if m.group('strval') is not None:
1776             if m.group('op') not in ('=', '!='):
1777                 raise ValueError(
1778                     'Operator %s does not support string values!' % m.group('op'))
1779             comparison_value = m.group('strval')
1780         else:
1781             try:
1782                 comparison_value = int(m.group('intval'))
1783             except ValueError:
1784                 comparison_value = parse_filesize(m.group('intval'))
1785                 if comparison_value is None:
1786                     comparison_value = parse_filesize(m.group('intval') + 'B')
1787                 if comparison_value is None:
1788                     raise ValueError(
1789                         'Invalid integer value %r in filter part %r' % (
1790                             m.group('intval'), filter_part))
1791         actual_value = dct.get(m.group('key'))
1792         if actual_value is None:
1793             return m.group('none_inclusive')
1794         return op(actual_value, comparison_value)
1795
1796     UNARY_OPERATORS = {
1797         '': lambda v: v is not None,
1798         '!': lambda v: v is None,
1799     }
1800     operator_rex = re.compile(r'''(?x)\s*
1801         (?P<op>%s)\s*(?P<key>[a-z_]+)
1802         \s*$
1803         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1804     m = operator_rex.search(filter_part)
1805     if m:
1806         op = UNARY_OPERATORS[m.group('op')]
1807         actual_value = dct.get(m.group('key'))
1808         return op(actual_value)
1809
1810     raise ValueError('Invalid filter part %r' % filter_part)
1811
1812
1813 def match_str(filter_str, dct):
1814     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1815
1816     return all(
1817         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1818
1819
1820 def match_filter_func(filter_str):
1821     def _match_func(info_dict):
1822         if match_str(filter_str, info_dict):
1823             return None
1824         else:
1825             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1826             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1827     return _match_func
1828
1829
1830 def parse_dfxp_time_expr(time_expr):
1831     if not time_expr:
1832         return 0.0
1833
1834     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1835     if mobj:
1836         return float(mobj.group('time_offset'))
1837
1838     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1839     if mobj:
1840         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1841
1842
1843 def srt_subtitles_timecode(seconds):
1844     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1845
1846
1847 def dfxp2srt(dfxp_data):
1848     _x = functools.partial(xpath_with_ns, ns_map={
1849         'ttml': 'http://www.w3.org/ns/ttml',
1850         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1851     })
1852
1853     def parse_node(node):
1854         str_or_empty = functools.partial(str_or_none, default='')
1855
1856         out = str_or_empty(node.text)
1857
1858         for child in node:
1859             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1860                 out += '\n' + str_or_empty(child.tail)
1861             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1862                 out += str_or_empty(parse_node(child))
1863             else:
1864                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1865
1866         return out
1867
1868     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1869     out = []
1870     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1871
1872     if not paras:
1873         raise ValueError('Invalid dfxp/TTML subtitle')
1874
1875     for para, index in zip(paras, itertools.count(1)):
1876         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1877         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1878         if not end_time:
1879             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1880         out.append('%d\n%s --> %s\n%s\n\n' % (
1881             index,
1882             srt_subtitles_timecode(begin_time),
1883             srt_subtitles_timecode(end_time),
1884             parse_node(para)))
1885
1886     return ''.join(out)
1887
1888
1889 class ISO639Utils(object):
1890     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1891     _lang_map = {
1892         'aa': 'aar',
1893         'ab': 'abk',
1894         'ae': 'ave',
1895         'af': 'afr',
1896         'ak': 'aka',
1897         'am': 'amh',
1898         'an': 'arg',
1899         'ar': 'ara',
1900         'as': 'asm',
1901         'av': 'ava',
1902         'ay': 'aym',
1903         'az': 'aze',
1904         'ba': 'bak',
1905         'be': 'bel',
1906         'bg': 'bul',
1907         'bh': 'bih',
1908         'bi': 'bis',
1909         'bm': 'bam',
1910         'bn': 'ben',
1911         'bo': 'bod',
1912         'br': 'bre',
1913         'bs': 'bos',
1914         'ca': 'cat',
1915         'ce': 'che',
1916         'ch': 'cha',
1917         'co': 'cos',
1918         'cr': 'cre',
1919         'cs': 'ces',
1920         'cu': 'chu',
1921         'cv': 'chv',
1922         'cy': 'cym',
1923         'da': 'dan',
1924         'de': 'deu',
1925         'dv': 'div',
1926         'dz': 'dzo',
1927         'ee': 'ewe',
1928         'el': 'ell',
1929         'en': 'eng',
1930         'eo': 'epo',
1931         'es': 'spa',
1932         'et': 'est',
1933         'eu': 'eus',
1934         'fa': 'fas',
1935         'ff': 'ful',
1936         'fi': 'fin',
1937         'fj': 'fij',
1938         'fo': 'fao',
1939         'fr': 'fra',
1940         'fy': 'fry',
1941         'ga': 'gle',
1942         'gd': 'gla',
1943         'gl': 'glg',
1944         'gn': 'grn',
1945         'gu': 'guj',
1946         'gv': 'glv',
1947         'ha': 'hau',
1948         'he': 'heb',
1949         'hi': 'hin',
1950         'ho': 'hmo',
1951         'hr': 'hrv',
1952         'ht': 'hat',
1953         'hu': 'hun',
1954         'hy': 'hye',
1955         'hz': 'her',
1956         'ia': 'ina',
1957         'id': 'ind',
1958         'ie': 'ile',
1959         'ig': 'ibo',
1960         'ii': 'iii',
1961         'ik': 'ipk',
1962         'io': 'ido',
1963         'is': 'isl',
1964         'it': 'ita',
1965         'iu': 'iku',
1966         'ja': 'jpn',
1967         'jv': 'jav',
1968         'ka': 'kat',
1969         'kg': 'kon',
1970         'ki': 'kik',
1971         'kj': 'kua',
1972         'kk': 'kaz',
1973         'kl': 'kal',
1974         'km': 'khm',
1975         'kn': 'kan',
1976         'ko': 'kor',
1977         'kr': 'kau',
1978         'ks': 'kas',
1979         'ku': 'kur',
1980         'kv': 'kom',
1981         'kw': 'cor',
1982         'ky': 'kir',
1983         'la': 'lat',
1984         'lb': 'ltz',
1985         'lg': 'lug',
1986         'li': 'lim',
1987         'ln': 'lin',
1988         'lo': 'lao',
1989         'lt': 'lit',
1990         'lu': 'lub',
1991         'lv': 'lav',
1992         'mg': 'mlg',
1993         'mh': 'mah',
1994         'mi': 'mri',
1995         'mk': 'mkd',
1996         'ml': 'mal',
1997         'mn': 'mon',
1998         'mr': 'mar',
1999         'ms': 'msa',
2000         'mt': 'mlt',
2001         'my': 'mya',
2002         'na': 'nau',
2003         'nb': 'nob',
2004         'nd': 'nde',
2005         'ne': 'nep',
2006         'ng': 'ndo',
2007         'nl': 'nld',
2008         'nn': 'nno',
2009         'no': 'nor',
2010         'nr': 'nbl',
2011         'nv': 'nav',
2012         'ny': 'nya',
2013         'oc': 'oci',
2014         'oj': 'oji',
2015         'om': 'orm',
2016         'or': 'ori',
2017         'os': 'oss',
2018         'pa': 'pan',
2019         'pi': 'pli',
2020         'pl': 'pol',
2021         'ps': 'pus',
2022         'pt': 'por',
2023         'qu': 'que',
2024         'rm': 'roh',
2025         'rn': 'run',
2026         'ro': 'ron',
2027         'ru': 'rus',
2028         'rw': 'kin',
2029         'sa': 'san',
2030         'sc': 'srd',
2031         'sd': 'snd',
2032         'se': 'sme',
2033         'sg': 'sag',
2034         'si': 'sin',
2035         'sk': 'slk',
2036         'sl': 'slv',
2037         'sm': 'smo',
2038         'sn': 'sna',
2039         'so': 'som',
2040         'sq': 'sqi',
2041         'sr': 'srp',
2042         'ss': 'ssw',
2043         'st': 'sot',
2044         'su': 'sun',
2045         'sv': 'swe',
2046         'sw': 'swa',
2047         'ta': 'tam',
2048         'te': 'tel',
2049         'tg': 'tgk',
2050         'th': 'tha',
2051         'ti': 'tir',
2052         'tk': 'tuk',
2053         'tl': 'tgl',
2054         'tn': 'tsn',
2055         'to': 'ton',
2056         'tr': 'tur',
2057         'ts': 'tso',
2058         'tt': 'tat',
2059         'tw': 'twi',
2060         'ty': 'tah',
2061         'ug': 'uig',
2062         'uk': 'ukr',
2063         'ur': 'urd',
2064         'uz': 'uzb',
2065         've': 'ven',
2066         'vi': 'vie',
2067         'vo': 'vol',
2068         'wa': 'wln',
2069         'wo': 'wol',
2070         'xh': 'xho',
2071         'yi': 'yid',
2072         'yo': 'yor',
2073         'za': 'zha',
2074         'zh': 'zho',
2075         'zu': 'zul',
2076     }
2077
2078     @classmethod
2079     def short2long(cls, code):
2080         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2081         return cls._lang_map.get(code[:2])
2082
2083     @classmethod
2084     def long2short(cls, code):
2085         """Convert language code from ISO 639-2/T to ISO 639-1"""
2086         for short_name, long_name in cls._lang_map.items():
2087             if long_name == code:
2088                 return short_name
2089
2090
2091 class ISO3166Utils(object):
2092     # From http://data.okfn.org/data/core/country-list
2093     _country_map = {
2094         'AF': 'Afghanistan',
2095         'AX': 'Åland Islands',
2096         'AL': 'Albania',
2097         'DZ': 'Algeria',
2098         'AS': 'American Samoa',
2099         'AD': 'Andorra',
2100         'AO': 'Angola',
2101         'AI': 'Anguilla',
2102         'AQ': 'Antarctica',
2103         'AG': 'Antigua and Barbuda',
2104         'AR': 'Argentina',
2105         'AM': 'Armenia',
2106         'AW': 'Aruba',
2107         'AU': 'Australia',
2108         'AT': 'Austria',
2109         'AZ': 'Azerbaijan',
2110         'BS': 'Bahamas',
2111         'BH': 'Bahrain',
2112         'BD': 'Bangladesh',
2113         'BB': 'Barbados',
2114         'BY': 'Belarus',
2115         'BE': 'Belgium',
2116         'BZ': 'Belize',
2117         'BJ': 'Benin',
2118         'BM': 'Bermuda',
2119         'BT': 'Bhutan',
2120         'BO': 'Bolivia, Plurinational State of',
2121         'BQ': 'Bonaire, Sint Eustatius and Saba',
2122         'BA': 'Bosnia and Herzegovina',
2123         'BW': 'Botswana',
2124         'BV': 'Bouvet Island',
2125         'BR': 'Brazil',
2126         'IO': 'British Indian Ocean Territory',
2127         'BN': 'Brunei Darussalam',
2128         'BG': 'Bulgaria',
2129         'BF': 'Burkina Faso',
2130         'BI': 'Burundi',
2131         'KH': 'Cambodia',
2132         'CM': 'Cameroon',
2133         'CA': 'Canada',
2134         'CV': 'Cape Verde',
2135         'KY': 'Cayman Islands',
2136         'CF': 'Central African Republic',
2137         'TD': 'Chad',
2138         'CL': 'Chile',
2139         'CN': 'China',
2140         'CX': 'Christmas Island',
2141         'CC': 'Cocos (Keeling) Islands',
2142         'CO': 'Colombia',
2143         'KM': 'Comoros',
2144         'CG': 'Congo',
2145         'CD': 'Congo, the Democratic Republic of the',
2146         'CK': 'Cook Islands',
2147         'CR': 'Costa Rica',
2148         'CI': 'Côte d\'Ivoire',
2149         'HR': 'Croatia',
2150         'CU': 'Cuba',
2151         'CW': 'Curaçao',
2152         'CY': 'Cyprus',
2153         'CZ': 'Czech Republic',
2154         'DK': 'Denmark',
2155         'DJ': 'Djibouti',
2156         'DM': 'Dominica',
2157         'DO': 'Dominican Republic',
2158         'EC': 'Ecuador',
2159         'EG': 'Egypt',
2160         'SV': 'El Salvador',
2161         'GQ': 'Equatorial Guinea',
2162         'ER': 'Eritrea',
2163         'EE': 'Estonia',
2164         'ET': 'Ethiopia',
2165         'FK': 'Falkland Islands (Malvinas)',
2166         'FO': 'Faroe Islands',
2167         'FJ': 'Fiji',
2168         'FI': 'Finland',
2169         'FR': 'France',
2170         'GF': 'French Guiana',
2171         'PF': 'French Polynesia',
2172         'TF': 'French Southern Territories',
2173         'GA': 'Gabon',
2174         'GM': 'Gambia',
2175         'GE': 'Georgia',
2176         'DE': 'Germany',
2177         'GH': 'Ghana',
2178         'GI': 'Gibraltar',
2179         'GR': 'Greece',
2180         'GL': 'Greenland',
2181         'GD': 'Grenada',
2182         'GP': 'Guadeloupe',
2183         'GU': 'Guam',
2184         'GT': 'Guatemala',
2185         'GG': 'Guernsey',
2186         'GN': 'Guinea',
2187         'GW': 'Guinea-Bissau',
2188         'GY': 'Guyana',
2189         'HT': 'Haiti',
2190         'HM': 'Heard Island and McDonald Islands',
2191         'VA': 'Holy See (Vatican City State)',
2192         'HN': 'Honduras',
2193         'HK': 'Hong Kong',
2194         'HU': 'Hungary',
2195         'IS': 'Iceland',
2196         'IN': 'India',
2197         'ID': 'Indonesia',
2198         'IR': 'Iran, Islamic Republic of',
2199         'IQ': 'Iraq',
2200         'IE': 'Ireland',
2201         'IM': 'Isle of Man',
2202         'IL': 'Israel',
2203         'IT': 'Italy',
2204         'JM': 'Jamaica',
2205         'JP': 'Japan',
2206         'JE': 'Jersey',
2207         'JO': 'Jordan',
2208         'KZ': 'Kazakhstan',
2209         'KE': 'Kenya',
2210         'KI': 'Kiribati',
2211         'KP': 'Korea, Democratic People\'s Republic of',
2212         'KR': 'Korea, Republic of',
2213         'KW': 'Kuwait',
2214         'KG': 'Kyrgyzstan',
2215         'LA': 'Lao People\'s Democratic Republic',
2216         'LV': 'Latvia',
2217         'LB': 'Lebanon',
2218         'LS': 'Lesotho',
2219         'LR': 'Liberia',
2220         'LY': 'Libya',
2221         'LI': 'Liechtenstein',
2222         'LT': 'Lithuania',
2223         'LU': 'Luxembourg',
2224         'MO': 'Macao',
2225         'MK': 'Macedonia, the Former Yugoslav Republic of',
2226         'MG': 'Madagascar',
2227         'MW': 'Malawi',
2228         'MY': 'Malaysia',
2229         'MV': 'Maldives',
2230         'ML': 'Mali',
2231         'MT': 'Malta',
2232         'MH': 'Marshall Islands',
2233         'MQ': 'Martinique',
2234         'MR': 'Mauritania',
2235         'MU': 'Mauritius',
2236         'YT': 'Mayotte',
2237         'MX': 'Mexico',
2238         'FM': 'Micronesia, Federated States of',
2239         'MD': 'Moldova, Republic of',
2240         'MC': 'Monaco',
2241         'MN': 'Mongolia',
2242         'ME': 'Montenegro',
2243         'MS': 'Montserrat',
2244         'MA': 'Morocco',
2245         'MZ': 'Mozambique',
2246         'MM': 'Myanmar',
2247         'NA': 'Namibia',
2248         'NR': 'Nauru',
2249         'NP': 'Nepal',
2250         'NL': 'Netherlands',
2251         'NC': 'New Caledonia',
2252         'NZ': 'New Zealand',
2253         'NI': 'Nicaragua',
2254         'NE': 'Niger',
2255         'NG': 'Nigeria',
2256         'NU': 'Niue',
2257         'NF': 'Norfolk Island',
2258         'MP': 'Northern Mariana Islands',
2259         'NO': 'Norway',
2260         'OM': 'Oman',
2261         'PK': 'Pakistan',
2262         'PW': 'Palau',
2263         'PS': 'Palestine, State of',
2264         'PA': 'Panama',
2265         'PG': 'Papua New Guinea',
2266         'PY': 'Paraguay',
2267         'PE': 'Peru',
2268         'PH': 'Philippines',
2269         'PN': 'Pitcairn',
2270         'PL': 'Poland',
2271         'PT': 'Portugal',
2272         'PR': 'Puerto Rico',
2273         'QA': 'Qatar',
2274         'RE': 'Réunion',
2275         'RO': 'Romania',
2276         'RU': 'Russian Federation',
2277         'RW': 'Rwanda',
2278         'BL': 'Saint Barthélemy',
2279         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2280         'KN': 'Saint Kitts and Nevis',
2281         'LC': 'Saint Lucia',
2282         'MF': 'Saint Martin (French part)',
2283         'PM': 'Saint Pierre and Miquelon',
2284         'VC': 'Saint Vincent and the Grenadines',
2285         'WS': 'Samoa',
2286         'SM': 'San Marino',
2287         'ST': 'Sao Tome and Principe',
2288         'SA': 'Saudi Arabia',
2289         'SN': 'Senegal',
2290         'RS': 'Serbia',
2291         'SC': 'Seychelles',
2292         'SL': 'Sierra Leone',
2293         'SG': 'Singapore',
2294         'SX': 'Sint Maarten (Dutch part)',
2295         'SK': 'Slovakia',
2296         'SI': 'Slovenia',
2297         'SB': 'Solomon Islands',
2298         'SO': 'Somalia',
2299         'ZA': 'South Africa',
2300         'GS': 'South Georgia and the South Sandwich Islands',
2301         'SS': 'South Sudan',
2302         'ES': 'Spain',
2303         'LK': 'Sri Lanka',
2304         'SD': 'Sudan',
2305         'SR': 'Suriname',
2306         'SJ': 'Svalbard and Jan Mayen',
2307         'SZ': 'Swaziland',
2308         'SE': 'Sweden',
2309         'CH': 'Switzerland',
2310         'SY': 'Syrian Arab Republic',
2311         'TW': 'Taiwan, Province of China',
2312         'TJ': 'Tajikistan',
2313         'TZ': 'Tanzania, United Republic of',
2314         'TH': 'Thailand',
2315         'TL': 'Timor-Leste',
2316         'TG': 'Togo',
2317         'TK': 'Tokelau',
2318         'TO': 'Tonga',
2319         'TT': 'Trinidad and Tobago',
2320         'TN': 'Tunisia',
2321         'TR': 'Turkey',
2322         'TM': 'Turkmenistan',
2323         'TC': 'Turks and Caicos Islands',
2324         'TV': 'Tuvalu',
2325         'UG': 'Uganda',
2326         'UA': 'Ukraine',
2327         'AE': 'United Arab Emirates',
2328         'GB': 'United Kingdom',
2329         'US': 'United States',
2330         'UM': 'United States Minor Outlying Islands',
2331         'UY': 'Uruguay',
2332         'UZ': 'Uzbekistan',
2333         'VU': 'Vanuatu',
2334         'VE': 'Venezuela, Bolivarian Republic of',
2335         'VN': 'Viet Nam',
2336         'VG': 'Virgin Islands, British',
2337         'VI': 'Virgin Islands, U.S.',
2338         'WF': 'Wallis and Futuna',
2339         'EH': 'Western Sahara',
2340         'YE': 'Yemen',
2341         'ZM': 'Zambia',
2342         'ZW': 'Zimbabwe',
2343     }
2344
2345     @classmethod
2346     def short2full(cls, code):
2347         """Convert an ISO 3166-2 country code to the corresponding full name"""
2348         return cls._country_map.get(code.upper())
2349
2350
2351 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2352     def __init__(self, proxies=None):
2353         # Set default handlers
2354         for type in ('http', 'https'):
2355             setattr(self, '%s_open' % type,
2356                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2357                         meth(r, proxy, type))
2358         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2359
2360     def proxy_open(self, req, proxy, type):
2361         req_proxy = req.headers.get('Ytdl-request-proxy')
2362         if req_proxy is not None:
2363             proxy = req_proxy
2364             del req.headers['Ytdl-request-proxy']
2365
2366         if proxy == '__noproxy__':
2367             return None  # No Proxy
2368         return compat_urllib_request.ProxyHandler.proxy_open(
2369             self, req, proxy, type)