_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 NO_DEFAULT = object()
  66
  67 ENGLISH_MONTH_NAMES = [
  68     'January', 'February', 'March', 'April', 'May', 'June',
  69     'July', 'August', 'September', 'October', 'November', 'December']
  70
  71
  72 def preferredencoding():
  73     """Get preferred encoding.
  74
  75     Returns the best encoding scheme for the system, based on
  76     locale.getpreferredencoding() and some further tweaks.
  77     """
  78     try:
  79         pref = locale.getpreferredencoding()
  80         'TEST'.encode(pref)
  81     except Exception:
  82         pref = 'UTF-8'
  83
  84     return pref
  85
  86
  87 def write_json_file(obj, fn):
  88     """ Encode obj as JSON and write it to fn, atomically if possible """
  89
  90     fn = encodeFilename(fn)
  91     if sys.version_info < (3, 0) and sys.platform != 'win32':
  92         encoding = get_filesystem_encoding()
  93         # os.path.basename returns a bytes object, but NamedTemporaryFile
  94         # will fail if the filename contains non ascii characters unless we
  95         # use a unicode object
  96         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  97         # the same for os.path.dirname
  98         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  99     else:
 100         path_basename = os.path.basename
 101         path_dirname = os.path.dirname
 102
 103     args = {
 104         'suffix': '.tmp',
 105         'prefix': path_basename(fn) + '.',
 106         'dir': path_dirname(fn),
 107         'delete': False,
 108     }
 109
 110     # In Python 2.x, json.dump expects a bytestream.
 111     # In Python 3.x, it writes to a character stream
 112     if sys.version_info < (3, 0):
 113         args['mode'] = 'wb'
 114     else:
 115         args.update({
 116             'mode': 'w',
 117             'encoding': 'utf-8',
 118         })
 119
 120     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 121
 122     try:
 123         with tf:
 124             json.dump(obj, tf)
 125         if sys.platform == 'win32':
 126             # Need to remove existing file on Windows, else os.rename raises
 127             # WindowsError or FileExistsError.
 128             try:
 129                 os.unlink(fn)
 130             except OSError:
 131                 pass
 132         os.rename(tf.name, fn)
 133     except Exception:
 134         try:
 135             os.remove(tf.name)
 136         except OSError:
 137             pass
 138         raise
 139
 140
 141 if sys.version_info >= (2, 7):
 142     def find_xpath_attr(node, xpath, key, val):
 143         """ Find the xpath xpath[@key=val] """
 144         assert re.match(r'^[a-zA-Z-]+$', key)
 145         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 146         expr = xpath + "[@%s='%s']" % (key, val)
 147         return node.find(expr)
 148 else:
 149     def find_xpath_attr(node, xpath, key, val):
 150         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 151         # .//node does not match if a node is a direct child of . !
 152         if isinstance(xpath, compat_str):
 153             xpath = xpath.encode('ascii')
 154
 155         for f in node.findall(xpath):
 156             if f.attrib.get(key) == val:
 157                 return f
 158         return None
 159
 160 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 161 # the namespace parameter
 162
 163
 164 def xpath_with_ns(path, ns_map):
 165     components = [c.split(':') for c in path.split('/')]
 166     replaced = []
 167     for c in components:
 168         if len(c) == 1:
 169             replaced.append(c[0])
 170         else:
 171             ns, tag = c
 172             replaced.append('{%s}%s' % (ns_map[ns], tag))
 173     return '/'.join(replaced)
 174
 175
 176 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 177     if sys.version_info < (2, 7):  # Crazy 2.6
 178         xpath = xpath.encode('ascii')
 179
 180     n = node.find(xpath)
 181     if n is None or n.text is None:
 182         if default is not NO_DEFAULT:
 183             return default
 184         elif fatal:
 185             name = xpath if name is None else name
 186             raise ExtractorError('Could not find XML element %s' % name)
 187         else:
 188             return None
 189     return n.text
 190
 191
 192 def get_element_by_id(id, html):
 193     """Return the content of the tag with the specified ID in the passed HTML document"""
 194     return get_element_by_attribute("id", id, html)
 195
 196
 197 def get_element_by_attribute(attribute, value, html):
 198     """Return the content of the tag with the specified attribute in the passed HTML document"""
 199
 200     m = re.search(r'''(?xs)
 201         <([a-zA-Z0-9:._-]+)
 202          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 203          \s+%s=['"]?%s['"]?
 204          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 205         \s*>
 206         (?P<content>.*?)
 207         </\1>
 208     ''' % (re.escape(attribute), re.escape(value)), html)
 209
 210     if not m:
 211         return None
 212     res = m.group('content')
 213
 214     if res.startswith('"') or res.startswith("'"):
 215         res = res[1:-1]
 216
 217     return unescapeHTML(res)
 218
 219
 220 def clean_html(html):
 221     """Clean an HTML snippet into a readable string"""
 222
 223     if html is None:  # Convenience for sanitizing descriptions etc.
 224         return html
 225
 226     # Newline vs <br />
 227     html = html.replace('\n', ' ')
 228     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 229     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 230     # Strip html tags
 231     html = re.sub('<.*?>', '', html)
 232     # Replace html entities
 233     html = unescapeHTML(html)
 234     return html.strip()
 235
 236
 237 def sanitize_open(filename, open_mode):
 238     """Try to open the given filename, and slightly tweak it if this fails.
 239
 240     Attempts to open the given filename. If this fails, it tries to change
 241     the filename slightly, step by step, until it's either able to open it
 242     or it fails and raises a final exception, like the standard open()
 243     function.
 244
 245     It returns the tuple (stream, definitive_file_name).
 246     """
 247     try:
 248         if filename == '-':
 249             if sys.platform == 'win32':
 250                 import msvcrt
 251                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 252             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 253         stream = open(encodeFilename(filename), open_mode)
 254         return (stream, filename)
 255     except (IOError, OSError) as err:
 256         if err.errno in (errno.EACCES,):
 257             raise
 258
 259         # In case of error, try to remove win32 forbidden chars
 260         alt_filename = sanitize_path(filename)
 261         if alt_filename == filename:
 262             raise
 263         else:
 264             # An exception here should be caught in the caller
 265             stream = open(encodeFilename(alt_filename), open_mode)
 266             return (stream, alt_filename)
 267
 268
 269 def timeconvert(timestr):
 270     """Convert RFC 2822 defined time string into system timestamp"""
 271     timestamp = None
 272     timetuple = email.utils.parsedate_tz(timestr)
 273     if timetuple is not None:
 274         timestamp = email.utils.mktime_tz(timetuple)
 275     return timestamp
 276
 277
 278 def sanitize_filename(s, restricted=False, is_id=False):
 279     """Sanitizes a string so it could be used as part of a filename.
 280     If restricted is set, use a stricter subset of allowed characters.
 281     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 282     """
 283     def replace_insane(char):
 284         if char == '?' or ord(char) < 32 or ord(char) == 127:
 285             return ''
 286         elif char == '"':
 287             return '' if restricted else '\''
 288         elif char == ':':
 289             return '_-' if restricted else ' -'
 290         elif char in '\\/|*<>':
 291             return '_'
 292         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 293             return '_'
 294         if restricted and ord(char) > 127:
 295             return '_'
 296         return char
 297
 298     # Handle timestamps
 299     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 300     result = ''.join(map(replace_insane, s))
 301     if not is_id:
 302         while '__' in result:
 303             result = result.replace('__', '_')
 304         result = result.strip('_')
 305         # Common case of "Foreign band name - English song title"
 306         if restricted and result.startswith('-_'):
 307             result = result[2:]
 308         if result.startswith('-'):
 309             result = '_' + result[len('-'):]
 310         result = result.lstrip('.')
 311         if not result:
 312             result = '_'
 313     return result
 314
 315
 316 def sanitize_path(s):
 317     """Sanitizes and normalizes path on Windows"""
 318     if sys.platform != 'win32':
 319         return s
 320     drive_or_unc, _ = os.path.splitdrive(s)
 321     if sys.version_info < (2, 7) and not drive_or_unc:
 322         drive_or_unc, _ = os.path.splitunc(s)
 323     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 324     if drive_or_unc:
 325         norm_path.pop(0)
 326     sanitized_path = [
 327         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 328         for path_part in norm_path]
 329     if drive_or_unc:
 330         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 331     return os.path.join(*sanitized_path)
 332
 333
 334 def orderedSet(iterable):
 335     """ Remove all duplicates from the input iterable """
 336     res = []
 337     for el in iterable:
 338         if el not in res:
 339             res.append(el)
 340     return res
 341
 342
 343 def _htmlentity_transform(entity):
 344     """Transforms an HTML entity to a character."""
 345     # Known non-numeric HTML entity
 346     if entity in compat_html_entities.name2codepoint:
 347         return compat_chr(compat_html_entities.name2codepoint[entity])
 348
 349     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 350     if mobj is not None:
 351         numstr = mobj.group(1)
 352         if numstr.startswith('x'):
 353             base = 16
 354             numstr = '0%s' % numstr
 355         else:
 356             base = 10
 357         return compat_chr(int(numstr, base))
 358
 359     # Unknown entity in name, return its literal representation
 360     return ('&%s;' % entity)
 361
 362
 363 def unescapeHTML(s):
 364     if s is None:
 365         return None
 366     assert type(s) == compat_str
 367
 368     return re.sub(
 369         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 370
 371
 372 def get_subprocess_encoding():
 373     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 374         # For subprocess calls, encode with locale encoding
 375         # Refer to http://stackoverflow.com/a/9951851/35070
 376         encoding = preferredencoding()
 377     else:
 378         encoding = sys.getfilesystemencoding()
 379     if encoding is None:
 380         encoding = 'utf-8'
 381     return encoding
 382
 383
 384 def encodeFilename(s, for_subprocess=False):
 385     """
 386     @param s The name of the file
 387     """
 388
 389     assert type(s) == compat_str
 390
 391     # Python 3 has a Unicode API
 392     if sys.version_info >= (3, 0):
 393         return s
 394
 395     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 396     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 397     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 398     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 399         return s
 400
 401     return s.encode(get_subprocess_encoding(), 'ignore')
 402
 403
 404 def decodeFilename(b, for_subprocess=False):
 405
 406     if sys.version_info >= (3, 0):
 407         return b
 408
 409     if not isinstance(b, bytes):
 410         return b
 411
 412     return b.decode(get_subprocess_encoding(), 'ignore')
 413
 414
 415 def encodeArgument(s):
 416     if not isinstance(s, compat_str):
 417         # Legacy code that uses byte strings
 418         # Uncomment the following line after fixing all post processors
 419         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 420         s = s.decode('ascii')
 421     return encodeFilename(s, True)
 422
 423
 424 def decodeArgument(b):
 425     return decodeFilename(b, True)
 426
 427
 428 def decodeOption(optval):
 429     if optval is None:
 430         return optval
 431     if isinstance(optval, bytes):
 432         optval = optval.decode(preferredencoding())
 433
 434     assert isinstance(optval, compat_str)
 435     return optval
 436
 437
 438 def formatSeconds(secs):
 439     if secs > 3600:
 440         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 441     elif secs > 60:
 442         return '%d:%02d' % (secs // 60, secs % 60)
 443     else:
 444         return '%d' % secs
 445
 446
 447 def make_HTTPS_handler(params, **kwargs):
 448     opts_no_check_certificate = params.get('nocheckcertificate', False)
 449     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 450         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 451         if opts_no_check_certificate:
 452             context.check_hostname = False
 453             context.verify_mode = ssl.CERT_NONE
 454         try:
 455             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 456         except TypeError:
 457             # Python 2.7.8
 458             # (create_default_context present but HTTPSHandler has no context=)
 459             pass
 460
 461     if sys.version_info < (3, 2):
 462         return YoutubeDLHTTPSHandler(params, **kwargs)
 463     else:  # Python < 3.4
 464         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 465         context.verify_mode = (ssl.CERT_NONE
 466                                if opts_no_check_certificate
 467                                else ssl.CERT_REQUIRED)
 468         context.set_default_verify_paths()
 469         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 470
 471
 472 def bug_reports_message():
 473     if ytdl_is_updateable():
 474         update_cmd = 'type  youtube-dl -U  to update'
 475     else:
 476         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 477     msg = '; please report this issue on https://yt-dl.org/bug .'
 478     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 479     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 480     return msg
 481
 482
 483 class ExtractorError(Exception):
 484     """Error during info extraction."""
 485
 486     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 487         """ tb, if given, is the original traceback (so that it can be printed out).
 488         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 489         """
 490
 491         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 492             expected = True
 493         if video_id is not None:
 494             msg = video_id + ': ' + msg
 495         if cause:
 496             msg += ' (caused by %r)' % cause
 497         if not expected:
 498             msg += bug_reports_message()
 499         super(ExtractorError, self).__init__(msg)
 500
 501         self.traceback = tb
 502         self.exc_info = sys.exc_info()  # preserve original exception
 503         self.cause = cause
 504         self.video_id = video_id
 505
 506     def format_traceback(self):
 507         if self.traceback is None:
 508             return None
 509         return ''.join(traceback.format_tb(self.traceback))
 510
 511
 512 class UnsupportedError(ExtractorError):
 513     def __init__(self, url):
 514         super(UnsupportedError, self).__init__(
 515             'Unsupported URL: %s' % url, expected=True)
 516         self.url = url
 517
 518
 519 class RegexNotFoundError(ExtractorError):
 520     """Error when a regex didn't match"""
 521     pass
 522
 523
 524 class DownloadError(Exception):
 525     """Download Error exception.
 526
 527     This exception may be thrown by FileDownloader objects if they are not
 528     configured to continue on errors. They will contain the appropriate
 529     error message.
 530     """
 531
 532     def __init__(self, msg, exc_info=None):
 533         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 534         super(DownloadError, self).__init__(msg)
 535         self.exc_info = exc_info
 536
 537
 538 class SameFileError(Exception):
 539     """Same File exception.
 540
 541     This exception will be thrown by FileDownloader objects if they detect
 542     multiple files would have to be downloaded to the same file on disk.
 543     """
 544     pass
 545
 546
 547 class PostProcessingError(Exception):
 548     """Post Processing exception.
 549
 550     This exception may be raised by PostProcessor's .run() method to
 551     indicate an error in the postprocessing task.
 552     """
 553
 554     def __init__(self, msg):
 555         self.msg = msg
 556
 557
 558 class MaxDownloadsReached(Exception):
 559     """ --max-downloads limit has been reached. """
 560     pass
 561
 562
 563 class UnavailableVideoError(Exception):
 564     """Unavailable Format exception.
 565
 566     This exception will be thrown when a video is requested
 567     in a format that is not available for that video.
 568     """
 569     pass
 570
 571
 572 class ContentTooShortError(Exception):
 573     """Content Too Short exception.
 574
 575     This exception may be raised by FileDownloader objects when a file they
 576     download is too small for what the server announced first, indicating
 577     the connection was probably interrupted.
 578     """
 579
 580     def __init__(self, downloaded, expected):
 581         # Both in bytes
 582         self.downloaded = downloaded
 583         self.expected = expected
 584
 585
 586 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 587     hc = http_class(*args, **kwargs)
 588     source_address = ydl_handler._params.get('source_address')
 589     if source_address is not None:
 590         sa = (source_address, 0)
 591         if hasattr(hc, 'source_address'):  # Python 2.7+
 592             hc.source_address = sa
 593         else:  # Python 2.6
 594             def _hc_connect(self, *args, **kwargs):
 595                 sock = compat_socket_create_connection(
 596                     (self.host, self.port), self.timeout, sa)
 597                 if is_https:
 598                     self.sock = ssl.wrap_socket(
 599                         sock, self.key_file, self.cert_file,
 600                         ssl_version=ssl.PROTOCOL_TLSv1)
 601                 else:
 602                     self.sock = sock
 603             hc.connect = functools.partial(_hc_connect, hc)
 604
 605     return hc
 606
 607
 608 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 609     """Handler for HTTP requests and responses.
 610
 611     This class, when installed with an OpenerDirector, automatically adds
 612     the standard headers to every HTTP request and handles gzipped and
 613     deflated responses from web servers. If compression is to be avoided in
 614     a particular request, the original request in the program code only has
 615     to include the HTTP header "Youtubedl-No-Compression", which will be
 616     removed before making the real request.
 617
 618     Part of this code was copied from:
 619
 620     http://techknack.net/python-urllib2-handlers/
 621
 622     Andrew Rowls, the author of that code, agreed to release it to the
 623     public domain.
 624     """
 625
 626     def __init__(self, params, *args, **kwargs):
 627         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 628         self._params = params
 629
 630     def http_open(self, req):
 631         return self.do_open(functools.partial(
 632             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 633             req)
 634
 635     @staticmethod
 636     def deflate(data):
 637         try:
 638             return zlib.decompress(data, -zlib.MAX_WBITS)
 639         except zlib.error:
 640             return zlib.decompress(data)
 641
 642     @staticmethod
 643     def addinfourl_wrapper(stream, headers, url, code):
 644         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 645             return compat_urllib_request.addinfourl(stream, headers, url, code)
 646         ret = compat_urllib_request.addinfourl(stream, headers, url)
 647         ret.code = code
 648         return ret
 649
 650     def http_request(self, req):
 651         for h, v in std_headers.items():
 652             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 653             # The dict keys are capitalized because of this bug by urllib
 654             if h.capitalize() not in req.headers:
 655                 req.add_header(h, v)
 656         if 'Youtubedl-no-compression' in req.headers:
 657             if 'Accept-encoding' in req.headers:
 658                 del req.headers['Accept-encoding']
 659             del req.headers['Youtubedl-no-compression']
 660
 661         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 662             # Python 2.6 is brain-dead when it comes to fragments
 663             req._Request__original = req._Request__original.partition('#')[0]
 664             req._Request__r_type = req._Request__r_type.partition('#')[0]
 665
 666         return req
 667
 668     def http_response(self, req, resp):
 669         old_resp = resp
 670         # gzip
 671         if resp.headers.get('Content-encoding', '') == 'gzip':
 672             content = resp.read()
 673             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 674             try:
 675                 uncompressed = io.BytesIO(gz.read())
 676             except IOError as original_ioerror:
 677                 # There may be junk add the end of the file
 678                 # See http://stackoverflow.com/q/4928560/35070 for details
 679                 for i in range(1, 1024):
 680                     try:
 681                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 682                         uncompressed = io.BytesIO(gz.read())
 683                     except IOError:
 684                         continue
 685                     break
 686                 else:
 687                     raise original_ioerror
 688             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 689             resp.msg = old_resp.msg
 690         # deflate
 691         if resp.headers.get('Content-encoding', '') == 'deflate':
 692             gz = io.BytesIO(self.deflate(resp.read()))
 693             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 694             resp.msg = old_resp.msg
 695         return resp
 696
 697     https_request = http_request
 698     https_response = http_response
 699
 700
 701 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 702     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 703         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 704         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 705         self._params = params
 706
 707     def https_open(self, req):
 708         kwargs = {}
 709         if hasattr(self, '_context'):  # python > 2.6
 710             kwargs['context'] = self._context
 711         if hasattr(self, '_check_hostname'):  # python 3.x
 712             kwargs['check_hostname'] = self._check_hostname
 713         return self.do_open(functools.partial(
 714             _create_http_connection, self, self._https_conn_class, True),
 715             req, **kwargs)
 716
 717
 718 def parse_iso8601(date_str, delimiter='T', timezone=None):
 719     """ Return a UNIX timestamp from the given date """
 720
 721     if date_str is None:
 722         return None
 723
 724     if timezone is None:
 725         m = re.search(
 726             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 727             date_str)
 728         if not m:
 729             timezone = datetime.timedelta()
 730         else:
 731             date_str = date_str[:-len(m.group(0))]
 732             if not m.group('sign'):
 733                 timezone = datetime.timedelta()
 734             else:
 735                 sign = 1 if m.group('sign') == '+' else -1
 736                 timezone = datetime.timedelta(
 737                     hours=sign * int(m.group('hours')),
 738                     minutes=sign * int(m.group('minutes')))
 739     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 740     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 741     return calendar.timegm(dt.timetuple())
 742
 743
 744 def unified_strdate(date_str, day_first=True):
 745     """Return a string with the date in the format YYYYMMDD"""
 746
 747     if date_str is None:
 748         return None
 749     upload_date = None
 750     # Replace commas
 751     date_str = date_str.replace(',', ' ')
 752     # %z (UTC offset) is only supported in python>=3.2
 753     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 754         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 755     # Remove AM/PM + timezone
 756     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 757
 758     format_expressions = [
 759         '%d %B %Y',
 760         '%d %b %Y',
 761         '%B %d %Y',
 762         '%b %d %Y',
 763         '%b %dst %Y %I:%M%p',
 764         '%b %dnd %Y %I:%M%p',
 765         '%b %dth %Y %I:%M%p',
 766         '%Y %m %d',
 767         '%Y-%m-%d',
 768         '%Y/%m/%d',
 769         '%Y/%m/%d %H:%M:%S',
 770         '%Y-%m-%d %H:%M:%S',
 771         '%Y-%m-%d %H:%M:%S.%f',
 772         '%d.%m.%Y %H:%M',
 773         '%d.%m.%Y %H.%M',
 774         '%Y-%m-%dT%H:%M:%SZ',
 775         '%Y-%m-%dT%H:%M:%S.%fZ',
 776         '%Y-%m-%dT%H:%M:%S.%f0Z',
 777         '%Y-%m-%dT%H:%M:%S',
 778         '%Y-%m-%dT%H:%M:%S.%f',
 779         '%Y-%m-%dT%H:%M',
 780     ]
 781     if day_first:
 782         format_expressions.extend([
 783             '%d-%m-%Y',
 784             '%d.%m.%Y',
 785             '%d/%m/%Y',
 786             '%d/%m/%y',
 787             '%d/%m/%Y %H:%M:%S',
 788         ])
 789     else:
 790         format_expressions.extend([
 791             '%m-%d-%Y',
 792             '%m.%d.%Y',
 793             '%m/%d/%Y',
 794             '%m/%d/%y',
 795             '%m/%d/%Y %H:%M:%S',
 796         ])
 797     for expression in format_expressions:
 798         try:
 799             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 800         except ValueError:
 801             pass
 802     if upload_date is None:
 803         timetuple = email.utils.parsedate_tz(date_str)
 804         if timetuple:
 805             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 806     return upload_date
 807
 808
 809 def determine_ext(url, default_ext='unknown_video'):
 810     if url is None:
 811         return default_ext
 812     guess = url.partition('?')[0].rpartition('.')[2]
 813     if re.match(r'^[A-Za-z0-9]+$', guess):
 814         return guess
 815     else:
 816         return default_ext
 817
 818
 819 def subtitles_filename(filename, sub_lang, sub_format):
 820     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 821
 822
 823 def date_from_str(date_str):
 824     """
 825     Return a datetime object from a string in the format YYYYMMDD or
 826     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 827     today = datetime.date.today()
 828     if date_str in ('now', 'today'):
 829         return today
 830     if date_str == 'yesterday':
 831         return today - datetime.timedelta(days=1)
 832     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 833     if match is not None:
 834         sign = match.group('sign')
 835         time = int(match.group('time'))
 836         if sign == '-':
 837             time = -time
 838         unit = match.group('unit')
 839         # A bad aproximation?
 840         if unit == 'month':
 841             unit = 'day'
 842             time *= 30
 843         elif unit == 'year':
 844             unit = 'day'
 845             time *= 365
 846         unit += 's'
 847         delta = datetime.timedelta(**{unit: time})
 848         return today + delta
 849     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 850
 851
 852 def hyphenate_date(date_str):
 853     """
 854     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 855     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 856     if match is not None:
 857         return '-'.join(match.groups())
 858     else:
 859         return date_str
 860
 861
 862 class DateRange(object):
 863     """Represents a time interval between two dates"""
 864
 865     def __init__(self, start=None, end=None):
 866         """start and end must be strings in the format accepted by date"""
 867         if start is not None:
 868             self.start = date_from_str(start)
 869         else:
 870             self.start = datetime.datetime.min.date()
 871         if end is not None:
 872             self.end = date_from_str(end)
 873         else:
 874             self.end = datetime.datetime.max.date()
 875         if self.start > self.end:
 876             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 877
 878     @classmethod
 879     def day(cls, day):
 880         """Returns a range that only contains the given day"""
 881         return cls(day, day)
 882
 883     def __contains__(self, date):
 884         """Check if the date is in the range"""
 885         if not isinstance(date, datetime.date):
 886             date = date_from_str(date)
 887         return self.start <= date <= self.end
 888
 889     def __str__(self):
 890         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 891
 892
 893 def platform_name():
 894     """ Returns the platform name as a compat_str """
 895     res = platform.platform()
 896     if isinstance(res, bytes):
 897         res = res.decode(preferredencoding())
 898
 899     assert isinstance(res, compat_str)
 900     return res
 901
 902
 903 def _windows_write_string(s, out):
 904     """ Returns True if the string was written using special methods,
 905     False if it has yet to be written out."""
 906     # Adapted from http://stackoverflow.com/a/3259271/35070
 907
 908     import ctypes
 909     import ctypes.wintypes
 910
 911     WIN_OUTPUT_IDS = {
 912         1: -11,
 913         2: -12,
 914     }
 915
 916     try:
 917         fileno = out.fileno()
 918     except AttributeError:
 919         # If the output stream doesn't have a fileno, it's virtual
 920         return False
 921     except io.UnsupportedOperation:
 922         # Some strange Windows pseudo files?
 923         return False
 924     if fileno not in WIN_OUTPUT_IDS:
 925         return False
 926
 927     GetStdHandle = ctypes.WINFUNCTYPE(
 928         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 929         (b"GetStdHandle", ctypes.windll.kernel32))
 930     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 931
 932     WriteConsoleW = ctypes.WINFUNCTYPE(
 933         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 934         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 935         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 936     written = ctypes.wintypes.DWORD(0)
 937
 938     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 939     FILE_TYPE_CHAR = 0x0002
 940     FILE_TYPE_REMOTE = 0x8000
 941     GetConsoleMode = ctypes.WINFUNCTYPE(
 942         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 943         ctypes.POINTER(ctypes.wintypes.DWORD))(
 944         (b"GetConsoleMode", ctypes.windll.kernel32))
 945     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 946
 947     def not_a_console(handle):
 948         if handle == INVALID_HANDLE_VALUE or handle is None:
 949             return True
 950         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 951                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 952
 953     if not_a_console(h):
 954         return False
 955
 956     def next_nonbmp_pos(s):
 957         try:
 958             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 959         except StopIteration:
 960             return len(s)
 961
 962     while s:
 963         count = min(next_nonbmp_pos(s), 1024)
 964
 965         ret = WriteConsoleW(
 966             h, s, count if count else 2, ctypes.byref(written), None)
 967         if ret == 0:
 968             raise OSError('Failed to write string')
 969         if not count:  # We just wrote a non-BMP character
 970             assert written.value == 2
 971             s = s[1:]
 972         else:
 973             assert written.value > 0
 974             s = s[written.value:]
 975     return True
 976
 977
 978 def write_string(s, out=None, encoding=None):
 979     if out is None:
 980         out = sys.stderr
 981     assert type(s) == compat_str
 982
 983     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 984         if _windows_write_string(s, out):
 985             return
 986
 987     if ('b' in getattr(out, 'mode', '') or
 988             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 989         byt = s.encode(encoding or preferredencoding(), 'ignore')
 990         out.write(byt)
 991     elif hasattr(out, 'buffer'):
 992         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 993         byt = s.encode(enc, 'ignore')
 994         out.buffer.write(byt)
 995     else:
 996         out.write(s)
 997     out.flush()
 998
 999
1000 def bytes_to_intlist(bs):
1001     if not bs:
1002         return []
1003     if isinstance(bs[0], int):  # Python 3
1004         return list(bs)
1005     else:
1006         return [ord(c) for c in bs]
1007
1008
1009 def intlist_to_bytes(xs):
1010     if not xs:
1011         return b''
1012     return struct_pack('%dB' % len(xs), *xs)
1013
1014
1015 # Cross-platform file locking
1016 if sys.platform == 'win32':
1017     import ctypes.wintypes
1018     import msvcrt
1019
1020     class OVERLAPPED(ctypes.Structure):
1021         _fields_ = [
1022             ('Internal', ctypes.wintypes.LPVOID),
1023             ('InternalHigh', ctypes.wintypes.LPVOID),
1024             ('Offset', ctypes.wintypes.DWORD),
1025             ('OffsetHigh', ctypes.wintypes.DWORD),
1026             ('hEvent', ctypes.wintypes.HANDLE),
1027         ]
1028
1029     kernel32 = ctypes.windll.kernel32
1030     LockFileEx = kernel32.LockFileEx
1031     LockFileEx.argtypes = [
1032         ctypes.wintypes.HANDLE,     # hFile
1033         ctypes.wintypes.DWORD,      # dwFlags
1034         ctypes.wintypes.DWORD,      # dwReserved
1035         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1036         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1037         ctypes.POINTER(OVERLAPPED)  # Overlapped
1038     ]
1039     LockFileEx.restype = ctypes.wintypes.BOOL
1040     UnlockFileEx = kernel32.UnlockFileEx
1041     UnlockFileEx.argtypes = [
1042         ctypes.wintypes.HANDLE,     # hFile
1043         ctypes.wintypes.DWORD,      # dwReserved
1044         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1045         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1046         ctypes.POINTER(OVERLAPPED)  # Overlapped
1047     ]
1048     UnlockFileEx.restype = ctypes.wintypes.BOOL
1049     whole_low = 0xffffffff
1050     whole_high = 0x7fffffff
1051
1052     def _lock_file(f, exclusive):
1053         overlapped = OVERLAPPED()
1054         overlapped.Offset = 0
1055         overlapped.OffsetHigh = 0
1056         overlapped.hEvent = 0
1057         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1058         handle = msvcrt.get_osfhandle(f.fileno())
1059         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1060                           whole_low, whole_high, f._lock_file_overlapped_p):
1061             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1062
1063     def _unlock_file(f):
1064         assert f._lock_file_overlapped_p
1065         handle = msvcrt.get_osfhandle(f.fileno())
1066         if not UnlockFileEx(handle, 0,
1067                             whole_low, whole_high, f._lock_file_overlapped_p):
1068             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1069
1070 else:
1071     import fcntl
1072
1073     def _lock_file(f, exclusive):
1074         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1075
1076     def _unlock_file(f):
1077         fcntl.flock(f, fcntl.LOCK_UN)
1078
1079
1080 class locked_file(object):
1081     def __init__(self, filename, mode, encoding=None):
1082         assert mode in ['r', 'a', 'w']
1083         self.f = io.open(filename, mode, encoding=encoding)
1084         self.mode = mode
1085
1086     def __enter__(self):
1087         exclusive = self.mode != 'r'
1088         try:
1089             _lock_file(self.f, exclusive)
1090         except IOError:
1091             self.f.close()
1092             raise
1093         return self
1094
1095     def __exit__(self, etype, value, traceback):
1096         try:
1097             _unlock_file(self.f)
1098         finally:
1099             self.f.close()
1100
1101     def __iter__(self):
1102         return iter(self.f)
1103
1104     def write(self, *args):
1105         return self.f.write(*args)
1106
1107     def read(self, *args):
1108         return self.f.read(*args)
1109
1110
1111 def get_filesystem_encoding():
1112     encoding = sys.getfilesystemencoding()
1113     return encoding if encoding is not None else 'utf-8'
1114
1115
1116 def shell_quote(args):
1117     quoted_args = []
1118     encoding = get_filesystem_encoding()
1119     for a in args:
1120         if isinstance(a, bytes):
1121             # We may get a filename encoded with 'encodeFilename'
1122             a = a.decode(encoding)
1123         quoted_args.append(pipes.quote(a))
1124     return ' '.join(quoted_args)
1125
1126
1127 def smuggle_url(url, data):
1128     """ Pass additional data in a URL for internal use. """
1129
1130     sdata = compat_urllib_parse.urlencode(
1131         {'__youtubedl_smuggle': json.dumps(data)})
1132     return url + '#' + sdata
1133
1134
1135 def unsmuggle_url(smug_url, default=None):
1136     if '#__youtubedl_smuggle' not in smug_url:
1137         return smug_url, default
1138     url, _, sdata = smug_url.rpartition('#')
1139     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1140     data = json.loads(jsond)
1141     return url, data
1142
1143
1144 def format_bytes(bytes):
1145     if bytes is None:
1146         return 'N/A'
1147     if type(bytes) is str:
1148         bytes = float(bytes)
1149     if bytes == 0.0:
1150         exponent = 0
1151     else:
1152         exponent = int(math.log(bytes, 1024.0))
1153     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1154     converted = float(bytes) / float(1024 ** exponent)
1155     return '%.2f%s' % (converted, suffix)
1156
1157
1158 def parse_filesize(s):
1159     if s is None:
1160         return None
1161
1162     # The lower-case forms are of course incorrect and inofficial,
1163     # but we support those too
1164     _UNIT_TABLE = {
1165         'B': 1,
1166         'b': 1,
1167         'KiB': 1024,
1168         'KB': 1000,
1169         'kB': 1024,
1170         'Kb': 1000,
1171         'MiB': 1024 ** 2,
1172         'MB': 1000 ** 2,
1173         'mB': 1024 ** 2,
1174         'Mb': 1000 ** 2,
1175         'GiB': 1024 ** 3,
1176         'GB': 1000 ** 3,
1177         'gB': 1024 ** 3,
1178         'Gb': 1000 ** 3,
1179         'TiB': 1024 ** 4,
1180         'TB': 1000 ** 4,
1181         'tB': 1024 ** 4,
1182         'Tb': 1000 ** 4,
1183         'PiB': 1024 ** 5,
1184         'PB': 1000 ** 5,
1185         'pB': 1024 ** 5,
1186         'Pb': 1000 ** 5,
1187         'EiB': 1024 ** 6,
1188         'EB': 1000 ** 6,
1189         'eB': 1024 ** 6,
1190         'Eb': 1000 ** 6,
1191         'ZiB': 1024 ** 7,
1192         'ZB': 1000 ** 7,
1193         'zB': 1024 ** 7,
1194         'Zb': 1000 ** 7,
1195         'YiB': 1024 ** 8,
1196         'YB': 1000 ** 8,
1197         'yB': 1024 ** 8,
1198         'Yb': 1000 ** 8,
1199     }
1200
1201     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1202     m = re.match(
1203         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1204     if not m:
1205         return None
1206
1207     num_str = m.group('num').replace(',', '.')
1208     mult = _UNIT_TABLE[m.group('unit')]
1209     return int(float(num_str) * mult)
1210
1211
1212 def month_by_name(name):
1213     """ Return the number of a month by (locale-independently) English name """
1214
1215     try:
1216         return ENGLISH_MONTH_NAMES.index(name) + 1
1217     except ValueError:
1218         return None
1219
1220
1221 def month_by_abbreviation(abbrev):
1222     """ Return the number of a month by (locale-independently) English
1223         abbreviations """
1224
1225     try:
1226         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1227     except ValueError:
1228         return None
1229
1230
1231 def fix_xml_ampersands(xml_str):
1232     """Replace all the '&' by '&amp;' in XML"""
1233     return re.sub(
1234         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1235         '&amp;',
1236         xml_str)
1237
1238
1239 def setproctitle(title):
1240     assert isinstance(title, compat_str)
1241     try:
1242         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1243     except OSError:
1244         return
1245     title_bytes = title.encode('utf-8')
1246     buf = ctypes.create_string_buffer(len(title_bytes))
1247     buf.value = title_bytes
1248     try:
1249         libc.prctl(15, buf, 0, 0, 0)
1250     except AttributeError:
1251         return  # Strange libc, just skip this
1252
1253
1254 def remove_start(s, start):
1255     if s.startswith(start):
1256         return s[len(start):]
1257     return s
1258
1259
1260 def remove_end(s, end):
1261     if s.endswith(end):
1262         return s[:-len(end)]
1263     return s
1264
1265
1266 def url_basename(url):
1267     path = compat_urlparse.urlparse(url).path
1268     return path.strip('/').split('/')[-1]
1269
1270
1271 class HEADRequest(compat_urllib_request.Request):
1272     def get_method(self):
1273         return "HEAD"
1274
1275
1276 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1277     if get_attr:
1278         if v is not None:
1279             v = getattr(v, get_attr, None)
1280     if v == '':
1281         v = None
1282     return default if v is None else (int(v) * invscale // scale)
1283
1284
1285 def str_or_none(v, default=None):
1286     return default if v is None else compat_str(v)
1287
1288
1289 def str_to_int(int_str):
1290     """ A more relaxed version of int_or_none """
1291     if int_str is None:
1292         return None
1293     int_str = re.sub(r'[,\.\+]', '', int_str)
1294     return int(int_str)
1295
1296
1297 def float_or_none(v, scale=1, invscale=1, default=None):
1298     return default if v is None else (float(v) * invscale / scale)
1299
1300
1301 def parse_duration(s):
1302     if not isinstance(s, compat_basestring):
1303         return None
1304
1305     s = s.strip()
1306
1307     m = re.match(
1308         r'''(?ix)(?:P?T)?
1309         (?:
1310             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1311             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1312
1313             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1314             (?:
1315                 (?:
1316                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1317                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1318                 )?
1319                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1320             )?
1321             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1322         )$''', s)
1323     if not m:
1324         return None
1325     res = 0
1326     if m.group('only_mins'):
1327         return float_or_none(m.group('only_mins'), invscale=60)
1328     if m.group('only_hours'):
1329         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1330     if m.group('secs'):
1331         res += int(m.group('secs'))
1332     if m.group('mins_reversed'):
1333         res += int(m.group('mins_reversed')) * 60
1334     if m.group('mins'):
1335         res += int(m.group('mins')) * 60
1336     if m.group('hours'):
1337         res += int(m.group('hours')) * 60 * 60
1338     if m.group('hours_reversed'):
1339         res += int(m.group('hours_reversed')) * 60 * 60
1340     if m.group('days'):
1341         res += int(m.group('days')) * 24 * 60 * 60
1342     if m.group('ms'):
1343         res += float(m.group('ms'))
1344     return res
1345
1346
1347 def prepend_extension(filename, ext, expected_real_ext=None):
1348     name, real_ext = os.path.splitext(filename)
1349     return (
1350         '{0}.{1}{2}'.format(name, ext, real_ext)
1351         if not expected_real_ext or real_ext[1:] == expected_real_ext
1352         else '{0}.{1}'.format(filename, ext))
1353
1354
1355 def replace_extension(filename, ext, expected_real_ext=None):
1356     name, real_ext = os.path.splitext(filename)
1357     return '{0}.{1}'.format(
1358         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1359         ext)
1360
1361
1362 def check_executable(exe, args=[]):
1363     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1364     args can be a list of arguments for a short output (like -version) """
1365     try:
1366         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1367     except OSError:
1368         return False
1369     return exe
1370
1371
1372 def get_exe_version(exe, args=['--version'],
1373                     version_re=None, unrecognized='present'):
1374     """ Returns the version of the specified executable,
1375     or False if the executable is not present """
1376     try:
1377         out, _ = subprocess.Popen(
1378             [encodeArgument(exe)] + args,
1379             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1380     except OSError:
1381         return False
1382     if isinstance(out, bytes):  # Python 2.x
1383         out = out.decode('ascii', 'ignore')
1384     return detect_exe_version(out, version_re, unrecognized)
1385
1386
1387 def detect_exe_version(output, version_re=None, unrecognized='present'):
1388     assert isinstance(output, compat_str)
1389     if version_re is None:
1390         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1391     m = re.search(version_re, output)
1392     if m:
1393         return m.group(1)
1394     else:
1395         return unrecognized
1396
1397
1398 class PagedList(object):
1399     def __len__(self):
1400         # This is only useful for tests
1401         return len(self.getslice())
1402
1403
1404 class OnDemandPagedList(PagedList):
1405     def __init__(self, pagefunc, pagesize):
1406         self._pagefunc = pagefunc
1407         self._pagesize = pagesize
1408
1409     def getslice(self, start=0, end=None):
1410         res = []
1411         for pagenum in itertools.count(start // self._pagesize):
1412             firstid = pagenum * self._pagesize
1413             nextfirstid = pagenum * self._pagesize + self._pagesize
1414             if start >= nextfirstid:
1415                 continue
1416
1417             page_results = list(self._pagefunc(pagenum))
1418
1419             startv = (
1420                 start % self._pagesize
1421                 if firstid <= start < nextfirstid
1422                 else 0)
1423
1424             endv = (
1425                 ((end - 1) % self._pagesize) + 1
1426                 if (end is not None and firstid <= end <= nextfirstid)
1427                 else None)
1428
1429             if startv != 0 or endv is not None:
1430                 page_results = page_results[startv:endv]
1431             res.extend(page_results)
1432
1433             # A little optimization - if current page is not "full", ie. does
1434             # not contain page_size videos then we can assume that this page
1435             # is the last one - there are no more ids on further pages -
1436             # i.e. no need to query again.
1437             if len(page_results) + startv < self._pagesize:
1438                 break
1439
1440             # If we got the whole page, but the next page is not interesting,
1441             # break out early as well
1442             if end == nextfirstid:
1443                 break
1444         return res
1445
1446
1447 class InAdvancePagedList(PagedList):
1448     def __init__(self, pagefunc, pagecount, pagesize):
1449         self._pagefunc = pagefunc
1450         self._pagecount = pagecount
1451         self._pagesize = pagesize
1452
1453     def getslice(self, start=0, end=None):
1454         res = []
1455         start_page = start // self._pagesize
1456         end_page = (
1457             self._pagecount if end is None else (end // self._pagesize + 1))
1458         skip_elems = start - start_page * self._pagesize
1459         only_more = None if end is None else end - start
1460         for pagenum in range(start_page, end_page):
1461             page = list(self._pagefunc(pagenum))
1462             if skip_elems:
1463                 page = page[skip_elems:]
1464                 skip_elems = None
1465             if only_more is not None:
1466                 if len(page) < only_more:
1467                     only_more -= len(page)
1468                 else:
1469                     page = page[:only_more]
1470                     res.extend(page)
1471                     break
1472             res.extend(page)
1473         return res
1474
1475
1476 def uppercase_escape(s):
1477     unicode_escape = codecs.getdecoder('unicode_escape')
1478     return re.sub(
1479         r'\\U[0-9a-fA-F]{8}',
1480         lambda m: unicode_escape(m.group(0))[0],
1481         s)
1482
1483
1484 def lowercase_escape(s):
1485     unicode_escape = codecs.getdecoder('unicode_escape')
1486     return re.sub(
1487         r'\\u[0-9a-fA-F]{4}',
1488         lambda m: unicode_escape(m.group(0))[0],
1489         s)
1490
1491
1492 def escape_rfc3986(s):
1493     """Escape non-ASCII characters as suggested by RFC 3986"""
1494     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1495         s = s.encode('utf-8')
1496     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1497
1498
1499 def escape_url(url):
1500     """Escape URL as suggested by RFC 3986"""
1501     url_parsed = compat_urllib_parse_urlparse(url)
1502     return url_parsed._replace(
1503         path=escape_rfc3986(url_parsed.path),
1504         params=escape_rfc3986(url_parsed.params),
1505         query=escape_rfc3986(url_parsed.query),
1506         fragment=escape_rfc3986(url_parsed.fragment)
1507     ).geturl()
1508
1509 try:
1510     struct.pack('!I', 0)
1511 except TypeError:
1512     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1513     def struct_pack(spec, *args):
1514         if isinstance(spec, compat_str):
1515             spec = spec.encode('ascii')
1516         return struct.pack(spec, *args)
1517
1518     def struct_unpack(spec, *args):
1519         if isinstance(spec, compat_str):
1520             spec = spec.encode('ascii')
1521         return struct.unpack(spec, *args)
1522 else:
1523     struct_pack = struct.pack
1524     struct_unpack = struct.unpack
1525
1526
1527 def read_batch_urls(batch_fd):
1528     def fixup(url):
1529         if not isinstance(url, compat_str):
1530             url = url.decode('utf-8', 'replace')
1531         BOM_UTF8 = '\xef\xbb\xbf'
1532         if url.startswith(BOM_UTF8):
1533             url = url[len(BOM_UTF8):]
1534         url = url.strip()
1535         if url.startswith(('#', ';', ']')):
1536             return False
1537         return url
1538
1539     with contextlib.closing(batch_fd) as fd:
1540         return [url for url in map(fixup, fd) if url]
1541
1542
1543 def urlencode_postdata(*args, **kargs):
1544     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1545
1546
1547 try:
1548     etree_iter = xml.etree.ElementTree.Element.iter
1549 except AttributeError:  # Python <=2.6
1550     etree_iter = lambda n: n.findall('.//*')
1551
1552
1553 def parse_xml(s):
1554     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1555         def doctype(self, name, pubid, system):
1556             pass  # Ignore doctypes
1557
1558     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1559     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1560     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1561     # Fix up XML parser in Python 2.x
1562     if sys.version_info < (3, 0):
1563         for n in etree_iter(tree):
1564             if n.text is not None:
1565                 if not isinstance(n.text, compat_str):
1566                     n.text = n.text.decode('utf-8')
1567     return tree
1568
1569
1570 US_RATINGS = {
1571     'G': 0,
1572     'PG': 10,
1573     'PG-13': 13,
1574     'R': 16,
1575     'NC': 18,
1576 }
1577
1578
1579 def parse_age_limit(s):
1580     if s is None:
1581         return None
1582     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1583     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1584
1585
1586 def strip_jsonp(code):
1587     return re.sub(
1588         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1589
1590
1591 def js_to_json(code):
1592     def fix_kv(m):
1593         v = m.group(0)
1594         if v in ('true', 'false', 'null'):
1595             return v
1596         if v.startswith('"'):
1597             return v
1598         if v.startswith("'"):
1599             v = v[1:-1]
1600             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1601                 '\\\\': '\\\\',
1602                 "\\'": "'",
1603                 '"': '\\"',
1604             }[m.group(0)], v)
1605         return '"%s"' % v
1606
1607     res = re.sub(r'''(?x)
1608         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1609         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1610         [a-zA-Z_][.a-zA-Z_0-9]*
1611         ''', fix_kv, code)
1612     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1613     return res
1614
1615
1616 def qualities(quality_ids):
1617     """ Get a numeric quality value out of a list of possible values """
1618     def q(qid):
1619         try:
1620             return quality_ids.index(qid)
1621         except ValueError:
1622             return -1
1623     return q
1624
1625
1626 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1627
1628
1629 def limit_length(s, length):
1630     """ Add ellipses to overly long strings """
1631     if s is None:
1632         return None
1633     ELLIPSES = '...'
1634     if len(s) > length:
1635         return s[:length - len(ELLIPSES)] + ELLIPSES
1636     return s
1637
1638
1639 def version_tuple(v):
1640     return tuple(int(e) for e in re.split(r'[-.]', v))
1641
1642
1643 def is_outdated_version(version, limit, assume_new=True):
1644     if not version:
1645         return not assume_new
1646     try:
1647         return version_tuple(version) < version_tuple(limit)
1648     except ValueError:
1649         return not assume_new
1650
1651
1652 def ytdl_is_updateable():
1653     """ Returns if youtube-dl can be updated with -U """
1654     from zipimport import zipimporter
1655
1656     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1657
1658
1659 def args_to_str(args):
1660     # Get a short string representation for a subprocess command
1661     return ' '.join(shlex_quote(a) for a in args)
1662
1663
1664 def mimetype2ext(mt):
1665     _, _, res = mt.rpartition('/')
1666
1667     return {
1668         'x-ms-wmv': 'wmv',
1669         'x-mp4-fragmented': 'mp4',
1670         'ttml+xml': 'ttml',
1671     }.get(res, res)
1672
1673
1674 def urlhandle_detect_ext(url_handle):
1675     try:
1676         url_handle.headers
1677         getheader = lambda h: url_handle.headers[h]
1678     except AttributeError:  # Python < 3
1679         getheader = url_handle.info().getheader
1680
1681     cd = getheader('Content-Disposition')
1682     if cd:
1683         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1684         if m:
1685             e = determine_ext(m.group('filename'), default_ext=None)
1686             if e:
1687                 return e
1688
1689     return mimetype2ext(getheader('Content-Type'))
1690
1691
1692 def age_restricted(content_limit, age_limit):
1693     """ Returns True iff the content should be blocked """
1694
1695     if age_limit is None:  # No limit set
1696         return False
1697     if content_limit is None:
1698         return False  # Content available for everyone
1699     return age_limit < content_limit
1700
1701
1702 def is_html(first_bytes):
1703     """ Detect whether a file contains HTML by examining its first bytes. """
1704
1705     BOMS = [
1706         (b'\xef\xbb\xbf', 'utf-8'),
1707         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1708         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1709         (b'\xff\xfe', 'utf-16-le'),
1710         (b'\xfe\xff', 'utf-16-be'),
1711     ]
1712     for bom, enc in BOMS:
1713         if first_bytes.startswith(bom):
1714             s = first_bytes[len(bom):].decode(enc, 'replace')
1715             break
1716     else:
1717         s = first_bytes.decode('utf-8', 'replace')
1718
1719     return re.match(r'^\s*<', s)
1720
1721
1722 def determine_protocol(info_dict):
1723     protocol = info_dict.get('protocol')
1724     if protocol is not None:
1725         return protocol
1726
1727     url = info_dict['url']
1728     if url.startswith('rtmp'):
1729         return 'rtmp'
1730     elif url.startswith('mms'):
1731         return 'mms'
1732     elif url.startswith('rtsp'):
1733         return 'rtsp'
1734
1735     ext = determine_ext(url)
1736     if ext == 'm3u8':
1737         return 'm3u8'
1738     elif ext == 'f4m':
1739         return 'f4m'
1740
1741     return compat_urllib_parse_urlparse(url).scheme
1742
1743
1744 def render_table(header_row, data):
1745     """ Render a list of rows, each as a list of values """
1746     table = [header_row] + data
1747     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1748     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1749     return '\n'.join(format_str % tuple(row) for row in table)
1750
1751
1752 def _match_one(filter_part, dct):
1753     COMPARISON_OPERATORS = {
1754         '<': operator.lt,
1755         '<=': operator.le,
1756         '>': operator.gt,
1757         '>=': operator.ge,
1758         '=': operator.eq,
1759         '!=': operator.ne,
1760     }
1761     operator_rex = re.compile(r'''(?x)\s*
1762         (?P<key>[a-z_]+)
1763         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1764         (?:
1765             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1766             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1767         )
1768         \s*$
1769         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1770     m = operator_rex.search(filter_part)
1771     if m:
1772         op = COMPARISON_OPERATORS[m.group('op')]
1773         if m.group('strval') is not None:
1774             if m.group('op') not in ('=', '!='):
1775                 raise ValueError(
1776                     'Operator %s does not support string values!' % m.group('op'))
1777             comparison_value = m.group('strval')
1778         else:
1779             try:
1780                 comparison_value = int(m.group('intval'))
1781             except ValueError:
1782                 comparison_value = parse_filesize(m.group('intval'))
1783                 if comparison_value is None:
1784                     comparison_value = parse_filesize(m.group('intval') + 'B')
1785                 if comparison_value is None:
1786                     raise ValueError(
1787                         'Invalid integer value %r in filter part %r' % (
1788                             m.group('intval'), filter_part))
1789         actual_value = dct.get(m.group('key'))
1790         if actual_value is None:
1791             return m.group('none_inclusive')
1792         return op(actual_value, comparison_value)
1793
1794     UNARY_OPERATORS = {
1795         '': lambda v: v is not None,
1796         '!': lambda v: v is None,
1797     }
1798     operator_rex = re.compile(r'''(?x)\s*
1799         (?P<op>%s)\s*(?P<key>[a-z_]+)
1800         \s*$
1801         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1802     m = operator_rex.search(filter_part)
1803     if m:
1804         op = UNARY_OPERATORS[m.group('op')]
1805         actual_value = dct.get(m.group('key'))
1806         return op(actual_value)
1807
1808     raise ValueError('Invalid filter part %r' % filter_part)
1809
1810
1811 def match_str(filter_str, dct):
1812     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1813
1814     return all(
1815         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1816
1817
1818 def match_filter_func(filter_str):
1819     def _match_func(info_dict):
1820         if match_str(filter_str, info_dict):
1821             return None
1822         else:
1823             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1824             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1825     return _match_func
1826
1827
1828 def parse_dfxp_time_expr(time_expr):
1829     if not time_expr:
1830         return 0.0
1831
1832     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1833     if mobj:
1834         return float(mobj.group('time_offset'))
1835
1836     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1837     if mobj:
1838         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1839
1840
1841 def srt_subtitles_timecode(seconds):
1842     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1843
1844
1845 def dfxp2srt(dfxp_data):
1846     _x = functools.partial(xpath_with_ns, ns_map={
1847         'ttml': 'http://www.w3.org/ns/ttml',
1848         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1849     })
1850
1851     def parse_node(node):
1852         str_or_empty = functools.partial(str_or_none, default='')
1853
1854         out = str_or_empty(node.text)
1855
1856         for child in node:
1857             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1858                 out += '\n' + str_or_empty(child.tail)
1859             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1860                 out += str_or_empty(parse_node(child))
1861             else:
1862                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1863
1864         return out
1865
1866     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1867     out = []
1868     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1869
1870     if not paras:
1871         raise ValueError('Invalid dfxp/TTML subtitle')
1872
1873     for para, index in zip(paras, itertools.count(1)):
1874         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1875         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1876         if not end_time:
1877             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1878         out.append('%d\n%s --> %s\n%s\n\n' % (
1879             index,
1880             srt_subtitles_timecode(begin_time),
1881             srt_subtitles_timecode(end_time),
1882             parse_node(para)))
1883
1884     return ''.join(out)
1885
1886
1887 class ISO639Utils(object):
1888     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1889     _lang_map = {
1890         'aa': 'aar',
1891         'ab': 'abk',
1892         'ae': 'ave',
1893         'af': 'afr',
1894         'ak': 'aka',
1895         'am': 'amh',
1896         'an': 'arg',
1897         'ar': 'ara',
1898         'as': 'asm',
1899         'av': 'ava',
1900         'ay': 'aym',
1901         'az': 'aze',
1902         'ba': 'bak',
1903         'be': 'bel',
1904         'bg': 'bul',
1905         'bh': 'bih',
1906         'bi': 'bis',
1907         'bm': 'bam',
1908         'bn': 'ben',
1909         'bo': 'bod',
1910         'br': 'bre',
1911         'bs': 'bos',
1912         'ca': 'cat',
1913         'ce': 'che',
1914         'ch': 'cha',
1915         'co': 'cos',
1916         'cr': 'cre',
1917         'cs': 'ces',
1918         'cu': 'chu',
1919         'cv': 'chv',
1920         'cy': 'cym',
1921         'da': 'dan',
1922         'de': 'deu',
1923         'dv': 'div',
1924         'dz': 'dzo',
1925         'ee': 'ewe',
1926         'el': 'ell',
1927         'en': 'eng',
1928         'eo': 'epo',
1929         'es': 'spa',
1930         'et': 'est',
1931         'eu': 'eus',
1932         'fa': 'fas',
1933         'ff': 'ful',
1934         'fi': 'fin',
1935         'fj': 'fij',
1936         'fo': 'fao',
1937         'fr': 'fra',
1938         'fy': 'fry',
1939         'ga': 'gle',
1940         'gd': 'gla',
1941         'gl': 'glg',
1942         'gn': 'grn',
1943         'gu': 'guj',
1944         'gv': 'glv',
1945         'ha': 'hau',
1946         'he': 'heb',
1947         'hi': 'hin',
1948         'ho': 'hmo',
1949         'hr': 'hrv',
1950         'ht': 'hat',
1951         'hu': 'hun',
1952         'hy': 'hye',
1953         'hz': 'her',
1954         'ia': 'ina',
1955         'id': 'ind',
1956         'ie': 'ile',
1957         'ig': 'ibo',
1958         'ii': 'iii',
1959         'ik': 'ipk',
1960         'io': 'ido',
1961         'is': 'isl',
1962         'it': 'ita',
1963         'iu': 'iku',
1964         'ja': 'jpn',
1965         'jv': 'jav',
1966         'ka': 'kat',
1967         'kg': 'kon',
1968         'ki': 'kik',
1969         'kj': 'kua',
1970         'kk': 'kaz',
1971         'kl': 'kal',
1972         'km': 'khm',
1973         'kn': 'kan',
1974         'ko': 'kor',
1975         'kr': 'kau',
1976         'ks': 'kas',
1977         'ku': 'kur',
1978         'kv': 'kom',
1979         'kw': 'cor',
1980         'ky': 'kir',
1981         'la': 'lat',
1982         'lb': 'ltz',
1983         'lg': 'lug',
1984         'li': 'lim',
1985         'ln': 'lin',
1986         'lo': 'lao',
1987         'lt': 'lit',
1988         'lu': 'lub',
1989         'lv': 'lav',
1990         'mg': 'mlg',
1991         'mh': 'mah',
1992         'mi': 'mri',
1993         'mk': 'mkd',
1994         'ml': 'mal',
1995         'mn': 'mon',
1996         'mr': 'mar',
1997         'ms': 'msa',
1998         'mt': 'mlt',
1999         'my': 'mya',
2000         'na': 'nau',
2001         'nb': 'nob',
2002         'nd': 'nde',
2003         'ne': 'nep',
2004         'ng': 'ndo',
2005         'nl': 'nld',
2006         'nn': 'nno',
2007         'no': 'nor',
2008         'nr': 'nbl',
2009         'nv': 'nav',
2010         'ny': 'nya',
2011         'oc': 'oci',
2012         'oj': 'oji',
2013         'om': 'orm',
2014         'or': 'ori',
2015         'os': 'oss',
2016         'pa': 'pan',
2017         'pi': 'pli',
2018         'pl': 'pol',
2019         'ps': 'pus',
2020         'pt': 'por',
2021         'qu': 'que',
2022         'rm': 'roh',
2023         'rn': 'run',
2024         'ro': 'ron',
2025         'ru': 'rus',
2026         'rw': 'kin',
2027         'sa': 'san',
2028         'sc': 'srd',
2029         'sd': 'snd',
2030         'se': 'sme',
2031         'sg': 'sag',
2032         'si': 'sin',
2033         'sk': 'slk',
2034         'sl': 'slv',
2035         'sm': 'smo',
2036         'sn': 'sna',
2037         'so': 'som',
2038         'sq': 'sqi',
2039         'sr': 'srp',
2040         'ss': 'ssw',
2041         'st': 'sot',
2042         'su': 'sun',
2043         'sv': 'swe',
2044         'sw': 'swa',
2045         'ta': 'tam',
2046         'te': 'tel',
2047         'tg': 'tgk',
2048         'th': 'tha',
2049         'ti': 'tir',
2050         'tk': 'tuk',
2051         'tl': 'tgl',
2052         'tn': 'tsn',
2053         'to': 'ton',
2054         'tr': 'tur',
2055         'ts': 'tso',
2056         'tt': 'tat',
2057         'tw': 'twi',
2058         'ty': 'tah',
2059         'ug': 'uig',
2060         'uk': 'ukr',
2061         'ur': 'urd',
2062         'uz': 'uzb',
2063         've': 'ven',
2064         'vi': 'vie',
2065         'vo': 'vol',
2066         'wa': 'wln',
2067         'wo': 'wol',
2068         'xh': 'xho',
2069         'yi': 'yid',
2070         'yo': 'yor',
2071         'za': 'zha',
2072         'zh': 'zho',
2073         'zu': 'zul',
2074     }
2075
2076     @classmethod
2077     def short2long(cls, code):
2078         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2079         return cls._lang_map.get(code[:2])
2080
2081     @classmethod
2082     def long2short(cls, code):
2083         """Convert language code from ISO 639-2/T to ISO 639-1"""
2084         for short_name, long_name in cls._lang_map.items():
2085             if long_name == code:
2086                 return short_name
2087
2088
2089 class ISO3166Utils(object):
2090     # From http://data.okfn.org/data/core/country-list
2091     _country_map = {
2092         'AF': 'Afghanistan',
2093         'AX': 'Åland Islands',
2094         'AL': 'Albania',
2095         'DZ': 'Algeria',
2096         'AS': 'American Samoa',
2097         'AD': 'Andorra',
2098         'AO': 'Angola',
2099         'AI': 'Anguilla',
2100         'AQ': 'Antarctica',
2101         'AG': 'Antigua and Barbuda',
2102         'AR': 'Argentina',
2103         'AM': 'Armenia',
2104         'AW': 'Aruba',
2105         'AU': 'Australia',
2106         'AT': 'Austria',
2107         'AZ': 'Azerbaijan',
2108         'BS': 'Bahamas',
2109         'BH': 'Bahrain',
2110         'BD': 'Bangladesh',
2111         'BB': 'Barbados',
2112         'BY': 'Belarus',
2113         'BE': 'Belgium',
2114         'BZ': 'Belize',
2115         'BJ': 'Benin',
2116         'BM': 'Bermuda',
2117         'BT': 'Bhutan',
2118         'BO': 'Bolivia, Plurinational State of',
2119         'BQ': 'Bonaire, Sint Eustatius and Saba',
2120         'BA': 'Bosnia and Herzegovina',
2121         'BW': 'Botswana',
2122         'BV': 'Bouvet Island',
2123         'BR': 'Brazil',
2124         'IO': 'British Indian Ocean Territory',
2125         'BN': 'Brunei Darussalam',
2126         'BG': 'Bulgaria',
2127         'BF': 'Burkina Faso',
2128         'BI': 'Burundi',
2129         'KH': 'Cambodia',
2130         'CM': 'Cameroon',
2131         'CA': 'Canada',
2132         'CV': 'Cape Verde',
2133         'KY': 'Cayman Islands',
2134         'CF': 'Central African Republic',
2135         'TD': 'Chad',
2136         'CL': 'Chile',
2137         'CN': 'China',
2138         'CX': 'Christmas Island',
2139         'CC': 'Cocos (Keeling) Islands',
2140         'CO': 'Colombia',
2141         'KM': 'Comoros',
2142         'CG': 'Congo',
2143         'CD': 'Congo, the Democratic Republic of the',
2144         'CK': 'Cook Islands',
2145         'CR': 'Costa Rica',
2146         'CI': 'Côte d\'Ivoire',
2147         'HR': 'Croatia',
2148         'CU': 'Cuba',
2149         'CW': 'Curaçao',
2150         'CY': 'Cyprus',
2151         'CZ': 'Czech Republic',
2152         'DK': 'Denmark',
2153         'DJ': 'Djibouti',
2154         'DM': 'Dominica',
2155         'DO': 'Dominican Republic',
2156         'EC': 'Ecuador',
2157         'EG': 'Egypt',
2158         'SV': 'El Salvador',
2159         'GQ': 'Equatorial Guinea',
2160         'ER': 'Eritrea',
2161         'EE': 'Estonia',
2162         'ET': 'Ethiopia',
2163         'FK': 'Falkland Islands (Malvinas)',
2164         'FO': 'Faroe Islands',
2165         'FJ': 'Fiji',
2166         'FI': 'Finland',
2167         'FR': 'France',
2168         'GF': 'French Guiana',
2169         'PF': 'French Polynesia',
2170         'TF': 'French Southern Territories',
2171         'GA': 'Gabon',
2172         'GM': 'Gambia',
2173         'GE': 'Georgia',
2174         'DE': 'Germany',
2175         'GH': 'Ghana',
2176         'GI': 'Gibraltar',
2177         'GR': 'Greece',
2178         'GL': 'Greenland',
2179         'GD': 'Grenada',
2180         'GP': 'Guadeloupe',
2181         'GU': 'Guam',
2182         'GT': 'Guatemala',
2183         'GG': 'Guernsey',
2184         'GN': 'Guinea',
2185         'GW': 'Guinea-Bissau',
2186         'GY': 'Guyana',
2187         'HT': 'Haiti',
2188         'HM': 'Heard Island and McDonald Islands',
2189         'VA': 'Holy See (Vatican City State)',
2190         'HN': 'Honduras',
2191         'HK': 'Hong Kong',
2192         'HU': 'Hungary',
2193         'IS': 'Iceland',
2194         'IN': 'India',
2195         'ID': 'Indonesia',
2196         'IR': 'Iran, Islamic Republic of',
2197         'IQ': 'Iraq',
2198         'IE': 'Ireland',
2199         'IM': 'Isle of Man',
2200         'IL': 'Israel',
2201         'IT': 'Italy',
2202         'JM': 'Jamaica',
2203         'JP': 'Japan',
2204         'JE': 'Jersey',
2205         'JO': 'Jordan',
2206         'KZ': 'Kazakhstan',
2207         'KE': 'Kenya',
2208         'KI': 'Kiribati',
2209         'KP': 'Korea, Democratic People\'s Republic of',
2210         'KR': 'Korea, Republic of',
2211         'KW': 'Kuwait',
2212         'KG': 'Kyrgyzstan',
2213         'LA': 'Lao People\'s Democratic Republic',
2214         'LV': 'Latvia',
2215         'LB': 'Lebanon',
2216         'LS': 'Lesotho',
2217         'LR': 'Liberia',
2218         'LY': 'Libya',
2219         'LI': 'Liechtenstein',
2220         'LT': 'Lithuania',
2221         'LU': 'Luxembourg',
2222         'MO': 'Macao',
2223         'MK': 'Macedonia, the Former Yugoslav Republic of',
2224         'MG': 'Madagascar',
2225         'MW': 'Malawi',
2226         'MY': 'Malaysia',
2227         'MV': 'Maldives',
2228         'ML': 'Mali',
2229         'MT': 'Malta',
2230         'MH': 'Marshall Islands',
2231         'MQ': 'Martinique',
2232         'MR': 'Mauritania',
2233         'MU': 'Mauritius',
2234         'YT': 'Mayotte',
2235         'MX': 'Mexico',
2236         'FM': 'Micronesia, Federated States of',
2237         'MD': 'Moldova, Republic of',
2238         'MC': 'Monaco',
2239         'MN': 'Mongolia',
2240         'ME': 'Montenegro',
2241         'MS': 'Montserrat',
2242         'MA': 'Morocco',
2243         'MZ': 'Mozambique',
2244         'MM': 'Myanmar',
2245         'NA': 'Namibia',
2246         'NR': 'Nauru',
2247         'NP': 'Nepal',
2248         'NL': 'Netherlands',
2249         'NC': 'New Caledonia',
2250         'NZ': 'New Zealand',
2251         'NI': 'Nicaragua',
2252         'NE': 'Niger',
2253         'NG': 'Nigeria',
2254         'NU': 'Niue',
2255         'NF': 'Norfolk Island',
2256         'MP': 'Northern Mariana Islands',
2257         'NO': 'Norway',
2258         'OM': 'Oman',
2259         'PK': 'Pakistan',
2260         'PW': 'Palau',
2261         'PS': 'Palestine, State of',
2262         'PA': 'Panama',
2263         'PG': 'Papua New Guinea',
2264         'PY': 'Paraguay',
2265         'PE': 'Peru',
2266         'PH': 'Philippines',
2267         'PN': 'Pitcairn',
2268         'PL': 'Poland',
2269         'PT': 'Portugal',
2270         'PR': 'Puerto Rico',
2271         'QA': 'Qatar',
2272         'RE': 'Réunion',
2273         'RO': 'Romania',
2274         'RU': 'Russian Federation',
2275         'RW': 'Rwanda',
2276         'BL': 'Saint Barthélemy',
2277         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2278         'KN': 'Saint Kitts and Nevis',
2279         'LC': 'Saint Lucia',
2280         'MF': 'Saint Martin (French part)',
2281         'PM': 'Saint Pierre and Miquelon',
2282         'VC': 'Saint Vincent and the Grenadines',
2283         'WS': 'Samoa',
2284         'SM': 'San Marino',
2285         'ST': 'Sao Tome and Principe',
2286         'SA': 'Saudi Arabia',
2287         'SN': 'Senegal',
2288         'RS': 'Serbia',
2289         'SC': 'Seychelles',
2290         'SL': 'Sierra Leone',
2291         'SG': 'Singapore',
2292         'SX': 'Sint Maarten (Dutch part)',
2293         'SK': 'Slovakia',
2294         'SI': 'Slovenia',
2295         'SB': 'Solomon Islands',
2296         'SO': 'Somalia',
2297         'ZA': 'South Africa',
2298         'GS': 'South Georgia and the South Sandwich Islands',
2299         'SS': 'South Sudan',
2300         'ES': 'Spain',
2301         'LK': 'Sri Lanka',
2302         'SD': 'Sudan',
2303         'SR': 'Suriname',
2304         'SJ': 'Svalbard and Jan Mayen',
2305         'SZ': 'Swaziland',
2306         'SE': 'Sweden',
2307         'CH': 'Switzerland',
2308         'SY': 'Syrian Arab Republic',
2309         'TW': 'Taiwan, Province of China',
2310         'TJ': 'Tajikistan',
2311         'TZ': 'Tanzania, United Republic of',
2312         'TH': 'Thailand',
2313         'TL': 'Timor-Leste',
2314         'TG': 'Togo',
2315         'TK': 'Tokelau',
2316         'TO': 'Tonga',
2317         'TT': 'Trinidad and Tobago',
2318         'TN': 'Tunisia',
2319         'TR': 'Turkey',
2320         'TM': 'Turkmenistan',
2321         'TC': 'Turks and Caicos Islands',
2322         'TV': 'Tuvalu',
2323         'UG': 'Uganda',
2324         'UA': 'Ukraine',
2325         'AE': 'United Arab Emirates',
2326         'GB': 'United Kingdom',
2327         'US': 'United States',
2328         'UM': 'United States Minor Outlying Islands',
2329         'UY': 'Uruguay',
2330         'UZ': 'Uzbekistan',
2331         'VU': 'Vanuatu',
2332         'VE': 'Venezuela, Bolivarian Republic of',
2333         'VN': 'Viet Nam',
2334         'VG': 'Virgin Islands, British',
2335         'VI': 'Virgin Islands, U.S.',
2336         'WF': 'Wallis and Futuna',
2337         'EH': 'Western Sahara',
2338         'YE': 'Yemen',
2339         'ZM': 'Zambia',
2340         'ZW': 'Zimbabwe',
2341     }
2342
2343     @classmethod
2344     def short2full(cls, code):
2345         """Convert an ISO 3166-2 country code to the corresponding full name"""
2346         return cls._country_map.get(code.upper())
2347
2348
2349 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2350     def __init__(self, proxies=None):
2351         # Set default handlers
2352         for type in ('http', 'https'):
2353             setattr(self, '%s_open' % type,
2354                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2355                         meth(r, proxy, type))
2356         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2357
2358     def proxy_open(self, req, proxy, type):
2359         req_proxy = req.headers.get('Ytdl-request-proxy')
2360         if req_proxy is not None:
2361             proxy = req_proxy
2362             del req.headers['Ytdl-request-proxy']
2363
2364         if proxy == '__noproxy__':
2365             return None  # No Proxy
2366         return compat_urllib_request.ProxyHandler.proxy_open(
2367             self, req, proxy, type)