_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 NO_DEFAULT = object()
  66
  67 ENGLISH_MONTH_NAMES = [
  68     'January', 'February', 'March', 'April', 'May', 'June',
  69     'July', 'August', 'September', 'October', 'November', 'December']
  70
  71
  72 def preferredencoding():
  73     """Get preferred encoding.
  74
  75     Returns the best encoding scheme for the system, based on
  76     locale.getpreferredencoding() and some further tweaks.
  77     """
  78     try:
  79         pref = locale.getpreferredencoding()
  80         'TEST'.encode(pref)
  81     except Exception:
  82         pref = 'UTF-8'
  83
  84     return pref
  85
  86
  87 def write_json_file(obj, fn):
  88     """ Encode obj as JSON and write it to fn, atomically if possible """
  89
  90     fn = encodeFilename(fn)
  91     if sys.version_info < (3, 0) and sys.platform != 'win32':
  92         encoding = get_filesystem_encoding()
  93         # os.path.basename returns a bytes object, but NamedTemporaryFile
  94         # will fail if the filename contains non ascii characters unless we
  95         # use a unicode object
  96         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  97         # the same for os.path.dirname
  98         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  99     else:
 100         path_basename = os.path.basename
 101         path_dirname = os.path.dirname
 102
 103     args = {
 104         'suffix': '.tmp',
 105         'prefix': path_basename(fn) + '.',
 106         'dir': path_dirname(fn),
 107         'delete': False,
 108     }
 109
 110     # In Python 2.x, json.dump expects a bytestream.
 111     # In Python 3.x, it writes to a character stream
 112     if sys.version_info < (3, 0):
 113         args['mode'] = 'wb'
 114     else:
 115         args.update({
 116             'mode': 'w',
 117             'encoding': 'utf-8',
 118         })
 119
 120     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 121
 122     try:
 123         with tf:
 124             json.dump(obj, tf)
 125         if sys.platform == 'win32':
 126             # Need to remove existing file on Windows, else os.rename raises
 127             # WindowsError or FileExistsError.
 128             try:
 129                 os.unlink(fn)
 130             except OSError:
 131                 pass
 132         os.rename(tf.name, fn)
 133     except Exception:
 134         try:
 135             os.remove(tf.name)
 136         except OSError:
 137             pass
 138         raise
 139
 140
 141 if sys.version_info >= (2, 7):
 142     def find_xpath_attr(node, xpath, key, val=None):
 143         """ Find the xpath xpath[@key=val] """
 144         assert re.match(r'^[a-zA-Z-]+$', key)
 145         if val:
 146             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 147         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 148         return node.find(expr)
 149 else:
 150     def find_xpath_attr(node, xpath, key, val=None):
 151         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 152         # .//node does not match if a node is a direct child of . !
 153         if isinstance(xpath, compat_str):
 154             xpath = xpath.encode('ascii')
 155
 156         for f in node.findall(xpath):
 157             if key not in f.attrib:
 158                 continue
 159             if val is None or f.attrib.get(key) == val:
 160                 return f
 161         return None
 162
 163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 164 # the namespace parameter
 165
 166
 167 def xpath_with_ns(path, ns_map):
 168     components = [c.split(':') for c in path.split('/')]
 169     replaced = []
 170     for c in components:
 171         if len(c) == 1:
 172             replaced.append(c[0])
 173         else:
 174             ns, tag = c
 175             replaced.append('{%s}%s' % (ns_map[ns], tag))
 176     return '/'.join(replaced)
 177
 178
 179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 180     if sys.version_info < (2, 7):  # Crazy 2.6
 181         xpath = xpath.encode('ascii')
 182
 183     n = node.find(xpath)
 184     if n is None or n.text is None:
 185         if default is not NO_DEFAULT:
 186             return default
 187         elif fatal:
 188             name = xpath if name is None else name
 189             raise ExtractorError('Could not find XML element %s' % name)
 190         else:
 191             return None
 192     return n.text
 193
 194
 195 def get_element_by_id(id, html):
 196     """Return the content of the tag with the specified ID in the passed HTML document"""
 197     return get_element_by_attribute("id", id, html)
 198
 199
 200 def get_element_by_attribute(attribute, value, html):
 201     """Return the content of the tag with the specified attribute in the passed HTML document"""
 202
 203     m = re.search(r'''(?xs)
 204         <([a-zA-Z0-9:._-]+)
 205          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 206          \s+%s=['"]?%s['"]?
 207          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 208         \s*>
 209         (?P<content>.*?)
 210         </\1>
 211     ''' % (re.escape(attribute), re.escape(value)), html)
 212
 213     if not m:
 214         return None
 215     res = m.group('content')
 216
 217     if res.startswith('"') or res.startswith("'"):
 218         res = res[1:-1]
 219
 220     return unescapeHTML(res)
 221
 222
 223 def clean_html(html):
 224     """Clean an HTML snippet into a readable string"""
 225
 226     if html is None:  # Convenience for sanitizing descriptions etc.
 227         return html
 228
 229     # Newline vs <br />
 230     html = html.replace('\n', ' ')
 231     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 232     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 233     # Strip html tags
 234     html = re.sub('<.*?>', '', html)
 235     # Replace html entities
 236     html = unescapeHTML(html)
 237     return html.strip()
 238
 239
 240 def sanitize_open(filename, open_mode):
 241     """Try to open the given filename, and slightly tweak it if this fails.
 242
 243     Attempts to open the given filename. If this fails, it tries to change
 244     the filename slightly, step by step, until it's either able to open it
 245     or it fails and raises a final exception, like the standard open()
 246     function.
 247
 248     It returns the tuple (stream, definitive_file_name).
 249     """
 250     try:
 251         if filename == '-':
 252             if sys.platform == 'win32':
 253                 import msvcrt
 254                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 255             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 256         stream = open(encodeFilename(filename), open_mode)
 257         return (stream, filename)
 258     except (IOError, OSError) as err:
 259         if err.errno in (errno.EACCES,):
 260             raise
 261
 262         # In case of error, try to remove win32 forbidden chars
 263         alt_filename = sanitize_path(filename)
 264         if alt_filename == filename:
 265             raise
 266         else:
 267             # An exception here should be caught in the caller
 268             stream = open(encodeFilename(alt_filename), open_mode)
 269             return (stream, alt_filename)
 270
 271
 272 def timeconvert(timestr):
 273     """Convert RFC 2822 defined time string into system timestamp"""
 274     timestamp = None
 275     timetuple = email.utils.parsedate_tz(timestr)
 276     if timetuple is not None:
 277         timestamp = email.utils.mktime_tz(timetuple)
 278     return timestamp
 279
 280
 281 def sanitize_filename(s, restricted=False, is_id=False):
 282     """Sanitizes a string so it could be used as part of a filename.
 283     If restricted is set, use a stricter subset of allowed characters.
 284     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 285     """
 286     def replace_insane(char):
 287         if char == '?' or ord(char) < 32 or ord(char) == 127:
 288             return ''
 289         elif char == '"':
 290             return '' if restricted else '\''
 291         elif char == ':':
 292             return '_-' if restricted else ' -'
 293         elif char in '\\/|*<>':
 294             return '_'
 295         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 296             return '_'
 297         if restricted and ord(char) > 127:
 298             return '_'
 299         return char
 300
 301     # Handle timestamps
 302     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 303     result = ''.join(map(replace_insane, s))
 304     if not is_id:
 305         while '__' in result:
 306             result = result.replace('__', '_')
 307         result = result.strip('_')
 308         # Common case of "Foreign band name - English song title"
 309         if restricted and result.startswith('-_'):
 310             result = result[2:]
 311         if result.startswith('-'):
 312             result = '_' + result[len('-'):]
 313         result = result.lstrip('.')
 314         if not result:
 315             result = '_'
 316     return result
 317
 318
 319 def sanitize_path(s):
 320     """Sanitizes and normalizes path on Windows"""
 321     if sys.platform != 'win32':
 322         return s
 323     drive_or_unc, _ = os.path.splitdrive(s)
 324     if sys.version_info < (2, 7) and not drive_or_unc:
 325         drive_or_unc, _ = os.path.splitunc(s)
 326     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 327     if drive_or_unc:
 328         norm_path.pop(0)
 329     sanitized_path = [
 330         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 331         for path_part in norm_path]
 332     if drive_or_unc:
 333         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 334     return os.path.join(*sanitized_path)
 335
 336
 337 def orderedSet(iterable):
 338     """ Remove all duplicates from the input iterable """
 339     res = []
 340     for el in iterable:
 341         if el not in res:
 342             res.append(el)
 343     return res
 344
 345
 346 def _htmlentity_transform(entity):
 347     """Transforms an HTML entity to a character."""
 348     # Known non-numeric HTML entity
 349     if entity in compat_html_entities.name2codepoint:
 350         return compat_chr(compat_html_entities.name2codepoint[entity])
 351
 352     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 353     if mobj is not None:
 354         numstr = mobj.group(1)
 355         if numstr.startswith('x'):
 356             base = 16
 357             numstr = '0%s' % numstr
 358         else:
 359             base = 10
 360         return compat_chr(int(numstr, base))
 361
 362     # Unknown entity in name, return its literal representation
 363     return ('&%s;' % entity)
 364
 365
 366 def unescapeHTML(s):
 367     if s is None:
 368         return None
 369     assert type(s) == compat_str
 370
 371     return re.sub(
 372         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 373
 374
 375 def get_subprocess_encoding():
 376     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 377         # For subprocess calls, encode with locale encoding
 378         # Refer to http://stackoverflow.com/a/9951851/35070
 379         encoding = preferredencoding()
 380     else:
 381         encoding = sys.getfilesystemencoding()
 382     if encoding is None:
 383         encoding = 'utf-8'
 384     return encoding
 385
 386
 387 def encodeFilename(s, for_subprocess=False):
 388     """
 389     @param s The name of the file
 390     """
 391
 392     assert type(s) == compat_str
 393
 394     # Python 3 has a Unicode API
 395     if sys.version_info >= (3, 0):
 396         return s
 397
 398     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 399     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 400     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 401     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 402         return s
 403
 404     return s.encode(get_subprocess_encoding(), 'ignore')
 405
 406
 407 def decodeFilename(b, for_subprocess=False):
 408
 409     if sys.version_info >= (3, 0):
 410         return b
 411
 412     if not isinstance(b, bytes):
 413         return b
 414
 415     return b.decode(get_subprocess_encoding(), 'ignore')
 416
 417
 418 def encodeArgument(s):
 419     if not isinstance(s, compat_str):
 420         # Legacy code that uses byte strings
 421         # Uncomment the following line after fixing all post processors
 422         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 423         s = s.decode('ascii')
 424     return encodeFilename(s, True)
 425
 426
 427 def decodeArgument(b):
 428     return decodeFilename(b, True)
 429
 430
 431 def decodeOption(optval):
 432     if optval is None:
 433         return optval
 434     if isinstance(optval, bytes):
 435         optval = optval.decode(preferredencoding())
 436
 437     assert isinstance(optval, compat_str)
 438     return optval
 439
 440
 441 def formatSeconds(secs):
 442     if secs > 3600:
 443         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 444     elif secs > 60:
 445         return '%d:%02d' % (secs // 60, secs % 60)
 446     else:
 447         return '%d' % secs
 448
 449
 450 def make_HTTPS_handler(params, **kwargs):
 451     opts_no_check_certificate = params.get('nocheckcertificate', False)
 452     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 453         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 454         if opts_no_check_certificate:
 455             context.check_hostname = False
 456             context.verify_mode = ssl.CERT_NONE
 457         try:
 458             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 459         except TypeError:
 460             # Python 2.7.8
 461             # (create_default_context present but HTTPSHandler has no context=)
 462             pass
 463
 464     if sys.version_info < (3, 2):
 465         return YoutubeDLHTTPSHandler(params, **kwargs)
 466     else:  # Python < 3.4
 467         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 468         context.verify_mode = (ssl.CERT_NONE
 469                                if opts_no_check_certificate
 470                                else ssl.CERT_REQUIRED)
 471         context.set_default_verify_paths()
 472         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 473
 474
 475 def bug_reports_message():
 476     if ytdl_is_updateable():
 477         update_cmd = 'type  youtube-dl -U  to update'
 478     else:
 479         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 480     msg = '; please report this issue on https://yt-dl.org/bug .'
 481     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 482     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 483     return msg
 484
 485
 486 class ExtractorError(Exception):
 487     """Error during info extraction."""
 488
 489     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 490         """ tb, if given, is the original traceback (so that it can be printed out).
 491         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 492         """
 493
 494         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 495             expected = True
 496         if video_id is not None:
 497             msg = video_id + ': ' + msg
 498         if cause:
 499             msg += ' (caused by %r)' % cause
 500         if not expected:
 501             msg += bug_reports_message()
 502         super(ExtractorError, self).__init__(msg)
 503
 504         self.traceback = tb
 505         self.exc_info = sys.exc_info()  # preserve original exception
 506         self.cause = cause
 507         self.video_id = video_id
 508
 509     def format_traceback(self):
 510         if self.traceback is None:
 511             return None
 512         return ''.join(traceback.format_tb(self.traceback))
 513
 514
 515 class UnsupportedError(ExtractorError):
 516     def __init__(self, url):
 517         super(UnsupportedError, self).__init__(
 518             'Unsupported URL: %s' % url, expected=True)
 519         self.url = url
 520
 521
 522 class RegexNotFoundError(ExtractorError):
 523     """Error when a regex didn't match"""
 524     pass
 525
 526
 527 class DownloadError(Exception):
 528     """Download Error exception.
 529
 530     This exception may be thrown by FileDownloader objects if they are not
 531     configured to continue on errors. They will contain the appropriate
 532     error message.
 533     """
 534
 535     def __init__(self, msg, exc_info=None):
 536         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 537         super(DownloadError, self).__init__(msg)
 538         self.exc_info = exc_info
 539
 540
 541 class SameFileError(Exception):
 542     """Same File exception.
 543
 544     This exception will be thrown by FileDownloader objects if they detect
 545     multiple files would have to be downloaded to the same file on disk.
 546     """
 547     pass
 548
 549
 550 class PostProcessingError(Exception):
 551     """Post Processing exception.
 552
 553     This exception may be raised by PostProcessor's .run() method to
 554     indicate an error in the postprocessing task.
 555     """
 556
 557     def __init__(self, msg):
 558         self.msg = msg
 559
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582
 583     def __init__(self, downloaded, expected):
 584         # Both in bytes
 585         self.downloaded = downloaded
 586         self.expected = expected
 587
 588
 589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 590     hc = http_class(*args, **kwargs)
 591     source_address = ydl_handler._params.get('source_address')
 592     if source_address is not None:
 593         sa = (source_address, 0)
 594         if hasattr(hc, 'source_address'):  # Python 2.7+
 595             hc.source_address = sa
 596         else:  # Python 2.6
 597             def _hc_connect(self, *args, **kwargs):
 598                 sock = compat_socket_create_connection(
 599                     (self.host, self.port), self.timeout, sa)
 600                 if is_https:
 601                     self.sock = ssl.wrap_socket(
 602                         sock, self.key_file, self.cert_file,
 603                         ssl_version=ssl.PROTOCOL_TLSv1)
 604                 else:
 605                     self.sock = sock
 606             hc.connect = functools.partial(_hc_connect, hc)
 607
 608     return hc
 609
 610
 611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 612     """Handler for HTTP requests and responses.
 613
 614     This class, when installed with an OpenerDirector, automatically adds
 615     the standard headers to every HTTP request and handles gzipped and
 616     deflated responses from web servers. If compression is to be avoided in
 617     a particular request, the original request in the program code only has
 618     to include the HTTP header "Youtubedl-No-Compression", which will be
 619     removed before making the real request.
 620
 621     Part of this code was copied from:
 622
 623     http://techknack.net/python-urllib2-handlers/
 624
 625     Andrew Rowls, the author of that code, agreed to release it to the
 626     public domain.
 627     """
 628
 629     def __init__(self, params, *args, **kwargs):
 630         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 631         self._params = params
 632
 633     def http_open(self, req):
 634         return self.do_open(functools.partial(
 635             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 636             req)
 637
 638     @staticmethod
 639     def deflate(data):
 640         try:
 641             return zlib.decompress(data, -zlib.MAX_WBITS)
 642         except zlib.error:
 643             return zlib.decompress(data)
 644
 645     @staticmethod
 646     def addinfourl_wrapper(stream, headers, url, code):
 647         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 648             return compat_urllib_request.addinfourl(stream, headers, url, code)
 649         ret = compat_urllib_request.addinfourl(stream, headers, url)
 650         ret.code = code
 651         return ret
 652
 653     def http_request(self, req):
 654         for h, v in std_headers.items():
 655             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 656             # The dict keys are capitalized because of this bug by urllib
 657             if h.capitalize() not in req.headers:
 658                 req.add_header(h, v)
 659         if 'Youtubedl-no-compression' in req.headers:
 660             if 'Accept-encoding' in req.headers:
 661                 del req.headers['Accept-encoding']
 662             del req.headers['Youtubedl-no-compression']
 663
 664         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 665             # Python 2.6 is brain-dead when it comes to fragments
 666             req._Request__original = req._Request__original.partition('#')[0]
 667             req._Request__r_type = req._Request__r_type.partition('#')[0]
 668
 669         return req
 670
 671     def http_response(self, req, resp):
 672         old_resp = resp
 673         # gzip
 674         if resp.headers.get('Content-encoding', '') == 'gzip':
 675             content = resp.read()
 676             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 677             try:
 678                 uncompressed = io.BytesIO(gz.read())
 679             except IOError as original_ioerror:
 680                 # There may be junk add the end of the file
 681                 # See http://stackoverflow.com/q/4928560/35070 for details
 682                 for i in range(1, 1024):
 683                     try:
 684                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 685                         uncompressed = io.BytesIO(gz.read())
 686                     except IOError:
 687                         continue
 688                     break
 689                 else:
 690                     raise original_ioerror
 691             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 692             resp.msg = old_resp.msg
 693         # deflate
 694         if resp.headers.get('Content-encoding', '') == 'deflate':
 695             gz = io.BytesIO(self.deflate(resp.read()))
 696             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 697             resp.msg = old_resp.msg
 698         return resp
 699
 700     https_request = http_request
 701     https_response = http_response
 702
 703
 704 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 705     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 706         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 707         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 708         self._params = params
 709
 710     def https_open(self, req):
 711         kwargs = {}
 712         if hasattr(self, '_context'):  # python > 2.6
 713             kwargs['context'] = self._context
 714         if hasattr(self, '_check_hostname'):  # python 3.x
 715             kwargs['check_hostname'] = self._check_hostname
 716         return self.do_open(functools.partial(
 717             _create_http_connection, self, self._https_conn_class, True),
 718             req, **kwargs)
 719
 720
 721 def parse_iso8601(date_str, delimiter='T', timezone=None):
 722     """ Return a UNIX timestamp from the given date """
 723
 724     if date_str is None:
 725         return None
 726
 727     if timezone is None:
 728         m = re.search(
 729             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 730             date_str)
 731         if not m:
 732             timezone = datetime.timedelta()
 733         else:
 734             date_str = date_str[:-len(m.group(0))]
 735             if not m.group('sign'):
 736                 timezone = datetime.timedelta()
 737             else:
 738                 sign = 1 if m.group('sign') == '+' else -1
 739                 timezone = datetime.timedelta(
 740                     hours=sign * int(m.group('hours')),
 741                     minutes=sign * int(m.group('minutes')))
 742     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 743     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 744     return calendar.timegm(dt.timetuple())
 745
 746
 747 def unified_strdate(date_str, day_first=True):
 748     """Return a string with the date in the format YYYYMMDD"""
 749
 750     if date_str is None:
 751         return None
 752     upload_date = None
 753     # Replace commas
 754     date_str = date_str.replace(',', ' ')
 755     # %z (UTC offset) is only supported in python>=3.2
 756     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 757         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 758     # Remove AM/PM + timezone
 759     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 760
 761     format_expressions = [
 762         '%d %B %Y',
 763         '%d %b %Y',
 764         '%B %d %Y',
 765         '%b %d %Y',
 766         '%b %dst %Y %I:%M%p',
 767         '%b %dnd %Y %I:%M%p',
 768         '%b %dth %Y %I:%M%p',
 769         '%Y %m %d',
 770         '%Y-%m-%d',
 771         '%Y/%m/%d',
 772         '%Y/%m/%d %H:%M:%S',
 773         '%Y-%m-%d %H:%M:%S',
 774         '%Y-%m-%d %H:%M:%S.%f',
 775         '%d.%m.%Y %H:%M',
 776         '%d.%m.%Y %H.%M',
 777         '%Y-%m-%dT%H:%M:%SZ',
 778         '%Y-%m-%dT%H:%M:%S.%fZ',
 779         '%Y-%m-%dT%H:%M:%S.%f0Z',
 780         '%Y-%m-%dT%H:%M:%S',
 781         '%Y-%m-%dT%H:%M:%S.%f',
 782         '%Y-%m-%dT%H:%M',
 783     ]
 784     if day_first:
 785         format_expressions.extend([
 786             '%d-%m-%Y',
 787             '%d.%m.%Y',
 788             '%d/%m/%Y',
 789             '%d/%m/%y',
 790             '%d/%m/%Y %H:%M:%S',
 791         ])
 792     else:
 793         format_expressions.extend([
 794             '%m-%d-%Y',
 795             '%m.%d.%Y',
 796             '%m/%d/%Y',
 797             '%m/%d/%y',
 798             '%m/%d/%Y %H:%M:%S',
 799         ])
 800     for expression in format_expressions:
 801         try:
 802             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 803         except ValueError:
 804             pass
 805     if upload_date is None:
 806         timetuple = email.utils.parsedate_tz(date_str)
 807         if timetuple:
 808             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 809     return upload_date
 810
 811
 812 def determine_ext(url, default_ext='unknown_video'):
 813     if url is None:
 814         return default_ext
 815     guess = url.partition('?')[0].rpartition('.')[2]
 816     if re.match(r'^[A-Za-z0-9]+$', guess):
 817         return guess
 818     else:
 819         return default_ext
 820
 821
 822 def subtitles_filename(filename, sub_lang, sub_format):
 823     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 824
 825
 826 def date_from_str(date_str):
 827     """
 828     Return a datetime object from a string in the format YYYYMMDD or
 829     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 830     today = datetime.date.today()
 831     if date_str in ('now', 'today'):
 832         return today
 833     if date_str == 'yesterday':
 834         return today - datetime.timedelta(days=1)
 835     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 836     if match is not None:
 837         sign = match.group('sign')
 838         time = int(match.group('time'))
 839         if sign == '-':
 840             time = -time
 841         unit = match.group('unit')
 842         # A bad aproximation?
 843         if unit == 'month':
 844             unit = 'day'
 845             time *= 30
 846         elif unit == 'year':
 847             unit = 'day'
 848             time *= 365
 849         unit += 's'
 850         delta = datetime.timedelta(**{unit: time})
 851         return today + delta
 852     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 853
 854
 855 def hyphenate_date(date_str):
 856     """
 857     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 858     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 859     if match is not None:
 860         return '-'.join(match.groups())
 861     else:
 862         return date_str
 863
 864
 865 class DateRange(object):
 866     """Represents a time interval between two dates"""
 867
 868     def __init__(self, start=None, end=None):
 869         """start and end must be strings in the format accepted by date"""
 870         if start is not None:
 871             self.start = date_from_str(start)
 872         else:
 873             self.start = datetime.datetime.min.date()
 874         if end is not None:
 875             self.end = date_from_str(end)
 876         else:
 877             self.end = datetime.datetime.max.date()
 878         if self.start > self.end:
 879             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 880
 881     @classmethod
 882     def day(cls, day):
 883         """Returns a range that only contains the given day"""
 884         return cls(day, day)
 885
 886     def __contains__(self, date):
 887         """Check if the date is in the range"""
 888         if not isinstance(date, datetime.date):
 889             date = date_from_str(date)
 890         return self.start <= date <= self.end
 891
 892     def __str__(self):
 893         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 894
 895
 896 def platform_name():
 897     """ Returns the platform name as a compat_str """
 898     res = platform.platform()
 899     if isinstance(res, bytes):
 900         res = res.decode(preferredencoding())
 901
 902     assert isinstance(res, compat_str)
 903     return res
 904
 905
 906 def _windows_write_string(s, out):
 907     """ Returns True if the string was written using special methods,
 908     False if it has yet to be written out."""
 909     # Adapted from http://stackoverflow.com/a/3259271/35070
 910
 911     import ctypes
 912     import ctypes.wintypes
 913
 914     WIN_OUTPUT_IDS = {
 915         1: -11,
 916         2: -12,
 917     }
 918
 919     try:
 920         fileno = out.fileno()
 921     except AttributeError:
 922         # If the output stream doesn't have a fileno, it's virtual
 923         return False
 924     except io.UnsupportedOperation:
 925         # Some strange Windows pseudo files?
 926         return False
 927     if fileno not in WIN_OUTPUT_IDS:
 928         return False
 929
 930     GetStdHandle = ctypes.WINFUNCTYPE(
 931         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 932         (b"GetStdHandle", ctypes.windll.kernel32))
 933     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 934
 935     WriteConsoleW = ctypes.WINFUNCTYPE(
 936         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 937         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 938         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 939     written = ctypes.wintypes.DWORD(0)
 940
 941     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 942     FILE_TYPE_CHAR = 0x0002
 943     FILE_TYPE_REMOTE = 0x8000
 944     GetConsoleMode = ctypes.WINFUNCTYPE(
 945         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 946         ctypes.POINTER(ctypes.wintypes.DWORD))(
 947         (b"GetConsoleMode", ctypes.windll.kernel32))
 948     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 949
 950     def not_a_console(handle):
 951         if handle == INVALID_HANDLE_VALUE or handle is None:
 952             return True
 953         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 954                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 955
 956     if not_a_console(h):
 957         return False
 958
 959     def next_nonbmp_pos(s):
 960         try:
 961             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 962         except StopIteration:
 963             return len(s)
 964
 965     while s:
 966         count = min(next_nonbmp_pos(s), 1024)
 967
 968         ret = WriteConsoleW(
 969             h, s, count if count else 2, ctypes.byref(written), None)
 970         if ret == 0:
 971             raise OSError('Failed to write string')
 972         if not count:  # We just wrote a non-BMP character
 973             assert written.value == 2
 974             s = s[1:]
 975         else:
 976             assert written.value > 0
 977             s = s[written.value:]
 978     return True
 979
 980
 981 def write_string(s, out=None, encoding=None):
 982     if out is None:
 983         out = sys.stderr
 984     assert type(s) == compat_str
 985
 986     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 987         if _windows_write_string(s, out):
 988             return
 989
 990     if ('b' in getattr(out, 'mode', '') or
 991             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 992         byt = s.encode(encoding or preferredencoding(), 'ignore')
 993         out.write(byt)
 994     elif hasattr(out, 'buffer'):
 995         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 996         byt = s.encode(enc, 'ignore')
 997         out.buffer.write(byt)
 998     else:
 999         out.write(s)
1000     out.flush()
1001
1002
1003 def bytes_to_intlist(bs):
1004     if not bs:
1005         return []
1006     if isinstance(bs[0], int):  # Python 3
1007         return list(bs)
1008     else:
1009         return [ord(c) for c in bs]
1010
1011
1012 def intlist_to_bytes(xs):
1013     if not xs:
1014         return b''
1015     return struct_pack('%dB' % len(xs), *xs)
1016
1017
1018 # Cross-platform file locking
1019 if sys.platform == 'win32':
1020     import ctypes.wintypes
1021     import msvcrt
1022
1023     class OVERLAPPED(ctypes.Structure):
1024         _fields_ = [
1025             ('Internal', ctypes.wintypes.LPVOID),
1026             ('InternalHigh', ctypes.wintypes.LPVOID),
1027             ('Offset', ctypes.wintypes.DWORD),
1028             ('OffsetHigh', ctypes.wintypes.DWORD),
1029             ('hEvent', ctypes.wintypes.HANDLE),
1030         ]
1031
1032     kernel32 = ctypes.windll.kernel32
1033     LockFileEx = kernel32.LockFileEx
1034     LockFileEx.argtypes = [
1035         ctypes.wintypes.HANDLE,     # hFile
1036         ctypes.wintypes.DWORD,      # dwFlags
1037         ctypes.wintypes.DWORD,      # dwReserved
1038         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1039         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1040         ctypes.POINTER(OVERLAPPED)  # Overlapped
1041     ]
1042     LockFileEx.restype = ctypes.wintypes.BOOL
1043     UnlockFileEx = kernel32.UnlockFileEx
1044     UnlockFileEx.argtypes = [
1045         ctypes.wintypes.HANDLE,     # hFile
1046         ctypes.wintypes.DWORD,      # dwReserved
1047         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1048         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1049         ctypes.POINTER(OVERLAPPED)  # Overlapped
1050     ]
1051     UnlockFileEx.restype = ctypes.wintypes.BOOL
1052     whole_low = 0xffffffff
1053     whole_high = 0x7fffffff
1054
1055     def _lock_file(f, exclusive):
1056         overlapped = OVERLAPPED()
1057         overlapped.Offset = 0
1058         overlapped.OffsetHigh = 0
1059         overlapped.hEvent = 0
1060         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1061         handle = msvcrt.get_osfhandle(f.fileno())
1062         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1063                           whole_low, whole_high, f._lock_file_overlapped_p):
1064             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1065
1066     def _unlock_file(f):
1067         assert f._lock_file_overlapped_p
1068         handle = msvcrt.get_osfhandle(f.fileno())
1069         if not UnlockFileEx(handle, 0,
1070                             whole_low, whole_high, f._lock_file_overlapped_p):
1071             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1072
1073 else:
1074     import fcntl
1075
1076     def _lock_file(f, exclusive):
1077         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1078
1079     def _unlock_file(f):
1080         fcntl.flock(f, fcntl.LOCK_UN)
1081
1082
1083 class locked_file(object):
1084     def __init__(self, filename, mode, encoding=None):
1085         assert mode in ['r', 'a', 'w']
1086         self.f = io.open(filename, mode, encoding=encoding)
1087         self.mode = mode
1088
1089     def __enter__(self):
1090         exclusive = self.mode != 'r'
1091         try:
1092             _lock_file(self.f, exclusive)
1093         except IOError:
1094             self.f.close()
1095             raise
1096         return self
1097
1098     def __exit__(self, etype, value, traceback):
1099         try:
1100             _unlock_file(self.f)
1101         finally:
1102             self.f.close()
1103
1104     def __iter__(self):
1105         return iter(self.f)
1106
1107     def write(self, *args):
1108         return self.f.write(*args)
1109
1110     def read(self, *args):
1111         return self.f.read(*args)
1112
1113
1114 def get_filesystem_encoding():
1115     encoding = sys.getfilesystemencoding()
1116     return encoding if encoding is not None else 'utf-8'
1117
1118
1119 def shell_quote(args):
1120     quoted_args = []
1121     encoding = get_filesystem_encoding()
1122     for a in args:
1123         if isinstance(a, bytes):
1124             # We may get a filename encoded with 'encodeFilename'
1125             a = a.decode(encoding)
1126         quoted_args.append(pipes.quote(a))
1127     return ' '.join(quoted_args)
1128
1129
1130 def smuggle_url(url, data):
1131     """ Pass additional data in a URL for internal use. """
1132
1133     sdata = compat_urllib_parse.urlencode(
1134         {'__youtubedl_smuggle': json.dumps(data)})
1135     return url + '#' + sdata
1136
1137
1138 def unsmuggle_url(smug_url, default=None):
1139     if '#__youtubedl_smuggle' not in smug_url:
1140         return smug_url, default
1141     url, _, sdata = smug_url.rpartition('#')
1142     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1143     data = json.loads(jsond)
1144     return url, data
1145
1146
1147 def format_bytes(bytes):
1148     if bytes is None:
1149         return 'N/A'
1150     if type(bytes) is str:
1151         bytes = float(bytes)
1152     if bytes == 0.0:
1153         exponent = 0
1154     else:
1155         exponent = int(math.log(bytes, 1024.0))
1156     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1157     converted = float(bytes) / float(1024 ** exponent)
1158     return '%.2f%s' % (converted, suffix)
1159
1160
1161 def parse_filesize(s):
1162     if s is None:
1163         return None
1164
1165     # The lower-case forms are of course incorrect and inofficial,
1166     # but we support those too
1167     _UNIT_TABLE = {
1168         'B': 1,
1169         'b': 1,
1170         'KiB': 1024,
1171         'KB': 1000,
1172         'kB': 1024,
1173         'Kb': 1000,
1174         'MiB': 1024 ** 2,
1175         'MB': 1000 ** 2,
1176         'mB': 1024 ** 2,
1177         'Mb': 1000 ** 2,
1178         'GiB': 1024 ** 3,
1179         'GB': 1000 ** 3,
1180         'gB': 1024 ** 3,
1181         'Gb': 1000 ** 3,
1182         'TiB': 1024 ** 4,
1183         'TB': 1000 ** 4,
1184         'tB': 1024 ** 4,
1185         'Tb': 1000 ** 4,
1186         'PiB': 1024 ** 5,
1187         'PB': 1000 ** 5,
1188         'pB': 1024 ** 5,
1189         'Pb': 1000 ** 5,
1190         'EiB': 1024 ** 6,
1191         'EB': 1000 ** 6,
1192         'eB': 1024 ** 6,
1193         'Eb': 1000 ** 6,
1194         'ZiB': 1024 ** 7,
1195         'ZB': 1000 ** 7,
1196         'zB': 1024 ** 7,
1197         'Zb': 1000 ** 7,
1198         'YiB': 1024 ** 8,
1199         'YB': 1000 ** 8,
1200         'yB': 1024 ** 8,
1201         'Yb': 1000 ** 8,
1202     }
1203
1204     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1205     m = re.match(
1206         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1207     if not m:
1208         return None
1209
1210     num_str = m.group('num').replace(',', '.')
1211     mult = _UNIT_TABLE[m.group('unit')]
1212     return int(float(num_str) * mult)
1213
1214
1215 def month_by_name(name):
1216     """ Return the number of a month by (locale-independently) English name """
1217
1218     try:
1219         return ENGLISH_MONTH_NAMES.index(name) + 1
1220     except ValueError:
1221         return None
1222
1223
1224 def month_by_abbreviation(abbrev):
1225     """ Return the number of a month by (locale-independently) English
1226         abbreviations """
1227
1228     try:
1229         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1230     except ValueError:
1231         return None
1232
1233
1234 def fix_xml_ampersands(xml_str):
1235     """Replace all the '&' by '&amp;' in XML"""
1236     return re.sub(
1237         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1238         '&amp;',
1239         xml_str)
1240
1241
1242 def setproctitle(title):
1243     assert isinstance(title, compat_str)
1244     try:
1245         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1246     except OSError:
1247         return
1248     title_bytes = title.encode('utf-8')
1249     buf = ctypes.create_string_buffer(len(title_bytes))
1250     buf.value = title_bytes
1251     try:
1252         libc.prctl(15, buf, 0, 0, 0)
1253     except AttributeError:
1254         return  # Strange libc, just skip this
1255
1256
1257 def remove_start(s, start):
1258     if s.startswith(start):
1259         return s[len(start):]
1260     return s
1261
1262
1263 def remove_end(s, end):
1264     if s.endswith(end):
1265         return s[:-len(end)]
1266     return s
1267
1268
1269 def url_basename(url):
1270     path = compat_urlparse.urlparse(url).path
1271     return path.strip('/').split('/')[-1]
1272
1273
1274 class HEADRequest(compat_urllib_request.Request):
1275     def get_method(self):
1276         return "HEAD"
1277
1278
1279 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1280     if get_attr:
1281         if v is not None:
1282             v = getattr(v, get_attr, None)
1283     if v == '':
1284         v = None
1285     return default if v is None else (int(v) * invscale // scale)
1286
1287
1288 def str_or_none(v, default=None):
1289     return default if v is None else compat_str(v)
1290
1291
1292 def str_to_int(int_str):
1293     """ A more relaxed version of int_or_none """
1294     if int_str is None:
1295         return None
1296     int_str = re.sub(r'[,\.\+]', '', int_str)
1297     return int(int_str)
1298
1299
1300 def float_or_none(v, scale=1, invscale=1, default=None):
1301     return default if v is None else (float(v) * invscale / scale)
1302
1303
1304 def parse_duration(s):
1305     if not isinstance(s, compat_basestring):
1306         return None
1307
1308     s = s.strip()
1309
1310     m = re.match(
1311         r'''(?ix)(?:P?T)?
1312         (?:
1313             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1314             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1315
1316             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1317             (?:
1318                 (?:
1319                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1320                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1321                 )?
1322                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1323             )?
1324             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1325         )$''', s)
1326     if not m:
1327         return None
1328     res = 0
1329     if m.group('only_mins'):
1330         return float_or_none(m.group('only_mins'), invscale=60)
1331     if m.group('only_hours'):
1332         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1333     if m.group('secs'):
1334         res += int(m.group('secs'))
1335     if m.group('mins_reversed'):
1336         res += int(m.group('mins_reversed')) * 60
1337     if m.group('mins'):
1338         res += int(m.group('mins')) * 60
1339     if m.group('hours'):
1340         res += int(m.group('hours')) * 60 * 60
1341     if m.group('hours_reversed'):
1342         res += int(m.group('hours_reversed')) * 60 * 60
1343     if m.group('days'):
1344         res += int(m.group('days')) * 24 * 60 * 60
1345     if m.group('ms'):
1346         res += float(m.group('ms'))
1347     return res
1348
1349
1350 def prepend_extension(filename, ext, expected_real_ext=None):
1351     name, real_ext = os.path.splitext(filename)
1352     return (
1353         '{0}.{1}{2}'.format(name, ext, real_ext)
1354         if not expected_real_ext or real_ext[1:] == expected_real_ext
1355         else '{0}.{1}'.format(filename, ext))
1356
1357
1358 def replace_extension(filename, ext, expected_real_ext=None):
1359     name, real_ext = os.path.splitext(filename)
1360     return '{0}.{1}'.format(
1361         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1362         ext)
1363
1364
1365 def check_executable(exe, args=[]):
1366     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1367     args can be a list of arguments for a short output (like -version) """
1368     try:
1369         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1370     except OSError:
1371         return False
1372     return exe
1373
1374
1375 def get_exe_version(exe, args=['--version'],
1376                     version_re=None, unrecognized='present'):
1377     """ Returns the version of the specified executable,
1378     or False if the executable is not present """
1379     try:
1380         out, _ = subprocess.Popen(
1381             [encodeArgument(exe)] + args,
1382             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1383     except OSError:
1384         return False
1385     if isinstance(out, bytes):  # Python 2.x
1386         out = out.decode('ascii', 'ignore')
1387     return detect_exe_version(out, version_re, unrecognized)
1388
1389
1390 def detect_exe_version(output, version_re=None, unrecognized='present'):
1391     assert isinstance(output, compat_str)
1392     if version_re is None:
1393         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1394     m = re.search(version_re, output)
1395     if m:
1396         return m.group(1)
1397     else:
1398         return unrecognized
1399
1400
1401 class PagedList(object):
1402     def __len__(self):
1403         # This is only useful for tests
1404         return len(self.getslice())
1405
1406
1407 class OnDemandPagedList(PagedList):
1408     def __init__(self, pagefunc, pagesize):
1409         self._pagefunc = pagefunc
1410         self._pagesize = pagesize
1411
1412     def getslice(self, start=0, end=None):
1413         res = []
1414         for pagenum in itertools.count(start // self._pagesize):
1415             firstid = pagenum * self._pagesize
1416             nextfirstid = pagenum * self._pagesize + self._pagesize
1417             if start >= nextfirstid:
1418                 continue
1419
1420             page_results = list(self._pagefunc(pagenum))
1421
1422             startv = (
1423                 start % self._pagesize
1424                 if firstid <= start < nextfirstid
1425                 else 0)
1426
1427             endv = (
1428                 ((end - 1) % self._pagesize) + 1
1429                 if (end is not None and firstid <= end <= nextfirstid)
1430                 else None)
1431
1432             if startv != 0 or endv is not None:
1433                 page_results = page_results[startv:endv]
1434             res.extend(page_results)
1435
1436             # A little optimization - if current page is not "full", ie. does
1437             # not contain page_size videos then we can assume that this page
1438             # is the last one - there are no more ids on further pages -
1439             # i.e. no need to query again.
1440             if len(page_results) + startv < self._pagesize:
1441                 break
1442
1443             # If we got the whole page, but the next page is not interesting,
1444             # break out early as well
1445             if end == nextfirstid:
1446                 break
1447         return res
1448
1449
1450 class InAdvancePagedList(PagedList):
1451     def __init__(self, pagefunc, pagecount, pagesize):
1452         self._pagefunc = pagefunc
1453         self._pagecount = pagecount
1454         self._pagesize = pagesize
1455
1456     def getslice(self, start=0, end=None):
1457         res = []
1458         start_page = start // self._pagesize
1459         end_page = (
1460             self._pagecount if end is None else (end // self._pagesize + 1))
1461         skip_elems = start - start_page * self._pagesize
1462         only_more = None if end is None else end - start
1463         for pagenum in range(start_page, end_page):
1464             page = list(self._pagefunc(pagenum))
1465             if skip_elems:
1466                 page = page[skip_elems:]
1467                 skip_elems = None
1468             if only_more is not None:
1469                 if len(page) < only_more:
1470                     only_more -= len(page)
1471                 else:
1472                     page = page[:only_more]
1473                     res.extend(page)
1474                     break
1475             res.extend(page)
1476         return res
1477
1478
1479 def uppercase_escape(s):
1480     unicode_escape = codecs.getdecoder('unicode_escape')
1481     return re.sub(
1482         r'\\U[0-9a-fA-F]{8}',
1483         lambda m: unicode_escape(m.group(0))[0],
1484         s)
1485
1486
1487 def lowercase_escape(s):
1488     unicode_escape = codecs.getdecoder('unicode_escape')
1489     return re.sub(
1490         r'\\u[0-9a-fA-F]{4}',
1491         lambda m: unicode_escape(m.group(0))[0],
1492         s)
1493
1494
1495 def escape_rfc3986(s):
1496     """Escape non-ASCII characters as suggested by RFC 3986"""
1497     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1498         s = s.encode('utf-8')
1499     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1500
1501
1502 def escape_url(url):
1503     """Escape URL as suggested by RFC 3986"""
1504     url_parsed = compat_urllib_parse_urlparse(url)
1505     return url_parsed._replace(
1506         path=escape_rfc3986(url_parsed.path),
1507         params=escape_rfc3986(url_parsed.params),
1508         query=escape_rfc3986(url_parsed.query),
1509         fragment=escape_rfc3986(url_parsed.fragment)
1510     ).geturl()
1511
1512 try:
1513     struct.pack('!I', 0)
1514 except TypeError:
1515     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1516     def struct_pack(spec, *args):
1517         if isinstance(spec, compat_str):
1518             spec = spec.encode('ascii')
1519         return struct.pack(spec, *args)
1520
1521     def struct_unpack(spec, *args):
1522         if isinstance(spec, compat_str):
1523             spec = spec.encode('ascii')
1524         return struct.unpack(spec, *args)
1525 else:
1526     struct_pack = struct.pack
1527     struct_unpack = struct.unpack
1528
1529
1530 def read_batch_urls(batch_fd):
1531     def fixup(url):
1532         if not isinstance(url, compat_str):
1533             url = url.decode('utf-8', 'replace')
1534         BOM_UTF8 = '\xef\xbb\xbf'
1535         if url.startswith(BOM_UTF8):
1536             url = url[len(BOM_UTF8):]
1537         url = url.strip()
1538         if url.startswith(('#', ';', ']')):
1539             return False
1540         return url
1541
1542     with contextlib.closing(batch_fd) as fd:
1543         return [url for url in map(fixup, fd) if url]
1544
1545
1546 def urlencode_postdata(*args, **kargs):
1547     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1548
1549
1550 try:
1551     etree_iter = xml.etree.ElementTree.Element.iter
1552 except AttributeError:  # Python <=2.6
1553     etree_iter = lambda n: n.findall('.//*')
1554
1555
1556 def parse_xml(s):
1557     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1558         def doctype(self, name, pubid, system):
1559             pass  # Ignore doctypes
1560
1561     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1562     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1563     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1564     # Fix up XML parser in Python 2.x
1565     if sys.version_info < (3, 0):
1566         for n in etree_iter(tree):
1567             if n.text is not None:
1568                 if not isinstance(n.text, compat_str):
1569                     n.text = n.text.decode('utf-8')
1570     return tree
1571
1572
1573 US_RATINGS = {
1574     'G': 0,
1575     'PG': 10,
1576     'PG-13': 13,
1577     'R': 16,
1578     'NC': 18,
1579 }
1580
1581
1582 def parse_age_limit(s):
1583     if s is None:
1584         return None
1585     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1586     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1587
1588
1589 def strip_jsonp(code):
1590     return re.sub(
1591         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1592
1593
1594 def js_to_json(code):
1595     def fix_kv(m):
1596         v = m.group(0)
1597         if v in ('true', 'false', 'null'):
1598             return v
1599         if v.startswith('"'):
1600             return v
1601         if v.startswith("'"):
1602             v = v[1:-1]
1603             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1604                 '\\\\': '\\\\',
1605                 "\\'": "'",
1606                 '"': '\\"',
1607             }[m.group(0)], v)
1608         return '"%s"' % v
1609
1610     res = re.sub(r'''(?x)
1611         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1612         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1613         [a-zA-Z_][.a-zA-Z_0-9]*
1614         ''', fix_kv, code)
1615     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1616     return res
1617
1618
1619 def qualities(quality_ids):
1620     """ Get a numeric quality value out of a list of possible values """
1621     def q(qid):
1622         try:
1623             return quality_ids.index(qid)
1624         except ValueError:
1625             return -1
1626     return q
1627
1628
1629 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1630
1631
1632 def limit_length(s, length):
1633     """ Add ellipses to overly long strings """
1634     if s is None:
1635         return None
1636     ELLIPSES = '...'
1637     if len(s) > length:
1638         return s[:length - len(ELLIPSES)] + ELLIPSES
1639     return s
1640
1641
1642 def version_tuple(v):
1643     return tuple(int(e) for e in re.split(r'[-.]', v))
1644
1645
1646 def is_outdated_version(version, limit, assume_new=True):
1647     if not version:
1648         return not assume_new
1649     try:
1650         return version_tuple(version) < version_tuple(limit)
1651     except ValueError:
1652         return not assume_new
1653
1654
1655 def ytdl_is_updateable():
1656     """ Returns if youtube-dl can be updated with -U """
1657     from zipimport import zipimporter
1658
1659     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1660
1661
1662 def args_to_str(args):
1663     # Get a short string representation for a subprocess command
1664     return ' '.join(shlex_quote(a) for a in args)
1665
1666
1667 def mimetype2ext(mt):
1668     _, _, res = mt.rpartition('/')
1669
1670     return {
1671         'x-ms-wmv': 'wmv',
1672         'x-mp4-fragmented': 'mp4',
1673         'ttml+xml': 'ttml',
1674     }.get(res, res)
1675
1676
1677 def urlhandle_detect_ext(url_handle):
1678     try:
1679         url_handle.headers
1680         getheader = lambda h: url_handle.headers[h]
1681     except AttributeError:  # Python < 3
1682         getheader = url_handle.info().getheader
1683
1684     cd = getheader('Content-Disposition')
1685     if cd:
1686         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1687         if m:
1688             e = determine_ext(m.group('filename'), default_ext=None)
1689             if e:
1690                 return e
1691
1692     return mimetype2ext(getheader('Content-Type'))
1693
1694
1695 def age_restricted(content_limit, age_limit):
1696     """ Returns True iff the content should be blocked """
1697
1698     if age_limit is None:  # No limit set
1699         return False
1700     if content_limit is None:
1701         return False  # Content available for everyone
1702     return age_limit < content_limit
1703
1704
1705 def is_html(first_bytes):
1706     """ Detect whether a file contains HTML by examining its first bytes. """
1707
1708     BOMS = [
1709         (b'\xef\xbb\xbf', 'utf-8'),
1710         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1711         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1712         (b'\xff\xfe', 'utf-16-le'),
1713         (b'\xfe\xff', 'utf-16-be'),
1714     ]
1715     for bom, enc in BOMS:
1716         if first_bytes.startswith(bom):
1717             s = first_bytes[len(bom):].decode(enc, 'replace')
1718             break
1719     else:
1720         s = first_bytes.decode('utf-8', 'replace')
1721
1722     return re.match(r'^\s*<', s)
1723
1724
1725 def determine_protocol(info_dict):
1726     protocol = info_dict.get('protocol')
1727     if protocol is not None:
1728         return protocol
1729
1730     url = info_dict['url']
1731     if url.startswith('rtmp'):
1732         return 'rtmp'
1733     elif url.startswith('mms'):
1734         return 'mms'
1735     elif url.startswith('rtsp'):
1736         return 'rtsp'
1737
1738     ext = determine_ext(url)
1739     if ext == 'm3u8':
1740         return 'm3u8'
1741     elif ext == 'f4m':
1742         return 'f4m'
1743
1744     return compat_urllib_parse_urlparse(url).scheme
1745
1746
1747 def render_table(header_row, data):
1748     """ Render a list of rows, each as a list of values """
1749     table = [header_row] + data
1750     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1751     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1752     return '\n'.join(format_str % tuple(row) for row in table)
1753
1754
1755 def _match_one(filter_part, dct):
1756     COMPARISON_OPERATORS = {
1757         '<': operator.lt,
1758         '<=': operator.le,
1759         '>': operator.gt,
1760         '>=': operator.ge,
1761         '=': operator.eq,
1762         '!=': operator.ne,
1763     }
1764     operator_rex = re.compile(r'''(?x)\s*
1765         (?P<key>[a-z_]+)
1766         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1767         (?:
1768             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1769             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1770         )
1771         \s*$
1772         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1773     m = operator_rex.search(filter_part)
1774     if m:
1775         op = COMPARISON_OPERATORS[m.group('op')]
1776         if m.group('strval') is not None:
1777             if m.group('op') not in ('=', '!='):
1778                 raise ValueError(
1779                     'Operator %s does not support string values!' % m.group('op'))
1780             comparison_value = m.group('strval')
1781         else:
1782             try:
1783                 comparison_value = int(m.group('intval'))
1784             except ValueError:
1785                 comparison_value = parse_filesize(m.group('intval'))
1786                 if comparison_value is None:
1787                     comparison_value = parse_filesize(m.group('intval') + 'B')
1788                 if comparison_value is None:
1789                     raise ValueError(
1790                         'Invalid integer value %r in filter part %r' % (
1791                             m.group('intval'), filter_part))
1792         actual_value = dct.get(m.group('key'))
1793         if actual_value is None:
1794             return m.group('none_inclusive')
1795         return op(actual_value, comparison_value)
1796
1797     UNARY_OPERATORS = {
1798         '': lambda v: v is not None,
1799         '!': lambda v: v is None,
1800     }
1801     operator_rex = re.compile(r'''(?x)\s*
1802         (?P<op>%s)\s*(?P<key>[a-z_]+)
1803         \s*$
1804         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1805     m = operator_rex.search(filter_part)
1806     if m:
1807         op = UNARY_OPERATORS[m.group('op')]
1808         actual_value = dct.get(m.group('key'))
1809         return op(actual_value)
1810
1811     raise ValueError('Invalid filter part %r' % filter_part)
1812
1813
1814 def match_str(filter_str, dct):
1815     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1816
1817     return all(
1818         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1819
1820
1821 def match_filter_func(filter_str):
1822     def _match_func(info_dict):
1823         if match_str(filter_str, info_dict):
1824             return None
1825         else:
1826             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1827             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1828     return _match_func
1829
1830
1831 def parse_dfxp_time_expr(time_expr):
1832     if not time_expr:
1833         return 0.0
1834
1835     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1836     if mobj:
1837         return float(mobj.group('time_offset'))
1838
1839     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1840     if mobj:
1841         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1842
1843
1844 def srt_subtitles_timecode(seconds):
1845     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1846
1847
1848 def dfxp2srt(dfxp_data):
1849     _x = functools.partial(xpath_with_ns, ns_map={
1850         'ttml': 'http://www.w3.org/ns/ttml',
1851         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1852     })
1853
1854     def parse_node(node):
1855         str_or_empty = functools.partial(str_or_none, default='')
1856
1857         out = str_or_empty(node.text)
1858
1859         for child in node:
1860             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1861                 out += '\n' + str_or_empty(child.tail)
1862             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1863                 out += str_or_empty(parse_node(child))
1864             else:
1865                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1866
1867         return out
1868
1869     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1870     out = []
1871     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1872
1873     if not paras:
1874         raise ValueError('Invalid dfxp/TTML subtitle')
1875
1876     for para, index in zip(paras, itertools.count(1)):
1877         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1878         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1879         if not end_time:
1880             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1881         out.append('%d\n%s --> %s\n%s\n\n' % (
1882             index,
1883             srt_subtitles_timecode(begin_time),
1884             srt_subtitles_timecode(end_time),
1885             parse_node(para)))
1886
1887     return ''.join(out)
1888
1889
1890 class ISO639Utils(object):
1891     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1892     _lang_map = {
1893         'aa': 'aar',
1894         'ab': 'abk',
1895         'ae': 'ave',
1896         'af': 'afr',
1897         'ak': 'aka',
1898         'am': 'amh',
1899         'an': 'arg',
1900         'ar': 'ara',
1901         'as': 'asm',
1902         'av': 'ava',
1903         'ay': 'aym',
1904         'az': 'aze',
1905         'ba': 'bak',
1906         'be': 'bel',
1907         'bg': 'bul',
1908         'bh': 'bih',
1909         'bi': 'bis',
1910         'bm': 'bam',
1911         'bn': 'ben',
1912         'bo': 'bod',
1913         'br': 'bre',
1914         'bs': 'bos',
1915         'ca': 'cat',
1916         'ce': 'che',
1917         'ch': 'cha',
1918         'co': 'cos',
1919         'cr': 'cre',
1920         'cs': 'ces',
1921         'cu': 'chu',
1922         'cv': 'chv',
1923         'cy': 'cym',
1924         'da': 'dan',
1925         'de': 'deu',
1926         'dv': 'div',
1927         'dz': 'dzo',
1928         'ee': 'ewe',
1929         'el': 'ell',
1930         'en': 'eng',
1931         'eo': 'epo',
1932         'es': 'spa',
1933         'et': 'est',
1934         'eu': 'eus',
1935         'fa': 'fas',
1936         'ff': 'ful',
1937         'fi': 'fin',
1938         'fj': 'fij',
1939         'fo': 'fao',
1940         'fr': 'fra',
1941         'fy': 'fry',
1942         'ga': 'gle',
1943         'gd': 'gla',
1944         'gl': 'glg',
1945         'gn': 'grn',
1946         'gu': 'guj',
1947         'gv': 'glv',
1948         'ha': 'hau',
1949         'he': 'heb',
1950         'hi': 'hin',
1951         'ho': 'hmo',
1952         'hr': 'hrv',
1953         'ht': 'hat',
1954         'hu': 'hun',
1955         'hy': 'hye',
1956         'hz': 'her',
1957         'ia': 'ina',
1958         'id': 'ind',
1959         'ie': 'ile',
1960         'ig': 'ibo',
1961         'ii': 'iii',
1962         'ik': 'ipk',
1963         'io': 'ido',
1964         'is': 'isl',
1965         'it': 'ita',
1966         'iu': 'iku',
1967         'ja': 'jpn',
1968         'jv': 'jav',
1969         'ka': 'kat',
1970         'kg': 'kon',
1971         'ki': 'kik',
1972         'kj': 'kua',
1973         'kk': 'kaz',
1974         'kl': 'kal',
1975         'km': 'khm',
1976         'kn': 'kan',
1977         'ko': 'kor',
1978         'kr': 'kau',
1979         'ks': 'kas',
1980         'ku': 'kur',
1981         'kv': 'kom',
1982         'kw': 'cor',
1983         'ky': 'kir',
1984         'la': 'lat',
1985         'lb': 'ltz',
1986         'lg': 'lug',
1987         'li': 'lim',
1988         'ln': 'lin',
1989         'lo': 'lao',
1990         'lt': 'lit',
1991         'lu': 'lub',
1992         'lv': 'lav',
1993         'mg': 'mlg',
1994         'mh': 'mah',
1995         'mi': 'mri',
1996         'mk': 'mkd',
1997         'ml': 'mal',
1998         'mn': 'mon',
1999         'mr': 'mar',
2000         'ms': 'msa',
2001         'mt': 'mlt',
2002         'my': 'mya',
2003         'na': 'nau',
2004         'nb': 'nob',
2005         'nd': 'nde',
2006         'ne': 'nep',
2007         'ng': 'ndo',
2008         'nl': 'nld',
2009         'nn': 'nno',
2010         'no': 'nor',
2011         'nr': 'nbl',
2012         'nv': 'nav',
2013         'ny': 'nya',
2014         'oc': 'oci',
2015         'oj': 'oji',
2016         'om': 'orm',
2017         'or': 'ori',
2018         'os': 'oss',
2019         'pa': 'pan',
2020         'pi': 'pli',
2021         'pl': 'pol',
2022         'ps': 'pus',
2023         'pt': 'por',
2024         'qu': 'que',
2025         'rm': 'roh',
2026         'rn': 'run',
2027         'ro': 'ron',
2028         'ru': 'rus',
2029         'rw': 'kin',
2030         'sa': 'san',
2031         'sc': 'srd',
2032         'sd': 'snd',
2033         'se': 'sme',
2034         'sg': 'sag',
2035         'si': 'sin',
2036         'sk': 'slk',
2037         'sl': 'slv',
2038         'sm': 'smo',
2039         'sn': 'sna',
2040         'so': 'som',
2041         'sq': 'sqi',
2042         'sr': 'srp',
2043         'ss': 'ssw',
2044         'st': 'sot',
2045         'su': 'sun',
2046         'sv': 'swe',
2047         'sw': 'swa',
2048         'ta': 'tam',
2049         'te': 'tel',
2050         'tg': 'tgk',
2051         'th': 'tha',
2052         'ti': 'tir',
2053         'tk': 'tuk',
2054         'tl': 'tgl',
2055         'tn': 'tsn',
2056         'to': 'ton',
2057         'tr': 'tur',
2058         'ts': 'tso',
2059         'tt': 'tat',
2060         'tw': 'twi',
2061         'ty': 'tah',
2062         'ug': 'uig',
2063         'uk': 'ukr',
2064         'ur': 'urd',
2065         'uz': 'uzb',
2066         've': 'ven',
2067         'vi': 'vie',
2068         'vo': 'vol',
2069         'wa': 'wln',
2070         'wo': 'wol',
2071         'xh': 'xho',
2072         'yi': 'yid',
2073         'yo': 'yor',
2074         'za': 'zha',
2075         'zh': 'zho',
2076         'zu': 'zul',
2077     }
2078
2079     @classmethod
2080     def short2long(cls, code):
2081         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2082         return cls._lang_map.get(code[:2])
2083
2084     @classmethod
2085     def long2short(cls, code):
2086         """Convert language code from ISO 639-2/T to ISO 639-1"""
2087         for short_name, long_name in cls._lang_map.items():
2088             if long_name == code:
2089                 return short_name
2090
2091
2092 class ISO3166Utils(object):
2093     # From http://data.okfn.org/data/core/country-list
2094     _country_map = {
2095         'AF': 'Afghanistan',
2096         'AX': 'Åland Islands',
2097         'AL': 'Albania',
2098         'DZ': 'Algeria',
2099         'AS': 'American Samoa',
2100         'AD': 'Andorra',
2101         'AO': 'Angola',
2102         'AI': 'Anguilla',
2103         'AQ': 'Antarctica',
2104         'AG': 'Antigua and Barbuda',
2105         'AR': 'Argentina',
2106         'AM': 'Armenia',
2107         'AW': 'Aruba',
2108         'AU': 'Australia',
2109         'AT': 'Austria',
2110         'AZ': 'Azerbaijan',
2111         'BS': 'Bahamas',
2112         'BH': 'Bahrain',
2113         'BD': 'Bangladesh',
2114         'BB': 'Barbados',
2115         'BY': 'Belarus',
2116         'BE': 'Belgium',
2117         'BZ': 'Belize',
2118         'BJ': 'Benin',
2119         'BM': 'Bermuda',
2120         'BT': 'Bhutan',
2121         'BO': 'Bolivia, Plurinational State of',
2122         'BQ': 'Bonaire, Sint Eustatius and Saba',
2123         'BA': 'Bosnia and Herzegovina',
2124         'BW': 'Botswana',
2125         'BV': 'Bouvet Island',
2126         'BR': 'Brazil',
2127         'IO': 'British Indian Ocean Territory',
2128         'BN': 'Brunei Darussalam',
2129         'BG': 'Bulgaria',
2130         'BF': 'Burkina Faso',
2131         'BI': 'Burundi',
2132         'KH': 'Cambodia',
2133         'CM': 'Cameroon',
2134         'CA': 'Canada',
2135         'CV': 'Cape Verde',
2136         'KY': 'Cayman Islands',
2137         'CF': 'Central African Republic',
2138         'TD': 'Chad',
2139         'CL': 'Chile',
2140         'CN': 'China',
2141         'CX': 'Christmas Island',
2142         'CC': 'Cocos (Keeling) Islands',
2143         'CO': 'Colombia',
2144         'KM': 'Comoros',
2145         'CG': 'Congo',
2146         'CD': 'Congo, the Democratic Republic of the',
2147         'CK': 'Cook Islands',
2148         'CR': 'Costa Rica',
2149         'CI': 'Côte d\'Ivoire',
2150         'HR': 'Croatia',
2151         'CU': 'Cuba',
2152         'CW': 'Curaçao',
2153         'CY': 'Cyprus',
2154         'CZ': 'Czech Republic',
2155         'DK': 'Denmark',
2156         'DJ': 'Djibouti',
2157         'DM': 'Dominica',
2158         'DO': 'Dominican Republic',
2159         'EC': 'Ecuador',
2160         'EG': 'Egypt',
2161         'SV': 'El Salvador',
2162         'GQ': 'Equatorial Guinea',
2163         'ER': 'Eritrea',
2164         'EE': 'Estonia',
2165         'ET': 'Ethiopia',
2166         'FK': 'Falkland Islands (Malvinas)',
2167         'FO': 'Faroe Islands',
2168         'FJ': 'Fiji',
2169         'FI': 'Finland',
2170         'FR': 'France',
2171         'GF': 'French Guiana',
2172         'PF': 'French Polynesia',
2173         'TF': 'French Southern Territories',
2174         'GA': 'Gabon',
2175         'GM': 'Gambia',
2176         'GE': 'Georgia',
2177         'DE': 'Germany',
2178         'GH': 'Ghana',
2179         'GI': 'Gibraltar',
2180         'GR': 'Greece',
2181         'GL': 'Greenland',
2182         'GD': 'Grenada',
2183         'GP': 'Guadeloupe',
2184         'GU': 'Guam',
2185         'GT': 'Guatemala',
2186         'GG': 'Guernsey',
2187         'GN': 'Guinea',
2188         'GW': 'Guinea-Bissau',
2189         'GY': 'Guyana',
2190         'HT': 'Haiti',
2191         'HM': 'Heard Island and McDonald Islands',
2192         'VA': 'Holy See (Vatican City State)',
2193         'HN': 'Honduras',
2194         'HK': 'Hong Kong',
2195         'HU': 'Hungary',
2196         'IS': 'Iceland',
2197         'IN': 'India',
2198         'ID': 'Indonesia',
2199         'IR': 'Iran, Islamic Republic of',
2200         'IQ': 'Iraq',
2201         'IE': 'Ireland',
2202         'IM': 'Isle of Man',
2203         'IL': 'Israel',
2204         'IT': 'Italy',
2205         'JM': 'Jamaica',
2206         'JP': 'Japan',
2207         'JE': 'Jersey',
2208         'JO': 'Jordan',
2209         'KZ': 'Kazakhstan',
2210         'KE': 'Kenya',
2211         'KI': 'Kiribati',
2212         'KP': 'Korea, Democratic People\'s Republic of',
2213         'KR': 'Korea, Republic of',
2214         'KW': 'Kuwait',
2215         'KG': 'Kyrgyzstan',
2216         'LA': 'Lao People\'s Democratic Republic',
2217         'LV': 'Latvia',
2218         'LB': 'Lebanon',
2219         'LS': 'Lesotho',
2220         'LR': 'Liberia',
2221         'LY': 'Libya',
2222         'LI': 'Liechtenstein',
2223         'LT': 'Lithuania',
2224         'LU': 'Luxembourg',
2225         'MO': 'Macao',
2226         'MK': 'Macedonia, the Former Yugoslav Republic of',
2227         'MG': 'Madagascar',
2228         'MW': 'Malawi',
2229         'MY': 'Malaysia',
2230         'MV': 'Maldives',
2231         'ML': 'Mali',
2232         'MT': 'Malta',
2233         'MH': 'Marshall Islands',
2234         'MQ': 'Martinique',
2235         'MR': 'Mauritania',
2236         'MU': 'Mauritius',
2237         'YT': 'Mayotte',
2238         'MX': 'Mexico',
2239         'FM': 'Micronesia, Federated States of',
2240         'MD': 'Moldova, Republic of',
2241         'MC': 'Monaco',
2242         'MN': 'Mongolia',
2243         'ME': 'Montenegro',
2244         'MS': 'Montserrat',
2245         'MA': 'Morocco',
2246         'MZ': 'Mozambique',
2247         'MM': 'Myanmar',
2248         'NA': 'Namibia',
2249         'NR': 'Nauru',
2250         'NP': 'Nepal',
2251         'NL': 'Netherlands',
2252         'NC': 'New Caledonia',
2253         'NZ': 'New Zealand',
2254         'NI': 'Nicaragua',
2255         'NE': 'Niger',
2256         'NG': 'Nigeria',
2257         'NU': 'Niue',
2258         'NF': 'Norfolk Island',
2259         'MP': 'Northern Mariana Islands',
2260         'NO': 'Norway',
2261         'OM': 'Oman',
2262         'PK': 'Pakistan',
2263         'PW': 'Palau',
2264         'PS': 'Palestine, State of',
2265         'PA': 'Panama',
2266         'PG': 'Papua New Guinea',
2267         'PY': 'Paraguay',
2268         'PE': 'Peru',
2269         'PH': 'Philippines',
2270         'PN': 'Pitcairn',
2271         'PL': 'Poland',
2272         'PT': 'Portugal',
2273         'PR': 'Puerto Rico',
2274         'QA': 'Qatar',
2275         'RE': 'Réunion',
2276         'RO': 'Romania',
2277         'RU': 'Russian Federation',
2278         'RW': 'Rwanda',
2279         'BL': 'Saint Barthélemy',
2280         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2281         'KN': 'Saint Kitts and Nevis',
2282         'LC': 'Saint Lucia',
2283         'MF': 'Saint Martin (French part)',
2284         'PM': 'Saint Pierre and Miquelon',
2285         'VC': 'Saint Vincent and the Grenadines',
2286         'WS': 'Samoa',
2287         'SM': 'San Marino',
2288         'ST': 'Sao Tome and Principe',
2289         'SA': 'Saudi Arabia',
2290         'SN': 'Senegal',
2291         'RS': 'Serbia',
2292         'SC': 'Seychelles',
2293         'SL': 'Sierra Leone',
2294         'SG': 'Singapore',
2295         'SX': 'Sint Maarten (Dutch part)',
2296         'SK': 'Slovakia',
2297         'SI': 'Slovenia',
2298         'SB': 'Solomon Islands',
2299         'SO': 'Somalia',
2300         'ZA': 'South Africa',
2301         'GS': 'South Georgia and the South Sandwich Islands',
2302         'SS': 'South Sudan',
2303         'ES': 'Spain',
2304         'LK': 'Sri Lanka',
2305         'SD': 'Sudan',
2306         'SR': 'Suriname',
2307         'SJ': 'Svalbard and Jan Mayen',
2308         'SZ': 'Swaziland',
2309         'SE': 'Sweden',
2310         'CH': 'Switzerland',
2311         'SY': 'Syrian Arab Republic',
2312         'TW': 'Taiwan, Province of China',
2313         'TJ': 'Tajikistan',
2314         'TZ': 'Tanzania, United Republic of',
2315         'TH': 'Thailand',
2316         'TL': 'Timor-Leste',
2317         'TG': 'Togo',
2318         'TK': 'Tokelau',
2319         'TO': 'Tonga',
2320         'TT': 'Trinidad and Tobago',
2321         'TN': 'Tunisia',
2322         'TR': 'Turkey',
2323         'TM': 'Turkmenistan',
2324         'TC': 'Turks and Caicos Islands',
2325         'TV': 'Tuvalu',
2326         'UG': 'Uganda',
2327         'UA': 'Ukraine',
2328         'AE': 'United Arab Emirates',
2329         'GB': 'United Kingdom',
2330         'US': 'United States',
2331         'UM': 'United States Minor Outlying Islands',
2332         'UY': 'Uruguay',
2333         'UZ': 'Uzbekistan',
2334         'VU': 'Vanuatu',
2335         'VE': 'Venezuela, Bolivarian Republic of',
2336         'VN': 'Viet Nam',
2337         'VG': 'Virgin Islands, British',
2338         'VI': 'Virgin Islands, U.S.',
2339         'WF': 'Wallis and Futuna',
2340         'EH': 'Western Sahara',
2341         'YE': 'Yemen',
2342         'ZM': 'Zambia',
2343         'ZW': 'Zimbabwe',
2344     }
2345
2346     @classmethod
2347     def short2full(cls, code):
2348         """Convert an ISO 3166-2 country code to the corresponding full name"""
2349         return cls._country_map.get(code.upper())
2350
2351
2352 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2353     def __init__(self, proxies=None):
2354         # Set default handlers
2355         for type in ('http', 'https'):
2356             setattr(self, '%s_open' % type,
2357                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2358                         meth(r, proxy, type))
2359         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2360
2361     def proxy_open(self, req, proxy, type):
2362         req_proxy = req.headers.get('Ytdl-request-proxy')
2363         if req_proxy is not None:
2364             proxy = req_proxy
2365             del req.headers['Ytdl-request-proxy']
2366
2367         if proxy == '__noproxy__':
2368             return None  # No Proxy
2369         return compat_urllib_request.ProxyHandler.proxy_open(
2370             self, req, proxy, type)