git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 NO_DEFAULT = object()
  66
  67 ENGLISH_MONTH_NAMES = [
  68     'January', 'February', 'March', 'April', 'May', 'June',
  69     'July', 'August', 'September', 'October', 'November', 'December']
  70
  71
  72 def preferredencoding():
  73     """Get preferred encoding.
  74
  75     Returns the best encoding scheme for the system, based on
  76     locale.getpreferredencoding() and some further tweaks.
  77     """
  78     try:
  79         pref = locale.getpreferredencoding()
  80         'TEST'.encode(pref)
  81     except Exception:
  82         pref = 'UTF-8'
  83
  84     return pref
  85
  86
  87 def write_json_file(obj, fn):
  88     """ Encode obj as JSON and write it to fn, atomically if possible """
  89
  90     fn = encodeFilename(fn)
  91     if sys.version_info < (3, 0) and sys.platform != 'win32':
  92         encoding = get_filesystem_encoding()
  93         # os.path.basename returns a bytes object, but NamedTemporaryFile
  94         # will fail if the filename contains non ascii characters unless we
  95         # use a unicode object
  96         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  97         # the same for os.path.dirname
  98         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  99     else:
 100         path_basename = os.path.basename
 101         path_dirname = os.path.dirname
 102
 103     args = {
 104         'suffix': '.tmp',
 105         'prefix': path_basename(fn) + '.',
 106         'dir': path_dirname(fn),
 107         'delete': False,
 108     }
 109
 110     # In Python 2.x, json.dump expects a bytestream.
 111     # In Python 3.x, it writes to a character stream
 112     if sys.version_info < (3, 0):
 113         args['mode'] = 'wb'
 114     else:
 115         args.update({
 116             'mode': 'w',
 117             'encoding': 'utf-8',
 118         })
 119
 120     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 121
 122     try:
 123         with tf:
 124             json.dump(obj, tf)
 125         if sys.platform == 'win32':
 126             # Need to remove existing file on Windows, else os.rename raises
 127             # WindowsError or FileExistsError.
 128             try:
 129                 os.unlink(fn)
 130             except OSError:
 131                 pass
 132         os.rename(tf.name, fn)
 133     except Exception:
 134         try:
 135             os.remove(tf.name)
 136         except OSError:
 137             pass
 138         raise
 139
 140
 141 if sys.version_info >= (2, 7):
 142     def find_xpath_attr(node, xpath, key, val=None):
 143         """ Find the xpath xpath[@key=val] """
 144         assert re.match(r'^[a-zA-Z-]+$', key)
 145         if val:
 146             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 147         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 148         return node.find(expr)
 149 else:
 150     def find_xpath_attr(node, xpath, key, val=None):
 151         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 152         # .//node does not match if a node is a direct child of . !
 153         if isinstance(xpath, compat_str):
 154             xpath = xpath.encode('ascii')
 155
 156         for f in node.findall(xpath):
 157             if key not in f.attrib:
 158                 continue
 159             if val is None or f.attrib.get(key) == val:
 160                 return f
 161         return None
 162
 163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 164 # the namespace parameter
 165
 166
 167 def xpath_with_ns(path, ns_map):
 168     components = [c.split(':') for c in path.split('/')]
 169     replaced = []
 170     for c in components:
 171         if len(c) == 1:
 172             replaced.append(c[0])
 173         else:
 174             ns, tag = c
 175             replaced.append('{%s}%s' % (ns_map[ns], tag))
 176     return '/'.join(replaced)
 177
 178
 179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 180     if sys.version_info < (2, 7):  # Crazy 2.6
 181         xpath = xpath.encode('ascii')
 182
 183     n = node.find(xpath)
 184     if n is None or n.text is None:
 185         if default is not NO_DEFAULT:
 186             return default
 187         elif fatal:
 188             name = xpath if name is None else name
 189             raise ExtractorError('Could not find XML element %s' % name)
 190         else:
 191             return None
 192     return n.text
 193
 194
 195 def get_element_by_id(id, html):
 196     """Return the content of the tag with the specified ID in the passed HTML document"""
 197     return get_element_by_attribute("id", id, html)
 198
 199
 200 def get_element_by_attribute(attribute, value, html):
 201     """Return the content of the tag with the specified attribute in the passed HTML document"""
 202
 203     m = re.search(r'''(?xs)
 204         <([a-zA-Z0-9:._-]+)
 205          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 206          \s+%s=['"]?%s['"]?
 207          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 208         \s*>
 209         (?P<content>.*?)
 210         </\1>
 211     ''' % (re.escape(attribute), re.escape(value)), html)
 212
 213     if not m:
 214         return None
 215     res = m.group('content')
 216
 217     if res.startswith('"') or res.startswith("'"):
 218         res = res[1:-1]
 219
 220     return unescapeHTML(res)
 221
 222
 223 def clean_html(html):
 224     """Clean an HTML snippet into a readable string"""
 225
 226     if html is None:  # Convenience for sanitizing descriptions etc.
 227         return html
 228
 229     # Newline vs <br />
 230     html = html.replace('\n', ' ')
 231     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 232     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 233     # Strip html tags
 234     html = re.sub('<.*?>', '', html)
 235     # Replace html entities
 236     html = unescapeHTML(html)
 237     return html.strip()
 238
 239
 240 def sanitize_open(filename, open_mode):
 241     """Try to open the given filename, and slightly tweak it if this fails.
 242
 243     Attempts to open the given filename. If this fails, it tries to change
 244     the filename slightly, step by step, until it's either able to open it
 245     or it fails and raises a final exception, like the standard open()
 246     function.
 247
 248     It returns the tuple (stream, definitive_file_name).
 249     """
 250     try:
 251         if filename == '-':
 252             if sys.platform == 'win32':
 253                 import msvcrt
 254                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 255             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 256         stream = open(encodeFilename(filename), open_mode)
 257         return (stream, filename)
 258     except (IOError, OSError) as err:
 259         if err.errno in (errno.EACCES,):
 260             raise
 261
 262         # In case of error, try to remove win32 forbidden chars
 263         alt_filename = sanitize_path(filename)
 264         if alt_filename == filename:
 265             raise
 266         else:
 267             # An exception here should be caught in the caller
 268             stream = open(encodeFilename(alt_filename), open_mode)
 269             return (stream, alt_filename)
 270
 271
 272 def timeconvert(timestr):
 273     """Convert RFC 2822 defined time string into system timestamp"""
 274     timestamp = None
 275     timetuple = email.utils.parsedate_tz(timestr)
 276     if timetuple is not None:
 277         timestamp = email.utils.mktime_tz(timetuple)
 278     return timestamp
 279
 280
 281 def sanitize_filename(s, restricted=False, is_id=False):
 282     """Sanitizes a string so it could be used as part of a filename.
 283     If restricted is set, use a stricter subset of allowed characters.
 284     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 285     """
 286     def replace_insane(char):
 287         if char == '?' or ord(char) < 32 or ord(char) == 127:
 288             return ''
 289         elif char == '"':
 290             return '' if restricted else '\''
 291         elif char == ':':
 292             return '_-' if restricted else ' -'
 293         elif char in '\\/|*<>':
 294             return '_'
 295         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 296             return '_'
 297         if restricted and ord(char) > 127:
 298             return '_'
 299         return char
 300
 301     # Handle timestamps
 302     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 303     result = ''.join(map(replace_insane, s))
 304     if not is_id:
 305         while '__' in result:
 306             result = result.replace('__', '_')
 307         result = result.strip('_')
 308         # Common case of "Foreign band name - English song title"
 309         if restricted and result.startswith('-_'):
 310             result = result[2:]
 311         if result.startswith('-'):
 312             result = '_' + result[len('-'):]
 313         result = result.lstrip('.')
 314         if not result:
 315             result = '_'
 316     return result
 317
 318
 319 def sanitize_path(s):
 320     """Sanitizes and normalizes path on Windows"""
 321     if sys.platform != 'win32':
 322         return s
 323     drive_or_unc, _ = os.path.splitdrive(s)
 324     if sys.version_info < (2, 7) and not drive_or_unc:
 325         drive_or_unc, _ = os.path.splitunc(s)
 326     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 327     if drive_or_unc:
 328         norm_path.pop(0)
 329     sanitized_path = [
 330         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 331         for path_part in norm_path]
 332     if drive_or_unc:
 333         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 334     return os.path.join(*sanitized_path)
 335
 336
 337 def orderedSet(iterable):
 338     """ Remove all duplicates from the input iterable """
 339     res = []
 340     for el in iterable:
 341         if el not in res:
 342             res.append(el)
 343     return res
 344
 345
 346 def _htmlentity_transform(entity):
 347     """Transforms an HTML entity to a character."""
 348     # Known non-numeric HTML entity
 349     if entity in compat_html_entities.name2codepoint:
 350         return compat_chr(compat_html_entities.name2codepoint[entity])
 351
 352     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 353     if mobj is not None:
 354         numstr = mobj.group(1)
 355         if numstr.startswith('x'):
 356             base = 16
 357             numstr = '0%s' % numstr
 358         else:
 359             base = 10
 360         return compat_chr(int(numstr, base))
 361
 362     # Unknown entity in name, return its literal representation
 363     return ('&%s;' % entity)
 364
 365
 366 def unescapeHTML(s):
 367     if s is None:
 368         return None
 369     assert type(s) == compat_str
 370
 371     return re.sub(
 372         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 373
 374
 375 def get_subprocess_encoding():
 376     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 377         # For subprocess calls, encode with locale encoding
 378         # Refer to http://stackoverflow.com/a/9951851/35070
 379         encoding = preferredencoding()
 380     else:
 381         encoding = sys.getfilesystemencoding()
 382     if encoding is None:
 383         encoding = 'utf-8'
 384     return encoding
 385
 386
 387 def encodeFilename(s, for_subprocess=False):
 388     """
 389     @param s The name of the file
 390     """
 391
 392     assert type(s) == compat_str
 393
 394     # Python 3 has a Unicode API
 395     if sys.version_info >= (3, 0):
 396         return s
 397
 398     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 399     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 400     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 401     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 402         return s
 403
 404     return s.encode(get_subprocess_encoding(), 'ignore')
 405
 406
 407 def decodeFilename(b, for_subprocess=False):
 408
 409     if sys.version_info >= (3, 0):
 410         return b
 411
 412     if not isinstance(b, bytes):
 413         return b
 414
 415     return b.decode(get_subprocess_encoding(), 'ignore')
 416
 417
 418 def encodeArgument(s):
 419     if not isinstance(s, compat_str):
 420         # Legacy code that uses byte strings
 421         # Uncomment the following line after fixing all post processors
 422         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 423         s = s.decode('ascii')
 424     return encodeFilename(s, True)
 425
 426
 427 def decodeArgument(b):
 428     return decodeFilename(b, True)
 429
 430
 431 def decodeOption(optval):
 432     if optval is None:
 433         return optval
 434     if isinstance(optval, bytes):
 435         optval = optval.decode(preferredencoding())
 436
 437     assert isinstance(optval, compat_str)
 438     return optval
 439
 440
 441 def formatSeconds(secs):
 442     if secs > 3600:
 443         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 444     elif secs > 60:
 445         return '%d:%02d' % (secs // 60, secs % 60)
 446     else:
 447         return '%d' % secs
 448
 449
 450 def make_HTTPS_handler(params, **kwargs):
 451     opts_no_check_certificate = params.get('nocheckcertificate', False)
 452     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 453         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 454         if opts_no_check_certificate:
 455             context.check_hostname = False
 456             context.verify_mode = ssl.CERT_NONE
 457         try:
 458             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 459         except TypeError:
 460             # Python 2.7.8
 461             # (create_default_context present but HTTPSHandler has no context=)
 462             pass
 463
 464     if sys.version_info < (3, 2):
 465         return YoutubeDLHTTPSHandler(params, **kwargs)
 466     else:  # Python < 3.4
 467         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 468         context.verify_mode = (ssl.CERT_NONE
 469                                if opts_no_check_certificate
 470                                else ssl.CERT_REQUIRED)
 471         context.set_default_verify_paths()
 472         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 473
 474
 475 def bug_reports_message():
 476     if ytdl_is_updateable():
 477         update_cmd = 'type  youtube-dl -U  to update'
 478     else:
 479         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 480     msg = '; please report this issue on https://yt-dl.org/bug .'
 481     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 482     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 483     return msg
 484
 485
 486 class ExtractorError(Exception):
 487     """Error during info extraction."""
 488
 489     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 490         """ tb, if given, is the original traceback (so that it can be printed out).
 491         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 492         """
 493
 494         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 495             expected = True
 496         if video_id is not None:
 497             msg = video_id + ': ' + msg
 498         if cause:
 499             msg += ' (caused by %r)' % cause
 500         if not expected:
 501             msg += bug_reports_message()
 502         super(ExtractorError, self).__init__(msg)
 503
 504         self.traceback = tb
 505         self.exc_info = sys.exc_info()  # preserve original exception
 506         self.cause = cause
 507         self.video_id = video_id
 508
 509     def format_traceback(self):
 510         if self.traceback is None:
 511             return None
 512         return ''.join(traceback.format_tb(self.traceback))
 513
 514
 515 class UnsupportedError(ExtractorError):
 516     def __init__(self, url):
 517         super(UnsupportedError, self).__init__(
 518             'Unsupported URL: %s' % url, expected=True)
 519         self.url = url
 520
 521
 522 class RegexNotFoundError(ExtractorError):
 523     """Error when a regex didn't match"""
 524     pass
 525
 526
 527 class DownloadError(Exception):
 528     """Download Error exception.
 529
 530     This exception may be thrown by FileDownloader objects if they are not
 531     configured to continue on errors. They will contain the appropriate
 532     error message.
 533     """
 534
 535     def __init__(self, msg, exc_info=None):
 536         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 537         super(DownloadError, self).__init__(msg)
 538         self.exc_info = exc_info
 539
 540
 541 class SameFileError(Exception):
 542     """Same File exception.
 543
 544     This exception will be thrown by FileDownloader objects if they detect
 545     multiple files would have to be downloaded to the same file on disk.
 546     """
 547     pass
 548
 549
 550 class PostProcessingError(Exception):
 551     """Post Processing exception.
 552
 553     This exception may be raised by PostProcessor's .run() method to
 554     indicate an error in the postprocessing task.
 555     """
 556
 557     def __init__(self, msg):
 558         self.msg = msg
 559
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582
 583     def __init__(self, downloaded, expected):
 584         # Both in bytes
 585         self.downloaded = downloaded
 586         self.expected = expected
 587
 588
 589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 590     hc = http_class(*args, **kwargs)
 591     source_address = ydl_handler._params.get('source_address')
 592     if source_address is not None:
 593         sa = (source_address, 0)
 594         if hasattr(hc, 'source_address'):  # Python 2.7+
 595             hc.source_address = sa
 596         else:  # Python 2.6
 597             def _hc_connect(self, *args, **kwargs):
 598                 sock = compat_socket_create_connection(
 599                     (self.host, self.port), self.timeout, sa)
 600                 if is_https:
 601                     self.sock = ssl.wrap_socket(
 602                         sock, self.key_file, self.cert_file,
 603                         ssl_version=ssl.PROTOCOL_TLSv1)
 604                 else:
 605                     self.sock = sock
 606             hc.connect = functools.partial(_hc_connect, hc)
 607
 608     return hc
 609
 610
 611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 612     """Handler for HTTP requests and responses.
 613
 614     This class, when installed with an OpenerDirector, automatically adds
 615     the standard headers to every HTTP request and handles gzipped and
 616     deflated responses from web servers. If compression is to be avoided in
 617     a particular request, the original request in the program code only has
 618     to include the HTTP header "Youtubedl-No-Compression", which will be
 619     removed before making the real request.
 620
 621     Part of this code was copied from:
 622
 623     http://techknack.net/python-urllib2-handlers/
 624
 625     Andrew Rowls, the author of that code, agreed to release it to the
 626     public domain.
 627     """
 628
 629     def __init__(self, params, *args, **kwargs):
 630         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 631         self._params = params
 632
 633     def http_open(self, req):
 634         return self.do_open(functools.partial(
 635             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 636             req)
 637
 638     @staticmethod
 639     def deflate(data):
 640         try:
 641             return zlib.decompress(data, -zlib.MAX_WBITS)
 642         except zlib.error:
 643             return zlib.decompress(data)
 644
 645     @staticmethod
 646     def addinfourl_wrapper(stream, headers, url, code):
 647         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 648             return compat_urllib_request.addinfourl(stream, headers, url, code)
 649         ret = compat_urllib_request.addinfourl(stream, headers, url)
 650         ret.code = code
 651         return ret
 652
 653     def http_request(self, req):
 654         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 655         # always respected by websites, some tend to give out URLs with non percent-encoded
 656         # non-ASCII characters (see telemb.py, ard.py [#3412])
 657         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 658         # To work around aforementioned issue we will replace request's original URL with
 659         # percent-encoded one
 660         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 661         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 662         url = req.get_full_url()
 663         url_escaped = escape_url(url)
 664
 665         # Substitute URL if any change after escaping
 666         if url != url_escaped:
 667             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 668             new_req = req_type(
 669                 url_escaped, data=req.data, headers=req.headers,
 670                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 671             new_req.timeout = req.timeout
 672             req = new_req
 673
 674         for h, v in std_headers.items():
 675             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 676             # The dict keys are capitalized because of this bug by urllib
 677             if h.capitalize() not in req.headers:
 678                 req.add_header(h, v)
 679         if 'Youtubedl-no-compression' in req.headers:
 680             if 'Accept-encoding' in req.headers:
 681                 del req.headers['Accept-encoding']
 682             del req.headers['Youtubedl-no-compression']
 683
 684         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 685             # Python 2.6 is brain-dead when it comes to fragments
 686             req._Request__original = req._Request__original.partition('#')[0]
 687             req._Request__r_type = req._Request__r_type.partition('#')[0]
 688
 689         return req
 690
 691     def http_response(self, req, resp):
 692         old_resp = resp
 693         # gzip
 694         if resp.headers.get('Content-encoding', '') == 'gzip':
 695             content = resp.read()
 696             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 697             try:
 698                 uncompressed = io.BytesIO(gz.read())
 699             except IOError as original_ioerror:
 700                 # There may be junk add the end of the file
 701                 # See http://stackoverflow.com/q/4928560/35070 for details
 702                 for i in range(1, 1024):
 703                     try:
 704                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 705                         uncompressed = io.BytesIO(gz.read())
 706                     except IOError:
 707                         continue
 708                     break
 709                 else:
 710                     raise original_ioerror
 711             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 712             resp.msg = old_resp.msg
 713         # deflate
 714         if resp.headers.get('Content-encoding', '') == 'deflate':
 715             gz = io.BytesIO(self.deflate(resp.read()))
 716             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 717             resp.msg = old_resp.msg
 718         return resp
 719
 720     https_request = http_request
 721     https_response = http_response
 722
 723
 724 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 725     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 726         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 727         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 728         self._params = params
 729
 730     def https_open(self, req):
 731         kwargs = {}
 732         if hasattr(self, '_context'):  # python > 2.6
 733             kwargs['context'] = self._context
 734         if hasattr(self, '_check_hostname'):  # python 3.x
 735             kwargs['check_hostname'] = self._check_hostname
 736         return self.do_open(functools.partial(
 737             _create_http_connection, self, self._https_conn_class, True),
 738             req, **kwargs)
 739
 740
 741 def parse_iso8601(date_str, delimiter='T', timezone=None):
 742     """ Return a UNIX timestamp from the given date """
 743
 744     if date_str is None:
 745         return None
 746
 747     if timezone is None:
 748         m = re.search(
 749             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 750             date_str)
 751         if not m:
 752             timezone = datetime.timedelta()
 753         else:
 754             date_str = date_str[:-len(m.group(0))]
 755             if not m.group('sign'):
 756                 timezone = datetime.timedelta()
 757             else:
 758                 sign = 1 if m.group('sign') == '+' else -1
 759                 timezone = datetime.timedelta(
 760                     hours=sign * int(m.group('hours')),
 761                     minutes=sign * int(m.group('minutes')))
 762     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 763     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 764     return calendar.timegm(dt.timetuple())
 765
 766
 767 def unified_strdate(date_str, day_first=True):
 768     """Return a string with the date in the format YYYYMMDD"""
 769
 770     if date_str is None:
 771         return None
 772     upload_date = None
 773     # Replace commas
 774     date_str = date_str.replace(',', ' ')
 775     # %z (UTC offset) is only supported in python>=3.2
 776     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 777         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 778     # Remove AM/PM + timezone
 779     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 780
 781     format_expressions = [
 782         '%d %B %Y',
 783         '%d %b %Y',
 784         '%B %d %Y',
 785         '%b %d %Y',
 786         '%b %dst %Y %I:%M%p',
 787         '%b %dnd %Y %I:%M%p',
 788         '%b %dth %Y %I:%M%p',
 789         '%Y %m %d',
 790         '%Y-%m-%d',
 791         '%Y/%m/%d',
 792         '%Y/%m/%d %H:%M:%S',
 793         '%Y-%m-%d %H:%M:%S',
 794         '%Y-%m-%d %H:%M:%S.%f',
 795         '%d.%m.%Y %H:%M',
 796         '%d.%m.%Y %H.%M',
 797         '%Y-%m-%dT%H:%M:%SZ',
 798         '%Y-%m-%dT%H:%M:%S.%fZ',
 799         '%Y-%m-%dT%H:%M:%S.%f0Z',
 800         '%Y-%m-%dT%H:%M:%S',
 801         '%Y-%m-%dT%H:%M:%S.%f',
 802         '%Y-%m-%dT%H:%M',
 803     ]
 804     if day_first:
 805         format_expressions.extend([
 806             '%d-%m-%Y',
 807             '%d.%m.%Y',
 808             '%d/%m/%Y',
 809             '%d/%m/%y',
 810             '%d/%m/%Y %H:%M:%S',
 811         ])
 812     else:
 813         format_expressions.extend([
 814             '%m-%d-%Y',
 815             '%m.%d.%Y',
 816             '%m/%d/%Y',
 817             '%m/%d/%y',
 818             '%m/%d/%Y %H:%M:%S',
 819         ])
 820     for expression in format_expressions:
 821         try:
 822             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 823         except ValueError:
 824             pass
 825     if upload_date is None:
 826         timetuple = email.utils.parsedate_tz(date_str)
 827         if timetuple:
 828             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 829     return upload_date
 830
 831
 832 def determine_ext(url, default_ext='unknown_video'):
 833     if url is None:
 834         return default_ext
 835     guess = url.partition('?')[0].rpartition('.')[2]
 836     if re.match(r'^[A-Za-z0-9]+$', guess):
 837         return guess
 838     else:
 839         return default_ext
 840
 841
 842 def subtitles_filename(filename, sub_lang, sub_format):
 843     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 844
 845
 846 def date_from_str(date_str):
 847     """
 848     Return a datetime object from a string in the format YYYYMMDD or
 849     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 850     today = datetime.date.today()
 851     if date_str in ('now', 'today'):
 852         return today
 853     if date_str == 'yesterday':
 854         return today - datetime.timedelta(days=1)
 855     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 856     if match is not None:
 857         sign = match.group('sign')
 858         time = int(match.group('time'))
 859         if sign == '-':
 860             time = -time
 861         unit = match.group('unit')
 862         # A bad aproximation?
 863         if unit == 'month':
 864             unit = 'day'
 865             time *= 30
 866         elif unit == 'year':
 867             unit = 'day'
 868             time *= 365
 869         unit += 's'
 870         delta = datetime.timedelta(**{unit: time})
 871         return today + delta
 872     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 873
 874
 875 def hyphenate_date(date_str):
 876     """
 877     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 878     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 879     if match is not None:
 880         return '-'.join(match.groups())
 881     else:
 882         return date_str
 883
 884
 885 class DateRange(object):
 886     """Represents a time interval between two dates"""
 887
 888     def __init__(self, start=None, end=None):
 889         """start and end must be strings in the format accepted by date"""
 890         if start is not None:
 891             self.start = date_from_str(start)
 892         else:
 893             self.start = datetime.datetime.min.date()
 894         if end is not None:
 895             self.end = date_from_str(end)
 896         else:
 897             self.end = datetime.datetime.max.date()
 898         if self.start > self.end:
 899             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 900
 901     @classmethod
 902     def day(cls, day):
 903         """Returns a range that only contains the given day"""
 904         return cls(day, day)
 905
 906     def __contains__(self, date):
 907         """Check if the date is in the range"""
 908         if not isinstance(date, datetime.date):
 909             date = date_from_str(date)
 910         return self.start <= date <= self.end
 911
 912     def __str__(self):
 913         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 914
 915
 916 def platform_name():
 917     """ Returns the platform name as a compat_str """
 918     res = platform.platform()
 919     if isinstance(res, bytes):
 920         res = res.decode(preferredencoding())
 921
 922     assert isinstance(res, compat_str)
 923     return res
 924
 925
 926 def _windows_write_string(s, out):
 927     """ Returns True if the string was written using special methods,
 928     False if it has yet to be written out."""
 929     # Adapted from http://stackoverflow.com/a/3259271/35070
 930
 931     import ctypes
 932     import ctypes.wintypes
 933
 934     WIN_OUTPUT_IDS = {
 935         1: -11,
 936         2: -12,
 937     }
 938
 939     try:
 940         fileno = out.fileno()
 941     except AttributeError:
 942         # If the output stream doesn't have a fileno, it's virtual
 943         return False
 944     except io.UnsupportedOperation:
 945         # Some strange Windows pseudo files?
 946         return False
 947     if fileno not in WIN_OUTPUT_IDS:
 948         return False
 949
 950     GetStdHandle = ctypes.WINFUNCTYPE(
 951         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 952         (b"GetStdHandle", ctypes.windll.kernel32))
 953     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 954
 955     WriteConsoleW = ctypes.WINFUNCTYPE(
 956         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 957         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 958         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 959     written = ctypes.wintypes.DWORD(0)
 960
 961     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 962     FILE_TYPE_CHAR = 0x0002
 963     FILE_TYPE_REMOTE = 0x8000
 964     GetConsoleMode = ctypes.WINFUNCTYPE(
 965         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 966         ctypes.POINTER(ctypes.wintypes.DWORD))(
 967         (b"GetConsoleMode", ctypes.windll.kernel32))
 968     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 969
 970     def not_a_console(handle):
 971         if handle == INVALID_HANDLE_VALUE or handle is None:
 972             return True
 973         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 974                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 975
 976     if not_a_console(h):
 977         return False
 978
 979     def next_nonbmp_pos(s):
 980         try:
 981             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 982         except StopIteration:
 983             return len(s)
 984
 985     while s:
 986         count = min(next_nonbmp_pos(s), 1024)
 987
 988         ret = WriteConsoleW(
 989             h, s, count if count else 2, ctypes.byref(written), None)
 990         if ret == 0:
 991             raise OSError('Failed to write string')
 992         if not count:  # We just wrote a non-BMP character
 993             assert written.value == 2
 994             s = s[1:]
 995         else:
 996             assert written.value > 0
 997             s = s[written.value:]
 998     return True
 999
1000
1001 def write_string(s, out=None, encoding=None):
1002     if out is None:
1003         out = sys.stderr
1004     assert type(s) == compat_str
1005
1006     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1007         if _windows_write_string(s, out):
1008             return
1009
1010     if ('b' in getattr(out, 'mode', '') or
1011             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1012         byt = s.encode(encoding or preferredencoding(), 'ignore')
1013         out.write(byt)
1014     elif hasattr(out, 'buffer'):
1015         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1016         byt = s.encode(enc, 'ignore')
1017         out.buffer.write(byt)
1018     else:
1019         out.write(s)
1020     out.flush()
1021
1022
1023 def bytes_to_intlist(bs):
1024     if not bs:
1025         return []
1026     if isinstance(bs[0], int):  # Python 3
1027         return list(bs)
1028     else:
1029         return [ord(c) for c in bs]
1030
1031
1032 def intlist_to_bytes(xs):
1033     if not xs:
1034         return b''
1035     return struct_pack('%dB' % len(xs), *xs)
1036
1037
1038 # Cross-platform file locking
1039 if sys.platform == 'win32':
1040     import ctypes.wintypes
1041     import msvcrt
1042
1043     class OVERLAPPED(ctypes.Structure):
1044         _fields_ = [
1045             ('Internal', ctypes.wintypes.LPVOID),
1046             ('InternalHigh', ctypes.wintypes.LPVOID),
1047             ('Offset', ctypes.wintypes.DWORD),
1048             ('OffsetHigh', ctypes.wintypes.DWORD),
1049             ('hEvent', ctypes.wintypes.HANDLE),
1050         ]
1051
1052     kernel32 = ctypes.windll.kernel32
1053     LockFileEx = kernel32.LockFileEx
1054     LockFileEx.argtypes = [
1055         ctypes.wintypes.HANDLE,     # hFile
1056         ctypes.wintypes.DWORD,      # dwFlags
1057         ctypes.wintypes.DWORD,      # dwReserved
1058         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1059         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1060         ctypes.POINTER(OVERLAPPED)  # Overlapped
1061     ]
1062     LockFileEx.restype = ctypes.wintypes.BOOL
1063     UnlockFileEx = kernel32.UnlockFileEx
1064     UnlockFileEx.argtypes = [
1065         ctypes.wintypes.HANDLE,     # hFile
1066         ctypes.wintypes.DWORD,      # dwReserved
1067         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1068         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1069         ctypes.POINTER(OVERLAPPED)  # Overlapped
1070     ]
1071     UnlockFileEx.restype = ctypes.wintypes.BOOL
1072     whole_low = 0xffffffff
1073     whole_high = 0x7fffffff
1074
1075     def _lock_file(f, exclusive):
1076         overlapped = OVERLAPPED()
1077         overlapped.Offset = 0
1078         overlapped.OffsetHigh = 0
1079         overlapped.hEvent = 0
1080         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1081         handle = msvcrt.get_osfhandle(f.fileno())
1082         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1083                           whole_low, whole_high, f._lock_file_overlapped_p):
1084             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1085
1086     def _unlock_file(f):
1087         assert f._lock_file_overlapped_p
1088         handle = msvcrt.get_osfhandle(f.fileno())
1089         if not UnlockFileEx(handle, 0,
1090                             whole_low, whole_high, f._lock_file_overlapped_p):
1091             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1092
1093 else:
1094     import fcntl
1095
1096     def _lock_file(f, exclusive):
1097         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1098
1099     def _unlock_file(f):
1100         fcntl.flock(f, fcntl.LOCK_UN)
1101
1102
1103 class locked_file(object):
1104     def __init__(self, filename, mode, encoding=None):
1105         assert mode in ['r', 'a', 'w']
1106         self.f = io.open(filename, mode, encoding=encoding)
1107         self.mode = mode
1108
1109     def __enter__(self):
1110         exclusive = self.mode != 'r'
1111         try:
1112             _lock_file(self.f, exclusive)
1113         except IOError:
1114             self.f.close()
1115             raise
1116         return self
1117
1118     def __exit__(self, etype, value, traceback):
1119         try:
1120             _unlock_file(self.f)
1121         finally:
1122             self.f.close()
1123
1124     def __iter__(self):
1125         return iter(self.f)
1126
1127     def write(self, *args):
1128         return self.f.write(*args)
1129
1130     def read(self, *args):
1131         return self.f.read(*args)
1132
1133
1134 def get_filesystem_encoding():
1135     encoding = sys.getfilesystemencoding()
1136     return encoding if encoding is not None else 'utf-8'
1137
1138
1139 def shell_quote(args):
1140     quoted_args = []
1141     encoding = get_filesystem_encoding()
1142     for a in args:
1143         if isinstance(a, bytes):
1144             # We may get a filename encoded with 'encodeFilename'
1145             a = a.decode(encoding)
1146         quoted_args.append(pipes.quote(a))
1147     return ' '.join(quoted_args)
1148
1149
1150 def smuggle_url(url, data):
1151     """ Pass additional data in a URL for internal use. """
1152
1153     sdata = compat_urllib_parse.urlencode(
1154         {'__youtubedl_smuggle': json.dumps(data)})
1155     return url + '#' + sdata
1156
1157
1158 def unsmuggle_url(smug_url, default=None):
1159     if '#__youtubedl_smuggle' not in smug_url:
1160         return smug_url, default
1161     url, _, sdata = smug_url.rpartition('#')
1162     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1163     data = json.loads(jsond)
1164     return url, data
1165
1166
1167 def format_bytes(bytes):
1168     if bytes is None:
1169         return 'N/A'
1170     if type(bytes) is str:
1171         bytes = float(bytes)
1172     if bytes == 0.0:
1173         exponent = 0
1174     else:
1175         exponent = int(math.log(bytes, 1024.0))
1176     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1177     converted = float(bytes) / float(1024 ** exponent)
1178     return '%.2f%s' % (converted, suffix)
1179
1180
1181 def parse_filesize(s):
1182     if s is None:
1183         return None
1184
1185     # The lower-case forms are of course incorrect and inofficial,
1186     # but we support those too
1187     _UNIT_TABLE = {
1188         'B': 1,
1189         'b': 1,
1190         'KiB': 1024,
1191         'KB': 1000,
1192         'kB': 1024,
1193         'Kb': 1000,
1194         'MiB': 1024 ** 2,
1195         'MB': 1000 ** 2,
1196         'mB': 1024 ** 2,
1197         'Mb': 1000 ** 2,
1198         'GiB': 1024 ** 3,
1199         'GB': 1000 ** 3,
1200         'gB': 1024 ** 3,
1201         'Gb': 1000 ** 3,
1202         'TiB': 1024 ** 4,
1203         'TB': 1000 ** 4,
1204         'tB': 1024 ** 4,
1205         'Tb': 1000 ** 4,
1206         'PiB': 1024 ** 5,
1207         'PB': 1000 ** 5,
1208         'pB': 1024 ** 5,
1209         'Pb': 1000 ** 5,
1210         'EiB': 1024 ** 6,
1211         'EB': 1000 ** 6,
1212         'eB': 1024 ** 6,
1213         'Eb': 1000 ** 6,
1214         'ZiB': 1024 ** 7,
1215         'ZB': 1000 ** 7,
1216         'zB': 1024 ** 7,
1217         'Zb': 1000 ** 7,
1218         'YiB': 1024 ** 8,
1219         'YB': 1000 ** 8,
1220         'yB': 1024 ** 8,
1221         'Yb': 1000 ** 8,
1222     }
1223
1224     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1225     m = re.match(
1226         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1227     if not m:
1228         return None
1229
1230     num_str = m.group('num').replace(',', '.')
1231     mult = _UNIT_TABLE[m.group('unit')]
1232     return int(float(num_str) * mult)
1233
1234
1235 def month_by_name(name):
1236     """ Return the number of a month by (locale-independently) English name """
1237
1238     try:
1239         return ENGLISH_MONTH_NAMES.index(name) + 1
1240     except ValueError:
1241         return None
1242
1243
1244 def month_by_abbreviation(abbrev):
1245     """ Return the number of a month by (locale-independently) English
1246         abbreviations """
1247
1248     try:
1249         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1250     except ValueError:
1251         return None
1252
1253
1254 def fix_xml_ampersands(xml_str):
1255     """Replace all the '&' by '&amp;' in XML"""
1256     return re.sub(
1257         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1258         '&amp;',
1259         xml_str)
1260
1261
1262 def setproctitle(title):
1263     assert isinstance(title, compat_str)
1264     try:
1265         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1266     except OSError:
1267         return
1268     title_bytes = title.encode('utf-8')
1269     buf = ctypes.create_string_buffer(len(title_bytes))
1270     buf.value = title_bytes
1271     try:
1272         libc.prctl(15, buf, 0, 0, 0)
1273     except AttributeError:
1274         return  # Strange libc, just skip this
1275
1276
1277 def remove_start(s, start):
1278     if s.startswith(start):
1279         return s[len(start):]
1280     return s
1281
1282
1283 def remove_end(s, end):
1284     if s.endswith(end):
1285         return s[:-len(end)]
1286     return s
1287
1288
1289 def url_basename(url):
1290     path = compat_urlparse.urlparse(url).path
1291     return path.strip('/').split('/')[-1]
1292
1293
1294 class HEADRequest(compat_urllib_request.Request):
1295     def get_method(self):
1296         return "HEAD"
1297
1298
1299 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1300     if get_attr:
1301         if v is not None:
1302             v = getattr(v, get_attr, None)
1303     if v == '':
1304         v = None
1305     return default if v is None else (int(v) * invscale // scale)
1306
1307
1308 def str_or_none(v, default=None):
1309     return default if v is None else compat_str(v)
1310
1311
1312 def str_to_int(int_str):
1313     """ A more relaxed version of int_or_none """
1314     if int_str is None:
1315         return None
1316     int_str = re.sub(r'[,\.\+]', '', int_str)
1317     return int(int_str)
1318
1319
1320 def float_or_none(v, scale=1, invscale=1, default=None):
1321     return default if v is None else (float(v) * invscale / scale)
1322
1323
1324 def parse_duration(s):
1325     if not isinstance(s, compat_basestring):
1326         return None
1327
1328     s = s.strip()
1329
1330     m = re.match(
1331         r'''(?ix)(?:P?T)?
1332         (?:
1333             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1334             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1335
1336             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1337             (?:
1338                 (?:
1339                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1340                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1341                 )?
1342                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1343             )?
1344             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1345         )$''', s)
1346     if not m:
1347         return None
1348     res = 0
1349     if m.group('only_mins'):
1350         return float_or_none(m.group('only_mins'), invscale=60)
1351     if m.group('only_hours'):
1352         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1353     if m.group('secs'):
1354         res += int(m.group('secs'))
1355     if m.group('mins_reversed'):
1356         res += int(m.group('mins_reversed')) * 60
1357     if m.group('mins'):
1358         res += int(m.group('mins')) * 60
1359     if m.group('hours'):
1360         res += int(m.group('hours')) * 60 * 60
1361     if m.group('hours_reversed'):
1362         res += int(m.group('hours_reversed')) * 60 * 60
1363     if m.group('days'):
1364         res += int(m.group('days')) * 24 * 60 * 60
1365     if m.group('ms'):
1366         res += float(m.group('ms'))
1367     return res
1368
1369
1370 def prepend_extension(filename, ext, expected_real_ext=None):
1371     name, real_ext = os.path.splitext(filename)
1372     return (
1373         '{0}.{1}{2}'.format(name, ext, real_ext)
1374         if not expected_real_ext or real_ext[1:] == expected_real_ext
1375         else '{0}.{1}'.format(filename, ext))
1376
1377
1378 def replace_extension(filename, ext, expected_real_ext=None):
1379     name, real_ext = os.path.splitext(filename)
1380     return '{0}.{1}'.format(
1381         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1382         ext)
1383
1384
1385 def check_executable(exe, args=[]):
1386     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1387     args can be a list of arguments for a short output (like -version) """
1388     try:
1389         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1390     except OSError:
1391         return False
1392     return exe
1393
1394
1395 def get_exe_version(exe, args=['--version'],
1396                     version_re=None, unrecognized='present'):
1397     """ Returns the version of the specified executable,
1398     or False if the executable is not present """
1399     try:
1400         out, _ = subprocess.Popen(
1401             [encodeArgument(exe)] + args,
1402             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1403     except OSError:
1404         return False
1405     if isinstance(out, bytes):  # Python 2.x
1406         out = out.decode('ascii', 'ignore')
1407     return detect_exe_version(out, version_re, unrecognized)
1408
1409
1410 def detect_exe_version(output, version_re=None, unrecognized='present'):
1411     assert isinstance(output, compat_str)
1412     if version_re is None:
1413         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1414     m = re.search(version_re, output)
1415     if m:
1416         return m.group(1)
1417     else:
1418         return unrecognized
1419
1420
1421 class PagedList(object):
1422     def __len__(self):
1423         # This is only useful for tests
1424         return len(self.getslice())
1425
1426
1427 class OnDemandPagedList(PagedList):
1428     def __init__(self, pagefunc, pagesize):
1429         self._pagefunc = pagefunc
1430         self._pagesize = pagesize
1431
1432     def getslice(self, start=0, end=None):
1433         res = []
1434         for pagenum in itertools.count(start // self._pagesize):
1435             firstid = pagenum * self._pagesize
1436             nextfirstid = pagenum * self._pagesize + self._pagesize
1437             if start >= nextfirstid:
1438                 continue
1439
1440             page_results = list(self._pagefunc(pagenum))
1441
1442             startv = (
1443                 start % self._pagesize
1444                 if firstid <= start < nextfirstid
1445                 else 0)
1446
1447             endv = (
1448                 ((end - 1) % self._pagesize) + 1
1449                 if (end is not None and firstid <= end <= nextfirstid)
1450                 else None)
1451
1452             if startv != 0 or endv is not None:
1453                 page_results = page_results[startv:endv]
1454             res.extend(page_results)
1455
1456             # A little optimization - if current page is not "full", ie. does
1457             # not contain page_size videos then we can assume that this page
1458             # is the last one - there are no more ids on further pages -
1459             # i.e. no need to query again.
1460             if len(page_results) + startv < self._pagesize:
1461                 break
1462
1463             # If we got the whole page, but the next page is not interesting,
1464             # break out early as well
1465             if end == nextfirstid:
1466                 break
1467         return res
1468
1469
1470 class InAdvancePagedList(PagedList):
1471     def __init__(self, pagefunc, pagecount, pagesize):
1472         self._pagefunc = pagefunc
1473         self._pagecount = pagecount
1474         self._pagesize = pagesize
1475
1476     def getslice(self, start=0, end=None):
1477         res = []
1478         start_page = start // self._pagesize
1479         end_page = (
1480             self._pagecount if end is None else (end // self._pagesize + 1))
1481         skip_elems = start - start_page * self._pagesize
1482         only_more = None if end is None else end - start
1483         for pagenum in range(start_page, end_page):
1484             page = list(self._pagefunc(pagenum))
1485             if skip_elems:
1486                 page = page[skip_elems:]
1487                 skip_elems = None
1488             if only_more is not None:
1489                 if len(page) < only_more:
1490                     only_more -= len(page)
1491                 else:
1492                     page = page[:only_more]
1493                     res.extend(page)
1494                     break
1495             res.extend(page)
1496         return res
1497
1498
1499 def uppercase_escape(s):
1500     unicode_escape = codecs.getdecoder('unicode_escape')
1501     return re.sub(
1502         r'\\U[0-9a-fA-F]{8}',
1503         lambda m: unicode_escape(m.group(0))[0],
1504         s)
1505
1506
1507 def lowercase_escape(s):
1508     unicode_escape = codecs.getdecoder('unicode_escape')
1509     return re.sub(
1510         r'\\u[0-9a-fA-F]{4}',
1511         lambda m: unicode_escape(m.group(0))[0],
1512         s)
1513
1514
1515 def escape_rfc3986(s):
1516     """Escape non-ASCII characters as suggested by RFC 3986"""
1517     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1518         s = s.encode('utf-8')
1519     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1520
1521
1522 def escape_url(url):
1523     """Escape URL as suggested by RFC 3986"""
1524     url_parsed = compat_urllib_parse_urlparse(url)
1525     return url_parsed._replace(
1526         path=escape_rfc3986(url_parsed.path),
1527         params=escape_rfc3986(url_parsed.params),
1528         query=escape_rfc3986(url_parsed.query),
1529         fragment=escape_rfc3986(url_parsed.fragment)
1530     ).geturl()
1531
1532 try:
1533     struct.pack('!I', 0)
1534 except TypeError:
1535     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1536     def struct_pack(spec, *args):
1537         if isinstance(spec, compat_str):
1538             spec = spec.encode('ascii')
1539         return struct.pack(spec, *args)
1540
1541     def struct_unpack(spec, *args):
1542         if isinstance(spec, compat_str):
1543             spec = spec.encode('ascii')
1544         return struct.unpack(spec, *args)
1545 else:
1546     struct_pack = struct.pack
1547     struct_unpack = struct.unpack
1548
1549
1550 def read_batch_urls(batch_fd):
1551     def fixup(url):
1552         if not isinstance(url, compat_str):
1553             url = url.decode('utf-8', 'replace')
1554         BOM_UTF8 = '\xef\xbb\xbf'
1555         if url.startswith(BOM_UTF8):
1556             url = url[len(BOM_UTF8):]
1557         url = url.strip()
1558         if url.startswith(('#', ';', ']')):
1559             return False
1560         return url
1561
1562     with contextlib.closing(batch_fd) as fd:
1563         return [url for url in map(fixup, fd) if url]
1564
1565
1566 def urlencode_postdata(*args, **kargs):
1567     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1568
1569
1570 try:
1571     etree_iter = xml.etree.ElementTree.Element.iter
1572 except AttributeError:  # Python <=2.6
1573     etree_iter = lambda n: n.findall('.//*')
1574
1575
1576 def parse_xml(s):
1577     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1578         def doctype(self, name, pubid, system):
1579             pass  # Ignore doctypes
1580
1581     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1582     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1583     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1584     # Fix up XML parser in Python 2.x
1585     if sys.version_info < (3, 0):
1586         for n in etree_iter(tree):
1587             if n.text is not None:
1588                 if not isinstance(n.text, compat_str):
1589                     n.text = n.text.decode('utf-8')
1590     return tree
1591
1592
1593 US_RATINGS = {
1594     'G': 0,
1595     'PG': 10,
1596     'PG-13': 13,
1597     'R': 16,
1598     'NC': 18,
1599 }
1600
1601
1602 def parse_age_limit(s):
1603     if s is None:
1604         return None
1605     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1606     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1607
1608
1609 def strip_jsonp(code):
1610     return re.sub(
1611         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1612
1613
1614 def js_to_json(code):
1615     def fix_kv(m):
1616         v = m.group(0)
1617         if v in ('true', 'false', 'null'):
1618             return v
1619         if v.startswith('"'):
1620             return v
1621         if v.startswith("'"):
1622             v = v[1:-1]
1623             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1624                 '\\\\': '\\\\',
1625                 "\\'": "'",
1626                 '"': '\\"',
1627             }[m.group(0)], v)
1628         return '"%s"' % v
1629
1630     res = re.sub(r'''(?x)
1631         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1632         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1633         [a-zA-Z_][.a-zA-Z_0-9]*
1634         ''', fix_kv, code)
1635     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1636     return res
1637
1638
1639 def qualities(quality_ids):
1640     """ Get a numeric quality value out of a list of possible values """
1641     def q(qid):
1642         try:
1643             return quality_ids.index(qid)
1644         except ValueError:
1645             return -1
1646     return q
1647
1648
1649 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1650
1651
1652 def limit_length(s, length):
1653     """ Add ellipses to overly long strings """
1654     if s is None:
1655         return None
1656     ELLIPSES = '...'
1657     if len(s) > length:
1658         return s[:length - len(ELLIPSES)] + ELLIPSES
1659     return s
1660
1661
1662 def version_tuple(v):
1663     return tuple(int(e) for e in re.split(r'[-.]', v))
1664
1665
1666 def is_outdated_version(version, limit, assume_new=True):
1667     if not version:
1668         return not assume_new
1669     try:
1670         return version_tuple(version) < version_tuple(limit)
1671     except ValueError:
1672         return not assume_new
1673
1674
1675 def ytdl_is_updateable():
1676     """ Returns if youtube-dl can be updated with -U """
1677     from zipimport import zipimporter
1678
1679     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1680
1681
1682 def args_to_str(args):
1683     # Get a short string representation for a subprocess command
1684     return ' '.join(shlex_quote(a) for a in args)
1685
1686
1687 def mimetype2ext(mt):
1688     _, _, res = mt.rpartition('/')
1689
1690     return {
1691         'x-ms-wmv': 'wmv',
1692         'x-mp4-fragmented': 'mp4',
1693         'ttml+xml': 'ttml',
1694     }.get(res, res)
1695
1696
1697 def urlhandle_detect_ext(url_handle):
1698     try:
1699         url_handle.headers
1700         getheader = lambda h: url_handle.headers[h]
1701     except AttributeError:  # Python < 3
1702         getheader = url_handle.info().getheader
1703
1704     cd = getheader('Content-Disposition')
1705     if cd:
1706         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1707         if m:
1708             e = determine_ext(m.group('filename'), default_ext=None)
1709             if e:
1710                 return e
1711
1712     return mimetype2ext(getheader('Content-Type'))
1713
1714
1715 def age_restricted(content_limit, age_limit):
1716     """ Returns True iff the content should be blocked """
1717
1718     if age_limit is None:  # No limit set
1719         return False
1720     if content_limit is None:
1721         return False  # Content available for everyone
1722     return age_limit < content_limit
1723
1724
1725 def is_html(first_bytes):
1726     """ Detect whether a file contains HTML by examining its first bytes. """
1727
1728     BOMS = [
1729         (b'\xef\xbb\xbf', 'utf-8'),
1730         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1731         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1732         (b'\xff\xfe', 'utf-16-le'),
1733         (b'\xfe\xff', 'utf-16-be'),
1734     ]
1735     for bom, enc in BOMS:
1736         if first_bytes.startswith(bom):
1737             s = first_bytes[len(bom):].decode(enc, 'replace')
1738             break
1739     else:
1740         s = first_bytes.decode('utf-8', 'replace')
1741
1742     return re.match(r'^\s*<', s)
1743
1744
1745 def determine_protocol(info_dict):
1746     protocol = info_dict.get('protocol')
1747     if protocol is not None:
1748         return protocol
1749
1750     url = info_dict['url']
1751     if url.startswith('rtmp'):
1752         return 'rtmp'
1753     elif url.startswith('mms'):
1754         return 'mms'
1755     elif url.startswith('rtsp'):
1756         return 'rtsp'
1757
1758     ext = determine_ext(url)
1759     if ext == 'm3u8':
1760         return 'm3u8'
1761     elif ext == 'f4m':
1762         return 'f4m'
1763
1764     return compat_urllib_parse_urlparse(url).scheme
1765
1766
1767 def render_table(header_row, data):
1768     """ Render a list of rows, each as a list of values """
1769     table = [header_row] + data
1770     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1771     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1772     return '\n'.join(format_str % tuple(row) for row in table)
1773
1774
1775 def _match_one(filter_part, dct):
1776     COMPARISON_OPERATORS = {
1777         '<': operator.lt,
1778         '<=': operator.le,
1779         '>': operator.gt,
1780         '>=': operator.ge,
1781         '=': operator.eq,
1782         '!=': operator.ne,
1783     }
1784     operator_rex = re.compile(r'''(?x)\s*
1785         (?P<key>[a-z_]+)
1786         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1787         (?:
1788             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1789             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1790         )
1791         \s*$
1792         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1793     m = operator_rex.search(filter_part)
1794     if m:
1795         op = COMPARISON_OPERATORS[m.group('op')]
1796         if m.group('strval') is not None:
1797             if m.group('op') not in ('=', '!='):
1798                 raise ValueError(
1799                     'Operator %s does not support string values!' % m.group('op'))
1800             comparison_value = m.group('strval')
1801         else:
1802             try:
1803                 comparison_value = int(m.group('intval'))
1804             except ValueError:
1805                 comparison_value = parse_filesize(m.group('intval'))
1806                 if comparison_value is None:
1807                     comparison_value = parse_filesize(m.group('intval') + 'B')
1808                 if comparison_value is None:
1809                     raise ValueError(
1810                         'Invalid integer value %r in filter part %r' % (
1811                             m.group('intval'), filter_part))
1812         actual_value = dct.get(m.group('key'))
1813         if actual_value is None:
1814             return m.group('none_inclusive')
1815         return op(actual_value, comparison_value)
1816
1817     UNARY_OPERATORS = {
1818         '': lambda v: v is not None,
1819         '!': lambda v: v is None,
1820     }
1821     operator_rex = re.compile(r'''(?x)\s*
1822         (?P<op>%s)\s*(?P<key>[a-z_]+)
1823         \s*$
1824         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1825     m = operator_rex.search(filter_part)
1826     if m:
1827         op = UNARY_OPERATORS[m.group('op')]
1828         actual_value = dct.get(m.group('key'))
1829         return op(actual_value)
1830
1831     raise ValueError('Invalid filter part %r' % filter_part)
1832
1833
1834 def match_str(filter_str, dct):
1835     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1836
1837     return all(
1838         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1839
1840
1841 def match_filter_func(filter_str):
1842     def _match_func(info_dict):
1843         if match_str(filter_str, info_dict):
1844             return None
1845         else:
1846             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1847             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1848     return _match_func
1849
1850
1851 def parse_dfxp_time_expr(time_expr):
1852     if not time_expr:
1853         return 0.0
1854
1855     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1856     if mobj:
1857         return float(mobj.group('time_offset'))
1858
1859     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1860     if mobj:
1861         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1862
1863
1864 def srt_subtitles_timecode(seconds):
1865     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1866
1867
1868 def dfxp2srt(dfxp_data):
1869     _x = functools.partial(xpath_with_ns, ns_map={
1870         'ttml': 'http://www.w3.org/ns/ttml',
1871         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1872     })
1873
1874     def parse_node(node):
1875         str_or_empty = functools.partial(str_or_none, default='')
1876
1877         out = str_or_empty(node.text)
1878
1879         for child in node:
1880             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1881                 out += '\n' + str_or_empty(child.tail)
1882             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1883                 out += str_or_empty(parse_node(child))
1884             else:
1885                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1886
1887         return out
1888
1889     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1890     out = []
1891     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1892
1893     if not paras:
1894         raise ValueError('Invalid dfxp/TTML subtitle')
1895
1896     for para, index in zip(paras, itertools.count(1)):
1897         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1898         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1899         if not end_time:
1900             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1901         out.append('%d\n%s --> %s\n%s\n\n' % (
1902             index,
1903             srt_subtitles_timecode(begin_time),
1904             srt_subtitles_timecode(end_time),
1905             parse_node(para)))
1906
1907     return ''.join(out)
1908
1909
1910 class ISO639Utils(object):
1911     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1912     _lang_map = {
1913         'aa': 'aar',
1914         'ab': 'abk',
1915         'ae': 'ave',
1916         'af': 'afr',
1917         'ak': 'aka',
1918         'am': 'amh',
1919         'an': 'arg',
1920         'ar': 'ara',
1921         'as': 'asm',
1922         'av': 'ava',
1923         'ay': 'aym',
1924         'az': 'aze',
1925         'ba': 'bak',
1926         'be': 'bel',
1927         'bg': 'bul',
1928         'bh': 'bih',
1929         'bi': 'bis',
1930         'bm': 'bam',
1931         'bn': 'ben',
1932         'bo': 'bod',
1933         'br': 'bre',
1934         'bs': 'bos',
1935         'ca': 'cat',
1936         'ce': 'che',
1937         'ch': 'cha',
1938         'co': 'cos',
1939         'cr': 'cre',
1940         'cs': 'ces',
1941         'cu': 'chu',
1942         'cv': 'chv',
1943         'cy': 'cym',
1944         'da': 'dan',
1945         'de': 'deu',
1946         'dv': 'div',
1947         'dz': 'dzo',
1948         'ee': 'ewe',
1949         'el': 'ell',
1950         'en': 'eng',
1951         'eo': 'epo',
1952         'es': 'spa',
1953         'et': 'est',
1954         'eu': 'eus',
1955         'fa': 'fas',
1956         'ff': 'ful',
1957         'fi': 'fin',
1958         'fj': 'fij',
1959         'fo': 'fao',
1960         'fr': 'fra',
1961         'fy': 'fry',
1962         'ga': 'gle',
1963         'gd': 'gla',
1964         'gl': 'glg',
1965         'gn': 'grn',
1966         'gu': 'guj',
1967         'gv': 'glv',
1968         'ha': 'hau',
1969         'he': 'heb',
1970         'hi': 'hin',
1971         'ho': 'hmo',
1972         'hr': 'hrv',
1973         'ht': 'hat',
1974         'hu': 'hun',
1975         'hy': 'hye',
1976         'hz': 'her',
1977         'ia': 'ina',
1978         'id': 'ind',
1979         'ie': 'ile',
1980         'ig': 'ibo',
1981         'ii': 'iii',
1982         'ik': 'ipk',
1983         'io': 'ido',
1984         'is': 'isl',
1985         'it': 'ita',
1986         'iu': 'iku',
1987         'ja': 'jpn',
1988         'jv': 'jav',
1989         'ka': 'kat',
1990         'kg': 'kon',
1991         'ki': 'kik',
1992         'kj': 'kua',
1993         'kk': 'kaz',
1994         'kl': 'kal',
1995         'km': 'khm',
1996         'kn': 'kan',
1997         'ko': 'kor',
1998         'kr': 'kau',
1999         'ks': 'kas',
2000         'ku': 'kur',
2001         'kv': 'kom',
2002         'kw': 'cor',
2003         'ky': 'kir',
2004         'la': 'lat',
2005         'lb': 'ltz',
2006         'lg': 'lug',
2007         'li': 'lim',
2008         'ln': 'lin',
2009         'lo': 'lao',
2010         'lt': 'lit',
2011         'lu': 'lub',
2012         'lv': 'lav',
2013         'mg': 'mlg',
2014         'mh': 'mah',
2015         'mi': 'mri',
2016         'mk': 'mkd',
2017         'ml': 'mal',
2018         'mn': 'mon',
2019         'mr': 'mar',
2020         'ms': 'msa',
2021         'mt': 'mlt',
2022         'my': 'mya',
2023         'na': 'nau',
2024         'nb': 'nob',
2025         'nd': 'nde',
2026         'ne': 'nep',
2027         'ng': 'ndo',
2028         'nl': 'nld',
2029         'nn': 'nno',
2030         'no': 'nor',
2031         'nr': 'nbl',
2032         'nv': 'nav',
2033         'ny': 'nya',
2034         'oc': 'oci',
2035         'oj': 'oji',
2036         'om': 'orm',
2037         'or': 'ori',
2038         'os': 'oss',
2039         'pa': 'pan',
2040         'pi': 'pli',
2041         'pl': 'pol',
2042         'ps': 'pus',
2043         'pt': 'por',
2044         'qu': 'que',
2045         'rm': 'roh',
2046         'rn': 'run',
2047         'ro': 'ron',
2048         'ru': 'rus',
2049         'rw': 'kin',
2050         'sa': 'san',
2051         'sc': 'srd',
2052         'sd': 'snd',
2053         'se': 'sme',
2054         'sg': 'sag',
2055         'si': 'sin',
2056         'sk': 'slk',
2057         'sl': 'slv',
2058         'sm': 'smo',
2059         'sn': 'sna',
2060         'so': 'som',
2061         'sq': 'sqi',
2062         'sr': 'srp',
2063         'ss': 'ssw',
2064         'st': 'sot',
2065         'su': 'sun',
2066         'sv': 'swe',
2067         'sw': 'swa',
2068         'ta': 'tam',
2069         'te': 'tel',
2070         'tg': 'tgk',
2071         'th': 'tha',
2072         'ti': 'tir',
2073         'tk': 'tuk',
2074         'tl': 'tgl',
2075         'tn': 'tsn',
2076         'to': 'ton',
2077         'tr': 'tur',
2078         'ts': 'tso',
2079         'tt': 'tat',
2080         'tw': 'twi',
2081         'ty': 'tah',
2082         'ug': 'uig',
2083         'uk': 'ukr',
2084         'ur': 'urd',
2085         'uz': 'uzb',
2086         've': 'ven',
2087         'vi': 'vie',
2088         'vo': 'vol',
2089         'wa': 'wln',
2090         'wo': 'wol',
2091         'xh': 'xho',
2092         'yi': 'yid',
2093         'yo': 'yor',
2094         'za': 'zha',
2095         'zh': 'zho',
2096         'zu': 'zul',
2097     }
2098
2099     @classmethod
2100     def short2long(cls, code):
2101         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2102         return cls._lang_map.get(code[:2])
2103
2104     @classmethod
2105     def long2short(cls, code):
2106         """Convert language code from ISO 639-2/T to ISO 639-1"""
2107         for short_name, long_name in cls._lang_map.items():
2108             if long_name == code:
2109                 return short_name
2110
2111
2112 class ISO3166Utils(object):
2113     # From http://data.okfn.org/data/core/country-list
2114     _country_map = {
2115         'AF': 'Afghanistan',
2116         'AX': 'Åland Islands',
2117         'AL': 'Albania',
2118         'DZ': 'Algeria',
2119         'AS': 'American Samoa',
2120         'AD': 'Andorra',
2121         'AO': 'Angola',
2122         'AI': 'Anguilla',
2123         'AQ': 'Antarctica',
2124         'AG': 'Antigua and Barbuda',
2125         'AR': 'Argentina',
2126         'AM': 'Armenia',
2127         'AW': 'Aruba',
2128         'AU': 'Australia',
2129         'AT': 'Austria',
2130         'AZ': 'Azerbaijan',
2131         'BS': 'Bahamas',
2132         'BH': 'Bahrain',
2133         'BD': 'Bangladesh',
2134         'BB': 'Barbados',
2135         'BY': 'Belarus',
2136         'BE': 'Belgium',
2137         'BZ': 'Belize',
2138         'BJ': 'Benin',
2139         'BM': 'Bermuda',
2140         'BT': 'Bhutan',
2141         'BO': 'Bolivia, Plurinational State of',
2142         'BQ': 'Bonaire, Sint Eustatius and Saba',
2143         'BA': 'Bosnia and Herzegovina',
2144         'BW': 'Botswana',
2145         'BV': 'Bouvet Island',
2146         'BR': 'Brazil',
2147         'IO': 'British Indian Ocean Territory',
2148         'BN': 'Brunei Darussalam',
2149         'BG': 'Bulgaria',
2150         'BF': 'Burkina Faso',
2151         'BI': 'Burundi',
2152         'KH': 'Cambodia',
2153         'CM': 'Cameroon',
2154         'CA': 'Canada',
2155         'CV': 'Cape Verde',
2156         'KY': 'Cayman Islands',
2157         'CF': 'Central African Republic',
2158         'TD': 'Chad',
2159         'CL': 'Chile',
2160         'CN': 'China',
2161         'CX': 'Christmas Island',
2162         'CC': 'Cocos (Keeling) Islands',
2163         'CO': 'Colombia',
2164         'KM': 'Comoros',
2165         'CG': 'Congo',
2166         'CD': 'Congo, the Democratic Republic of the',
2167         'CK': 'Cook Islands',
2168         'CR': 'Costa Rica',
2169         'CI': 'Côte d\'Ivoire',
2170         'HR': 'Croatia',
2171         'CU': 'Cuba',
2172         'CW': 'Curaçao',
2173         'CY': 'Cyprus',
2174         'CZ': 'Czech Republic',
2175         'DK': 'Denmark',
2176         'DJ': 'Djibouti',
2177         'DM': 'Dominica',
2178         'DO': 'Dominican Republic',
2179         'EC': 'Ecuador',
2180         'EG': 'Egypt',
2181         'SV': 'El Salvador',
2182         'GQ': 'Equatorial Guinea',
2183         'ER': 'Eritrea',
2184         'EE': 'Estonia',
2185         'ET': 'Ethiopia',
2186         'FK': 'Falkland Islands (Malvinas)',
2187         'FO': 'Faroe Islands',
2188         'FJ': 'Fiji',
2189         'FI': 'Finland',
2190         'FR': 'France',
2191         'GF': 'French Guiana',
2192         'PF': 'French Polynesia',
2193         'TF': 'French Southern Territories',
2194         'GA': 'Gabon',
2195         'GM': 'Gambia',
2196         'GE': 'Georgia',
2197         'DE': 'Germany',
2198         'GH': 'Ghana',
2199         'GI': 'Gibraltar',
2200         'GR': 'Greece',
2201         'GL': 'Greenland',
2202         'GD': 'Grenada',
2203         'GP': 'Guadeloupe',
2204         'GU': 'Guam',
2205         'GT': 'Guatemala',
2206         'GG': 'Guernsey',
2207         'GN': 'Guinea',
2208         'GW': 'Guinea-Bissau',
2209         'GY': 'Guyana',
2210         'HT': 'Haiti',
2211         'HM': 'Heard Island and McDonald Islands',
2212         'VA': 'Holy See (Vatican City State)',
2213         'HN': 'Honduras',
2214         'HK': 'Hong Kong',
2215         'HU': 'Hungary',
2216         'IS': 'Iceland',
2217         'IN': 'India',
2218         'ID': 'Indonesia',
2219         'IR': 'Iran, Islamic Republic of',
2220         'IQ': 'Iraq',
2221         'IE': 'Ireland',
2222         'IM': 'Isle of Man',
2223         'IL': 'Israel',
2224         'IT': 'Italy',
2225         'JM': 'Jamaica',
2226         'JP': 'Japan',
2227         'JE': 'Jersey',
2228         'JO': 'Jordan',
2229         'KZ': 'Kazakhstan',
2230         'KE': 'Kenya',
2231         'KI': 'Kiribati',
2232         'KP': 'Korea, Democratic People\'s Republic of',
2233         'KR': 'Korea, Republic of',
2234         'KW': 'Kuwait',
2235         'KG': 'Kyrgyzstan',
2236         'LA': 'Lao People\'s Democratic Republic',
2237         'LV': 'Latvia',
2238         'LB': 'Lebanon',
2239         'LS': 'Lesotho',
2240         'LR': 'Liberia',
2241         'LY': 'Libya',
2242         'LI': 'Liechtenstein',
2243         'LT': 'Lithuania',
2244         'LU': 'Luxembourg',
2245         'MO': 'Macao',
2246         'MK': 'Macedonia, the Former Yugoslav Republic of',
2247         'MG': 'Madagascar',
2248         'MW': 'Malawi',
2249         'MY': 'Malaysia',
2250         'MV': 'Maldives',
2251         'ML': 'Mali',
2252         'MT': 'Malta',
2253         'MH': 'Marshall Islands',
2254         'MQ': 'Martinique',
2255         'MR': 'Mauritania',
2256         'MU': 'Mauritius',
2257         'YT': 'Mayotte',
2258         'MX': 'Mexico',
2259         'FM': 'Micronesia, Federated States of',
2260         'MD': 'Moldova, Republic of',
2261         'MC': 'Monaco',
2262         'MN': 'Mongolia',
2263         'ME': 'Montenegro',
2264         'MS': 'Montserrat',
2265         'MA': 'Morocco',
2266         'MZ': 'Mozambique',
2267         'MM': 'Myanmar',
2268         'NA': 'Namibia',
2269         'NR': 'Nauru',
2270         'NP': 'Nepal',
2271         'NL': 'Netherlands',
2272         'NC': 'New Caledonia',
2273         'NZ': 'New Zealand',
2274         'NI': 'Nicaragua',
2275         'NE': 'Niger',
2276         'NG': 'Nigeria',
2277         'NU': 'Niue',
2278         'NF': 'Norfolk Island',
2279         'MP': 'Northern Mariana Islands',
2280         'NO': 'Norway',
2281         'OM': 'Oman',
2282         'PK': 'Pakistan',
2283         'PW': 'Palau',
2284         'PS': 'Palestine, State of',
2285         'PA': 'Panama',
2286         'PG': 'Papua New Guinea',
2287         'PY': 'Paraguay',
2288         'PE': 'Peru',
2289         'PH': 'Philippines',
2290         'PN': 'Pitcairn',
2291         'PL': 'Poland',
2292         'PT': 'Portugal',
2293         'PR': 'Puerto Rico',
2294         'QA': 'Qatar',
2295         'RE': 'Réunion',
2296         'RO': 'Romania',
2297         'RU': 'Russian Federation',
2298         'RW': 'Rwanda',
2299         'BL': 'Saint Barthélemy',
2300         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2301         'KN': 'Saint Kitts and Nevis',
2302         'LC': 'Saint Lucia',
2303         'MF': 'Saint Martin (French part)',
2304         'PM': 'Saint Pierre and Miquelon',
2305         'VC': 'Saint Vincent and the Grenadines',
2306         'WS': 'Samoa',
2307         'SM': 'San Marino',
2308         'ST': 'Sao Tome and Principe',
2309         'SA': 'Saudi Arabia',
2310         'SN': 'Senegal',
2311         'RS': 'Serbia',
2312         'SC': 'Seychelles',
2313         'SL': 'Sierra Leone',
2314         'SG': 'Singapore',
2315         'SX': 'Sint Maarten (Dutch part)',
2316         'SK': 'Slovakia',
2317         'SI': 'Slovenia',
2318         'SB': 'Solomon Islands',
2319         'SO': 'Somalia',
2320         'ZA': 'South Africa',
2321         'GS': 'South Georgia and the South Sandwich Islands',
2322         'SS': 'South Sudan',
2323         'ES': 'Spain',
2324         'LK': 'Sri Lanka',
2325         'SD': 'Sudan',
2326         'SR': 'Suriname',
2327         'SJ': 'Svalbard and Jan Mayen',
2328         'SZ': 'Swaziland',
2329         'SE': 'Sweden',
2330         'CH': 'Switzerland',
2331         'SY': 'Syrian Arab Republic',
2332         'TW': 'Taiwan, Province of China',
2333         'TJ': 'Tajikistan',
2334         'TZ': 'Tanzania, United Republic of',
2335         'TH': 'Thailand',
2336         'TL': 'Timor-Leste',
2337         'TG': 'Togo',
2338         'TK': 'Tokelau',
2339         'TO': 'Tonga',
2340         'TT': 'Trinidad and Tobago',
2341         'TN': 'Tunisia',
2342         'TR': 'Turkey',
2343         'TM': 'Turkmenistan',
2344         'TC': 'Turks and Caicos Islands',
2345         'TV': 'Tuvalu',
2346         'UG': 'Uganda',
2347         'UA': 'Ukraine',
2348         'AE': 'United Arab Emirates',
2349         'GB': 'United Kingdom',
2350         'US': 'United States',
2351         'UM': 'United States Minor Outlying Islands',
2352         'UY': 'Uruguay',
2353         'UZ': 'Uzbekistan',
2354         'VU': 'Vanuatu',
2355         'VE': 'Venezuela, Bolivarian Republic of',
2356         'VN': 'Viet Nam',
2357         'VG': 'Virgin Islands, British',
2358         'VI': 'Virgin Islands, U.S.',
2359         'WF': 'Wallis and Futuna',
2360         'EH': 'Western Sahara',
2361         'YE': 'Yemen',
2362         'ZM': 'Zambia',
2363         'ZW': 'Zimbabwe',
2364     }
2365
2366     @classmethod
2367     def short2full(cls, code):
2368         """Convert an ISO 3166-2 country code to the corresponding full name"""
2369         return cls._country_map.get(code.upper())
2370
2371
2372 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2373     def __init__(self, proxies=None):
2374         # Set default handlers
2375         for type in ('http', 'https'):
2376             setattr(self, '%s_open' % type,
2377                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2378                         meth(r, proxy, type))
2379         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2380
2381     def proxy_open(self, req, proxy, type):
2382         req_proxy = req.headers.get('Ytdl-request-proxy')
2383         if req_proxy is not None:
2384             proxy = req_proxy
2385             del req.headers['Ytdl-request-proxy']
2386
2387         if proxy == '__noproxy__':
2388             return None  # No Proxy
2389         return compat_urllib_request.ProxyHandler.proxy_open(
2390             self, req, proxy, type)