[utils] YoutubeDLHandler: don't use 'Youtubedl-user-agent' for overriding the default...

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 620c4fe8a154cf209d3aada650fd7f8db70e7a59..d22b0313460f4dc42453bcd7772bd59f5dcfe074 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,8 @@
  #!/usr/bin/env python
  # -*- coding: utf-8 -*-
  
+from __future__ import unicode_literals
+
  import calendar
  import codecs
  import contextlib
@@ -8,6 +10,7 @@ import ctypes
  import datetime
  import email.utils
  import errno
+import functools
  import gzip
  import itertools
  import io
@@ -32,14 +35,16 @@ from .compat import (
      compat_chr,
      compat_getenv,
      compat_html_entities,
-    compat_html_parser,
+    compat_http_client,
      compat_parse_qs,
+    compat_socket_create_connection,
      compat_str,
      compat_urllib_error,
      compat_urllib_parse,
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urlparse,
+    shlex_quote,
  )
  
  
@@ -54,6 +59,7 @@ std_headers = {
      'Accept-Language': 'en-us,en;q=0.5',
  }
  
+
  def preferredencoding():
      """Get preferred encoding.
  
@@ -62,7 +68,7 @@ def preferredencoding():
      """
      try:
          pref = locale.getpreferredencoding()
-        u'TEST'.encode(pref)
+        'TEST'.encode(pref)
      except:
          pref = 'UTF-8'
  
@@ -70,12 +76,25 @@ def preferredencoding():
  
  
  def write_json_file(obj, fn):
-    """ Encode obj as JSON and write it to fn, atomically """
+    """ Encode obj as JSON and write it to fn, atomically if possible """
+
+    fn = encodeFilename(fn)
+    if sys.version_info < (3, 0) and sys.platform != 'win32':
+        encoding = get_filesystem_encoding()
+        # os.path.basename returns a bytes object, but NamedTemporaryFile
+        # will fail if the filename contains non ascii characters unless we
+        # use a unicode object
+        path_basename = lambda f: os.path.basename(fn).decode(encoding)
+        # the same for os.path.dirname
+        path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
+    else:
+        path_basename = os.path.basename
+        path_dirname = os.path.dirname
  
      args = {
          'suffix': '.tmp',
-        'prefix': os.path.basename(fn) + '.',
-        'dir': os.path.dirname(fn),
+        'prefix': path_basename(fn) + '.',
+        'dir': path_dirname(fn),
          'delete': False,
      }
  
@@ -94,6 +113,13 @@ def write_json_file(obj, fn):
      try:
          with tf:
              json.dump(obj, tf)
+        if sys.platform == 'win32':
+            # Need to remove existing file on Windows, else os.rename raises
+            # WindowsError or FileExistsError.
+            try:
+                os.unlink(fn)
+            except OSError:
+                pass
          os.rename(tf.name, fn)
      except:
          try:
@@ -108,7 +134,7 @@ if sys.version_info >= (2, 7):
          """ Find the xpath xpath[@key=val] """
          assert re.match(r'^[a-zA-Z-]+$', key)
          assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
-        expr = xpath + u"[@%s='%s']" % (key, val)
+        expr = xpath + "[@%s='%s']" % (key, val)
          return node.find(expr)
  else:
      def find_xpath_attr(node, xpath, key, val):
@@ -124,6 +150,8 @@ else:
  
  # On python2.6 the xml.etree.ElementTree.Element methods don't support
  # the namespace parameter
+
+
  def xpath_with_ns(path, ns_map):
      components = [c.split(':') for c in path.split('/')]
      replaced = []
@@ -141,7 +169,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
          xpath = xpath.encode('ascii')
  
      n = node.find(xpath)
-    if n is None:
+    if n is None or n.text is None:
          if fatal:
              name = xpath if name is None else name
              raise ExtractorError('Could not find XML element %s' % name)
@@ -150,131 +178,40 @@ def xpath_text(node, xpath, name=None, fatal=False):
      return n.text
  
  
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
-    def __init(self):
-        compat_html_parser.HTMLParser.__init__(self)
-        self.html = None
-
-    def loads(self, html):
-        self.html = html
-        self.feed(html)
-        self.close()
-
-class AttrParser(BaseHTMLParser):
-    """Modified HTMLParser that isolates a tag with the specified attribute"""
-    def __init__(self, attribute, value):
-        self.attribute = attribute
-        self.value = value
-        self.result = None
-        self.started = False
-        self.depth = {}
-        self.watch_startpos = False
-        self.error_count = 0
-        BaseHTMLParser.__init__(self)
-
-    def error(self, message):
-        if self.error_count > 10 or self.started:
-            raise compat_html_parser.HTMLParseError(message, self.getpos())
-        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
-        self.error_count += 1
-        self.goahead(1)
-
-    def handle_starttag(self, tag, attrs):
-        attrs = dict(attrs)
-        if self.started:
-            self.find_startpos(None)
-        if self.attribute in attrs and attrs[self.attribute] == self.value:
-            self.result = [tag]
-            self.started = True
-            self.watch_startpos = True
-        if self.started:
-            if not tag in self.depth: self.depth[tag] = 0
-            self.depth[tag] += 1
-
-    def handle_endtag(self, tag):
-        if self.started:
-            if tag in self.depth: self.depth[tag] -= 1
-            if self.depth[self.result[0]] == 0:
-                self.started = False
-                self.result.append(self.getpos())
-
-    def find_startpos(self, x):
-        """Needed to put the start position of the result (self.result[1])
-        after the opening tag with the requested id"""
-        if self.watch_startpos:
-            self.watch_startpos = False
-            self.result.append(self.getpos())
-    handle_entityref = handle_charref = handle_data = handle_comment = \
-    handle_decl = handle_pi = unknown_decl = find_startpos
-
-    def get_result(self):
-        if self.result is None:
-            return None
-        if len(self.result) != 3:
-            return None
-        lines = self.html.split('\n')
-        lines = lines[self.result[1][0]-1:self.result[2][0]]
-        lines[0] = lines[0][self.result[1][1]:]
-        if len(lines) == 1:
-            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
-        lines[-1] = lines[-1][:self.result[2][1]]
-        return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
-    AttrParser.parse_endtag = (lambda self, i:
-        i + len("</scr'+'ipt>")
-        if self.rawdata[i:].startswith("</scr'+'ipt>")
-        else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
  def get_element_by_id(id, html):
      """Return the content of the tag with the specified ID in the passed HTML document"""
      return get_element_by_attribute("id", id, html)
  
+
  def get_element_by_attribute(attribute, value, html):
      """Return the content of the tag with the specified attribute in the passed HTML document"""
-    parser = AttrParser(attribute, value)
-    try:
-        parser.loads(html)
-    except compat_html_parser.HTMLParseError:
-        pass
-    return parser.get_result()
  
-class MetaParser(BaseHTMLParser):
-    """
-    Modified HTMLParser that isolates a meta tag with the specified name 
-    attribute.
-    """
-    def __init__(self, name):
-        BaseHTMLParser.__init__(self)
-        self.name = name
-        self.content = None
-        self.result = None
-
-    def handle_starttag(self, tag, attrs):
-        if tag != 'meta':
-            return
-        attrs = dict(attrs)
-        if attrs.get('name') == self.name:
-            self.result = attrs.get('content')
+    m = re.search(r'''(?xs)
+        <([a-zA-Z0-9:._-]+)
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         \s+%s=['"]?%s['"]?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+        \s*>
+        (?P<content>.*?)
+        </\1>
+    ''' % (re.escape(attribute), re.escape(value)), html)
  
-    def get_result(self):
-        return self.result
+    if not m:
+        return None
+    res = m.group('content')
  
-def get_meta_content(name, html):
-    """
-    Return the content attribute from the meta tag with the given name attribute.
-    """
-    parser = MetaParser(name)
-    try:
-        parser.loads(html)
-    except compat_html_parser.HTMLParseError:
-        pass
-    return parser.get_result()
+    if res.startswith('"') or res.startswith("'"):
+        res = res[1:-1]
+
+    return unescapeHTML(res)
  
  
  def clean_html(html):
      """Clean an HTML snippet into a readable string"""
+
+    if html is None:  # Convenience for sanitizing descriptions etc.
+        return html
+
      # Newline vs <br />
      html = html.replace('\n', ' ')
      html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
@@ -297,7 +234,7 @@ def sanitize_open(filename, open_mode):
      It returns the tuple (stream, definitive_file_name).
      """
      try:
-        if filename == u'-':
+        if filename == '-':
              if sys.platform == 'win32':
                  import msvcrt
                  msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
@@ -310,9 +247,9 @@ def sanitize_open(filename, open_mode):
  
          # In case of error, try to remove win32 forbidden chars
          alt_filename = os.path.join(
-                        re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
-                        for path_part in os.path.split(filename)
-                       )
+            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
+            for path_part in os.path.split(filename)
+        )
          if alt_filename == filename:
              raise
          else:
@@ -329,6 +266,7 @@ def timeconvert(timestr):
          timestamp = email.utils.mktime_tz(timetuple)
      return timestamp
  
+
  def sanitize_filename(s, restricted=False, is_id=False):
      """Sanitizes a string so it could be used as part of a filename.
      If restricted is set, use a stricter subset of allowed characters.
@@ -349,7 +287,9 @@ def sanitize_filename(s, restricted=False, is_id=False):
              return '_'
          return char
  
-    result = u''.join(map(replace_insane, s))
+    # Handle timestamps
+    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
+    result = ''.join(map(replace_insane, s))
      if not is_id:
          while '__' in result:
              result = result.replace('__', '_')
@@ -361,6 +301,7 @@ def sanitize_filename(s, restricted=False, is_id=False):
              result = '_'
      return result
  
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -379,15 +320,15 @@ def _htmlentity_transform(entity):
      mobj = re.match(r'#(x?[0-9]+)', entity)
      if mobj is not None:
          numstr = mobj.group(1)
-        if numstr.startswith(u'x'):
+        if numstr.startswith('x'):
              base = 16
-            numstr = u'0%s' % numstr
+            numstr = '0%s' % numstr
          else:
              base = 10
          return compat_chr(int(numstr, base))
  
      # Unknown entity in name, return its literal representation
-    return (u'&%s;' % entity)
+    return ('&%s;' % entity)
  
  
  def unescapeHTML(s):
@@ -411,7 +352,7 @@ def encodeFilename(s, for_subprocess=False):
          return s
  
      if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
-        # Pass u'' directly to use Unicode APIs on Windows 2000 and up
+        # Pass '' directly to use Unicode APIs on Windows 2000 and up
          # (Detecting Windows NT 4 is tricky because 'major >= 4' would
          # match Windows 9x series as well. Besides, NT 4 is obsolete.)
          if not for_subprocess:
@@ -431,7 +372,7 @@ def encodeArgument(s):
      if not isinstance(s, compat_str):
          # Legacy code that uses byte strings
          # Uncomment the following line after fixing all post processors
-        #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+        # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
          s = s.decode('ascii')
      return encodeFilename(s, True)
  
@@ -445,6 +386,7 @@ def decodeOption(optval):
      assert isinstance(optval, compat_str)
      return optval
  
+
  def formatSeconds(secs):
      if secs > 3600:
          return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
@@ -454,48 +396,34 @@ def formatSeconds(secs):
          return '%d' % secs
  
  
-def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
-    if sys.version_info < (3, 2):
-        import httplib
-
-        class HTTPSConnectionV3(httplib.HTTPSConnection):
-            def __init__(self, *args, **kwargs):
-                httplib.HTTPSConnection.__init__(self, *args, **kwargs)
-
-            def connect(self):
-                sock = socket.create_connection((self.host, self.port), self.timeout)
-                if getattr(self, '_tunnel_host', False):
-                    self.sock = sock
-                    self._tunnel()
-                try:
-                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
-                except ssl.SSLError:
-                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
-
-        class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
-            def https_open(self, req):
-                return self.do_open(HTTPSConnectionV3, req)
-        return HTTPSHandlerV3(**kwargs)
-    elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
-        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
-        context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
+def make_HTTPS_handler(params, **kwargs):
+    opts_no_check_certificate = params.get('nocheckcertificate', False)
+    if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
+        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
          if opts_no_check_certificate:
+            context.check_hostname = False
              context.verify_mode = ssl.CERT_NONE
-        return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+        try:
+            return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+        except TypeError:
+            # Python 2.7.8
+            # (create_default_context present but HTTPSHandler has no context=)
+            pass
+
+    if sys.version_info < (3, 2):
+        return YoutubeDLHTTPSHandler(params, **kwargs)
      else:  # Python < 3.4
-        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+        context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
          context.verify_mode = (ssl.CERT_NONE
                                 if opts_no_check_certificate
                                 else ssl.CERT_REQUIRED)
          context.set_default_verify_paths()
-        try:
-            context.load_default_certs()
-        except AttributeError:
-            pass  # Python < 3.4
-        return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+        return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+
  
  class ExtractorError(Exception):
      """Error during info extraction."""
+
      def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
          """ tb, if given, is the original traceback (so that it can be printed out).
          If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
@@ -506,9 +434,15 @@ class ExtractorError(Exception):
          if video_id is not None:
              msg = video_id + ': ' + msg
          if cause:
-            msg += u' (caused by %r)' % cause
+            msg += ' (caused by %r)' % cause
          if not expected:
-            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
+            if ytdl_is_updateable():
+                update_cmd = 'type  youtube-dl -U  to update'
+            else:
+                update_cmd = 'see  https://yt-dl.org/update  on how to update'
+            msg += '; please report this issue on https://yt-dl.org/bug .'
+            msg += ' Make sure you are using the latest version; %s.' % update_cmd
+            msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
          super(ExtractorError, self).__init__(msg)
  
          self.traceback = tb
@@ -519,7 +453,14 @@ class ExtractorError(Exception):
      def format_traceback(self):
          if self.traceback is None:
              return None
-        return u''.join(traceback.format_tb(self.traceback))
+        return ''.join(traceback.format_tb(self.traceback))
+
+
+class UnsupportedError(ExtractorError):
+    def __init__(self, url):
+        super(UnsupportedError, self).__init__(
+            'Unsupported URL: %s' % url, expected=True)
+        self.url = url
  
  
  class RegexNotFoundError(ExtractorError):
@@ -534,6 +475,7 @@ class DownloadError(Exception):
      configured to continue on errors. They will contain the appropriate
      error message.
      """
+
      def __init__(self, msg, exc_info=None):
          """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
          super(DownloadError, self).__init__(msg)
@@ -555,9 +497,11 @@ class PostProcessingError(Exception):
      This exception may be raised by PostProcessor's .run() method to
      indicate an error in the postprocessing task.
      """
+
      def __init__(self, msg):
          self.msg = msg
  
+
  class MaxDownloadsReached(Exception):
      """ --max-downloads limit has been reached. """
      pass
@@ -587,6 +531,29 @@ class ContentTooShortError(Exception):
          self.downloaded = downloaded
          self.expected = expected
  
+
+def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
+    hc = http_class(*args, **kwargs)
+    source_address = ydl_handler._params.get('source_address')
+    if source_address is not None:
+        sa = (source_address, 0)
+        if hasattr(hc, 'source_address'):  # Python 2.7+
+            hc.source_address = sa
+        else:  # Python 2.6
+            def _hc_connect(self, *args, **kwargs):
+                sock = compat_socket_create_connection(
+                    (self.host, self.port), self.timeout, sa)
+                if is_https:
+                    self.sock = ssl.wrap_socket(
+                        sock, self.key_file, self.cert_file,
+                        ssl_version=ssl.PROTOCOL_TLSv1)
+                else:
+                    self.sock = sock
+            hc.connect = functools.partial(_hc_connect, hc)
+
+    return hc
+
+
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.
  
@@ -605,6 +572,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      public domain.
      """
  
+    def __init__(self, params, *args, **kwargs):
+        compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
+        self._params = params
+
+    def http_open(self, req):
+        return self.do_open(functools.partial(
+            _create_http_connection, self, compat_http_client.HTTPConnection, False),
+            req)
+
      @staticmethod
      def deflate(data):
          try:
@@ -622,17 +598,14 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
  
      def http_request(self, req):
          for h, v in std_headers.items():
-            if h not in req.headers:
+            # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
+            # The dict keys are capitalized because of this bug by urllib
+            if h.capitalize() not in req.headers:
                  req.add_header(h, v)
          if 'Youtubedl-no-compression' in req.headers:
              if 'Accept-encoding' in req.headers:
                  del req.headers['Accept-encoding']
              del req.headers['Youtubedl-no-compression']
-        if 'Youtubedl-user-agent' in req.headers:
-            if 'User-agent' in req.headers:
-                del req.headers['User-agent']
-            req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
-            del req.headers['Youtubedl-user-agent']
  
          if sys.version_info < (2, 7) and '#' in req.get_full_url():
              # Python 2.6 is brain-dead when it comes to fragments
@@ -674,6 +647,18 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      https_response = http_response
  
  
+class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
+    def __init__(self, params, https_conn_class=None, *args, **kwargs):
+        compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
+        self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
+        self._params = params
+
+    def https_open(self, req):
+        return self.do_open(functools.partial(
+            _create_http_connection, self, self._https_conn_class, True),
+            req)
+
+
  def parse_iso8601(date_str, delimiter='T'):
      """ Return a UNIX timestamp from the given date """
  
@@ -699,17 +684,19 @@ def parse_iso8601(date_str, delimiter='T'):
      return calendar.timegm(dt.timetuple())
  
  
-def unified_strdate(date_str):
+def unified_strdate(date_str, day_first=True):
      """Return a string with the date in the format YYYYMMDD"""
  
      if date_str is None:
          return None
-
      upload_date = None
-    #Replace commas
+    # Replace commas
      date_str = date_str.replace(',', ' ')
      # %z (UTC offset) is only supported in python>=3.2
      date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
+
      format_expressions = [
          '%d %B %Y',
          '%d %b %Y',
@@ -718,13 +705,10 @@ def unified_strdate(date_str):
          '%b %dst %Y %I:%M%p',
          '%b %dnd %Y %I:%M%p',
          '%b %dth %Y %I:%M%p',
+        '%Y %m %d',
          '%Y-%m-%d',
          '%Y/%m/%d',
-        '%d.%m.%Y',
-        '%d/%m/%Y',
-        '%d/%m/%y',
          '%Y/%m/%d %H:%M:%S',
-        '%d/%m/%Y %H:%M:%S',
          '%Y-%m-%d %H:%M:%S',
          '%Y-%m-%d %H:%M:%S.%f',
          '%d.%m.%Y %H:%M',
@@ -736,6 +720,20 @@ def unified_strdate(date_str):
          '%Y-%m-%dT%H:%M:%S.%f',
          '%Y-%m-%dT%H:%M',
      ]
+    if day_first:
+        format_expressions.extend([
+            '%d.%m.%Y',
+            '%d/%m/%Y',
+            '%d/%m/%y',
+            '%d/%m/%Y %H:%M:%S',
+        ])
+    else:
+        format_expressions.extend([
+            '%m.%d.%Y',
+            '%m/%d/%Y',
+            '%m/%d/%y',
+            '%m/%d/%Y %H:%M:%S',
+        ])
      for expression in format_expressions:
          try:
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -747,25 +745,30 @@ def unified_strdate(date_str):
              upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
      return upload_date
  
-def determine_ext(url, default_ext=u'unknown_video'):
+
+def determine_ext(url, default_ext='unknown_video'):
      if url is None:
          return default_ext
-    guess = url.partition(u'?')[0].rpartition(u'.')[2]
+    guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
      else:
          return default_ext
  
+
  def subtitles_filename(filename, sub_lang, sub_format):
-    return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+    return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
+
  
  def date_from_str(date_str):
      """
      Return a datetime object from a string in the format YYYYMMDD or
      (now|today)[+-][0-9](day|week|month|year)(s)?"""
      today = datetime.date.today()
-    if date_str == 'now'or date_str == 'today':
+    if date_str in ('now', 'today'):
          return today
+    if date_str == 'yesterday':
+        return today - datetime.timedelta(days=1)
      match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
      if match is not None:
          sign = match.group('sign')
@@ -773,7 +776,7 @@ def date_from_str(date_str):
          if sign == '-':
              time = -time
          unit = match.group('unit')
-        #A bad aproximation?
+        # A bad aproximation?
          if unit == 'month':
              unit = 'day'
              time *= 30
@@ -784,7 +787,8 @@ def date_from_str(date_str):
          delta = datetime.timedelta(**{unit: time})
          return today + delta
      return datetime.datetime.strptime(date_str, "%Y%m%d").date()
-    
+
+
  def hyphenate_date(date_str):
      """
      Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
@@ -794,8 +798,10 @@ def hyphenate_date(date_str):
      else:
          return date_str
  
+
  class DateRange(object):
      """Represents a time interval between two dates"""
+
      def __init__(self, start=None, end=None):
          """start and end must be strings in the format accepted by date"""
          if start is not None:
@@ -808,17 +814,20 @@ class DateRange(object):
              self.end = datetime.datetime.max.date()
          if self.start > self.end:
              raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
+
      @classmethod
      def day(cls, day):
          """Returns a range that only contains the given day"""
-        return cls(day,day)
+        return cls(day, day)
+
      def __contains__(self, date):
          """Check if the date is in the range"""
          if not isinstance(date, datetime.date):
              date = date_from_str(date)
          return self.start <= date <= self.end
+
      def __str__(self):
-        return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
+        return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
  
  
  def platform_name():
@@ -849,27 +858,30 @@ def _windows_write_string(s, out):
      except AttributeError:
          # If the output stream doesn't have a fileno, it's virtual
          return False
+    except io.UnsupportedOperation:
+        # Some strange Windows pseudo files?
+        return False
      if fileno not in WIN_OUTPUT_IDS:
          return False
  
      GetStdHandle = ctypes.WINFUNCTYPE(
          ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
-        ("GetStdHandle", ctypes.windll.kernel32))
+        (b"GetStdHandle", ctypes.windll.kernel32))
      h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
  
      WriteConsoleW = ctypes.WINFUNCTYPE(
          ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
          ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
-        ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
+        ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
      written = ctypes.wintypes.DWORD(0)
  
-    GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
+    GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
      FILE_TYPE_CHAR = 0x0002
      FILE_TYPE_REMOTE = 0x8000
      GetConsoleMode = ctypes.WINFUNCTYPE(
          ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
          ctypes.POINTER(ctypes.wintypes.DWORD))(
-        ("GetConsoleMode", ctypes.windll.kernel32))
+        (b"GetConsoleMode", ctypes.windll.kernel32))
      INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
  
      def not_a_console(handle):
@@ -937,10 +949,7 @@ def bytes_to_intlist(bs):
  def intlist_to_bytes(xs):
      if not xs:
          return b''
-    if isinstance(chr(0), bytes):  # Python 2
-        return ''.join([chr(x) for x in xs])
-    else:
-        return bytes(xs)
+    return struct_pack('%dB' % len(xs), *xs)
  
  
  # Cross-platform file locking
@@ -1052,7 +1061,7 @@ def shell_quote(args):
              # We may get a filename encoded with 'encodeFilename'
              a = a.decode(encoding)
          quoted_args.append(pipes.quote(a))
-    return u' '.join(quoted_args)
+    return ' '.join(quoted_args)
  
  
  def takewhile_inclusive(pred, seq):
@@ -1068,31 +1077,85 @@ def smuggle_url(url, data):
      """ Pass additional data in a URL for internal use. """
  
      sdata = compat_urllib_parse.urlencode(
-        {u'__youtubedl_smuggle': json.dumps(data)})
-    return url + u'#' + sdata
+        {'__youtubedl_smuggle': json.dumps(data)})
+    return url + '#' + sdata
  
  
  def unsmuggle_url(smug_url, default=None):
-    if not '#__youtubedl_smuggle' in smug_url:
+    if '#__youtubedl_smuggle' not in smug_url:
          return smug_url, default
-    url, _, sdata = smug_url.rpartition(u'#')
-    jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
+    url, _, sdata = smug_url.rpartition('#')
+    jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
      data = json.loads(jsond)
      return url, data
  
  
  def format_bytes(bytes):
      if bytes is None:
-        return u'N/A'
+        return 'N/A'
      if type(bytes) is str:
          bytes = float(bytes)
      if bytes == 0.0:
          exponent = 0
      else:
          exponent = int(math.log(bytes, 1024.0))
-    suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+    suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
      converted = float(bytes) / float(1024 ** exponent)
-    return u'%.2f%s' % (converted, suffix)
+    return '%.2f%s' % (converted, suffix)
+
+
+def parse_filesize(s):
+    if s is None:
+        return None
+
+    # The lower-case forms are of course incorrect and inofficial,
+    # but we support those too
+    _UNIT_TABLE = {
+        'B': 1,
+        'b': 1,
+        'KiB': 1024,
+        'KB': 1000,
+        'kB': 1024,
+        'Kb': 1000,
+        'MiB': 1024 ** 2,
+        'MB': 1000 ** 2,
+        'mB': 1024 ** 2,
+        'Mb': 1000 ** 2,
+        'GiB': 1024 ** 3,
+        'GB': 1000 ** 3,
+        'gB': 1024 ** 3,
+        'Gb': 1000 ** 3,
+        'TiB': 1024 ** 4,
+        'TB': 1000 ** 4,
+        'tB': 1024 ** 4,
+        'Tb': 1000 ** 4,
+        'PiB': 1024 ** 5,
+        'PB': 1000 ** 5,
+        'pB': 1024 ** 5,
+        'Pb': 1000 ** 5,
+        'EiB': 1024 ** 6,
+        'EB': 1000 ** 6,
+        'eB': 1024 ** 6,
+        'Eb': 1000 ** 6,
+        'ZiB': 1024 ** 7,
+        'ZB': 1000 ** 7,
+        'zB': 1024 ** 7,
+        'Zb': 1000 ** 7,
+        'YiB': 1024 ** 8,
+        'YB': 1000 ** 8,
+        'yB': 1024 ** 8,
+        'Yb': 1000 ** 8,
+    }
+
+    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    if not m:
+        return None
+
+    num_str = m.group('num').replace(',', '.')
+    mult = _UNIT_TABLE[m.group('unit')]
+    return int(float(num_str) * mult)
  
  
  def get_term_width():
@@ -1115,8 +1178,8 @@ def month_by_name(name):
      """ Return the number of a month by (locale-independently) English name """
  
      ENGLISH_NAMES = [
-        u'January', u'February', u'March', u'April', u'May', u'June',
-        u'July', u'August', u'September', u'October', u'November', u'December']
+        'January', 'February', 'March', 'April', 'May', 'June',
+        'July', 'August', 'September', 'October', 'November', 'December']
      try:
          return ENGLISH_NAMES.index(name) + 1
      except ValueError:
@@ -1127,7 +1190,7 @@ def fix_xml_ampersands(xml_str):
      """Replace all the '&' by '&amp;' in XML"""
      return re.sub(
          r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
-        u'&amp;',
+        '&amp;',
          xml_str)
  
  
@@ -1160,7 +1223,7 @@ def remove_end(s, end):
  
  def url_basename(url):
      path = compat_urlparse.urlparse(url).path
-    return path.strip(u'/').split(u'/')[-1]
+    return path.strip('/').split('/')[-1]
  
  
  class HEADRequest(compat_urllib_request.Request):
@@ -1185,7 +1248,7 @@ def str_to_int(int_str):
      """ A more relaxed version of int_or_none """
      if int_str is None:
          return None
-    int_str = re.sub(r'[,\.\+]', u'', int_str)
+    int_str = re.sub(r'[,\.\+]', '', int_str)
      return int(int_str)
  
  
@@ -1194,28 +1257,44 @@ def float_or_none(v, scale=1, invscale=1, default=None):
  
  
  def parse_duration(s):
-    if s is None:
+    if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
          return None
  
      s = s.strip()
  
      m = re.match(
-        r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
+        r'''(?ix)(?:P?T)?
+        (?:
+            (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
+            (?P<only_hours>[0-9.]+)\s*(?:hours?)|
+
+            (?:
+                (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
+                (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
+            )?
+            (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
+        )$''', s)
      if not m:
          return None
-    res = int(m.group('secs'))
+    res = 0
+    if m.group('only_mins'):
+        return float_or_none(m.group('only_mins'), invscale=60)
+    if m.group('only_hours'):
+        return float_or_none(m.group('only_hours'), invscale=60 * 60)
+    if m.group('secs'):
+        res += int(m.group('secs'))
      if m.group('mins'):
          res += int(m.group('mins')) * 60
-        if m.group('hours'):
-            res += int(m.group('hours')) * 60 * 60
+    if m.group('hours'):
+        res += int(m.group('hours')) * 60 * 60
      if m.group('ms'):
          res += float(m.group('ms'))
      return res
  
  
  def prepend_extension(filename, ext):
-    name, real_ext = os.path.splitext(filename) 
-    return u'{0}.{1}{2}'.format(name, ext, real_ext)
+    name, real_ext = os.path.splitext(filename)
+    return '{0}.{1}{2}'.format(name, ext, real_ext)
  
  
  def check_executable(exe, args=[]):
@@ -1229,18 +1308,25 @@ def check_executable(exe, args=[]):
  
  
  def get_exe_version(exe, args=['--version'],
-                    version_re=r'version\s+([0-9._-a-zA-Z]+)',
-                    unrecognized=u'present'):
+                    version_re=None, unrecognized='present'):
      """ Returns the version of the specified executable,
      or False if the executable is not present """
      try:
-        out, err = subprocess.Popen(
+        out, _ = subprocess.Popen(
              [exe] + args,
              stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
      except OSError:
          return False
-    firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
-    m = re.search(version_re, firstline)
+    if isinstance(out, bytes):  # Python 2.x
+        out = out.decode('ascii', 'ignore')
+    return detect_exe_version(out, version_re, unrecognized)
+
+
+def detect_exe_version(output, version_re=None, unrecognized='present'):
+    assert isinstance(output, compat_str)
+    if version_re is None:
+        version_re = r'version\s+([-0-9._a-zA-Z]+)'
+    m = re.search(version_re, output)
      if m:
          return m.group(1)
      else:
@@ -1337,7 +1423,7 @@ def escape_rfc3986(s):
      """Escape non-ASCII characters as suggested by RFC 3986"""
      if sys.version_info < (3, 0) and isinstance(s, unicode):
          s = s.encode('utf-8')
-    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+    return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
  
  
  def escape_url(url):
@@ -1351,7 +1437,7 @@ def escape_url(url):
      ).geturl()
  
  try:
-    struct.pack(u'!I', 0)
+    struct.pack('!I', 0)
  except TypeError:
      # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
      def struct_pack(spec, *args):
@@ -1372,7 +1458,7 @@ def read_batch_urls(batch_fd):
      def fixup(url):
          if not isinstance(url, compat_str):
              url = url.decode('utf-8', 'replace')
-        BOM_UTF8 = u'\xef\xbb\xbf'
+        BOM_UTF8 = '\xef\xbb\xbf'
          if url.startswith(BOM_UTF8):
              url = url[len(BOM_UTF8):]
          url = url.strip()
@@ -1428,7 +1514,8 @@ def parse_age_limit(s):
  
  
  def strip_jsonp(code):
-    return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
+    return re.sub(
+        r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
  
  
  def js_to_json(code):
@@ -1480,7 +1567,7 @@ def limit_length(s, length):
  
  
  def version_tuple(v):
-    return [int(e) for e in v.split('.')]
+    return tuple(int(e) for e in re.split(r'[-.]', v))
  
  
  def is_outdated_version(version, limit, assume_new=True):
@@ -1490,3 +1577,85 @@ def is_outdated_version(version, limit, assume_new=True):
          return version_tuple(version) < version_tuple(limit)
      except ValueError:
          return not assume_new
+
+
+def ytdl_is_updateable():
+    """ Returns if youtube-dl can be updated with -U """
+    from zipimport import zipimporter
+
+    return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
+
+
+def args_to_str(args):
+    # Get a short string representation for a subprocess command
+    return ' '.join(shlex_quote(a) for a in args)
+
+
+def urlhandle_detect_ext(url_handle):
+    try:
+        url_handle.headers
+        getheader = lambda h: url_handle.headers[h]
+    except AttributeError:  # Python < 3
+        getheader = url_handle.info().getheader
+
+    cd = getheader('Content-Disposition')
+    if cd:
+        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        if m:
+            e = determine_ext(m.group('filename'), default_ext=None)
+            if e:
+                return e
+
+    return getheader('Content-Type').split("/")[1]
+
+
+def age_restricted(content_limit, age_limit):
+    """ Returns True iff the content should be blocked """
+
+    if age_limit is None:  # No limit set
+        return False
+    if content_limit is None:
+        return False  # Content available for everyone
+    return age_limit < content_limit
+
+
+def is_html(first_bytes):
+    """ Detect whether a file contains HTML by examining its first bytes. """
+
+    BOMS = [
+        (b'\xef\xbb\xbf', 'utf-8'),
+        (b'\x00\x00\xfe\xff', 'utf-32-be'),
+        (b'\xff\xfe\x00\x00', 'utf-32-le'),
+        (b'\xff\xfe', 'utf-16-le'),
+        (b'\xfe\xff', 'utf-16-be'),
+    ]
+    for bom, enc in BOMS:
+        if first_bytes.startswith(bom):
+            s = first_bytes[len(bom):].decode(enc, 'replace')
+            break
+    else:
+        s = first_bytes.decode('utf-8', 'replace')
+
+    return re.match(r'^\s*<', s)
+
+
+def determine_protocol(info_dict):
+    protocol = info_dict.get('protocol')
+    if protocol is not None:
+        return protocol
+
+    url = info_dict['url']
+    if url.startswith('rtmp'):
+        return 'rtmp'
+    elif url.startswith('mms'):
+        return 'mms'
+    elif url.startswith('rtsp'):
+        return 'rtsp'
+
+    ext = determine_ext(url)
+    if ext == 'm3u8':
+        return 'm3u8'
+    elif ext == 'f4m':
+        return 'f4m'
+
+    return compat_urllib_parse_urlparse(url).scheme