[core] Decode environment variables with filesystem encoding (Fixes #3854, Fixes...

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 9124c36212481b860b92d4bc35676e4ab3453bd4..afe32ae0582824c0d02b965fb008c83f5c1cc044 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -203,6 +203,48 @@ def compat_ord(c):
      if type(c) is int: return c
      else: return ord(c)
  
+
+# Environment variables should be decoded with filesystem encoding
+# otherwise this results in issues like #3854 #2918 #3217
+if sys.version_info >= (3, 0):
+    compat_getenv = os.getenv
+    compat_expanduser = os.path.expanduser
+else:
+    def compat_getenv(key, default=None):
+        env = os.getenv(key, default)
+        if env:
+            env = env.decode(get_filesystem_encoding())
+        return env
+
+    def compat_expanduser(path):
+        """Expand ~ and ~user constructs.
+
+        If user or $HOME is unknown, do nothing."""
+        if path[:1] != '~':
+            return path
+        i, n = 1, len(path)
+        while i < n and path[i] not in '/\\':
+            i += 1
+
+        if 'HOME' in os.environ:
+            userhome = compat_getenv('HOME')
+        elif 'USERPROFILE' in os.environ:
+            userhome = compat_getenv('USERPROFILE')
+        elif not 'HOMEPATH' in os.environ:
+            return path
+        else:
+            try:
+                drive = compat_getenv('HOMEDRIVE')
+            except KeyError:
+                drive = ''
+            userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
+
+        if i != 1:  # ~user
+            userhome = os.path.join(os.path.dirname(userhome), path[1:i])
+
+        return userhome + path[i:]
+
+
  # This is not clearly defined otherwise
  compiled_regex_type = type(re.compile(''))
  
@@ -280,6 +322,11 @@ if sys.version_info >= (2, 7):
          return node.find(expr)
  else:
      def find_xpath_attr(node, xpath, key, val):
+        # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+        # .//node does not match if a node is a direct child of . !
+        if isinstance(xpath, unicode):
+            xpath = xpath.encode('ascii')
+
          for f in node.findall(xpath):
              if f.attrib.get(key) == val:
                  return f
@@ -299,6 +346,20 @@ def xpath_with_ns(path, ns_map):
      return '/'.join(replaced)
  
  
+def xpath_text(node, xpath, name=None, fatal=False):
+    if sys.version_info < (2, 7):  # Crazy 2.6
+        xpath = xpath.encode('ascii')
+
+    n = node.find(xpath)
+    if n is None:
+        if fatal:
+            name = xpath if name is None else name
+            raise ExtractorError('Could not find XML element %s' % name)
+        else:
+            return None
+    return n.text
+
+
  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  class BaseHTMLParser(compat_html_parser.HTMLParser):
      def __init(self):
@@ -780,6 +841,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                  del req.headers['User-agent']
              req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
              del req.headers['Youtubedl-user-agent']
+
+        if sys.version_info < (2, 7) and '#' in req.get_full_url():
+            # Python 2.6 is brain-dead when it comes to fragments
+            req._Request__original = req._Request__original.partition('#')[0]
+            req._Request__r_type = req._Request__r_type.partition('#')[0]
+
          return req
  
      def http_response(self, req, resp):
@@ -865,6 +932,7 @@ def unified_strdate(date_str):
          '%d/%m/%Y',
          '%d/%m/%y',
          '%Y/%m/%d %H:%M:%S',
+        '%d/%m/%Y %H:%M:%S',
          '%Y-%m-%d %H:%M:%S',
          '%d.%m.%Y %H:%M',
          '%d.%m.%Y %H.%M',
@@ -1178,11 +1246,14 @@ class locked_file(object):
          return self.f.read(*args)
  
  
+def get_filesystem_encoding():
+    encoding = sys.getfilesystemencoding()
+    return encoding if encoding is not None else 'utf-8'
+
+
  def shell_quote(args):
      quoted_args = []
-    encoding = sys.getfilesystemencoding()
-    if encoding is None:
-        encoding = 'utf-8'
+    encoding = get_filesystem_encoding()
      for a in args:
          if isinstance(a, bytes):
              # We may get a filename encoded with 'encodeFilename'
@@ -1232,7 +1303,7 @@ def format_bytes(bytes):
  
  
  def get_term_width():
-    columns = os.environ.get('COLUMNS', None)
+    columns = compat_getenv('COLUMNS', None)
      if columns:
          return int(columns)
  
@@ -1365,14 +1436,16 @@ def check_executable(exe, args=[]):
  
  
  class PagedList(object):
-    def __init__(self, pagefunc, pagesize):
-        self._pagefunc = pagefunc
-        self._pagesize = pagesize
-
      def __len__(self):
          # This is only useful for tests
          return len(self.getslice())
  
+
+class OnDemandPagedList(PagedList):
+    def __init__(self, pagefunc, pagesize):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+
      def getslice(self, start=0, end=None):
          res = []
          for pagenum in itertools.count(start // self._pagesize):
@@ -1411,6 +1484,35 @@ class PagedList(object):
          return res
  
  
+class InAdvancePagedList(PagedList):
+    def __init__(self, pagefunc, pagecount, pagesize):
+        self._pagefunc = pagefunc
+        self._pagecount = pagecount
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        start_page = start // self._pagesize
+        end_page = (
+            self._pagecount if end is None else (end // self._pagesize + 1))
+        skip_elems = start - start_page * self._pagesize
+        only_more = None if end is None else end - start
+        for pagenum in range(start_page, end_page):
+            page = list(self._pagefunc(pagenum))
+            if skip_elems:
+                page = page[skip_elems:]
+                skip_elems = None
+            if only_more is not None:
+                if len(page) < only_more:
+                    only_more -= len(page)
+                else:
+                    page = page[:only_more]
+                    res.extend(page)
+                    break
+            res.extend(page)
+        return res
+
+
  def uppercase_escape(s):
      unicode_escape = codecs.getdecoder('unicode_escape')
      return re.sub(
@@ -1423,7 +1525,7 @@ def escape_rfc3986(s):
      """Escape non-ASCII characters as suggested by RFC 3986"""
      if sys.version_info < (3, 0) and isinstance(s, unicode):
          s = s.encode('utf-8')
-    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") #"%/;:@&=+$,!~*'()?#[]+"   #?#[]+
+    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
  
  
  def escape_url(url):
@@ -1570,3 +1672,13 @@ except AttributeError:
          if ret:
              raise subprocess.CalledProcessError(ret, p.args, output=output)
          return output
+
+
+def limit_length(s, length):
+    """ Add ellipses to overly long strings """
+    if s is None:
+        return None
+    ELLIPSES = '...'
+    if len(s) > length:
+        return s[:length - len(ELLIPSES)] + ELLIPSES
+    return s