hack for apparently broken parse_qs in python2
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Nov 2012 01:01:09 +0000 (02:01 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Nov 2012 01:01:09 +0000 (02:01 +0100)
youtube_dl/utils.py

index a5df62bf81ce0336ff4d641f8bcc93a1c27ff1c8..cf78e9dc843d7bfba008c92a1954add3dca633b4 100644 (file)
@@ -49,7 +49,81 @@ except ImportError: # Python 2
 try:
        from urllib.parse import parse_qs as compat_parse_qs
 except ImportError: # Python 2
-       from urlparse import parse_qs as compat_parse_qs
+       # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
+       # Python 2's version is apparently totally broken
+       def _unquote(string, encoding='utf-8', errors='replace'):
+               if string == '':
+                       return string
+               res = string.split('%')
+               if len(res) == 1:
+                       return string
+               if encoding is None:
+                       encoding = 'utf-8'
+               if errors is None:
+                       errors = 'replace'
+               # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
+               pct_sequence = b''
+               string = res[0]
+               for item in res[1:]:
+                       try:
+                               if not item:
+                                       raise ValueError
+                               pct_sequence += item[:2].decode('hex')
+                               rest = item[2:]
+                               if not rest:
+                                       # This segment was just a single percent-encoded character.
+                                       # May be part of a sequence of code units, so delay decoding.
+                                       # (Stored in pct_sequence).
+                                       continue
+                       except ValueError:
+                               rest = '%' + item
+                       # Encountered non-percent-encoded characters. Flush the current
+                       # pct_sequence.
+                       string += pct_sequence.decode(encoding, errors) + rest
+                       pct_sequence = b''
+               if pct_sequence:
+                       # Flush the final pct_sequence
+                       string += pct_sequence.decode(encoding, errors)
+               return string
+
+       def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
+                               encoding='utf-8', errors='replace'):
+               qs, _coerce_result = qs, unicode
+               pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+               r = []
+               for name_value in pairs:
+                       if not name_value and not strict_parsing:
+                               continue
+                       nv = name_value.split('=', 1)
+                       if len(nv) != 2:
+                               if strict_parsing:
+                                       raise ValueError("bad query field: %r" % (name_value,))
+                               # Handle case of a control-name with no equal sign
+                               if keep_blank_values:
+                                       nv.append('')
+                               else:
+                                       continue
+                       if len(nv[1]) or keep_blank_values:
+                               name = nv[0].replace('+', ' ')
+                               name = _unquote(name, encoding=encoding, errors=errors)
+                               name = _coerce_result(name)
+                               value = nv[1].replace('+', ' ')
+                               value = _unquote(value, encoding=encoding, errors=errors)
+                               value = _coerce_result(value)
+                               r.append((name, value))
+               return r
+
+       def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
+                               encoding='utf-8', errors='replace'):
+               parsed_result = {}
+               pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
+                                               encoding=encoding, errors=errors)
+               for name, value in pairs:
+                       if name in parsed_result:
+                               parsed_result[name].append(value)
+                       else:
+                               parsed_result[name] = [value]
+               return parsed_result
 
 try:
        compat_str = unicode # Python 2