Merge remote-tracking branch 'dstftw/escape-non-ascii-in-urls'
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
Conflicts:
test/test_utils.py

1  2 
test/test_utils.py
youtube_dl/utils.py

diff --combined test/test_utils.py
index 70050d2b836aedfd7310bbee2c2f07037d70c71a,e90caed2961ba24689d135c2fc007ef565a26ae7..3efbed29dd34de570f2db4e6eb4954ec2f4b9c6e
@@@ -40,7 -40,8 +40,9 @@@ from youtube_dl.utils import 
      parse_iso8601,
      strip_jsonp,
      uppercase_escape,
 +    limit_length,
+     escape_rfc3986,
+     escape_url,
  )
  
  
@@@ -287,12 -288,34 +289,41 @@@ class TestUtil(unittest.TestCase)
          self.assertEqual(uppercase_escape('aä'), 'aä')
          self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
  
 +    def test_limit_length(self):
 +        self.assertEqual(limit_length(None, 12), None)
 +        self.assertEqual(limit_length('foo', 12), 'foo')
 +        self.assertTrue(
 +            limit_length('foo bar baz asd', 12).startswith('foo bar'))
 +        self.assertTrue('...' in limit_length('foo bar baz asd', 12))
 +
+     def test_escape_rfc3986(self):
+         reserved = "!*'();:@&=+$,/?#[]"
+         unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
+         self.assertEqual(escape_rfc3986(reserved), reserved)
+         self.assertEqual(escape_rfc3986(unreserved), unreserved)
+         self.assertEqual(escape_rfc3986('тест'), '%D1%82%D0%B5%D1%81%D1%82')
+         self.assertEqual(escape_rfc3986('%D1%82%D0%B5%D1%81%D1%82'), '%D1%82%D0%B5%D1%81%D1%82')
+         self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
+         self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')
+     def test_escape_url(self):
+         self.assertEqual(
+             escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
+             'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
+         )
+         self.assertEqual(
+             escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
+             'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
+         )
+         self.assertEqual(
+             escape_url('http://тест.рф/фрагмент'),
+             'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
+         )
+         self.assertEqual(
+             escape_url('http://тест.рф/абв?абв=абв#абв'),
+             'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
+         )
+         self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
  if __name__ == '__main__':
      unittest.main()
diff --combined youtube_dl/utils.py
index 3ac0f1f541745b1ec34f1245574ea82387c3115d,e924b1688d1d7e61c3ef8aded9fab8d98a873394..b644f4e920bf0353658ec9920abdb0541dbaf0e2
@@@ -280,11 -280,6 +280,11 @@@ if sys.version_info >= (2, 7)
          return node.find(expr)
  else:
      def find_xpath_attr(node, xpath, key, val):
 +        # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 +        # .//node does not match if a node is a direct child of . !
 +        if isinstance(xpath, unicode):
 +            xpath = xpath.encode('ascii')
 +
          for f in node.findall(xpath):
              if f.attrib.get(key) == val:
                  return f
@@@ -304,20 -299,6 +304,20 @@@ def xpath_with_ns(path, ns_map)
      return '/'.join(replaced)
  
  
 +def xpath_text(node, xpath, name=None, fatal=False):
 +    if sys.version_info < (2, 7):  # Crazy 2.6
 +        xpath = xpath.encode('ascii')
 +
 +    n = node.find(xpath)
 +    if n is None:
 +        if fatal:
 +            name = xpath if name is None else name
 +            raise ExtractorError('Could not find XML element %s' % name)
 +        else:
 +            return None
 +    return n.text
 +
 +
  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  class BaseHTMLParser(compat_html_parser.HTMLParser):
      def __init(self):
@@@ -1437,6 -1418,24 +1437,24 @@@ def uppercase_escape(s)
          lambda m: unicode_escape(m.group(0))[0],
          s)
  
+ def escape_rfc3986(s):
+     """Escape non-ASCII characters as suggested by RFC 3986"""
+     if sys.version_info < (3, 0) and isinstance(s, unicode):
+         s = s.encode('utf-8')
+     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ def escape_url(url):
+     """Escape URL as suggested by RFC 3986"""
+     url_parsed = compat_urllib_parse_urlparse(url)
+     return url_parsed._replace(
+         path=escape_rfc3986(url_parsed.path),
+         params=escape_rfc3986(url_parsed.params),
+         query=escape_rfc3986(url_parsed.query),
+         fragment=escape_rfc3986(url_parsed.fragment)
+     ).geturl()
  try:
      struct.pack(u'!I', 0)
  except TypeError:
@@@ -1571,13 -1570,3 +1589,13 @@@ except AttributeError
          if ret:
              raise subprocess.CalledProcessError(ret, p.args, output=output)
          return output
 +
 +
 +def limit_length(s, length):
 +    """ Add ellipses to overly long strings """
 +    if s is None:
 +        return None
 +    ELLIPSES = '...'
 +    if len(s) > length:
 +        return s[:length - len(ELLIPSES)] + ELLIPSES
 +    return s