[utils] Encode hostnames before passing to urllib
authorYen Chi Hsuan <yan12125@gmail.com>
Wed, 23 Mar 2016 14:24:52 +0000 (22:24 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Wed, 23 Mar 2016 14:24:52 +0000 (22:24 +0800)
With IDN (Internationalized Domain Name) and a proxy, non-ascii URLs
are passed down to urllib/urllib2, causing UnicodeEncodeError

Fixes #8890

test/test_http.py
youtube_dl/utils.py

index fc59b1aed6ddc2db10598a1a4b954a128e3d3133..15e0ad369d57966bef222bf35c422ad9bdb4e755 100644 (file)
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding: utf-8
 from __future__ import unicode_literals
 
 # Allow direct execution
@@ -120,5 +121,14 @@ class TestProxy(unittest.TestCase):
         response = ydl.urlopen(req).read().decode('utf-8')
         self.assertEqual(response, 'cn: {0}'.format(url))
 
+    def test_proxy_with_idn(self):
+        ydl = YoutubeDL({
+            'proxy': 'localhost:{0}'.format(self.port),
+        })
+        url = 'http://中文.tw/'
+        response = ydl.urlopen(url).read().decode('utf-8')
+        # b'xn--fiq228c' is '中文'.encode('idna')
+        self.assertEqual(response, 'normal: http://xn--fiq228c.tw/')
+
 if __name__ == '__main__':
     unittest.main()
index 067b8a184c67ea152e3769bcdbcc63bf43afca77..03bb7782f492daa247c6ababba6c975d5548d75e 100644 (file)
@@ -1746,6 +1746,7 @@ def escape_url(url):
     """Escape URL as suggested by RFC 3986"""
     url_parsed = compat_urllib_parse_urlparse(url)
     return url_parsed._replace(
+        netloc=url_parsed.netloc.encode('idna').decode('ascii'),
         path=escape_rfc3986(url_parsed.path),
         params=escape_rfc3986(url_parsed.params),
         query=escape_rfc3986(url_parsed.query),