[YoutubeDL] Escape non-ASCII characters in URLs
authorSergey M․ <dstftw@gmail.com>
Fri, 12 Sep 2014 16:20:17 +0000 (23:20 +0700)
committerSergey M․ <dstftw@gmail.com>
Fri, 12 Sep 2014 16:20:17 +0000 (23:20 +0700)
urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
Working around by replacing request's original URL with escaped one

youtube_dl/YoutubeDL.py

index 553bf559b3b2e7b155b2e14b2f2f49eda91ba9e9..99cd05e6518a726a3b72e8f1d705d7733790ebe1 100755 (executable)
@@ -28,6 +28,7 @@ from .utils import (
     compat_str,
     compat_urllib_error,
     compat_urllib_request,
+    compat_urllib_parse_urlparse,
     ContentTooShortError,
     date_from_str,
     DateRange,
@@ -1241,6 +1242,31 @@ class YoutubeDL(object):
 
     def urlopen(self, req):
         """ Start an HTTP download """
+
+        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+        # Working around by replacing request's original URL with escaped one
+
+        url = req if isinstance(req, compat_str) else req.get_full_url()
+
+        def escape(component):
+            return compat_cookiejar.escape_path(component.encode('utf-8'))
+
+        url_parsed = compat_urllib_parse_urlparse(url)
+        url_escaped = url_parsed._replace(
+            path=escape(url_parsed.path),
+            query=escape(url_parsed.query),
+            fragment=escape(url_parsed.fragment)
+        ).geturl()
+
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            if isinstance(req, compat_str):
+                req = url_escaped
+            else:
+                req = compat_urllib_request.Request(
+                    url_escaped, data=req.data, headers=req.headers,
+                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+
         return self._opener.open(req, timeout=self._socket_timeout)
 
     def print_debug_header(self):