[utils] Encode hostnames before passing to urllib

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index ec186918cd8672ada2da2d5521e0ba8b22eb273d..03bb7782f492daa247c6ababba6c975d5548d75e 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -50,6 +50,7 @@ from .compat import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urlparse,
+    compat_xpath,
      shlex_quote,
  )
  
@@ -165,12 +166,7 @@ if sys.version_info >= (2, 7):
          return node.find(expr)
  else:
      def find_xpath_attr(node, xpath, key, val=None):
-        # Here comes the crazy part: In 2.6, if the xpath is a unicode,
-        # .//node does not match if a node is a direct child of . !
-        if isinstance(xpath, compat_str):
-            xpath = xpath.encode('ascii')
-
-        for f in node.findall(xpath):
+        for f in node.findall(compat_xpath(xpath)):
              if key not in f.attrib:
                  continue
              if val is None or f.attrib.get(key) == val:
@@ -195,9 +191,7 @@ def xpath_with_ns(path, ns_map):
  
  def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
      def _find_xpath(xpath):
-        if sys.version_info < (2, 7):  # Crazy 2.6
-            xpath = xpath.encode('ascii')
-        return node.find(xpath)
+        return node.find(compat_xpath(xpath))
  
      if isinstance(xpath, (str, compat_str)):
          n = _find_xpath(xpath)
@@ -273,15 +267,17 @@ def get_element_by_attribute(attribute, value, html):
  
      return unescapeHTML(res)
  
+
  class HTMLAttributeParser(compat_HTMLParser):
      """Trivial HTML parser to gather the attributes for a single element"""
      def __init__(self):
-        self.attrs = { }
+        self.attrs = {}
          compat_HTMLParser.__init__(self)
  
      def handle_starttag(self, tag, attrs):
          self.attrs = dict(attrs)
  
+
  def extract_attributes(html_element):
      """Given a string for an HTML element such as
      <el
@@ -303,6 +299,7 @@ def extract_attributes(html_element):
      parser.close()
      return parser.attrs
  
+
  def clean_html(html):
      """Clean an HTML snippet into a readable string"""
  
@@ -1349,7 +1346,7 @@ def format_bytes(bytes):
  def lookup_unit_table(unit_table, s):
      units_re = '|'.join(re.escape(u) for u in unit_table)
      m = re.match(
-        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
      if not m:
          return None
      num_str = m.group('num').replace(',', '.')
@@ -1749,6 +1746,7 @@ def escape_url(url):
      """Escape URL as suggested by RFC 3986"""
      url_parsed = compat_urllib_parse_urlparse(url)
      return url_parsed._replace(
+        netloc=url_parsed.netloc.encode('idna').decode('ascii'),
          path=escape_rfc3986(url_parsed.path),
          params=escape_rfc3986(url_parsed.params),
          query=escape_rfc3986(url_parsed.query),