[utils] Escape all HTML entities written in hexadecimal form

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index c3135effc18d0df51982787933c06ceaa851e666..245d623d86df365a562fb395bc792b42f68634eb 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -326,6 +326,13 @@ def sanitize_path(s):
      return os.path.join(*sanitized_path)
  
  
+def sanitize_url_path_consecutive_slashes(url):
+    """Collapses consecutive slashes in URLs' path"""
+    parsed_url = list(compat_urlparse.urlparse(url))
+    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
+    return compat_urlparse.urlunparse(parsed_url)
+
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -341,7 +348,7 @@ def _htmlentity_transform(entity):
      if entity in compat_html_entities.name2codepoint:
          return compat_chr(compat_html_entities.name2codepoint[entity])
  
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
      if mobj is not None:
          numstr = mobj.group(1)
          if numstr.startswith('x'):