[utils] Escape all HTML entities written in hexadecimal form
[youtube-dl] / youtube_dl / utils.py
index c3135effc18d0df51982787933c06ceaa851e666..245d623d86df365a562fb395bc792b42f68634eb 100644 (file)
@@ -326,6 +326,13 @@ def sanitize_path(s):
     return os.path.join(*sanitized_path)
 
 
+def sanitize_url_path_consecutive_slashes(url):
+    """Collapses consecutive slashes in URLs' path"""
+    parsed_url = list(compat_urlparse.urlparse(url))
+    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
+    return compat_urlparse.urlunparse(parsed_url)
+
+
 def orderedSet(iterable):
     """ Remove all duplicates from the input iterable """
     res = []
@@ -341,7 +348,7 @@ def _htmlentity_transform(entity):
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
         if numstr.startswith('x'):