[utils] Escape all HTML entities written in hexadecimal form
authorNaglis Jonaitis <njonaitis@gmail.com>
Thu, 26 Mar 2015 15:15:27 +0000 (17:15 +0200)
committerNaglis Jonaitis <njonaitis@gmail.com>
Thu, 26 Mar 2015 15:15:27 +0000 (17:15 +0200)
test/test_utils.py
youtube_dl/utils.py

index a8ab876850d3645eb615d552162128befdac316d..abaf1ab7331eab044c684b592bd5bfb85f4d39e0 100644 (file)
@@ -200,6 +200,8 @@ class TestUtil(unittest.TestCase):
 
     def test_unescape_html(self):
         self.assertEqual(unescapeHTML('%20;'), '%20;')
+        self.assertEqual(unescapeHTML('&#x2F;'), '/')
+        self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(
             unescapeHTML('&eacute;'), 'é')
 
index 472d4df41fda2cb1ffd0392cc4da0f4bdcc2a48a..245d623d86df365a562fb395bc792b42f68634eb 100644 (file)
@@ -348,7 +348,7 @@ def _htmlentity_transform(entity):
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
         if numstr.startswith('x'):