Do not override stdlib html parser 'locatestarttagend' regex (fixes #4081)
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 2 Nov 2014 16:28:42 +0000 (17:28 +0100)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 2 Nov 2014 18:31:06 +0000 (19:31 +0100)
'<a href="foo" ><img src="bar" / ></a>' wouldn't be parsed right (the problem is '/ >', '/>' worked fine).
We need to change it in python 2.6 (for example the description of youtube videos wouldn't be extracted).

youtube_dl/utils.py

index 233286de840983142f4a89a5ea2cd3db6507fcae..bdd637e4880b902c13e9bd30f99f74d47c0a86de 100644 (file)
@@ -152,7 +152,9 @@ def xpath_text(node, xpath, name=None, fatal=False):
     return n.text
 
 
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
+if sys.version_info < (2, 7):
+    compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
+
 class BaseHTMLParser(compat_html_parser.HTMLParser):
     def __init(self):
         compat_html_parser.HTMLParser.__init__(self)