Don't accept '>' inside the content attribute in OpenGraph regexes
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Fri, 15 Nov 2013 11:54:13 +0000 (12:54 +0100)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Fri, 15 Nov 2013 11:54:13 +0000 (12:54 +0100)
youtube_dl/extractor/common.py

index e021768528f9857efc0772db5f844dd54d21a616..45dd01789b786da425545a528edc4838cb91de19 100644 (file)
@@ -316,10 +316,12 @@ class InfoExtractor(object):
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        esc_prop = re.escape(prop)
+        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+        template = r'<meta[^>]+?%s[^>]+?%s'
         return [
-            r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop,
-            r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop,
+            template % (property_re, content_re),
+            template % (content_re, property_re),
         ]
 
     def _og_search_property(self, prop, html, name=None, **kargs):