[extractor/common] Allow angle brackets in attributes in _og_regexes (#7215)
authorSergey M․ <dstftw@gmail.com>
Sun, 18 Oct 2015 03:11:02 +0000 (09:11 +0600)
committerSergey M․ <dstftw@gmail.com>
Sun, 18 Oct 2015 03:11:02 +0000 (09:11 +0600)
test/test_InfoExtractor.py
youtube_dl/extractor/common.py

index 2a00d09a5a1f6e666d4e4fdd4c837495045a24b4..938466a800122211ab0414d9aa9de831951e2903 100644 (file)
@@ -37,12 +37,16 @@ class TestInfoExtractor(unittest.TestCase):
             <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
             <meta content='application/x-shockwave-flash' property='og:video:type'>
             <meta content='Foo' property=og:foobar>
+            <meta name="og:test1" content='foo > < bar'/>
+            <meta name="og:test2" content="foo >//< bar"/>
             '''
         self.assertEqual(ie._og_search_title(html), 'Foo')
         self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
         self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
         self.assertEqual(ie._og_search_video_url(html, default=None), None)
         self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
+        self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
+        self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
 
     def test_html_search_meta(self):
         ie = self.ie
index a0c4af92f2aa284801fcb59b459285dce6933336..4365077f1379e49bc96145e28df4d47f58b91ecb 100644 (file)
@@ -645,7 +645,7 @@ class InfoExtractor(object):
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))'
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
                        % {'prop': re.escape(prop)})
         template = r'<meta[^>]+?%s[^>]+?%s'