Correct XML ampersand fixup

author Philipp Hagemeister <phihag@phihag.de>

Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)
diff --git a/test/test_utils.py b/test/test_utils.py

index bee355ee0e0605a5134dc37b8556e9e233728902..a17483ada829345e8e96f23dbdeaeea7a5451294 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -16,6 +16,7 @@ from youtube_dl.utils import (
      DateRange,
      encodeFilename,
      find_xpath_attr,
+    fix_xml_ampersands,
      get_meta_content,
      orderedSet,
      parse_duration,
@@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(parse_duration('9:12:43'), 33163)
          self.assertEqual(parse_duration('x:y'), None)
  
+    def test_fix_xml_ampersands(self):
+        self.assertEqual(
+            fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
+        self.assertEqual(
+            fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
+            '"&amp;x=y&amp;wrong;&amp;z=a')
+        self.assertEqual(
+            fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
+            '&amp;&apos;&gt;&lt;&quot;')
+        self.assertEqual(
+            fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
+        self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py

index c60089ad353274adaa380671cee9d4e3ce2e2718..9ab6a4ab69726c5c2a7ad0df7de5933f1f882d33 100644 (file)
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -3,7 +3,7 @@ import re
  from .common import InfoExtractor
  from ..utils import (
      find_xpath_attr,
-    fix_xml_all_ampersand,
+    fix_xml_ampersands
  )
  
  
@@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
          pdoc = self._download_xml(
              'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
              video_id, u'Downloading video info',
-            transform_source=fix_xml_all_ampersand) 
+            transform_source=fix_xml_ampersands)
  
          track_doc = pdoc.find('trackList/track')
          def find_param(name):
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py

index f3ff0e8bb47ac3307d52eeb10ad1b5449c6fb8e3..465ac4916a4596e247f957cac636522227da7f78 100644 (file)
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -4,7 +4,7 @@ import re
  
  from .common import InfoExtractor
  from ..utils import (
-    fix_xml_all_ampersand,
+    fix_xml_ampersands,
  )
  
  
@@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
          webpage = self._download_webpage(url, video_id)
          # The xml is not well formatted, there are raw '&'
          info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
-            video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand)
+            video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
  
          clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
          formats = []
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index f1cf41e2dbf2012764fdb0f2e1745c07ecdef055..c4fa16fb6de697498f9025bee0afd35ff7e20a5d 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse,
      ExtractorError,
+    fix_xml_ampersands,
  )
  
  def _media_xml_tag(tag):
@@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
          video_id = self._id_from_uri(uri)
          data = compat_urllib_parse.urlencode({'uri': uri})
  
-        def fix_ampersand(s):
-            """ Fix unencoded ampersand in XML """
-            return s.replace(u'& ', '&amp; ')
          idoc = self._download_xml(
              self._FEED_URL + '?' + data, video_id,
-            u'Downloading info', transform_source=fix_ampersand)
+            u'Downloading info', transform_source=fix_xml_ampersands)
          return [self._get_video_info(item) for item in idoc.findall('.//item')]
  
  
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 73fe1ad0a3a27165d3dffc61927733beb9c5ed33..70f28414981111d63fb74c00f3ea554702042ea7 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1092,9 +1092,12 @@ def month_by_name(name):
          return None
  
  
-def fix_xml_all_ampersand(xml_str):
+def fix_xml_ampersands(xml_str):
      """Replace all the '&' by '&amp;' in XML"""
-    return xml_str.replace(u'&', u'&amp;')
+    return re.sub(
+        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+        u'&amp;',
+        xml_str)
  
  
  def setproctitle(title):
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 20 Jan 2014 21:11:34 +0000 (22:11 +0100)
test/test_utils.py		patch \| blob \| history
youtube_dl/extractor/clipsyndicate.py		patch \| blob \| history
youtube_dl/extractor/metacritic.py		patch \| blob \| history
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history