[mtv] Fixup incorrectly encoded XML documents
authorPhilipp Hagemeister <phihag@phihag.de>
Tue, 10 Dec 2013 11:45:22 +0000 (12:45 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Tue, 10 Dec 2013 11:45:22 +0000 (12:45 +0100)
youtube_dl/extractor/common.py
youtube_dl/extractor/mtv.py

index 534908a2b89af8db08f5d7ba0fc7f983c1199fa9..69a083b68aa3cee7d8ec2a6af9af7108a28bdf9b 100644 (file)
@@ -230,9 +230,12 @@ class InfoExtractor(object):
             return content
 
     def _download_xml(self, url_or_request, video_id,
-                      note=u'Downloading XML', errnote=u'Unable to download XML'):
+                      note=u'Downloading XML', errnote=u'Unable to download XML',
+                      transform_source=None):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        if transform_source:
+            xml_string = transform_source(xml_string)
         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 
     def to_screen(self, msg):
index 6b3feb560768f96c4d5b3bb3adc0989ecf1c1d4f..5b2bd96334e2eb1a090db2260f08ab9cc1a4b882 100644 (file)
@@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
     def _get_videos_info(self, uri):
         video_id = self._id_from_uri(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
-        idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
-                                         u'Downloading info')
+
+        def fix_ampersand(s):
+            """ Fix unencoded ampersand in XML """
+            return s.replace(u'& ', '&amp; ')
+        idoc = self._download_xml(
+            self._FEED_URL + '?' + data, video_id,
+            u'Downloading info', transform_source=fix_ampersand)
         return [self._get_video_info(item) for item in idoc.findall('.//item')]