Use `_download_xml` in more extractors
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Tue, 10 Dec 2013 20:03:53 +0000 (21:03 +0100)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Tue, 10 Dec 2013 20:03:53 +0000 (21:03 +0100)
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/clipsyndicate.py
youtube_dl/extractor/metacritic.py
youtube_dl/utils.py

index a527f10de250596e42f19f0957433e2a72fe5bbf..ef5644aa54fe28002dc4d8c76308941c264252e3 100644 (file)
@@ -1,5 +1,4 @@
 import re
-import xml.etree.ElementTree
 import json
 
 from .common import InfoExtractor
@@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
         uploader_id = mobj.group('company')
 
         playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
-        playlist_snippet = self._download_webpage(playlist_url, movie)
-        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
-        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
-        # The ' in the onClick attributes are not escaped, it couldn't be parsed
-        # with xml.etree.ElementTree.fromstring
-        # like: http://trailers.apple.com/trailers/wb/gravity/
-        def _clean_json(m):
-            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
-        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
-        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+        def fix_html(s):
+            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
+            # The ' in the onClick attributes are not escaped, it couldn't be parsed
+            # like: http://trailers.apple.com/trailers/wb/gravity/
+            def _clean_json(m):
+                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+            s = re.sub(self._JSON_RE, _clean_json, s)
+            s = u'<html>' + s + u'</html>'
+            return s
+        doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 
-        doc = xml.etree.ElementTree.fromstring(playlist_html)
         playlist = []
         for li in doc.findall('./div/ul/li'):
             on_click = li.find('.//a').attrib['onClick']
index d4fc869732a8ae15e60f0963ef3418abac1a9201..c60089ad353274adaa380671cee9d4e3ce2e2718 100644 (file)
@@ -1,9 +1,9 @@
 import re
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
+    fix_xml_all_ampersand,
 )
 
 
@@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):
         # it includes a required token
         flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
 
-        playlist_page = self._download_webpage(
+        pdoc = self._download_xml(
             'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
-            video_id, u'Downloading video info') 
-        # Fix broken xml
-        playlist_page = re.sub('&', '&amp;', playlist_page)
-        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+            video_id, u'Downloading video info',
+            transform_source=fix_xml_all_ampersand) 
 
         track_doc = pdoc.find('trackList/track')
         def find_param(name):
index 6b95b4998852ac61d1061e0dcf6c3f442772fee2..e560c1d354d8b03a05133bf1458ce8d28b84b7bc 100644 (file)
@@ -1,8 +1,10 @@
 import re
-import xml.etree.ElementTree
 import operator
 
 from .common import InfoExtractor
+from ..utils import (
+    fix_xml_all_ampersand,
+)
 
 
 class MetacriticIE(InfoExtractor):
@@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
         # The xml is not well formatted, there are raw '&'
-        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
-            video_id, u'Downloading info xml').replace('&', '&amp;')
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+            video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
 
         clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
         formats = []
index 0dab9fcc5d7ad8b5d0fcae42b38bf5ac26acdb64..4593488ce5e30bb69a19c56cc4c84a581d3d17f6 100644 (file)
@@ -1057,3 +1057,8 @@ def month_by_name(name):
         return ENGLISH_NAMES.index(name) + 1
     except ValueError:
         return None
+
+
+def fix_xml_all_ampersand(xml_str):
+    """Replace all the '&' by '&amp;' in XML"""
+    return xml_str.replace(u'&', u'&amp;')