[collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes...
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 24 Nov 2013 13:59:19 +0000 (14:59 +0100)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 24 Nov 2013 13:59:19 +0000 (14:59 +0100)
Uses a new helper method in InfoExtractor: _download_xml

youtube_dl/extractor/collegehumor.py
youtube_dl/extractor/common.py

index 0c29acfb13eafebe226e23f603f9f18d13a33304..b27c1dfc52401f3c148d48d2b2897d2b06db3834 100644 (file)
@@ -1,5 +1,4 @@
 import re
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from ..utils import (
@@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
 
         self.report_extraction(video_id)
         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
-        metaXml = self._download_webpage(xmlUrl, video_id,
+        mdoc = self._download_xml(xmlUrl, video_id,
                                          u'Downloading info XML',
                                          u'Unable to download video info XML')
 
-        mdoc = xml.etree.ElementTree.fromstring(metaXml)
         try:
             videoNode = mdoc.findall('./video')[0]
             youtubeIdNode = videoNode.find('./youtubeID')
@@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor):
 
         if next_url.endswith(u'manifest.f4m'):
             manifest_url = next_url + '?hdcore=2.10.3'
-            manifestXml = self._download_webpage(manifest_url, video_id,
+            adoc = self._download_xml(manifest_url, video_id,
                                          u'Downloading XML manifest',
                                          u'Unable to download video info XML')
 
-            adoc = xml.etree.ElementTree.fromstring(manifestXml)
             try:
                 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
             except IndexError:
index 3cebeaf29883875ca30e45c32234d83148b45caa..482a231ec0dc4632daabb4526253dae89bea64f8 100644 (file)
@@ -4,6 +4,7 @@ import re
 import socket
 import sys
 import netrc
+import xml.etree.ElementTree
 
 from ..utils import (
     compat_http_client,
@@ -208,6 +209,11 @@ class InfoExtractor(object):
         """ Returns the data of the page as a string """
         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 
+    def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
+        """Return the xml as an xml.etree.ElementTree.Element"""
+        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+
     def to_screen(self, msg):
         """Print msg to screen, prefixing it with '[ie_name]'"""
         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))