Merge pull request #7296 from jaimeMF/xml_attrib_unicode

author Sergey M <dstftw@gmail.com>

Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)

committer Sergey M <dstftw@gmail.com>

Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)
author Sergey M <dstftw@gmail.com>
Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)
committer Sergey M <dstftw@gmail.com>
Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)
diff --git a/test/test_compat.py b/test/test_compat.py

index 4ee0dc99d0791095d11cdfbdde94f30b52feca64..b6bfad05e3c85c07854cc00c337a12caf493e849 100644 (file)
--- a/test/test_compat.py
+++ b/test/test_compat.py
@@ -13,8 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  from youtube_dl.utils import get_filesystem_encoding
  from youtube_dl.compat import (
      compat_getenv,
+    compat_etree_fromstring,
      compat_expanduser,
      compat_shlex_split,
+    compat_str,
      compat_urllib_parse_unquote,
      compat_urllib_parse_unquote_plus,
  )
@@ -71,5 +73,20 @@ class TestCompat(unittest.TestCase):
      def test_compat_shlex_split(self):
          self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
  
+    def test_compat_etree_fromstring(self):
+        xml = '''
+            <root foo="bar" spam="中文">
+                <normal>foo</normal>
+                <chinese>中文</chinese>
+                <foo><bar>spam</bar></foo>
+            </root>
+        '''
+        doc = compat_etree_fromstring(xml.encode('utf-8'))
+        self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
+        self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
+        self.assertTrue(isinstance(doc.find('normal').text, compat_str))
+        self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
+        self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py

index 5a56ad7767898bc7f69dcd5fa7e74c582886e6c3..3298315d2458903da67187b2d2bddeccd5e125a7 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -68,6 +68,9 @@ from youtube_dl.utils import (
      cli_valueless_option,
      cli_bool_option,
  )
+from youtube_dl.compat import (
+    compat_etree_fromstring,
+)
  
  
  class TestUtil(unittest.TestCase):
@@ -242,7 +245,7 @@ class TestUtil(unittest.TestCase):
              <node x="b" y="d" />
              <node x="" />
          </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
  
          self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
          self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
@@ -263,7 +266,7 @@ class TestUtil(unittest.TestCase):
                  <url>http://server.com/download.mp3</url>
              </media:song>
          </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
          find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
          self.assertTrue(find('media:song') is not None)
          self.assertEqual(find('media:song/media:author').text, 'The Author')
@@ -292,7 +295,7 @@ class TestUtil(unittest.TestCase):
                  <p>Foo</p>
              </div>
          </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
          self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
          self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')
          self.assertTrue(xpath_text(doc, 'div/bar') is None)
@@ -304,7 +307,7 @@ class TestUtil(unittest.TestCase):
                  <p x="a">Foo</p>
              </div>
          </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
          self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')
          self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)
          self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None)
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py

index d103ab9adf73ee664a0639e33191ef2ff89431ce..a3e85264acda8dbefe0883ea50c901302d01d29f 100644 (file)
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -14,6 +14,7 @@ import socket
  import subprocess
  import sys
  import itertools
+import xml.etree.ElementTree
  
  
  try:
@@ -212,6 +213,43 @@ try:
  except ImportError:  # Python 2.6
      from xml.parsers.expat import ExpatError as compat_xml_parse_error
  
+if sys.version_info[0] >= 3:
+    compat_etree_fromstring = xml.etree.ElementTree.fromstring
+else:
+    # python 2.x tries to encode unicode strings with ascii (see the
+    # XMLParser._fixtext method)
+    etree = xml.etree.ElementTree
+
+    try:
+        _etree_iter = etree.Element.iter
+    except AttributeError:  # Python <=2.6
+        def _etree_iter(root):
+            for el in root.findall('*'):
+                yield el
+                for sub in _etree_iter(el):
+                    yield sub
+
+    # on 2.6 XML doesn't have a parser argument, function copied from CPython
+    # 2.7 source
+    def _XML(text, parser=None):
+        if not parser:
+            parser = etree.XMLParser(target=etree.TreeBuilder())
+        parser.feed(text)
+        return parser.close()
+
+    def _element_factory(*args, **kwargs):
+        el = etree.Element(*args, **kwargs)
+        for k, v in el.items():
+            if isinstance(v, bytes):
+                el.set(k, v.decode('utf-8'))
+        return el
+
+    def compat_etree_fromstring(text):
+        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+        for el in _etree_iter(doc):
+            if el.text is not None and isinstance(el.text, bytes):
+                el.text = el.text.decode('utf-8')
+        return doc
  
  try:
      from urllib.parse import parse_qs as compat_parse_qs
@@ -507,6 +545,7 @@ __all__ = [
      'compat_chr',
      'compat_cookiejar',
      'compat_cookies',
+    'compat_etree_fromstring',
      'compat_expanduser',
      'compat_get_terminal_size',
      'compat_getenv',
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py

index 7f6143954d3d4fdd633eefd3b85a75ea5c07dda8..6170cc1552194104ea1f029ec9b54c2b051e7a9c 100644 (file)
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -5,10 +5,10 @@ import io
  import itertools
  import os
  import time
-import xml.etree.ElementTree as etree
  
  from .fragment import FragmentFD
  from ..compat import (
+    compat_etree_fromstring,
      compat_urlparse,
      compat_urllib_error,
      compat_urllib_parse_urlparse,
@@ -290,7 +290,7 @@ class F4mFD(FragmentFD):
          man_url = urlh.geturl()
          manifest = urlh.read()
  
-        doc = etree.fromstring(manifest)
+        doc = compat_etree_fromstring(manifest)
          formats = [(int(f.attrib.get('bitrate', -1)), f)
                     for f in self._get_unencrypted_media(doc)]
          if requested_bitrate is None:
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

index 6f465789b497a6625776c383ff699a64b0b5c346..73be6d2040b7197a94939ddbe5f3d7f81a92b750 100644 (file)
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -14,8 +14,8 @@ from ..utils import (
      parse_duration,
      unified_strdate,
      xpath_text,
-    parse_xml,
  )
+from ..compat import compat_etree_fromstring
  
  
  class ARDMediathekIE(InfoExtractor):
@@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):
              raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
  
          if re.search(r'[\?&]rss($|[=&])', url):
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  return GenericIE()._extract_rss(url, video_id, doc)
  
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 2cdce1eb9568923f91dce28313b61acd7abcd9bb..a55a6dbc9dc8e89dda6213650f46e719978cf8ca 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -2,7 +2,6 @@
  from __future__ import unicode_literals
  
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -14,7 +13,10 @@ from ..utils import (
      remove_end,
      unescapeHTML,
  )
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_etree_fromstring,
+    compat_HTTPError,
+)
  
  
  class BBCCoUkIE(InfoExtractor):
@@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor):
                  url, programme_id, 'Downloading media selection XML')
          except ExtractorError as ee:
              if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
-                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
+                media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
              else:
                  raise
          return self._process_media_selector(media_selection, programme_id)
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py

index ecc17ebebca9e1819fc804f37d48dcceb80c44c5..6c66a12368ea0a963d89ef5922c9d83f3019ddfc 100644 (file)
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
  import re
  import itertools
  import json
-import xml.etree.ElementTree as ET
  
  from .common import InfoExtractor
+from ..compat import (
+    compat_etree_fromstring,
+)
  from ..utils import (
      int_or_none,
      unified_strdate,
@@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor):
          except ValueError:
              pass
  
-        lq_doc = ET.fromstring(lq_page)
+        lq_doc = compat_etree_fromstring(lq_page)
          lq_durls = lq_doc.findall('./durl')
  
          hq_doc = self._download_xml(
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index 4721c22930f15cb51d0daaac294eeeca3a329092..1686cdde14fcc7383f91bf52a6723d4831d6311d 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -3,10 +3,10 @@ from __future__ import unicode_literals
  
  import re
  import json
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..compat import (
+    compat_etree_fromstring,
      compat_parse_qs,
      compat_str,
      compat_urllib_parse,
@@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor):
          object_str = fix_xml_ampersands(object_str)
  
          try:
-            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+            object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
          except compat_xml_parse_error:
              return
  
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 1f09fbb47d892cc6dc5fe1353fd0b166603d2c1a..5e263f8b5a2cf46fbb26e928f5df85c87c42dfde 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,7 +10,6 @@ import re
  import socket
  import sys
  import time
-import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
@@ -23,6 +22,7 @@ from ..compat import (
      compat_urllib_request,
      compat_urlparse,
      compat_str,
+    compat_etree_fromstring,
  )
  from ..utils import (
      NO_DEFAULT,
@@ -461,7 +461,7 @@ class InfoExtractor(object):
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py

index f8ce10111df8078638d9c56b837f354b2345ad73..0c9b8ca024cb6022b0e0227dcad2065c64ba69eb 100644 (file)
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -5,12 +5,12 @@ import re
  import json
  import base64
  import zlib
-import xml.etree.ElementTree
  
  from hashlib import sha1
  from math import pow, sqrt, floor
  from .common import InfoExtractor
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_parse,
      compat_urllib_parse_unquote,
      compat_urllib_request,
@@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
          return output
  
      def _extract_subtitles(self, subtitle):
-        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        sub_root = compat_etree_fromstring(subtitle)
          return [{
              'ext': 'srt',
              'data': self._convert_subtitles_to_srt(sub_root),
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index a8413503293399d40a95f63d16af06f052ff9f13..ee5419f51017a5f3a5ce01018eaaa65f4ab979ab 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -9,6 +9,7 @@ import sys
  from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_parse_unquote,
      compat_urllib_request,
      compat_urlparse,
@@ -21,7 +22,6 @@ from ..utils import (
      HEADRequest,
      is_html,
      orderedSet,
-    parse_xml,
      smuggle_url,
      unescapeHTML,
      unified_strdate,
@@ -1238,7 +1238,7 @@ class GenericIE(InfoExtractor):
  
          # Is it an RSS feed, a SMIL file or a XSPF playlist?
          try:
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  return self._extract_rss(url, video_id, doc)
              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index c17094f8193f7678cc3d0a912c3d970f38e6bf7c..4c0de354f7d8ac467f0f027b201d785a38c77ffa 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,10 +1,10 @@
  from __future__ import unicode_literals
  
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_request,
  )
  from ..utils import (
@@ -97,7 +97,7 @@ class VevoIE(InfoExtractor):
          if last_version['version'] == -1:
              raise ExtractorError('Unable to extract last version of the video')
  
-        renditions = xml.etree.ElementTree.fromstring(last_version['data'])
+        renditions = compat_etree_fromstring(last_version['data'])
          formats = []
          # Already sorted from worst to best quality
          for rend in renditions.findall('rendition'):
@@ -114,7 +114,7 @@ class VevoIE(InfoExtractor):
  
      def _formats_from_smil(self, smil_xml):
          formats = []
-        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+        smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
          els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
          for el in els:
              src = el.attrib['src']
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 764a89ccaccc34e9236d84b445f9f2d8342e0244..efd5f4ae111448ca9933136b151ca1bf6a0505a8 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -36,6 +36,7 @@ import zlib
  from .compat import (
      compat_basestring,
      compat_chr,
+    compat_etree_fromstring,
      compat_html_entities,
      compat_http_client,
      compat_kwargs,
@@ -1665,29 +1666,6 @@ def encode_dict(d, encoding='utf-8'):
      return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
  
  
-try:
-    etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError:  # Python <=2.6
-    etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
-    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
-        def doctype(self, name, pubid, system):
-            pass  # Ignore doctypes
-
-    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
-    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
-    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
-    # Fix up XML parser in Python 2.x
-    if sys.version_info < (3, 0):
-        for n in etree_iter(tree):
-            if n.text is not None:
-                if not isinstance(n.text, compat_str):
-                    n.text = n.text.decode('utf-8')
-    return tree
-
-
  US_RATINGS = {
      'G': 0,
      'PG': 10,
@@ -1988,7 +1966,7 @@ def dfxp2srt(dfxp_data):
  
          return out
  
-    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
      out = []
      paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
author	Sergey M <dstftw@gmail.com>
	Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)
committer	Sergey M <dstftw@gmail.com>
	Sat, 31 Oct 2015 18:15:21 +0000 (18:15 +0000)
test/test_compat.py		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube_dl/compat.py		patch \| blob \| history
youtube_dl/downloader/f4m.py		patch \| blob \| history
youtube_dl/extractor/ard.py		patch \| blob \| history
youtube_dl/extractor/bbc.py		patch \| blob \| history
youtube_dl/extractor/bilibili.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/crunchyroll.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history