[utils] fix dfxp2srt text extraction(fixes #8055)

author remitamine <remitamine@gmail.com>

Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)

committer remitamine <remitamine@gmail.com>

Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
author remitamine <remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
committer remitamine <remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index c63b61598ac421c99f86928093c8a38f0f57ca59..18dbe28bb5ecbcdb922f292490a43ccf6662ac9b 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
          'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
      })
  
-    def parse_node(node):
-        str_or_empty = functools.partial(str_or_none, default='')
+    class TTMLPElementParser:
+        out = ''
  
-        out = str_or_empty(node.text)
+        def start(self, tag, attrib):
+            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+                self.out += '\n'
  
-        for child in node:
-            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
-                out += '\n' + str_or_empty(child.tail)
-            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
-                out += str_or_empty(parse_node(child))
-            else:
-                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+        def end(self, tag):
+            pass
  
-        return out
+        def data(self, data):
+            self.out += data
+
+        def close(self):
+            return self.out.strip()
+
+    def parse_node(node):
+        target = TTMLPElementParser()
+        parser = xml.etree.ElementTree.XMLParser(target=target)
+        parser.feed(xml.etree.ElementTree.tostring(node))
+        return parser.close()
  
      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
      out = []
author	remitamine <remitamine@gmail.com>
	Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
committer	remitamine <remitamine@gmail.com>
	Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)