[utils] fix dfxp2srt text extraction(fixes #8055)
authorremitamine <remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
committerremitamine <remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000 (12:38 +0100)
youtube_dl/utils.py

index c63b61598ac421c99f86928093c8a38f0f57ca59..18dbe28bb5ecbcdb922f292490a43ccf6662ac9b 100644 (file)
@@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
     })
 
-    def parse_node(node):
-        str_or_empty = functools.partial(str_or_none, default='')
+    class TTMLPElementParser:
+        out = ''
 
-        out = str_or_empty(node.text)
+        def start(self, tag, attrib):
+            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+                self.out += '\n'
 
-        for child in node:
-            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
-                out += '\n' + str_or_empty(child.tail)
-            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
-                out += str_or_empty(parse_node(child))
-            else:
-                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+        def end(self, tag):
+            pass
 
-        return out
+        def data(self, data):
+            self.out += data
+
+        def close(self):
+            return self.out.strip()
+
+    def parse_node(node):
+        target = TTMLPElementParser()
+        parser = xml.etree.ElementTree.XMLParser(target=target)
+        parser.feed(xml.etree.ElementTree.tostring(node))
+        return parser.close()
 
     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
     out = []