projects
/
youtube-dl
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
ed7cd1e
)
[utils] fix dfxp2srt text extraction(fixes #8055)
author
remitamine
<remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000
(12:38 +0100)
committer
remitamine
<remitamine@gmail.com>
Thu, 28 Jan 2016 11:38:34 +0000
(12:38 +0100)
youtube_dl/utils.py
patch
|
blob
|
history
diff --git
a/youtube_dl/utils.py
b/youtube_dl/utils.py
index c63b61598ac421c99f86928093c8a38f0f57ca59..18dbe28bb5ecbcdb922f292490a43ccf6662ac9b 100644
(file)
--- a/
youtube_dl/utils.py
+++ b/
youtube_dl/utils.py
@@
-2017,20
+2017,27
@@
def dfxp2srt(dfxp_data):
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
})
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
})
-
def parse_node(node)
:
- str_or_empty = functools.partial(str_or_none, default='')
+
class TTMLPElementParser
:
+ out = ''
- out = str_or_empty(node.text)
+ def start(self, tag, attrib):
+ if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+ self.out += '\n'
- for child in node:
- if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- out += '\n' + str_or_empty(child.tail)
- elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
- out += str_or_empty(parse_node(child))
- else:
- out += str_or_empty(xml.etree.ElementTree.tostring(child))
+ def end(self, tag):
+ pass
- return out
+ def data(self, data):
+ self.out += data
+
+ def close(self):
+ return self.out.strip()
+
+ def parse_node(node):
+ target = TTMLPElementParser()
+ parser = xml.etree.ElementTree.XMLParser(target=target)
+ parser.feed(xml.etree.ElementTree.tostring(node))
+ return parser.close()
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []