diff options
| author | remitamine <remitamine@gmail.com> | 2016-02-02 18:36:26 +0100 | 
|---|---|---|
| committer | remitamine <remitamine@gmail.com> | 2016-02-02 18:36:26 +0100 | 
| commit | 4e0cff2a50f4c297fc25dae01c460596d8f5badb (patch) | |
| tree | 7a3ed19f8182e6454c8d10ea63a88bd70eb6fd89 | |
| parent | 0436157b95ddd6d7c415549ba48b7c0305b2c0df (diff) | |
| parent | 2b14cb566fde3e5482ce9a63b2be7103cec939e0 (diff) | |
Merge pull request #8348 from remitamine/dfxp2srt-text
[utils] fix dfxp2srt text extraction(fixes #8055)
| -rw-r--r-- | youtube_dl/utils.py | 29 | 
1 files changed, 18 insertions, 11 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c63b61598..18dbe28bb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):          'ttaf1': 'http://www.w3.org/2006/10/ttaf1',      }) -    def parse_node(node): -        str_or_empty = functools.partial(str_or_none, default='') +    class TTMLPElementParser: +        out = '' -        out = str_or_empty(node.text) +        def start(self, tag, attrib): +            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): +                self.out += '\n' -        for child in node: -            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): -                out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): -                out += str_or_empty(parse_node(child)) -            else: -                out += str_or_empty(xml.etree.ElementTree.tostring(child)) +        def end(self, tag): +            pass -        return out +        def data(self, data): +            self.out += data + +        def close(self): +            return self.out.strip() + +    def parse_node(node): +        target = TTMLPElementParser() +        parser = xml.etree.ElementTree.XMLParser(target=target) +        parser.feed(xml.etree.ElementTree.tostring(node)) +        return parser.close()      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = [] | 
