[ffmpeg] Add dfxp (TTML) subtitles support (#3432, #5146)

author: Yen Chi Hsuan <yan12125@gmail.com> 2015-04-25 23:15:05 +0800
committer: Yen Chi Hsuan <yan12125@gmail.com> 2015-04-25 23:18:27 +0800
commit: bf6427d2fbcbd95cd1cb640e8b894c18782a2a12 (patch)
tree: 0b84e18ba8a76b71c169ede701e21947e6eccacf /youtube_dl/utils.py
parent: 672f1bd8497f43179dcd01f8b4831564f0b42356 (diff)
download: youtube-dl-bf6427d2fbcbd95cd1cb640e8b894c18782a2a12.tar.xz
1 files changed, 53 insertions, 0 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index edeee1853..5e1c4525d 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1800,6 +1800,59 @@ def match_filter_func(filter_str):
     return _match_func
 
 
+def parse_dfxp_time_expr(time_expr):
+    if not time_expr:
+        return 0.0
+
+    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+    if mobj:
+        return float(mobj.group('time_offset'))
+
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+    if mobj:
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+
+
+def format_srt_time(seconds):
+    (mins, secs) = divmod(seconds, 60)
+    (hours, mins) = divmod(mins, 60)
+    millisecs = (secs - int(secs)) * 1000
+    secs = int(secs)
+    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+
+
+def dfxp2srt(dfxp_data):
+    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+
+    def parse_node(node):
+        str_or_empty = functools.partial(str_or_none, default='')
+
+        out = str_or_empty(node.text)
+
+        for child in node:
+            if child.tag == _x('ttml:br'):
+                out += '\n' + str_or_empty(child.tail)
+            elif child.tag == _x('ttml:span'):
+                out += str_or_empty(parse_node(child))
+            else:
+                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+
+        return out
+
+    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    out = []
+    paras = dfxp.findall(_x('.//ttml:p'))
+
+    for para, index in zip(paras, itertools.count(1)):
+        out.append('%d\n%s --> %s\n%s\n\n' % (
+            index,
+            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
+            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+            parse_node(para)))
+
+    return ''.join(out)
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
         # Set default handlers
author	Yen Chi Hsuan <yan12125@gmail.com>	2015-04-25 23:15:05 +0800
committer	Yen Chi Hsuan <yan12125@gmail.com>	2015-04-25 23:18:27 +0800
commit	bf6427d2fbcbd95cd1cb640e8b894c18782a2a12 (patch)
tree	0b84e18ba8a76b71c169ede701e21947e6eccacf /youtube_dl/utils.py
parent	672f1bd8497f43179dcd01f8b4831564f0b42356 (diff)
download	youtube-dl-bf6427d2fbcbd95cd1cb640e8b894c18782a2a12.tar.xz