aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2015-04-25 23:15:05 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2015-04-25 23:18:27 +0800
commitbf6427d2fbcbd95cd1cb640e8b894c18782a2a12 (patch)
tree0b84e18ba8a76b71c169ede701e21947e6eccacf /youtube_dl
parent672f1bd8497f43179dcd01f8b4831564f0b42356 (diff)
downloadyoutube-dl-bf6427d2fbcbd95cd1cb640e8b894c18782a2a12.tar.xz
[ffmpeg] Add dfxp (TTML) subtitles support (#3432, #5146)
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py25
-rw-r--r--youtube_dl/utils.py53
2 files changed, 78 insertions, 0 deletions
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 7a952963e..1765f4969 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -20,6 +20,7 @@ from ..utils import (
prepend_extension,
shell_quote,
subtitles_filename,
+ dfxp2srt,
)
@@ -651,6 +652,30 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
'format' % new_ext)
continue
new_file = subtitles_filename(filename, lang, new_ext)
+
+ if ext == 'dfxp' or ext == 'ttml':
+ self._downloader.report_warning(
+ 'You have requested to convert dfxp (TTML) subtitles into another format, '
+ 'which results in style information loss')
+
+ dfxp_file = subtitles_filename(filename, lang, ext)
+ srt_file = subtitles_filename(filename, lang, 'srt')
+
+ with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+ srt_data = dfxp2srt(f.read())
+
+ with io.open(srt_file, 'wt', encoding='utf-8') as f:
+ f.write(srt_data)
+
+ ext = 'srt'
+ subs[lang] = {
+ 'ext': 'srt',
+ 'data': srt_data
+ }
+
+ if new_ext == 'srt':
+ continue
+
self.run_ffmpeg(
subtitles_filename(filename, lang, ext),
new_file, ['-f', new_format])
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index edeee1853..5e1c4525d 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1800,6 +1800,59 @@ def match_filter_func(filter_str):
return _match_func
+def parse_dfxp_time_expr(time_expr):
+ if not time_expr:
+ return 0.0
+
+ mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+ if mobj:
+ return float(mobj.group('time_offset'))
+
+ mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+ if mobj:
+ return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+
+
+def format_srt_time(seconds):
+ (mins, secs) = divmod(seconds, 60)
+ (hours, mins) = divmod(mins, 60)
+ millisecs = (secs - int(secs)) * 1000
+ secs = int(secs)
+ return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+
+
+def dfxp2srt(dfxp_data):
+ _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+
+ def parse_node(node):
+ str_or_empty = functools.partial(str_or_none, default='')
+
+ out = str_or_empty(node.text)
+
+ for child in node:
+ if child.tag == _x('ttml:br'):
+ out += '\n' + str_or_empty(child.tail)
+ elif child.tag == _x('ttml:span'):
+ out += str_or_empty(parse_node(child))
+ else:
+ out += str_or_empty(xml.etree.ElementTree.tostring(child))
+
+ return out
+
+ dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+ out = []
+ paras = dfxp.findall(_x('.//ttml:p'))
+
+ for para, index in zip(paras, itertools.count(1)):
+ out.append('%d\n%s --> %s\n%s\n\n' % (
+ index,
+ format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
+ format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+ parse_node(para)))
+
+ return ''.join(out)
+
+
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None):
# Set default handlers