diff options
-rw-r--r-- | test/test_utils.py | 38 | ||||
-rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 25 | ||||
-rw-r--r-- | youtube_dl/utils.py | 53 |
3 files changed, 116 insertions, 0 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index 2e3a6480c..17017a8c0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -58,6 +58,8 @@ from youtube_dl.utils import ( xpath_text, render_table, match_str, + parse_dfxp_time_expr, + dfxp2srt, ) @@ -581,6 +583,42 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) + def test_parse_dfxp_time_expr(self): + self.assertEqual(parse_dfxp_time_expr(None), 0.0) + self.assertEqual(parse_dfxp_time_expr(''), 0.0) + self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1) + self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) + self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) + self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1) + + def test_dfxp2srt(self): + dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?> + <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> + <body> + <div xml:lang="en"> + <p begin="0" end="1">The following line contains Chinese characters and special symbols</p> + <p begin="1" end="2">第二行<br/>♪♪</p> + <p begin="2" end="3"><span>Third<br/>Line</span></p> + </div> + </body> + </tt>''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The following line contains Chinese characters and special symbols + +2 +00:00:01,000 --> 00:00:02,000 +第二行 +♪♪ + +3 +00:00:02,000 --> 00:00:03,000 +Third +Line + +''' + self.assertEqual(dfxp2srt(dfxp_data), srt_data) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 7a952963e..1765f4969 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -20,6 +20,7 @@ from ..utils import ( prepend_extension, shell_quote, subtitles_filename, + dfxp2srt, ) @@ -651,6 +652,30 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'format' % new_ext) continue new_file = subtitles_filename(filename, lang, new_ext) + + if ext == 'dfxp' or ext == 'ttml': + self._downloader.report_warning( + 'You have requested to convert dfxp (TTML) subtitles into another format, ' + 'which results in style information loss') + + dfxp_file = subtitles_filename(filename, lang, ext) + srt_file = subtitles_filename(filename, lang, 'srt') + + with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + srt_data = dfxp2srt(f.read()) + + with io.open(srt_file, 'wt', encoding='utf-8') as f: + f.write(srt_data) + + ext = 'srt' + subs[lang] = { + 'ext': 'srt', + 'data': srt_data + } + + if new_ext == 'srt': + continue + self.run_ffmpeg( subtitles_filename(filename, lang, ext), new_file, ['-f', new_format]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index edeee1853..5e1c4525d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1800,6 +1800,59 @@ def match_filter_func(filter_str): return _match_func +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return 0.0 + + mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def format_srt_time(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, mins) = divmod(mins, 60) + millisecs = (secs - int(secs)) * 1000 + secs = int(secs) + return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) + + +def dfxp2srt(dfxp_data): + _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + + def parse_node(node): + str_or_empty = functools.partial(str_or_none, default='') + + out = str_or_empty(node.text) + + for child in node: + if child.tag == _x('ttml:br'): + out += '\n' + str_or_empty(child.tail) + elif child.tag == _x('ttml:span'): + out += str_or_empty(parse_node(child)) + else: + out += str_or_empty(xml.etree.ElementTree.tostring(child)) + + return out + + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) + + for para, index in zip(paras, itertools.count(1)): + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), + format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), + parse_node(para))) + + return ''.join(out) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers |