diff options
| author | Yen Chi Hsuan <yan12125@gmail.com> | 2015-04-25 23:15:05 +0800 | 
|---|---|---|
| committer | Yen Chi Hsuan <yan12125@gmail.com> | 2015-04-25 23:18:27 +0800 | 
| commit | bf6427d2fbcbd95cd1cb640e8b894c18782a2a12 (patch) | |
| tree | 0b84e18ba8a76b71c169ede701e21947e6eccacf /youtube_dl/utils.py | |
| parent | 672f1bd8497f43179dcd01f8b4831564f0b42356 (diff) | |
[ffmpeg] Add dfxp (TTML) subtitles support (#3432, #5146)
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 53 | 
1 files changed, 53 insertions, 0 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index edeee1853..5e1c4525d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1800,6 +1800,59 @@ def match_filter_func(filter_str):      return _match_func +def parse_dfxp_time_expr(time_expr): +    if not time_expr: +        return 0.0 + +    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) +    if mobj: +        return float(mobj.group('time_offset')) + +    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) +    if mobj: +        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def format_srt_time(seconds): +    (mins, secs) = divmod(seconds, 60) +    (hours, mins) = divmod(mins, 60) +    millisecs = (secs - int(secs)) * 1000 +    secs = int(secs) +    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) + + +def dfxp2srt(dfxp_data): +    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + +    def parse_node(node): +        str_or_empty = functools.partial(str_or_none, default='') + +        out = str_or_empty(node.text) + +        for child in node: +            if child.tag == _x('ttml:br'): +                out += '\n' + str_or_empty(child.tail) +            elif child.tag == _x('ttml:span'): +                out += str_or_empty(parse_node(child)) +            else: +                out += str_or_empty(xml.etree.ElementTree.tostring(child)) + +        return out + +    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) +    out = [] +    paras = dfxp.findall(_x('.//ttml:p')) + +    for para, index in zip(paras, itertools.count(1)): +        out.append('%d\n%s --> %s\n%s\n\n' % ( +            index, +            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), +            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), +            parse_node(para))) + +    return ''.join(out) + +  class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):      def __init__(self, proxies=None):          # Set default handlers | 
