From c646d76f6717a646dd35f6efad6b396435f9fa55 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 22 Jun 2022 03:46:54 +0530 Subject: [webvtt, extractor/youtube] Extract auto-subs from livestream VODs Closes #4130 Authored by: pukkandan, fstirlitz --- yt_dlp/webvtt.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'yt_dlp/webvtt.py') diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index cc2353436..23d67a897 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -161,6 +161,12 @@ class Magic(HeaderBlock): _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') + # This was removed from the spec in the 2017 revision; + # the last spec draft to describe this syntax element is + # . + # Nevertheless, YouTube keeps serving those + _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') + @classmethod def __parse_tsmap(cls, parser): parser = parser.child() @@ -200,13 +206,18 @@ class Magic(HeaderBlock): raise ParseError(parser) extra = m.group(1) - local, mpegts = None, None - if parser.consume(cls._REGEX_TSMAP): - local, mpegts = cls.__parse_tsmap(parser) - if not parser.consume(_REGEX_NL): + local, mpegts, meta = None, None, '' + while not parser.consume(_REGEX_NL): + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + continue + m = parser.consume(cls._REGEX_META) + if m: + meta += m.group(0) + continue raise ParseError(parser) parser.commit() - return cls(extra=extra, mpegts=mpegts, local=local) + return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) def write_into(self, stream): stream.write('WEBVTT') @@ -219,6 +230,8 @@ class Magic(HeaderBlock): stream.write(',MPEGTS:') stream.write(str(self.mpegts if self.mpegts is not None else 0)) stream.write('\n') + if self.meta: + stream.write(self.meta) stream.write('\n') -- cgit v1.2.3