From 25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56 Mon Sep 17 00:00:00 2001 From: Felix S Date: Mon, 9 Aug 2021 20:22:30 +0000 Subject: [webvtt] Merge daisy-chained duplicate cues (#638) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552 Previous deduplication algorithm only removed duplicate cues with identical text, styles and timestamps. This change also merges cues that come in ‘daisy chains’, where sequences of cues with identical text and styles appear in which the ending timestamp of one equals the starting timestamp of the next. This deduplication algorithm has the somewhat unfortunate side effect that NOTE blocks between cues, if found, will be emitted in a different order relative to their original cues. This may be unwanted if perfect fidelity is desired, but then so is daisy-chain deduplication itself. NOTE blocks ought to be ignored by WebVTT players in any case. Authored by: fstirlitz --- yt_dlp/webvtt.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'yt_dlp/webvtt.py') diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index ef55e6459..eee2a4a2d 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -331,6 +331,26 @@ class CueBlock(Block): 'settings': self.settings, } + def __eq__(self, other): + return self.as_json == other.as_json + + @classmethod + def from_json(cls, json): + return cls( + id=json['id'], + start=json['start'], + end=json['end'], + text=json['text'], + settings=json['settings'] + ) + + def hinges(self, other): + if self.text != other.text: + return False + if self.settings != other.settings: + return False + return self.start <= self.end == other.start <= other.end + def parse_fragment(frag_content): """ -- cgit v1.2.3