diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2021-04-28 19:02:43 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-28 19:02:43 +0530 |
commit | be6202f12b97858b9d716e608394b51065d0419f (patch) | |
tree | 71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/downloader/hls.py | |
parent | db9a564b6a5c31472f8298969584eead0b59fa1c (diff) | |
parent | e8f834cd8dfc07011d1080321e42bc130e7201bb (diff) |
Subtitle extraction from streaming media manifests #247
Authored by fstirlitz
Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144
Closes: #73
Fixes:
https://github.com/ytdl-org/youtube-dl/issues/6106
https://github.com/ytdl-org/youtube-dl/issues/14977
https://github.com/ytdl-org/youtube-dl/issues/21438
https://github.com/ytdl-org/youtube-dl/issues/23609
https://github.com/ytdl-org/youtube-dl/issues/28132
Might also fix (untested):
https://github.com/ytdl-org/youtube-dl/issues/15424
https://github.com/ytdl-org/youtube-dl/issues/18267
https://github.com/ytdl-org/youtube-dl/issues/23899
https://github.com/ytdl-org/youtube-dl/issues/24375
https://github.com/ytdl-org/youtube-dl/issues/24595
https://github.com/ytdl-org/youtube-dl/issues/27899
Related:
https://github.com/ytdl-org/youtube-dl/issues/22379
https://github.com/ytdl-org/youtube-dl/pull/24517
https://github.com/ytdl-org/youtube-dl/pull/24886
https://github.com/ytdl-org/youtube-dl/pull/27215
Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
* Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
* The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
* Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
* The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
* But validity of the those extracted from ISM are untested
Diffstat (limited to 'yt_dlp/downloader/hls.py')
-rw-r--r-- | yt_dlp/downloader/hls.py | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index f4e41a6c7..270b33b22 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import errno import re +import io import binascii try: from Crypto.Cipher import AES @@ -27,7 +28,9 @@ from ..utils import ( parse_m3u8_attributes, sanitize_open, update_url_query, + bug_reports_message, ) +from .. import webvtt class HlsFD(FragmentFD): @@ -78,6 +81,8 @@ class HlsFD(FragmentFD): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + is_webvtt = info_dict['ext'] == 'vtt' + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) man_url = urlh.geturl() s = urlh.read().decode('utf-8', 'ignore') @@ -142,6 +147,8 @@ class HlsFD(FragmentFD): else: self._prepare_and_start_frag_download(ctx) + extra_state = ctx.setdefault('extra_state', {}) + fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) test = self.params.get('test', False) @@ -308,6 +315,76 @@ class HlsFD(FragmentFD): return frag_content, frag_index + pack_fragment = lambda frag_content, _: frag_content + + if is_webvtt: + def pack_fragment(frag_content, frag_index): + output = io.StringIO() + adjust = 0 + for block in webvtt.parse_fragment(frag_content): + if isinstance(block, webvtt.CueBlock): + block.start += adjust + block.end += adjust + + dedup_window = extra_state.setdefault('webvtt_dedup_window', []) + cue = block.as_json + + # skip the cue if an identical one appears + # in the window of potential duplicates + # and prune the window of unviable candidates + i = 0 + skip = True + while i < len(dedup_window): + window_cue = dedup_window[i] + if window_cue == cue: + break + if window_cue['end'] >= cue['start']: + i += 1 + continue + del dedup_window[i] + else: + skip = False + + if skip: + continue + + # add the cue to the window + dedup_window.append(cue) + elif isinstance(block, webvtt.Magic): + # take care of MPEG PES timestamp overflow + if block.mpegts is None: + block.mpegts = 0 + extra_state.setdefault('webvtt_mpegts_adjust', 0) + block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33 + if block.mpegts < extra_state.get('webvtt_mpegts_last', 0): + extra_state['webvtt_mpegts_adjust'] += 1 + block.mpegts += 1 << 33 + extra_state['webvtt_mpegts_last'] = block.mpegts + + if frag_index == 1: + extra_state['webvtt_mpegts'] = block.mpegts or 0 + extra_state['webvtt_local'] = block.local or 0 + # XXX: block.local = block.mpegts = None ? + else: + if block.mpegts is not None and block.local is not None: + adjust = ( + (block.mpegts - extra_state.get('webvtt_mpegts', 0)) + - (block.local - extra_state.get('webvtt_local', 0)) + ) + continue + elif isinstance(block, webvtt.HeaderBlock): + if frag_index != 1: + # XXX: this should probably be silent as well + # or verify that all segments contain the same data + self.report_warning(bug_reports_message( + 'Discarding a %s block found in the middle of the stream; ' + 'if the subtitles display incorrectly,' + % (type(block).__name__))) + continue + block.write_into(output) + + return output.getvalue().encode('utf-8') + def append_fragment(frag_content, frag_index): if frag_content: fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index) @@ -315,6 +392,7 @@ class HlsFD(FragmentFD): file, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized file.close() + frag_content = pack_fragment(frag_content, frag_index) self._append_fragment(ctx, frag_content) return True except EnvironmentError as ose: |