aboutsummaryrefslogtreecommitdiff
path: root/yt_dlp/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r--yt_dlp/extractor/common.py65
1 files changed, 52 insertions, 13 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index af534775f..f56ccaf7e 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -247,6 +247,8 @@ class InfoExtractor:
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
+ * is_dash_periods Whether the format is a result of merging
+ multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
@@ -2530,7 +2532,11 @@ class InfoExtractor:
self._report_ignoring_subs('DASH')
return fmts
- def _extract_mpd_formats_and_subtitles(
+ def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._extract_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
@@ -2543,17 +2549,16 @@ class InfoExtractor:
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return [], {}
+ return []
mpd_doc, urlh = res
if mpd_doc is None:
- return [], {}
+ return []
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
- return self._parse_mpd_formats_and_subtitles(
- mpd_doc, mpd_id, mpd_base_url, mpd_url)
+ return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
@@ -2561,8 +2566,39 @@ class InfoExtractor:
self._report_ignoring_subs('DASH')
return fmts
- def _parse_mpd_formats_and_subtitles(
- self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+ def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._parse_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _merge_mpd_periods(self, periods):
+ """
+ Combine all formats and subtitles from an MPD manifest into a single list,
+ by concatenate streams with similar formats.
+ """
+ formats, subtitles = {}, {}
+ for period in periods:
+ for f in period['formats']:
+ assert 'is_dash_periods' not in f, 'format already processed'
+ f['is_dash_periods'] = True
+ format_key = tuple(v for k, v in f.items() if k not in (
+ ('format_id', 'fragments', 'manifest_stream_number')))
+ if format_key not in formats:
+ formats[format_key] = f
+ elif 'fragments' in f:
+ formats[format_key].setdefault('fragments', []).extend(f['fragments'])
+
+ if subtitles and period['subtitles']:
+ self.report_warning(bug_reports_message(
+ 'Found subtitles in multiple periods in the DASH manifest; '
+ 'if part of the subtitles are missing,'
+ ), only_once=True)
+
+ for sub_lang, sub_info in period['subtitles'].items():
+ subtitles.setdefault(sub_lang, []).extend(sub_info)
+
+ return list(formats.values()), subtitles
+
+ def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -2641,9 +2677,13 @@ class InfoExtractor:
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
- formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
- for period in mpd_doc.findall(_add_ns('Period')):
+ for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
+ period_entry = {
+ 'id': period.get('id', f'period-{period_idx}'),
+ 'formats': [],
+ 'subtitles': collections.defaultdict(list),
+ }
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
@@ -2893,11 +2933,10 @@ class InfoExtractor:
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
- formats.append(f)
+ period_entry['formats'].append(f)
elif content_type == 'text':
- subtitles.setdefault(lang or 'und', []).append(f)
-
- return formats, subtitles
+ period_entry['subtitles'][lang or 'und'].append(f)
+ yield period_entry
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)