aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbashonly <88596187+bashonly@users.noreply.github.com>2025-02-18 18:19:02 -0600
committerGitHub <noreply@github.com>2025-02-19 00:19:02 +0000
commite7882b682b959e476d8454911655b3e9b14c79b2 (patch)
treed83cb9c4ca48bb22943c937b9f829ec48f365d89
parent6ca23ffaa4663cb552f937f0b1e9769b66db11bd (diff)
[ie/3sat] Fix extractor (#12403)
Fix 241ace4f104d50fdf7638f9203927aefcf57a1f7 Closes #12391 Authored by: bashonly
-rw-r--r--yt_dlp/extractor/dreisat.py31
-rw-r--r--yt_dlp/extractor/zdf.py230
2 files changed, 142 insertions, 119 deletions
diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py
index 4b0a269b9..376ff672d 100644
--- a/yt_dlp/extractor/dreisat.py
+++ b/yt_dlp/extractor/dreisat.py
@@ -1,10 +1,24 @@
-from .zdf import ZDFIE
+from .zdf import ZDFBaseIE
-class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE
+class DreiSatIE(ZDFBaseIE):
IE_NAME = '3sat'
_VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
_TESTS = [{
+ 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html',
+ 'info_dict': {
+ 'id': '231124_traumziele_philippinen_und_vietnam_dokreise',
+ 'ext': 'mp4',
+ 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam',
+ 'description': 'md5:26329ce5197775b596773b939354079d',
+ 'duration': 2625.0,
+ 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~2400x1350?cb=1699870351148',
+ 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam',
+ 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95',
+ 'timestamp': 1738593000,
+ 'upload_date': '20250203',
+ },
+ }, {
# Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html
'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html',
'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
@@ -17,6 +31,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE
'timestamp': 1608604200,
'upload_date': '20201222',
},
+ 'skip': '410 Gone',
}, {
'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html',
'info_dict': {
@@ -30,6 +45,7 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE
'params': {
'skip_download': True,
},
+ 'skip': '404 Not Found',
}, {
# Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html
'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html',
@@ -39,3 +55,14 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE
'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html',
'only_matching': True,
}]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id, fatal=False)
+ if webpage:
+ player = self._extract_player(webpage, url, fatal=False)
+ if player:
+ return self._extract_regular(url, player, video_id)
+
+ return self._extract_mobile(video_id)
diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py
index b64a88f6c..4bef1017c 100644
--- a/yt_dlp/extractor/zdf.py
+++ b/yt_dlp/extractor/zdf.py
@@ -137,6 +137,116 @@ class ZDFBaseIE(InfoExtractor):
group='json'),
video_id)
+ def _extract_entry(self, url, player, content, video_id):
+ title = content.get('title') or content['teaserHeadline']
+
+ t = content['mainVideoContent']['http://zdf.de/rels/target']
+ ptmd_path = traverse_obj(t, (
+ (('streams', 'default'), None),
+ ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'),
+ ), get_all=False)
+ if not ptmd_path:
+ raise ExtractorError('Could not extract ptmd_path')
+
+ info = self._extract_ptmd(
+ urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url)
+
+ thumbnails = []
+ layouts = try_get(
+ content, lambda x: x['teaserImageRef']['layouts'], dict)
+ if layouts:
+ for layout_key, layout_url in layouts.items():
+ layout_url = url_or_none(layout_url)
+ if not layout_url:
+ continue
+ thumbnail = {
+ 'url': layout_url,
+ 'format_id': layout_key,
+ }
+ mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
+ if mobj:
+ thumbnail.update({
+ 'width': int(mobj.group('width')),
+ 'height': int(mobj.group('height')),
+ })
+ thumbnails.append(thumbnail)
+
+ chapter_marks = t.get('streamAnchorTag') or []
+ chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
+ chapters = [{
+ 'start_time': chap.get('anchorOffset'),
+ 'end_time': next_chap.get('anchorOffset'),
+ 'title': chap.get('anchorLabel'),
+ } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
+
+ return merge_dicts(info, {
+ 'title': title,
+ 'description': content.get('leadParagraph') or content.get('teasertext'),
+ 'duration': int_or_none(t.get('duration')),
+ 'timestamp': unified_timestamp(content.get('editorialDate')),
+ 'thumbnails': thumbnails,
+ 'chapters': chapters or None,
+ 'episode': title,
+ **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', {
+ 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}),
+ 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}),
+ 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}),
+ 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}),
+ 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ 'episode_id': ('contentId', {str}),
+ })),
+ })
+
+ def _extract_regular(self, url, player, video_id, query=None):
+ player_url = player['content']
+
+ content = self._call_api(
+ update_url_query(player_url, query),
+ video_id, 'content', player['apiToken'], url)
+
+ return self._extract_entry(player_url, player, content, video_id)
+
+ def _extract_mobile(self, video_id):
+ video = self._download_v2_doc(video_id)
+
+ formats = []
+ formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
+ document = formitaeten and video['document']
+ if formitaeten:
+ title = document['titel']
+ content_id = document['basename']
+
+ format_urls = set()
+ for f in formitaeten or []:
+ self._extract_format(content_id, formats, format_urls, f)
+
+ thumbnails = []
+ teaser_bild = document.get('teaserBild')
+ if isinstance(teaser_bild, dict):
+ for thumbnail_key, thumbnail in teaser_bild.items():
+ thumbnail_url = try_get(
+ thumbnail, lambda x: x['url'], str)
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'id': thumbnail_key,
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ })
+
+ return {
+ 'id': content_id,
+ 'title': title,
+ 'description': document.get('beschreibung'),
+ 'duration': int_or_none(document.get('length')),
+ 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
+ try_get(video, lambda x: x['meta']['editorialDate'], str)),
+ 'thumbnails': thumbnails,
+ 'subtitles': self._extract_subtitles(document),
+ 'formats': formats,
+ }
+
class ZDFIE(ZDFBaseIE):
_VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
@@ -306,121 +416,6 @@ class ZDFIE(ZDFBaseIE):
},
}]
- def _extract_entry(self, url, player, content, video_id):
- title = content.get('title') or content['teaserHeadline']
-
- t = content['mainVideoContent']['http://zdf.de/rels/target']
- ptmd_path = traverse_obj(t, (
- (('streams', 'default'), None),
- ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'),
- ), get_all=False)
- if not ptmd_path:
- raise ExtractorError('Could not extract ptmd_path')
-
- info = self._extract_ptmd(
- urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url)
-
- thumbnails = []
- layouts = try_get(
- content, lambda x: x['teaserImageRef']['layouts'], dict)
- if layouts:
- for layout_key, layout_url in layouts.items():
- layout_url = url_or_none(layout_url)
- if not layout_url:
- continue
- thumbnail = {
- 'url': layout_url,
- 'format_id': layout_key,
- }
- mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
- if mobj:
- thumbnail.update({
- 'width': int(mobj.group('width')),
- 'height': int(mobj.group('height')),
- })
- thumbnails.append(thumbnail)
-
- chapter_marks = t.get('streamAnchorTag') or []
- chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
- chapters = [{
- 'start_time': chap.get('anchorOffset'),
- 'end_time': next_chap.get('anchorOffset'),
- 'title': chap.get('anchorLabel'),
- } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
-
- return merge_dicts(info, {
- 'title': title,
- 'description': content.get('leadParagraph') or content.get('teasertext'),
- 'duration': int_or_none(t.get('duration')),
- 'timestamp': unified_timestamp(content.get('editorialDate')),
- 'thumbnails': thumbnails,
- 'chapters': chapters or None,
- 'episode': title,
- **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', {
- 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}),
- 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}),
- 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}),
- 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}),
- 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}),
- 'episode_number': ('episodeNumber', {int_or_none}),
- 'episode_id': ('contentId', {str}),
- })),
- })
-
- def _extract_regular(self, url, player, video_id):
- player_url = player['content']
-
- try:
- content = self._call_api(
- update_url_query(player_url, {'profile': 'player-3'}),
- video_id, 'content', player['apiToken'], url)
- except ExtractorError as e:
- self.report_warning(f'{video_id}: {e.orig_msg}; retrying with v2 profile')
- content = self._call_api(
- player_url, video_id, 'content', player['apiToken'], url)
-
- return self._extract_entry(player_url, player, content, video_id)
-
- def _extract_mobile(self, video_id):
- video = self._download_v2_doc(video_id)
-
- formats = []
- formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
- document = formitaeten and video['document']
- if formitaeten:
- title = document['titel']
- content_id = document['basename']
-
- format_urls = set()
- for f in formitaeten or []:
- self._extract_format(content_id, formats, format_urls, f)
-
- thumbnails = []
- teaser_bild = document.get('teaserBild')
- if isinstance(teaser_bild, dict):
- for thumbnail_key, thumbnail in teaser_bild.items():
- thumbnail_url = try_get(
- thumbnail, lambda x: x['url'], str)
- if thumbnail_url:
- thumbnails.append({
- 'url': thumbnail_url,
- 'id': thumbnail_key,
- 'width': int_or_none(thumbnail.get('width')),
- 'height': int_or_none(thumbnail.get('height')),
- })
-
- return {
- 'id': content_id,
- 'title': title,
- 'description': document.get('beschreibung'),
- 'duration': int_or_none(document.get('length')),
- 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
- try_get(video, lambda x: x['meta']['editorialDate'], str)),
- 'thumbnails': thumbnails,
- 'subtitles': self._extract_subtitles(document),
- 'formats': formats,
- }
-
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -428,7 +423,7 @@ class ZDFIE(ZDFBaseIE):
if webpage:
player = self._extract_player(webpage, url, fatal=False)
if player:
- return self._extract_regular(url, player, video_id)
+ return self._extract_regular(url, player, video_id, query={'profile': 'player-3'})
return self._extract_mobile(video_id)
@@ -474,7 +469,8 @@ class ZDFChannelIE(ZDFBaseIE):
'title': ('titel', {str}),
'description': ('beschreibung', {str}),
'duration': ('length', {float_or_none}),
- # TODO: seasonNumber and episodeNumber can be extracted but need to also be in ZDFIE
+ 'season_number': ('seasonNumber', {int_or_none}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
}))
def _entries(self, data, document_id):