aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordirkf <fieldhouse@gmx.net>2023-05-03 10:07:35 +0100
committerdirkf <fieldhouse@gmx.net>2023-07-19 22:14:50 +0100
commit4339910df3fe97a054069cb98da594dd1b50c13a (patch)
tree275c0a50ad98f8221b2fa10a892f175d4925de99
parenteaaf4c6736b98e20a1923162ae05952c8cb51ee1 (diff)
[DLF] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6697, thanks nick-cd
-rw-r--r--youtube_dl/extractor/dlf.py204
-rw-r--r--youtube_dl/extractor/extractors.py4
2 files changed, 208 insertions, 0 deletions
diff --git a/youtube_dl/extractor/dlf.py b/youtube_dl/extractor/dlf.py
new file mode 100644
index 000000000..cc3de4582
--- /dev/null
+++ b/youtube_dl/extractor/dlf.py
@@ -0,0 +1,204 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ int_or_none,
+ merge_dicts,
+ traverse_obj,
+ url_or_none,
+ variadic,
+)
+
+
+class DLFBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
+ _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
+
+ def _parse_button_attrs(self, button, audio_id=None):
+ attrs = extract_attributes(button)
+ audio_id = audio_id or attrs['data-audio-diraid']
+
+ url = traverse_obj(
+ attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
+ 'data-audio-src', expected_type=url_or_none)
+ ext = determine_ext(url)
+ formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
+ if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
+ self._sort_formats(formats)
+
+ def traverse_attrs(path):
+ path = list(variadic(path))
+ t = path.pop() if callable(path[-1]) else None
+ return traverse_obj(attrs, path, expected_type=t, get_all=False)
+
+ def txt_or_none(v, default=None):
+ return default if v is None else (compat_str(v).strip() or default)
+
+ return merge_dicts(*reversed([{
+ 'id': audio_id,
+ # 'extractor_key': DLFIE.ie_key(),
+ # 'extractor': DLFIE.IE_NAME,
+ 'formats': formats,
+ }, dict((k, traverse_attrs(v)) for k, v in {
+ 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
+ 'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
+ 'thumbnail': ('data-audioimage', url_or_none),
+ 'uploader': 'data-audio-producer',
+ 'series': 'data-audio-series',
+ 'channel': 'data-audio-origin-site-name',
+ 'webpage_url': ('data-audio-download-tracking-path', url_or_none),
+ }.items())]))
+
+
+class DLFIE(DLFBaseIE):
+ IE_NAME = 'dlf'
+ _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
+ _TESTS = [
+ # Audio as an HLS stream
+ {
+ 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
+ 'info_dict': {
+ 'id': '03a3eb19',
+ 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
+ 'ext': 'm4a',
+ 'duration': 3298,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'On Stage',
+ 'channel': 'deutschlandfunk'
+ },
+ 'params': {
+ 'skip_download': 'm3u8'
+ },
+ 'skip': 'This webpage no longer exists'
+ }, {
+ 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
+ 'info_dict': {
+ 'id': 'd9cc1856',
+ 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
+ 'ext': 'mp3',
+ 'duration': 291,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Kommentare und Themen der Woche',
+ 'channel': 'deutschlandfunk'
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+ webpage = self._download_webpage(url, audio_id)
+
+ return self._parse_button_attrs(
+ self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
+
+
+class DLFCorpusIE(DLFBaseIE):
+ IE_NAME = 'dlf:corpus'
+ IE_DESC = 'DLF Multi-feed Archives'
+ _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
+ _TESTS = [
+ # Recorded news broadcast with referrals to related broadcasts
+ {
+ 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
+ 'info_dict': {
+ 'id': 'fechten-russland-belarus-ukraine-protest-100',
+ 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
+ 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
+ },
+ 'playlist_mincount': 5,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '1fc5d64a',
+ 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
+ 'ext': 'mp3',
+ 'duration': 252,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '2ada145f',
+ 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
+ 'ext': 'mp3',
+ 'duration': 336,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Deutschlandfunk Nova',
+ 'channel': 'deutschlandfunk-nova'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '5e55e8c9',
+ 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
+ 'ext': 'mp3',
+ 'duration': 187,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '47e1a096',
+ 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
+ 'ext': 'mp3',
+ 'duration': 602,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '5e55e8c9',
+ 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
+ 'ext': 'mp3',
+ 'duration': 187,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }]
+ },
+ # Podcast feed with tag buttons, playlist count fluctuates
+ {
+ 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
+ 'info_dict': {
+ 'id': 'kommentare-und-themen-der-woche-100',
+ 'title': 'Meinung - Kommentare und Themen der Woche',
+ 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
+ },
+ 'playlist_mincount': 10,
+ },
+ # Podcast feed with no description
+ {
+ 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
+ 'info_dict': {
+ 'id': 'podcast-tolle-idee-100',
+ 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
+ },
+ 'playlist_mincount': 11,
+ },
+ ]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ return self.playlist_result(
+ map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
+ playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
+ self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 9f247dbbf..be73c0665 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -295,6 +295,10 @@ from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .democracynow import DemocracynowIE
+from .dlf import (
+ DLFCorpusIE,
+ DLFIE,
+)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE