diff options
| -rw-r--r-- | youtube_dl/extractor/dlf.py | 204 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 4 | 
2 files changed, 208 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/dlf.py b/youtube_dl/extractor/dlf.py new file mode 100644 index 000000000..cc3de4582 --- /dev/null +++ b/youtube_dl/extractor/dlf.py @@ -0,0 +1,204 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_str, +) +from ..utils import ( +    determine_ext, +    extract_attributes, +    int_or_none, +    merge_dicts, +    traverse_obj, +    url_or_none, +    variadic, +) + + +class DLFBaseIE(InfoExtractor): +    _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' +    _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + +    def _parse_button_attrs(self, button, audio_id=None): +        attrs = extract_attributes(button) +        audio_id = audio_id or attrs['data-audio-diraid'] + +        url = traverse_obj( +            attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', +            'data-audio-src', expected_type=url_or_none) +        ext = determine_ext(url) +        formats = (self._extract_m3u8_formats(url, audio_id, fatal=False) +                   if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) +        self._sort_formats(formats) + +        def traverse_attrs(path): +            path = list(variadic(path)) +            t = path.pop() if callable(path[-1]) else None +            return traverse_obj(attrs, path, expected_type=t, get_all=False) + +        def txt_or_none(v, default=None): +            return default if v is None else (compat_str(v).strip() or default) + +        return merge_dicts(*reversed([{ +            'id': audio_id, +            # 'extractor_key': DLFIE.ie_key(), +            # 'extractor': DLFIE.IE_NAME, +            'formats': formats, +        }, dict((k, traverse_attrs(v)) for k, v in { +            'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none), +            'duration': (('data-audioduration', 'data-audio-duration'), int_or_none), +            'thumbnail': ('data-audioimage', url_or_none), +            'uploader': 'data-audio-producer', +            'series': 'data-audio-series', +            'channel': 'data-audio-origin-site-name', +            'webpage_url': ('data-audio-download-tracking-path', url_or_none), +        }.items())])) + + +class DLFIE(DLFBaseIE): +    IE_NAME = 'dlf' +    _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html' +    _TESTS = [ +        # Audio as an HLS stream +        { +            'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', +            'info_dict': { +                'id': '03a3eb19', +                'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', +                'ext': 'm4a', +                'duration': 3298, +                'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', +                'uploader': 'Deutschlandfunk', +                'series': 'On Stage', +                'channel': 'deutschlandfunk' +            }, +            'params': { +                'skip_download': 'm3u8' +            }, +            'skip': 'This webpage no longer exists' +        }, { +            'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', +            'info_dict': { +                'id': 'd9cc1856', +                'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', +                'ext': 'mp3', +                'duration': 291, +                'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', +                'uploader': 'Deutschlandfunk', +                'series': 'Kommentare und Themen der Woche', +                'channel': 'deutschlandfunk' +            } +        }, +    ] + +    def _real_extract(self, url): +        audio_id = self._match_id(url) +        webpage = self._download_webpage(url, audio_id) + +        return self._parse_button_attrs( +            self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): +    IE_NAME = 'dlf:corpus' +    IE_DESC = 'DLF Multi-feed Archives' +    _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' +    _TESTS = [ +        # Recorded news broadcast with referrals to related broadcasts +        { +            'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', +            'info_dict': { +                'id': 'fechten-russland-belarus-ukraine-protest-100', +                'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', +                'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' +            }, +            'playlist_mincount': 5, +            'playlist': [{ +                'info_dict': { +                    'id': '1fc5d64a', +                    'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', +                    'ext': 'mp3', +                    'duration': 252, +                    'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', +                    'uploader': 'Deutschlandfunk', +                    'series': 'Sport', +                    'channel': 'deutschlandfunk' +                } +            }, { +                'info_dict': { +                    'id': '2ada145f', +                    'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', +                    'ext': 'mp3', +                    'duration': 336, +                    'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', +                    'uploader': 'Deutschlandfunk', +                    'series': 'Deutschlandfunk Nova', +                    'channel': 'deutschlandfunk-nova' +                } +            }, { +                'info_dict': { +                    'id': '5e55e8c9', +                    'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', +                    'ext': 'mp3', +                    'duration': 187, +                    'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', +                    'uploader': 'Deutschlandfunk', +                    'series': 'Sport am Samstag', +                    'channel': 'deutschlandfunk' +                } +            }, { +                'info_dict': { +                    'id': '47e1a096', +                    'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', +                    'ext': 'mp3', +                    'duration': 602, +                    'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', +                    'uploader': 'Deutschlandfunk', +                    'series': 'Sport am Samstag', +                    'channel': 'deutschlandfunk' +                } +            }, { +                'info_dict': { +                    'id': '5e55e8c9', +                    'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', +                    'ext': 'mp3', +                    'duration': 187, +                    'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', +                    'uploader': 'Deutschlandfunk', +                    'series': 'Sport am Samstag', +                    'channel': 'deutschlandfunk' +                } +            }] +        }, +        # Podcast feed with tag buttons, playlist count fluctuates +        { +            'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', +            'info_dict': { +                'id': 'kommentare-und-themen-der-woche-100', +                'title': 'Meinung - Kommentare und Themen der Woche', +                'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', +            }, +            'playlist_mincount': 10, +        }, +        # Podcast feed with no description +        { +            'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', +            'info_dict': { +                'id': 'podcast-tolle-idee-100', +                'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', +            }, +            'playlist_mincount': 11, +        }, +    ] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) +        webpage = self._download_webpage(url, playlist_id) + +        return self.playlist_result( +            map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), +            playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), +            self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f247dbbf..be73c0665 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -295,6 +295,10 @@ from .dbtv import DBTVIE  from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .democracynow import DemocracynowIE +from .dlf import ( +    DLFCorpusIE, +    DLFIE, +)  from .dfb import DFBIE  from .dhm import DHMIE  from .digg import DiggIE | 
