diff options
| author | dirkf <fieldhouse@gmx.net> | 2023-05-03 10:02:25 +0100 | 
|---|---|---|
| committer | dirkf <fieldhouse@gmx.net> | 2023-07-19 22:14:50 +0100 | 
| commit | 4566e6e53ebd87c6c548a8414ab5bd742c14c2b0 (patch) | |
| tree | fe53fa4d88bb91c77d8041713cdb42f55e48c0d4 /youtube_dl/extractor/globalplayer.py | |
| parent | 1e8ccdd2eb77901e18feb8a9d48e62d11651cd1e (diff) | |
[GlobalPlayer] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6903, thanks garret1317
Diffstat (limited to 'youtube_dl/extractor/globalplayer.py')
| -rw-r--r-- | youtube_dl/extractor/globalplayer.py | 285 | 
1 files changed, 285 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/globalplayer.py b/youtube_dl/extractor/globalplayer.py new file mode 100644 index 000000000..cceab9e6a --- /dev/null +++ b/youtube_dl/extractor/globalplayer.py @@ -0,0 +1,285 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    join_nonempty, +    merge_dicts, +    parse_duration, +    str_or_none, +    T, +    traverse_obj, +    unified_strdate, +    unified_timestamp, +    urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + +    import re + +    @classmethod +    def _match_valid_url(cls, url): +        return cls.re.match(cls._VALID_URL, url) + +    def _search_nextjs_data(self, webpage, video_id, **kw): +        return self._parse_json( +            self._search_regex( +                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', +                webpage, 'next.js data', **kw), +            video_id, **kw) + +    def _get_page_props(self, url, video_id): +        webpage = self._download_webpage(url, video_id) +        return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + +    def _request_ext(self, url, video_id): +        return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests +            url, video_id, note='Determining source extension')) + +    def _extract_audio(self, episode, series): + +        def clean_desc(x): +            x = clean_html(x) +            if x: +                x = x.replace('\xa0', ' ') +            return x + +        return merge_dicts({ +            'vcodec': 'none', +        }, traverse_obj(series, { +            'series': 'title', +            'series_id': 'id', +            'thumbnail': 'imageUrl', +            'uploader': 'itunesAuthor',  # podcasts only +        }), traverse_obj(episode, { +            'id': 'id', +            'description': ('description', T(clean_desc)), +            'duration': ('duration', T(parse_duration)), +            'thumbnail': 'imageUrl', +            'url': 'streamUrl', +            'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)), +            'title': 'title', +        }, get_all=False), rev=True) + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): +    _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+' +    _TESTS = [{ +        'url': 'https://www.globalplayer.com/live/smoothchill/uk/', +        'info_dict': { +            'id': '2mx1E', +            'ext': 'aac', +            'display_id': 'smoothchill-uk', +            'title': 're:^Smooth Chill.+$', +            'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', +            'description': 'Music To Chill To', +            # 'live_status': 'is_live', +            'is_live': True, +        }, +    }, { +        # national station +        'url': 'https://www.globalplayer.com/live/heart/uk/', +        'info_dict': { +            'id': '2mwx4', +            'ext': 'aac', +            'description': 'turn up the feel good!', +            'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', +            # 'live_status': 'is_live', +            'is_live': True, +            'title': 're:^Heart UK.+$', +            'display_id': 'heart-uk', +        }, +    }, { +        # regional variation +        'url': 'https://www.globalplayer.com/live/heart/london/', +        'info_dict': { +            'id': 'AMqg', +            'ext': 'aac', +            'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', +            'title': 're:^Heart London.+$', +            # 'live_status': 'is_live', +            'is_live': True, +            'display_id': 'heart-london', +            'description': 'turn up the feel good!', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        station = self._get_page_props(url, video_id)['station'] +        stream_url = station['streamUrl'] + +        return merge_dicts({ +            'id': station['id'], +            'display_id': ( +                join_nonempty('brandSlug', 'slug', from_dict=station) +                or station.get('legacyStationPrefix')), +            'url': stream_url, +            'ext': self._request_ext(stream_url, video_id), +            'vcodec': 'none', +            'is_live': True, +        }, { +            'title': self._live_title(traverse_obj( +                station, (('name', 'brandName'), T(str_or_none)), +                get_all=False)), +        }, traverse_obj(station, { +            'description': 'tagline', +            'thumbnail': 'brandLogo', +        }), rev=True) + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): +    _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)' +    _TESTS = [{ +        # "live playlist" +        'url': 'https://www.globalplayer.com/playlists/8bLk/', +        'info_dict': { +            'id': '8bLk', +            'ext': 'aac', +            # 'live_status': 'is_live', +            'is_live': True, +            'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', +            'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', +            'title': 're:^Classic FM Hall of Fame.+$' +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        station = self._get_page_props(url, video_id)['playlistData'] +        stream_url = station['streamUrl'] + +        return merge_dicts({ +            'id': video_id, +            'url': stream_url, +            'ext': self._request_ext(stream_url, video_id), +            'vcodec': 'none', +            'is_live': True, +        }, traverse_obj(station, { +            'title': 'title', +            'description': 'description', +            'thumbnail': 'image', +        }), rev=True) + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): +    _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])' +    _TESTS = [{ +        # podcast +        'url': 'https://www.globalplayer.com/podcasts/42KuaM/', +        'playlist_mincount': 5, +        'info_dict': { +            'id': '42KuaM', +            'title': 'Filthy Ritual', +            'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', +            'categories': ['Society & Culture', 'True Crime'], +            'uploader': 'Global', +            'description': 'md5:da5b918eac9ae319454a10a563afacf9', +        }, +    }, { +        # radio catchup +        'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', +        'playlist_mincount': 2, +        'info_dict': { +            'id': '46vyD7z', +            'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', +            'title': 'Nick Ferrari', +            'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', +        }, +    }] + +    def _real_extract(self, url): +        video_id, podcast = self._match_valid_url(url).group('id', 'podcast') +        props = self._get_page_props(url, video_id) +        series = props['podcastInfo'] if podcast else props['catchupInfo'] + +        return merge_dicts({ +            '_type': 'playlist', +            'id': video_id, +            'entries': [self._extract_audio(ep, series) for ep in traverse_obj( +                        series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], +            'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None, +        }, traverse_obj(series, { +            'description': 'description', +            'thumbnail': 'imageUrl', +            'title': 'title', +            'uploader': 'itunesAuthor',  # podcasts only +        }), rev=True) + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): +    _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])' +    _TESTS = [{ +        # podcast +        'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', +        'info_dict': { +            'id': '7DrfNnE', +            'ext': 'mp3', +            'title': 'Filthy Ritual - Trailer', +            'description': 'md5:1f1562fd0f01b4773b590984f94223e0', +            'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', +            'duration': 225.0, +            'timestamp': 1681254900, +            'series': 'Filthy Ritual', +            'series_id': '42KuaM', +            'upload_date': '20230411', +            'uploader': 'Global', +        }, +    }, { +        # radio catchup +        'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', +        'only_matching': True, +        # expired: refresh the details with a current show for a full test +        'info_dict': { +            'id': '2zGq26Vcv1fCWhddC4JAwETXWe', +            'ext': 'm4a', +            'timestamp': 1682056800, +            'series': 'Nick Ferrari', +            'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', +            'upload_date': '20230421', +            'series_id': '46vyD7z', +            'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', +            'title': 'Nick Ferrari', +            'duration': 10800.0, +        }, +    }] + +    def _real_extract(self, url): +        video_id, podcast = self._match_valid_url(url).group('id', 'podcast') +        props = self._get_page_props(url, video_id) +        episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + +        return self._extract_audio( +            episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): +    _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)' +    _TESTS = [{ +        'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', +        'info_dict': { +            'id': '2JsSZ7Gm2uP', +            'ext': 'mp4', +            'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', +            'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', +            'upload_date': '20230420', +            'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        meta = self._get_page_props(url, video_id)['videoData'] + +        return merge_dicts({ +            'id': video_id, +        }, traverse_obj(meta, { +            'url': 'url', +            'thumbnail': ('image', 'url'), +            'title': 'title', +            'upload_date': ('publish_date', T(unified_strdate)), +            'description': 'description', +        }), rev=True) | 
