diff options
| author | dirkf <fieldhouse@gmx.net> | 2024-06-01 13:43:37 +0100 | 
|---|---|---|
| committer | dirkf <fieldhouse@gmx.net> | 2024-06-11 12:52:13 +0100 | 
| commit | e20ca543f037bd3a8e38507b870ed3a3de3c32e7 (patch) | |
| tree | dcc1cd84a30cec34d27ecc0f601367622b14c0ed | |
| parent | e39466051f01411944bd657fe826b658a0df5af1 (diff) | |
[ORF] Re-factor and update`ORFFM4StoryIE`
* fix getting media via DASH instead of inaccessible mp4
* also get in-page YT media
| -rw-r--r-- | youtube_dl/extractor/orf.py | 255 | 
1 files changed, 127 insertions, 128 deletions
| diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 25c16c84d..f03aa40dc 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,6 +6,7 @@ import functools  import re  from .common import InfoExtractor +from .youtube import YoutubeIE  from ..utils import (      clean_html,      determine_ext, @@ -14,10 +15,8 @@ from ..utils import (      int_or_none,      merge_dicts,      mimetype2ext, -    orderedSet,      parse_age_limit,      parse_iso8601, -    remove_end,      strip_jsonp,      txt_or_none,      unified_strdate, @@ -305,11 +304,90 @@ class ORFPodcastIE(ORFRadioBase):          }, self._extract_podcast_upload(data), rev=True) -class ORFIPTVIE(InfoExtractor): +class ORFIPTVBase(InfoExtractor): +    _TITLE_STRIP_RE = '' + +    def _extract_video(self, video_id, webpage, fatal=False): + +        data = self._download_json( +            'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, +            video_id)[0] + +        video = traverse_obj(data, ( +            'sources', ('default', 'q8c'), +            T(lambda x: x if x['loadBalancerUrl'] else None), +            any)) + +        load_balancer_url = video['loadBalancerUrl'] + +        try: +            rendition = self._download_json( +                load_balancer_url, video_id, transform_source=strip_jsonp) +        except ExtractorError: +            rendition = None + +        if not rendition: +            rendition = { +                'redirect': { +                    'smil': re.sub( +                        r'(/)jsonp(/.+\.)mp4$', r'\1dash\2smil/manifest.mpd', +                        load_balancer_url), +                }, +            } + +        f = traverse_obj(video, { +            'abr': ('audioBitrate', T(int_or_none)), +            'vbr': ('bitrate', T(int_or_none)), +            'fps': ('videoFps', T(int_or_none)), +            'width': ('videoWidth', T(int_or_none)), +            'height': ('videoHeight', T(int_or_none)), +        }) + +        formats = [] +        for format_id, format_url in traverse_obj(rendition, ( +                'redirect', T(dict.items), Ellipsis)): +            if format_id == 'rtmp': +                ff = f.copy() +                ff.update({ +                    'url': format_url, +                    'format_id': format_id, +                }) +                formats.append(ff) +            elif determine_ext(format_url) == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    format_url, video_id, f4m_id=format_id)) +            elif determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', m3u8_id=format_id, +                    entry_protocol='m3u8_native')) +            elif determine_ext(format_url) == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    format_url, video_id, mpd_id=format_id)) + +        if formats or fatal: +            self._sort_formats(formats) +        else: +            return + +        return merge_dicts({ +            'id': video_id, +            'title': re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage)), +            'description': self._og_search_description(webpage), +            'upload_date': unified_strdate(self._html_search_meta( +                'dc.date', webpage, 'upload date', fatal=False)), +            'formats': formats, +        }, traverse_obj(data, { +            'duration': ('duration', T(k_float_or_none)), +            'thumbnail': ('sources', 'default', 'preview', T(url_or_none)), +        }), rev=True) + + +class ORFIPTVIE(ORFIPTVBase):      IE_NAME = 'orf:iptv'      IE_DESC = 'iptv.ORF.at'      _WORKING = False  # URLs redirect to orf.at/      _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' +    _TITLE_STRIP_RE = r'\s+-\s+iptv\.ORF\.at\S*$'      _TEST = {          'url': 'http://iptv.orf.at/stories/2275236/', @@ -334,74 +412,32 @@ class ORFIPTVIE(InfoExtractor):          video_id = self._search_regex(              r'data-video(?:id)?="(\d+)"', webpage, 'video id') -        data = self._download_json( -            'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, -            video_id)[0] - -        duration = float_or_none(data['duration'], 1000) - -        video = data['sources']['default'] -        load_balancer_url = video['loadBalancerUrl'] -        abr = int_or_none(video.get('audioBitrate')) -        vbr = int_or_none(video.get('bitrate')) -        fps = int_or_none(video.get('videoFps')) -        width = int_or_none(video.get('videoWidth')) -        height = int_or_none(video.get('videoHeight')) -        thumbnail = video.get('preview') - -        rendition = self._download_json( -            load_balancer_url, video_id, transform_source=strip_jsonp) - -        f = { -            'abr': abr, -            'vbr': vbr, -            'fps': fps, -            'width': width, -            'height': height, -        } - -        formats = [] -        for format_id, format_url in rendition['redirect'].items(): -            if format_id == 'rtmp': -                ff = f.copy() -                ff.update({ -                    'url': format_url, -                    'format_id': format_id, -                }) -                formats.append(ff) -            elif determine_ext(format_url) == 'f4m': -                formats.extend(self._extract_f4m_formats( -                    format_url, video_id, f4m_id=format_id)) -            elif determine_ext(format_url) == 'm3u8': -                formats.extend(self._extract_m3u8_formats( -                    format_url, video_id, 'mp4', m3u8_id=format_id)) -            else: -                continue -        self._sort_formats(formats) +        return self._extract_video(video_id, webpage) -        title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') -        description = self._og_search_description(webpage) -        upload_date = unified_strdate(self._html_search_meta( -            'dc.date', webpage, 'upload date')) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'duration': duration, -            'thumbnail': thumbnail, -            'upload_date': upload_date, -            'formats': formats, -        } - -class ORFFM4StoryIE(InfoExtractor): +class ORFFM4StoryIE(ORFIPTVBase):      IE_NAME = 'orf:fm4:story'      IE_DESC = 'fm4.orf.at stories'      _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' +    _TITLE_STRIP_RE = r'\s+-\s+fm4\.ORF\.at\s*$' -    _TEST = { +    _TESTS = [{ +        'url': 'https://fm4.orf.at/stories/3041554/', +        'add_ie': ['Youtube'], +        'info_dict': { +            'id': '3041554', +            'title': 'Is The EU Green Deal In Mortal Danger?', +        }, +        'playlist_count': 4, +        'params': { +            'format': 'bestvideo', +        }, +    }, {          'url': 'http://fm4.orf.at/stories/2865738/', +        'info_dict': { +            'id': '2865738', +            'title': 'Manu Delago und Inner Tongue live', +        },          'playlist': [{              'md5': 'e1c2c706c45c7b34cf478bbf409907ca',              'info_dict': { @@ -418,86 +454,49 @@ class ORFFM4StoryIE(InfoExtractor):              'info_dict': {                  'id': '547798',                  'ext': 'flv', -                'title': 'Manu Delago und Inner Tongue live (2)', +                'title': 'Manu Delago und Inner Tongue https://vod-ww.mdn.ors.at/cms-worldwide_episodes_nas/_definst_/nas/cms-worldwide_episodes/online/14228823_0005.smil/chunklist_b992000_vo.m3u8live (2)',                  'duration': 1504.08,                  'thumbnail': r're:^https?://.*\.jpg$',                  'upload_date': '20170913',                  'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',              },          }], -    } +        'skip': 'Videos gone', +    }]      def _real_extract(self, url):          story_id = self._match_id(url)          webpage = self._download_webpage(url, story_id)          entries = [] -        all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) -        for idx, video_id in enumerate(all_ids): -            data = self._download_json( -                'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, -                video_id)[0] - -            duration = float_or_none(data['duration'], 1000) - -            video = data['sources']['q8c'] -            load_balancer_url = video['loadBalancerUrl'] -            abr = int_or_none(video.get('audioBitrate')) -            vbr = int_or_none(video.get('bitrate')) -            fps = int_or_none(video.get('videoFps')) -            width = int_or_none(video.get('videoWidth')) -            height = int_or_none(video.get('videoHeight')) -            thumbnail = video.get('preview') +        seen_ids = set() +        for idx, video_id in enumerate(re.findall(r'data-video(?:id)?="(\d+)"', webpage)): +            if video_id in seen_ids: +                continue +            seen_ids.add(video_id) +            entry = self._extract_video(video_id, webpage, fatal=False) +            if not entry: +                continue -            rendition = self._download_json( -                load_balancer_url, video_id, transform_source=strip_jsonp) +            if idx >= 1: +                # Titles are duplicates, make them unique +                entry['title'] = '%s (%d)' % (entry['title'], idx) -            f = { -                'abr': abr, -                'vbr': vbr, -                'fps': fps, -                'width': width, -                'height': height, -            } +            entries.append(entry) -            formats = [] -            for format_id, format_url in rendition['redirect'].items(): -                if format_id == 'rtmp': -                    ff = f.copy() -                    ff.update({ -                        'url': format_url, -                        'format_id': format_id, -                    }) -                    formats.append(ff) -                elif determine_ext(format_url) == 'f4m': -                    formats.extend(self._extract_f4m_formats( -                        format_url, video_id, f4m_id=format_id)) -                elif determine_ext(format_url) == 'm3u8': -                    formats.extend(self._extract_m3u8_formats( -                        format_url, video_id, 'mp4', m3u8_id=format_id)) -                else: -                    continue -            self._sort_formats(formats) +        seen_ids = set() +        for yt_id in re.findall( +                r'data-id\s*=\s*["\']([\w-]+)[^>]+\bclass\s*=\s*["\']youtube\b', +                webpage): +            if yt_id in seen_ids: +                continue +            seen_ids.add(yt_id) +            if YoutubeIE.suitable(yt_id): +                entries.append(self.url_result(yt_id, ie='Youtube', video_id=yt_id)) -            title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') -            if idx >= 1: -                # Titles are duplicates, make them unique -                title += ' (' + str(idx + 1) + ')' -            description = self._og_search_description(webpage) -            upload_date = unified_strdate(self._html_search_meta( -                'dc.date', webpage, 'upload date')) - -            entries.append({ -                'id': video_id, -                'title': title, -                'description': description, -                'duration': duration, -                'thumbnail': thumbnail, -                'upload_date': upload_date, -                'formats': formats, -            }) - -        return self.playlist_result(entries) +        return self.playlist_result( +            entries, story_id, +            re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage, default='') or None))  class ORFONBase(InfoExtractor): | 
