diff options
Diffstat (limited to 'youtube_dl/extractor/orf.py')
| -rw-r--r-- | youtube_dl/extractor/orf.py | 434 | 
1 files changed, 272 insertions, 162 deletions
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 8d537d7ae..3854911bd 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,184 +1,30 @@  # coding: utf-8  from __future__ import unicode_literals +import base64 +import functools  import re  from .common import InfoExtractor -from ..compat import compat_str  from ..utils import (      clean_html,      determine_ext,      float_or_none, -    HEADRequest,      int_or_none, +    merge_dicts,      orderedSet, +    parse_age_limit, +    parse_iso8601,      remove_end,      str_or_none,      strip_jsonp, -    unescapeHTML, +    txt_or_none,      unified_strdate,      url_or_none,  ) +from ..traversal import T, traverse_obj - -class ORFTVthekIE(InfoExtractor): -    IE_NAME = 'orf:tvthek' -    IE_DESC = 'ORF TVthek' -    _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)' - -    _TESTS = [{ -        'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', -        'playlist': [{ -            'md5': '2942210346ed779588f428a92db88712', -            'info_dict': { -                'id': '8896777', -                'ext': 'mp4', -                'title': 'Aufgetischt: Mit der Steirischen Tafelrunde', -                'description': 'md5:c1272f0245537812d4e36419c207b67d', -                'duration': 2668, -                'upload_date': '20141208', -            }, -        }], -        'skip': 'Blocked outside of Austria / Germany', -    }, { -        'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', -        'info_dict': { -            'id': '7982259', -            'ext': 'mp4', -            'title': 'Best of Ingrid Thurnher', -            'upload_date': '20140527', -            'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', -        }, -        'params': { -            'skip_download': True,  # rtsp downloads -        }, -        'skip': 'Blocked outside of Austria / Germany', -    }, { -        'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', -        'only_matching': True, -    }, { -        'url': 'http://tvthek.orf.at/profile/Universum/35429', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        playlist_id = self._match_id(url) -        webpage = self._download_webpage(url, playlist_id) - -        data_jsb = self._parse_json( -            self._search_regex( -                r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2', -                webpage, 'playlist', group='json'), -            playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - -        entries = [] -        for sd in data_jsb: -            video_id, title = sd.get('id'), sd.get('title') -            if not video_id or not title: -                continue -            video_id = compat_str(video_id) -            formats = [] -            for fd in sd['sources']: -                src = url_or_none(fd.get('src')) -                if not src: -                    continue -                format_id_list = [] -                for key in ('delivery', 'quality', 'quality_string'): -                    value = fd.get(key) -                    if value: -                        format_id_list.append(value) -                format_id = '-'.join(format_id_list) -                ext = determine_ext(src) -                if ext == 'm3u8': -                    m3u8_formats = self._extract_m3u8_formats( -                        src, video_id, 'mp4', m3u8_id=format_id, fatal=False) -                    if any('/geoprotection' in f['url'] for f in m3u8_formats): -                        self.raise_geo_restricted() -                    formats.extend(m3u8_formats) -                elif ext == 'f4m': -                    formats.extend(self._extract_f4m_formats( -                        src, video_id, f4m_id=format_id, fatal=False)) -                elif ext == 'mpd': -                    formats.extend(self._extract_mpd_formats( -                        src, video_id, mpd_id=format_id, fatal=False)) -                else: -                    formats.append({ -                        'format_id': format_id, -                        'url': src, -                        'protocol': fd.get('protocol'), -                    }) - -            # Check for geoblocking. -            # There is a property is_geoprotection, but that's always false -            geo_str = sd.get('geoprotection_string') -            if geo_str: -                try: -                    http_url = next( -                        f['url'] -                        for f in formats -                        if re.match(r'^https?://.*\.mp4$', f['url'])) -                except StopIteration: -                    pass -                else: -                    req = HEADRequest(http_url) -                    self._request_webpage( -                        req, video_id, -                        note='Testing for geoblocking', -                        errnote=(( -                            'This video seems to be blocked outside of %s. ' -                            'You may want to try the streaming-* formats.') -                            % geo_str), -                        fatal=False) - -            self._check_formats(formats, video_id) -            self._sort_formats(formats) - -            subtitles = {} -            for sub in sd.get('subtitles', []): -                sub_src = sub.get('src') -                if not sub_src: -                    continue -                subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ -                    'url': sub_src, -                }) - -            upload_date = unified_strdate(sd.get('created_date')) - -            thumbnails = [] -            preview = sd.get('preview_image_url') -            if preview: -                thumbnails.append({ -                    'id': 'preview', -                    'url': preview, -                    'preference': 0, -                }) -            image = sd.get('image_full_url') -            if not image and len(data_jsb) == 1: -                image = self._og_search_thumbnail(webpage) -            if image: -                thumbnails.append({ -                    'id': 'full', -                    'url': image, -                    'preference': 1, -                }) - -            entries.append({ -                '_type': 'video', -                'id': video_id, -                'title': title, -                'formats': formats, -                'subtitles': subtitles, -                'description': sd.get('description'), -                'duration': int_or_none(sd.get('duration_in_seconds')), -                'upload_date': upload_date, -                'thumbnails': thumbnails, -            }) - -        return { -            '_type': 'playlist', -            'entries': entries, -            'id': playlist_id, -        } +k_float_or_none = functools.partial(float_or_none, scale=1000)  class ORFRadioIE(InfoExtractor): @@ -401,6 +247,7 @@ class ORFOE1IE(ORFRadioIE):  class ORFIPTVIE(InfoExtractor):      IE_NAME = 'orf:iptv'      IE_DESC = 'iptv.ORF.at' +    _WORKING = False  # URLs redirect to orf.at/      _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'      _TEST = { @@ -590,3 +437,266 @@ class ORFFM4StoryIE(InfoExtractor):              })          return self.playlist_result(entries) + + +class ORFONBase(InfoExtractor): +    _ENC_PFX = '3dSlfek03nsLKdj4Jsd' +    _API_PATH = 'episode' + +    def _call_api(self, video_id, **kwargs): +        encrypted_id = base64.b64encode('{0}{1}'.format( +            self._ENC_PFX, video_id).encode('utf-8')).decode('ascii') +        return self._download_json( +            'https://api-tvthek.orf.at/api/v4.3/public/{0}/encrypted/{1}'.format( +                self._API_PATH, encrypted_id), +            video_id, **kwargs) + +    @classmethod +    def _parse_metadata(cls, api_json): +        return traverse_obj(api_json, { +            'id': ('id', T(int), T(txt_or_none)), +            'age_limit': ('age_classification', T(parse_age_limit)), +            'duration': ((('exact_duration', T(k_float_or_none)), +                          ('duration_second', T(float_or_none))),), +            'title': (('title', 'headline'), T(txt_or_none)), +            'description': (('description', 'teaser_text'), T(txt_or_none)), +            # 'media_type': ('video_type', T(txt_or_none)), +            'thumbnail': ('_embedded', 'image', 'public_urls', 'highlight_teaser', 'url', T(url_or_none)), +            'timestamp': (('date', 'episode_date'), T(parse_iso8601)), +            'release_timestamp': ('release_date', T(parse_iso8601)), +            # 'modified_timestamp': ('updated_at', T(parse_iso8601)), +        }, get_all=False) + +    def _extract_video(self, video_id, segment_id): +        # Not a segmented episode: return single video +        # Segmented episode without valid segment id: return entire playlist +        # Segmented episode with valid segment id and yes-playlist: return entire playlist +        # Segmented episode with valid segment id and no-playlist: return single video corresponding to segment id +        # If a multi_video playlist would be returned, but an unsegmented source exists, that source is chosen instead. + +        api_json = self._call_api(video_id) + +        if traverse_obj(api_json, 'is_drm_protected'): +            self.report_drm(video_id) + +        # updates formats, subtitles +        def extract_sources(src_json, video_id): +            for manifest_type in traverse_obj(src_json, ('sources', T(dict.keys), Ellipsis)): +                for manifest_url in traverse_obj(src_json, ('sources', manifest_type, Ellipsis, 'src', T(url_or_none))): +                    if manifest_type == 'hls': +                        fmts, subs = self._extract_m3u8_formats( +                            manifest_url, video_id, fatal=False, m3u8_id='hls', +                            ext='mp4', entry_protocol='m3u8_native'), {} +                        for f in fmts: +                            if '_vo.' in f['url']: +                                f['acodec'] = 'none' +                    elif manifest_type == 'dash': +                        fmts, subs = self._extract_mpd_formats_and_subtitles( +                            manifest_url, video_id, fatal=False, mpd_id='dash') +                    else: +                        continue +                    formats.extend(fmts) +                    self._merge_subtitles(subs, target=subtitles) + +        formats, subtitles = [], {} +        if segment_id is None: +            extract_sources(api_json, video_id) +        if not formats: +            segments = traverse_obj(api_json, ( +                '_embedded', 'segments', lambda _, v: v['id'])) +            if len(segments) > 1 and segment_id is not None: +                if not self._yes_playlist(video_id, segment_id, playlist_label='collection', video_label='segment'): +                    segments = [next(s for s in segments if txt_or_none(s['id']) == segment_id)] + +            entries = [] +            for seg in segments: +                formats, subtitles = [], {} +                extract_sources(seg, segment_id) +                self._sort_formats(formats) +                entries.append(merge_dicts({ +                    'formats': formats, +                    'subtitles': subtitles, +                }, self._parse_metadata(seg), rev=True)) +            result = merge_dicts( +                {'_type': 'multi_video' if len(entries) > 1 else 'playlist'}, +                self._parse_metadata(api_json), +                self.playlist_result(entries, video_id)) +            # not yet processed in core for playlist/multi +            self._downloader._fill_common_fields(result) +            return result +        else: +            self._sort_formats(formats) + +        for sub_url in traverse_obj(api_json, ( +                '_embedded', 'subtitle', +                ('xml_url', 'sami_url', 'stl_url', 'ttml_url', 'srt_url', 'vtt_url'), +                T(url_or_none))): +            self._merge_subtitles({'de': [{'url': sub_url}]}, target=subtitles) + +        return merge_dicts({ +            'id': video_id, +            'formats': formats, +            'subtitles': subtitles, +            # '_old_archive_ids': [self._downloader._make_archive_id({'ie_key': 'ORFTVthek', 'id': video_id})], +        }, self._parse_metadata(api_json), rev=True) + +    def _real_extract(self, url): +        video_id, segment_id = self._match_valid_url(url).group('id', 'segment') +        webpage = self._download_webpage(url, video_id) + +        # ORF doesn't like 410 or 404 +        if self._search_regex(r'<div\b[^>]*>\s*(Nicht mehr verfügbar)\s*</div>', webpage, 'Availability', default=False): +            raise ExtractorError('Content is no longer available', expected=True, video_id=video_id) + +        return merge_dicts({ +            'id': video_id, +            'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), +            'description': self._html_search_meta( +                ['description', 'og:description', 'twitter:description'], webpage, default=None), +        }, self._search_json_ld(webpage, video_id, default={}), +            self._extract_video(video_id, segment_id), +            rev=True) + + +class ORFONIE(ORFONBase): +    IE_NAME = 'orf:on' +    _VALID_URL = r'https?://on\.orf\.at/video/(?P<id>\d+)(?:/(?P<segment>\d+))?' +    _TESTS = [{ +        'url': 'https://on.orf.at/video/14210000/school-of-champions-48', +        'info_dict': { +            'id': '14210000', +            'ext': 'mp4', +            'duration': 2651.08, +            'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0167/98/thumb_16697671_segments_highlight_teaser.jpeg', +            'title': 'School of Champions (4/8)', +            'description': r're:(?s)Luca hat sein ganzes Leben in den Bergen Südtirols verbracht und ist bei seiner Mutter aufgewachsen, .{1029} Leo$', +            # 'media_type': 'episode', +            'timestamp': 1706558922, +            'upload_date': '20240129', +            'release_timestamp': 1706472362, +            'release_date': '20240128', +            # 'modified_timestamp': 1712756663, +            # 'modified_date': '20240410', +            # '_old_archive_ids': ['orftvthek 14210000'], +        }, +        'params': { +            'format': 'bestvideo', +        }, +    }, { +        'url': 'https://on.orf.at/video/3220355', +        'md5': '925a93b2b9a37da5c9b979d7cf71aa2e', +        'info_dict': { +            'id': '3220355', +            'ext': 'mp4', +            'duration': 445.04, +            'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0002/60/thumb_159573_segments_highlight_teaser.png', +            'title': '50 Jahre Burgenland: Der Festumzug', +            'description': r're:(?s)Aus allen Landesteilen zogen festlich geschmückte Wagen und Musikkapellen .{270} Jenakowitsch$', +            # 'media_type': 'episode', +            'timestamp': 52916400, +            'upload_date': '19710905', +            'release_timestamp': 52916400, +            'release_date': '19710905', +            # 'modified_timestamp': 1498536049, +            # 'modified_date': '20170627', +            # '_old_archive_ids': ['orftvthek 3220355'], +        }, +    }, { +        # Video with multiple segments selecting the second segment +        'url': 'https://on.orf.at/video/14226549/15639808/jugendbande-einbrueche-aus-langeweile', +        'md5': 'fc151bba8c05ea77ab5693617e4a33d3', +        'info_dict': { +            'id': '15639808', +            'ext': 'mp4', +            'duration': 97.707, +            'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0175/43/thumb_17442704_segments_highlight_teaser.jpg', +            'title': 'Jugendbande: Einbrüche aus Langeweile', +            'description': r're:Jugendbande: Einbrüche aus Langeweile \| Neuer Kinder- und .{259} Wanda$', +            # 'media_type': 'segment', +            'timestamp': 1715792400, +            'upload_date': '20240515', +            # 'modified_timestamp': 1715794394, +            # 'modified_date': '20240515', +            # '_old_archive_ids': ['orftvthek 15639808'], +        }, +        'params': { +            'noplaylist': True, +            'format': 'bestvideo', +        }, +    }, { +        # Video with multiple segments and no combined version +        'url': 'https://on.orf.at/video/14227864/formel-1-grosser-preis-von-monaco-2024', +        'info_dict': { +            '_type': 'multi_video', +            'id': '14227864', +            'duration': 18410.52, +            'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/04/thumb_17503881_segments_highlight_teaser.jpg', +            'title': 'Formel 1: Großer Preis von Monaco 2024', +            'description': 'md5:aeeb010710ccf70ce28ccb4482243d4f', +            # 'media_type': 'episode', +            'timestamp': 1716721200, +            'upload_date': '20240526', +            'release_timestamp': 1716721802, +            'release_date': '20240526', +            # 'modified_timestamp': 1716884702, +            # 'modified_date': '20240528', +        }, +        'playlist_count': 42, +        'skip': 'Gone: Nicht mehr verfügbar', +    }, { +        # Video with multiple segments, but with combined version +        'url': 'https://on.orf.at/video/14228172', +        'info_dict': { +            'id': '14228172', +            'ext': 'mp4', +            'duration': 3294.878, +            'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/29/thumb_17528242_segments_highlight_teaser.jpg', +            'title': 'Willkommen Österreich mit Stermann & Grissemann', +            'description': r're:Zum Saisonfinale freuen sich die urlaubsreifen Gastgeber Stermann und .{1863} Geschichten\.$', +            # 'media_type': 'episode', +            'timestamp': 1716926584, +            'upload_date': '20240528', +            'release_timestamp': 1716919202, +            'release_date': '20240528', +            # 'modified_timestamp': 1716968045, +            # 'modified_date': '20240529', +            # '_old_archive_ids': ['orftvthek 14228172'], +        }, +        'params': { +            'format': 'bestvideo', +        }, +        'skip': 'Gone: Nicht mehr verfügbar', +    }] + + +class ORFONLiveIE(ORFONBase): +    _ENC_PFX = '8876324jshjd7293ktd' +    _API_PATH = 'livestream' +    _VALID_URL = r'https?://on\.orf\.at/livestream/(?P<id>\d+)(?:/(?P<segment>\d+))?' +    _TESTS = [{ +        'url': 'https://on.orf.at/livestream/14320204/pressekonferenz-neos-zu-aktuellen-entwicklungen', +        'info_dict': { +            'id': '14320204', +            'ext': 'mp4', +            'title': 'Pressekonferenz: Neos zu aktuellen Entwicklungen', +            'description': r're:(?s)Neos-Chefin Beate Meinl-Reisinger informi.{598}ng\."', +            'timestamp': 1716886335, +            'upload_date': '20240528', +            # 'modified_timestamp': 1712756663, +            # 'modified_date': '20240410', +            # '_old_archive_ids': ['orftvthek 14210000'], +        }, +        'params': { +            'format': 'bestvideo', +        }, +    }] + +    @classmethod +    def _parse_metadata(cls, api_json): +        return merge_dicts( +            super(ORFONLiveIE, cls)._parse_metadata(api_json), +            traverse_obj(api_json, { +                'timestamp': ('updated_at', T(parse_iso8601)), +                'release_timestamp': ('start', T(parse_iso8601)), +                'is_live': True, +            }))  | 
