aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordirkf <fieldhouse@gmx.net>2024-06-01 13:43:37 +0100
committerdirkf <fieldhouse@gmx.net>2024-06-11 12:52:13 +0100
commite20ca543f037bd3a8e38507b870ed3a3de3c32e7 (patch)
treedcc1cd84a30cec34d27ecc0f601367622b14c0ed
parente39466051f01411944bd657fe826b658a0df5af1 (diff)
[ORF] Re-factor and update`ORFFM4StoryIE`
* fix getting media via DASH instead of inaccessible mp4 * also get in-page YT media
-rw-r--r--youtube_dl/extractor/orf.py255
1 files changed, 127 insertions, 128 deletions
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 25c16c84d..f03aa40dc 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -6,6 +6,7 @@ import functools
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
clean_html,
determine_ext,
@@ -14,10 +15,8 @@ from ..utils import (
int_or_none,
merge_dicts,
mimetype2ext,
- orderedSet,
parse_age_limit,
parse_iso8601,
- remove_end,
strip_jsonp,
txt_or_none,
unified_strdate,
@@ -305,11 +304,90 @@ class ORFPodcastIE(ORFRadioBase):
}, self._extract_podcast_upload(data), rev=True)
-class ORFIPTVIE(InfoExtractor):
+class ORFIPTVBase(InfoExtractor):
+ _TITLE_STRIP_RE = ''
+
+ def _extract_video(self, video_id, webpage, fatal=False):
+
+ data = self._download_json(
+ 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
+ video_id)[0]
+
+ video = traverse_obj(data, (
+ 'sources', ('default', 'q8c'),
+ T(lambda x: x if x['loadBalancerUrl'] else None),
+ any))
+
+ load_balancer_url = video['loadBalancerUrl']
+
+ try:
+ rendition = self._download_json(
+ load_balancer_url, video_id, transform_source=strip_jsonp)
+ except ExtractorError:
+ rendition = None
+
+ if not rendition:
+ rendition = {
+ 'redirect': {
+ 'smil': re.sub(
+ r'(/)jsonp(/.+\.)mp4$', r'\1dash\2smil/manifest.mpd',
+ load_balancer_url),
+ },
+ }
+
+ f = traverse_obj(video, {
+ 'abr': ('audioBitrate', T(int_or_none)),
+ 'vbr': ('bitrate', T(int_or_none)),
+ 'fps': ('videoFps', T(int_or_none)),
+ 'width': ('videoWidth', T(int_or_none)),
+ 'height': ('videoHeight', T(int_or_none)),
+ })
+
+ formats = []
+ for format_id, format_url in traverse_obj(rendition, (
+ 'redirect', T(dict.items), Ellipsis)):
+ if format_id == 'rtmp':
+ ff = f.copy()
+ ff.update({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ formats.append(ff)
+ elif determine_ext(format_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ format_url, video_id, f4m_id=format_id))
+ elif determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ entry_protocol='m3u8_native'))
+ elif determine_ext(format_url) == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id=format_id))
+
+ if formats or fatal:
+ self._sort_formats(formats)
+ else:
+ return
+
+ return merge_dicts({
+ 'id': video_id,
+ 'title': re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage)),
+ 'description': self._og_search_description(webpage),
+ 'upload_date': unified_strdate(self._html_search_meta(
+ 'dc.date', webpage, 'upload date', fatal=False)),
+ 'formats': formats,
+ }, traverse_obj(data, {
+ 'duration': ('duration', T(k_float_or_none)),
+ 'thumbnail': ('sources', 'default', 'preview', T(url_or_none)),
+ }), rev=True)
+
+
+class ORFIPTVIE(ORFIPTVBase):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
_WORKING = False # URLs redirect to orf.at/
_VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
+ _TITLE_STRIP_RE = r'\s+-\s+iptv\.ORF\.at\S*$'
_TEST = {
'url': 'http://iptv.orf.at/stories/2275236/',
@@ -334,74 +412,32 @@ class ORFIPTVIE(InfoExtractor):
video_id = self._search_regex(
r'data-video(?:id)?="(\d+)"', webpage, 'video id')
- data = self._download_json(
- 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
- video_id)[0]
-
- duration = float_or_none(data['duration'], 1000)
-
- video = data['sources']['default']
- load_balancer_url = video['loadBalancerUrl']
- abr = int_or_none(video.get('audioBitrate'))
- vbr = int_or_none(video.get('bitrate'))
- fps = int_or_none(video.get('videoFps'))
- width = int_or_none(video.get('videoWidth'))
- height = int_or_none(video.get('videoHeight'))
- thumbnail = video.get('preview')
-
- rendition = self._download_json(
- load_balancer_url, video_id, transform_source=strip_jsonp)
-
- f = {
- 'abr': abr,
- 'vbr': vbr,
- 'fps': fps,
- 'width': width,
- 'height': height,
- }
-
- formats = []
- for format_id, format_url in rendition['redirect'].items():
- if format_id == 'rtmp':
- ff = f.copy()
- ff.update({
- 'url': format_url,
- 'format_id': format_id,
- })
- formats.append(ff)
- elif determine_ext(format_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
- format_url, video_id, f4m_id=format_id))
- elif determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', m3u8_id=format_id))
- else:
- continue
- self._sort_formats(formats)
+ return self._extract_video(video_id, webpage)
- title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
- description = self._og_search_description(webpage)
- upload_date = unified_strdate(self._html_search_meta(
- 'dc.date', webpage, 'upload date'))
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'formats': formats,
- }
-
-class ORFFM4StoryIE(InfoExtractor):
+class ORFFM4StoryIE(ORFIPTVBase):
IE_NAME = 'orf:fm4:story'
IE_DESC = 'fm4.orf.at stories'
_VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
+ _TITLE_STRIP_RE = r'\s+-\s+fm4\.ORF\.at\s*$'
- _TEST = {
+ _TESTS = [{
+ 'url': 'https://fm4.orf.at/stories/3041554/',
+ 'add_ie': ['Youtube'],
+ 'info_dict': {
+ 'id': '3041554',
+ 'title': 'Is The EU Green Deal In Mortal Danger?',
+ },
+ 'playlist_count': 4,
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
'url': 'http://fm4.orf.at/stories/2865738/',
+ 'info_dict': {
+ 'id': '2865738',
+ 'title': 'Manu Delago und Inner Tongue live',
+ },
'playlist': [{
'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
'info_dict': {
@@ -418,86 +454,49 @@ class ORFFM4StoryIE(InfoExtractor):
'info_dict': {
'id': '547798',
'ext': 'flv',
- 'title': 'Manu Delago und Inner Tongue live (2)',
+ 'title': 'Manu Delago und Inner Tongue https://vod-ww.mdn.ors.at/cms-worldwide_episodes_nas/_definst_/nas/cms-worldwide_episodes/online/14228823_0005.smil/chunklist_b992000_vo.m3u8live (2)',
'duration': 1504.08,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170913',
'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
},
}],
- }
+ 'skip': 'Videos gone',
+ }]
def _real_extract(self, url):
story_id = self._match_id(url)
webpage = self._download_webpage(url, story_id)
entries = []
- all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
- for idx, video_id in enumerate(all_ids):
- data = self._download_json(
- 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
- video_id)[0]
-
- duration = float_or_none(data['duration'], 1000)
-
- video = data['sources']['q8c']
- load_balancer_url = video['loadBalancerUrl']
- abr = int_or_none(video.get('audioBitrate'))
- vbr = int_or_none(video.get('bitrate'))
- fps = int_or_none(video.get('videoFps'))
- width = int_or_none(video.get('videoWidth'))
- height = int_or_none(video.get('videoHeight'))
- thumbnail = video.get('preview')
+ seen_ids = set()
+ for idx, video_id in enumerate(re.findall(r'data-video(?:id)?="(\d+)"', webpage)):
+ if video_id in seen_ids:
+ continue
+ seen_ids.add(video_id)
+ entry = self._extract_video(video_id, webpage, fatal=False)
+ if not entry:
+ continue
- rendition = self._download_json(
- load_balancer_url, video_id, transform_source=strip_jsonp)
+ if idx >= 1:
+ # Titles are duplicates, make them unique
+ entry['title'] = '%s (%d)' % (entry['title'], idx)
- f = {
- 'abr': abr,
- 'vbr': vbr,
- 'fps': fps,
- 'width': width,
- 'height': height,
- }
+ entries.append(entry)
- formats = []
- for format_id, format_url in rendition['redirect'].items():
- if format_id == 'rtmp':
- ff = f.copy()
- ff.update({
- 'url': format_url,
- 'format_id': format_id,
- })
- formats.append(ff)
- elif determine_ext(format_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
- format_url, video_id, f4m_id=format_id))
- elif determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', m3u8_id=format_id))
- else:
- continue
- self._sort_formats(formats)
+ seen_ids = set()
+ for yt_id in re.findall(
+ r'data-id\s*=\s*["\']([\w-]+)[^>]+\bclass\s*=\s*["\']youtube\b',
+ webpage):
+ if yt_id in seen_ids:
+ continue
+ seen_ids.add(yt_id)
+ if YoutubeIE.suitable(yt_id):
+ entries.append(self.url_result(yt_id, ie='Youtube', video_id=yt_id))
- title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
- if idx >= 1:
- # Titles are duplicates, make them unique
- title += ' (' + str(idx + 1) + ')'
- description = self._og_search_description(webpage)
- upload_date = unified_strdate(self._html_search_meta(
- 'dc.date', webpage, 'upload date'))
-
- entries.append({
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'formats': formats,
- })
-
- return self.playlist_result(entries)
+ return self.playlist_result(
+ entries, story_id,
+ re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage, default='') or None))
class ORFONBase(InfoExtractor):