From 3fb14cd214fdadfae195745b26498e012f78be8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:03:17 +0700 Subject: [zdf] Rework extractors (closes #11606, closes #13473, closes #17354, closes #21185, closes #26711, closes #27068, closes #27930, closes #28198, closes #28199, closes #28274) * Generalize unique video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix --- youtube_dl/extractor/phoenix.py | 149 +++++++++++++++++++++++++++++++--------- 1 file changed, 116 insertions(+), 33 deletions(-) (limited to 'youtube_dl/extractor/phoenix.py') diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e1..dbbfce983 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,45 +1,128 @@ +# coding: utf-8 from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + unified_timestamp, + xpath_text, +) -class PhoenixIE(DreiSatIE): + +class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' - _TESTS = [ - { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613906100, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'channel': 'corona nachgehakt', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + article_id = self._match_id(url) + + article = self._download_json( + 'https://www.phoenix.de/response/id/%s' % article_id, article_id, + 'Downloading article JSON') + + video = article['absaetze'][0] + title = video.get('titel') or article.get('subtitel') + + if video.get('typ') == 'video-youtube': + video_id = video['id'] + return self.url_result( + video_id, ie=YoutubeIE.ie_key(), video_id=video_id, + video_title=title) + + video_id = compat_str(video.get('basename') or video.get('content')) - internal_id = self._search_regex( - r'
Date: Sun, 16 May 2021 21:21:14 +0700 Subject: [phoenix] Fix extraction (closes #29057) --- youtube_dl/extractor/phoenix.py | 51 ++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'youtube_dl/extractor/phoenix.py') diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index dbbfce983..e3ea01443 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -9,8 +9,9 @@ from ..compat import compat_str from ..utils import ( int_or_none, merge_dicts, + try_get, unified_timestamp, - xpath_text, + urljoin, ) @@ -27,10 +28,11 @@ class PhoenixIE(ZDFBaseIE): 'title': 'Wohin führt der Protest in der Pandemie?', 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', 'duration': 1691, - 'timestamp': 1613906100, + 'timestamp': 1613902500, 'upload_date': '20210221', 'uploader': 'Phoenix', - 'channel': 'corona nachgehakt', + 'series': 'corona nachgehakt', + 'episode': 'Wohin führt der Protest in der Pandemie?', }, }, { # Youtube embed @@ -79,50 +81,53 @@ class PhoenixIE(ZDFBaseIE): video_id = compat_str(video.get('basename') or video.get('content')) - details = self._download_xml( + details = self._download_json( 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', - video_id, 'Downloading details XML', query={ + video_id, 'Downloading details JSON', query={ 'ak': 'web', 'ptmd': 'true', 'id': video_id, 'profile': 'player2', }) - title = title or xpath_text( - details, './/information/title', 'title', fatal=True) - content_id = xpath_text( - details, './/video/details/basename', 'content id', fatal=True) + title = title or details['title'] + content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, content_id, None, url) - timestamp = unified_timestamp(xpath_text(details, './/details/airtime')) + duration = int_or_none(try_get( + details, lambda x: x['tracking']['nielsen']['content']['length'])) + timestamp = unified_timestamp(details.get('editorialDate')) + series = try_get( + details, lambda x: x['tracking']['nielsen']['content']['program'], + compat_str) + episode = title if details.get('contentType') == 'episode' else None thumbnails = [] - for node in details.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text + teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} + for thumbnail_key, thumbnail_url in teaser_images.items(): + thumbnail_url = urljoin(url, thumbnail_url) if not thumbnail_url: continue thumbnail = { 'url': thumbnail_url, } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return merge_dicts(info, { 'id': content_id, 'title': title, - 'description': xpath_text(details, './/information/detail'), - 'duration': int_or_none(xpath_text(details, './/details/lengthSec')), + 'description': details.get('leadParagraph'), + 'duration': duration, 'thumbnails': thumbnails, 'timestamp': timestamp, - 'uploader': xpath_text(details, './/details/channel'), - 'uploader_id': xpath_text(details, './/details/originChannelId'), - 'channel': xpath_text(details, './/details/originChannelTitle'), + 'uploader': details.get('tvService'), + 'series': series, + 'episode': episode, }) -- cgit v1.2.3