diff options
author | Remita Amine <remitamine@gmail.com> | 2021-01-05 21:17:08 +0100 |
---|---|---|
committer | Remita Amine <remitamine@gmail.com> | 2021-01-05 21:17:39 +0100 |
commit | 1ae7ae0b969d378ea41e6b90b9c5d44358d3e36b (patch) | |
tree | ed8f69a72a6a651a758f26c4ca764d4a9c023459 | |
parent | ccc71122915e630d99e8266c73a2eba26707f199 (diff) |
[canvas] Fix VRT NU extraction(closes #26957)(closes #27053)
-rw-r--r-- | youtube_dl/extractor/canvas.py | 118 |
1 files changed, 42 insertions, 76 deletions
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8667a0d04..65d65d52e 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,12 +7,12 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + extract_attributes, ExtractorError, strip_or_none, float_or_none, int_or_none, merge_dicts, - parse_iso8601, str_or_none, url_or_none, ) @@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +48,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -208,17 +214,21 @@ class VrtNUIE(GigyaBaseIE): _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +310,25 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) - - info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)<div class="content__description">(.+?)</div>', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* - <span>seizoen\ (.+?)</span>\s* - </div>''', - r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], - webpage, 'season', default=None) - - season_number = int_or_none(season) - - episode_number = int_or_none(self._html_search_regex( - r'''(?xms)<div\ class="content__episode">\s* - <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> - </div>''', - webpage, 'episode_number', default=None)) - - release_date = parse_iso8601(self._html_search_regex( - r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', - webpage, 'release_date', default=None)) - - # If there's a ? or a # in the URL, remove them and everything after - clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') - securevideo_url = clean_url + '.mssecurevideo.json' - - try: - video = self._download_json(securevideo_url, display_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self.raise_login_required() - raise + webpage = self._download_webpage(url, display_id) - # We are dealing with a '../<show>.relevant' URL - redirect_url = video.get('url') - if redirect_url: - return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + attrs = extract_attributes(self._search_regex( + r'(<nui-media[^>]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id - # There is only one entry, but with an unknown key, so just get - # the first one - video_id = list(video.values())[0].get('videoid') + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} + info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'season': season, - 'season_number': season_number, - 'episode_number': episode_number, - 'release_date': release_date, + 'season_number': int_or_none(page.get('episode_season')), }) |