aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Skrundz <david@skrundz.ca>2024-10-21 12:56:43 -0600
committerGitHub <noreply@github.com>2024-10-21 18:56:43 +0000
commit40054cb4a7ebbea30d335d444e6f58b298a3baa0 (patch)
tree0e4a0a61ba540f0399d0e5c9d2f3cf7ce178e0fc
parentfed53d70bdb7d3e37ef63dd7fcf0ef74356167fd (diff)
[ie/gem.cbc.ca] Fix formats extraction (#11196)
Also extracts `timestamp` and `release_timestamp` as seconds instead of milliseconds Authored by: DavidSkrundz
-rw-r--r--yt_dlp/extractor/cbc.py82
1 files changed, 26 insertions, 56 deletions
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index 40224f63f..b44c23fa1 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -4,7 +4,6 @@ import json
import re
import time
import urllib.parse
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -12,7 +11,6 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
- join_nonempty,
js_to_json,
mimetype2ext,
orderedSet,
@@ -524,14 +522,13 @@ class CBCGemIE(InfoExtractor):
_TESTS = [{
# This is a normal, public, TV show video
'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
- 'md5': '93dbb31c74a8e45b378cf13bd3f6f11e',
'info_dict': {
'id': 'schitts-creek/s06e01',
'ext': 'mp4',
'title': 'Smoke Signals',
'description': 'md5:929868d20021c924020641769eb3e7f1',
- 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)',
- 'duration': 1314,
+ 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg',
+ 'duration': 1324,
'categories': ['comedy'],
'series': 'Schitt\'s Creek',
'season': 'Season 6',
@@ -539,19 +536,21 @@ class CBCGemIE(InfoExtractor):
'episode': 'Smoke Signals',
'episode_number': 1,
'episode_id': 'schitts-creek/s06e01',
+ 'upload_date': '20210618',
+ 'timestamp': 1623988800,
+ 'release_date': '20200107',
+ 'release_timestamp': 1578427200,
},
'params': {'format': 'bv'},
- 'skip': 'Geo-restricted to Canada',
}, {
# This video requires an account in the browser, but works fine in yt-dlp
'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01',
- 'md5': '297a9600f554f2258aed01514226a697',
'info_dict': {
'id': 'schitts-creek/s01e01',
'ext': 'mp4',
'title': 'The Cup Runneth Over',
'description': 'md5:9bca14ea49ab808097530eb05a29e797',
- 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)',
+ 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_01e01_thumbnail_v01\.jpg',
'series': 'Schitt\'s Creek',
'season_number': 1,
'season': 'Season 1',
@@ -560,9 +559,12 @@ class CBCGemIE(InfoExtractor):
'episode_id': 'schitts-creek/s01e01',
'duration': 1309,
'categories': ['comedy'],
+ 'upload_date': '20210617',
+ 'timestamp': 1623902400,
+ 'release_date': '20151124',
+ 'release_timestamp': 1448323200,
},
'params': {'format': 'bv'},
- 'skip': 'Geo-restricted to Canada',
}, {
'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01',
'only_matching': True,
@@ -631,38 +633,6 @@ class CBCGemIE(InfoExtractor):
return
self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token')
- def _find_secret_formats(self, formats, video_id):
- """ Find a valid video url and convert it to the secret variant """
- base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
- if not base_format:
- return
-
- base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
- url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
-
- secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
- if not isinstance(secret_xml, xml.etree.ElementTree.Element):
- return
-
- for child in secret_xml:
- if child.attrib.get('Type') != 'video':
- continue
- for video_quality in child:
- bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
- if not bitrate or 'Index' not in video_quality.attrib:
- continue
- height = int_or_none(video_quality.attrib.get('MaxHeight'))
-
- yield {
- **base_format,
- 'format_id': join_nonempty('sec', height),
- # Note: \g<1> is necessary instead of \1 since bitrate is a number
- 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
- 'width': int_or_none(video_quality.attrib.get('MaxWidth')),
- 'tbr': bitrate / 1000.0,
- 'height': height,
- }
-
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = self._download_json(
@@ -676,7 +646,6 @@ class CBCGemIE(InfoExtractor):
else:
headers = {}
m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
- m3u8_url = m3u8_info.get('url')
if m3u8_info.get('errorCode') == 1:
self.raise_geo_restricted(countries=['CA'])
@@ -685,9 +654,9 @@ class CBCGemIE(InfoExtractor):
elif m3u8_info.get('errorCode') != 0:
raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
- formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
+ formats = self._extract_m3u8_formats(
+ m3u8_info['url'], video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''})
self._remove_duplicate_formats(formats)
- formats.extend(self._find_secret_formats(formats, video_id))
for fmt in formats:
if fmt.get('vcodec') == 'none':
@@ -703,20 +672,21 @@ class CBCGemIE(InfoExtractor):
return {
'id': video_id,
- 'title': video_info['title'],
- 'description': video_info.get('description'),
- 'thumbnail': video_info.get('image'),
- 'series': video_info.get('series'),
- 'season_number': video_info.get('season'),
- 'season': f'Season {video_info.get("season")}',
- 'episode_number': video_info.get('episode'),
- 'episode': video_info.get('title'),
'episode_id': video_id,
- 'duration': video_info.get('duration'),
- 'categories': [video_info.get('category')],
'formats': formats,
- 'release_timestamp': video_info.get('airDate'),
- 'timestamp': video_info.get('availableDate'),
+ **traverse_obj(video_info, {
+ 'title': ('title', {str}),
+ 'episode': ('title', {str}),
+ 'description': ('description', {str}),
+ 'thumbnail': ('image', {url_or_none}),
+ 'series': ('series', {str}),
+ 'season_number': ('season', {int_or_none}),
+ 'episode_number': ('episode', {int_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ 'categories': ('category', {str}, all),
+ 'release_timestamp': ('airDate', {int_or_none(scale=1000)}),
+ 'timestamp': ('availableDate', {int_or_none(scale=1000)}),
+ }),
}