aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md3
-rw-r--r--yt_dlp/extractor/youtube.py257
2 files changed, 182 insertions, 78 deletions
diff --git a/README.md b/README.md
index ca32e09bf..428eb9f47 100644
--- a/README.md
+++ b/README.md
@@ -1777,6 +1777,9 @@ The following extractors use this feature:
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning
+* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage`
+* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID)
+* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY`
#### youtubetab (YouTube playlists, channels, feeds, etc.)
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 2501398ba..343d103f6 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -69,6 +69,8 @@ from ..utils import (
)
STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
+STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token'
+
# any clients starting with _ cannot be explicitly requested by the user
INNERTUBE_CLIENTS = {
'web': {
@@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = {
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
+ 'REQUIRE_PO_TOKEN': True,
},
# Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats
'web_safari': {
@@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = {
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
+ 'REQUIRE_PO_TOKEN': True,
},
'web_embedded': {
'INNERTUBE_CONTEXT': {
@@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = {
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
'REQUIRE_JS_PLAYER': False,
+ 'REQUIRE_PO_TOKEN': True,
},
'android_music': {
'INNERTUBE_CONTEXT': {
@@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = {
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
'REQUIRE_JS_PLAYER': False,
+ 'REQUIRE_PO_TOKEN': True,
},
'android_creator': {
'INNERTUBE_CONTEXT': {
@@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = {
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
'REQUIRE_JS_PLAYER': False,
+ 'REQUIRE_PO_TOKEN': True,
},
# YouTube Kids videos aren't returned on this client for some reason
'android_vr': {
@@ -323,6 +330,7 @@ def build_innertube_clients():
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
+ ytcfg.setdefault('REQUIRE_PO_TOKEN', False)
ytcfg.setdefault('PLAYER_PARAMS', None)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
@@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None, fatal=False)
- @staticmethod
- def _extract_account_syncid(*args):
+ def _data_sync_id_to_delegated_session_id(self, data_sync_id):
+ if not data_sync_id:
+ return
+ # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
+ # and just "user_syncid||" for primary channel. We only want the channel_syncid
+ channel_syncid, _, user_syncid = data_sync_id.partition('||')
+ if user_syncid:
+ return channel_syncid
+
+ def _extract_account_syncid(self, *args):
"""
- Extract syncId required to download private playlists of secondary channels
+ Extract current session ID required to download private playlists of secondary channels
@params response and/or ytcfg
"""
- for data in args:
- # ytcfg includes channel_syncid if on secondary channel
- delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str)
- if delegated_sid:
- return delegated_sid
- sync_ids = (try_get(
- data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
- lambda x: x['DATASYNC_ID']), str) or '').split('||')
- if len(sync_ids) >= 2 and sync_ids[1]:
- # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
- # and just "user_syncid||" for primary channel. We only want the channel_syncid
- return sync_ids[0]
+ # ytcfg includes channel_syncid if on secondary channel
+ if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)):
+ return delegated_sid
- @staticmethod
- def _extract_visitor_data(*args):
+ data_sync_id = self._extract_data_sync_id(*args)
+ return self._data_sync_id_to_delegated_session_id(data_sync_id)
+
+ def _extract_data_sync_id(self, *args):
+ """
+ Extract current account dataSyncId.
+ In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID||
+ @params response and/or ytcfg
+ """
+ if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]:
+ return data_sync_id
+
+ return traverse_obj(
+ args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any))
+
+ def _extract_visitor_data(self, *args):
"""
Extracts visitorData from an API response or ytcfg
Appears to be used to track session state
"""
+ if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]:
+ return visitor_data
return get_first(
args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
expected_type=str)
@@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
}
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
- _POTOKEN_EXPERIMENTS = ('51217476', '51217102')
- _BROKEN_CLIENTS = {
- short_client_name(client): client
- for client in ('android', 'android_creator', 'android_music')
- }
_DEFAULT_CLIENTS = ('ios', 'web_creator')
_GEO_BYPASS = False
@@ -3701,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
**cls._get_checkok_params(),
}
+ def _get_config_po_token(self, client):
+ po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True)
+ for token_str in po_token_strs:
+ po_token_client, sep, po_token = token_str.partition('+')
+ if not sep:
+ self.report_warning(
+ f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True)
+ continue
+ if po_token_client == client:
+ return po_token
+
+ def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
+ # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function.
+ if not visitor_data and not self.is_authenticated and player_url:
+ self.report_warning(
+ f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. '
+ f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"')
+ return
+
+ config_po_token = self._get_config_po_token(client)
+ if config_po_token:
+ # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token,
+ # if using first channel in an account then we don't need the data_sync_id anymore...
+ if not data_sync_id and self.is_authenticated and player_url:
+ self.report_warning(
+ f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.'
+ f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
+
+ return config_po_token
+
+ # Require PO Token if logged in for external fetching
+ if not data_sync_id and self.is_authenticated and player_url:
+ self.report_warning(
+ f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. '
+ f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
+ return
+
+ return self._fetch_po_token(
+ client=client,
+ visitor_data=visitor_data,
+ data_sync_id=data_sync_id,
+ player_url=player_url,
+ **kwargs,
+ )
+
+ def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
+ """External PO Token fetch stub"""
+
@staticmethod
def _is_agegated(player_response):
if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
@@ -3717,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _is_unplayable(player_response):
return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
- def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data):
-
- session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
- syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
- sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
+ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token):
headers = self.generate_api_headers(
- ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
+ ytcfg=player_ytcfg,
+ default_client=client,
+ visitor_data=visitor_data,
+ session_index=self._extract_session_index(master_ytcfg, player_ytcfg),
+ account_syncid=(
+ self._data_sync_id_to_delegated_session_id(data_sync_id)
+ or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg)
+ ),
+ )
yt_query = {
'videoId': video_id,
@@ -3734,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]:
yt_query['params'] = player_params
+ if po_token:
+ yt_query['serviceIntegrityDimensions'] = {'poToken': po_token}
+
+ sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
yt_query.update(self._generate_player_context(sts))
return self._extract_response(
item_id=video_id, ep='player', query=yt_query,
@@ -3744,7 +3818,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data):
requested_clients = []
- broken_clients = []
excluded_clients = []
allowed_clients = sorted(
(client for client in INNERTUBE_CLIENTS if client[:1] != '_'),
@@ -3758,12 +3831,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
excluded_clients.append(client[1:])
elif client not in allowed_clients:
self.report_warning(f'Skipping unsupported client "{client}"')
- elif client in self._BROKEN_CLIENTS.values():
- broken_clients.append(client)
else:
requested_clients.append(client)
- # Force deprioritization of _BROKEN_CLIENTS for format de-duplication
- requested_clients.extend(broken_clients)
if not requested_clients:
requested_clients.extend(self._DEFAULT_CLIENTS)
for excluded_client in excluded_clients:
@@ -3788,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return pr_id
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
- initial_pr = ignore_initial_response = None
+ initial_pr = None
if webpage:
- if 'web' in clients:
- experiments = traverse_obj(master_ytcfg, (
- 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...))
- if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
- self.report_warning(
- 'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response')
- ignore_initial_response = True
initial_pr = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
prs = []
+ deprioritized_prs = []
+
if initial_pr and not self._invalid_player_response(initial_pr, video_id):
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
@@ -3822,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return
tried_iframe_fallback = False
- player_url = None
+ player_url = visitor_data = data_sync_id = None
skipped_clients = {}
while clients:
+ deprioritize_pr = False
client, base_client, variant = _split_innertube_client(clients.pop())
- player_ytcfg = {}
- if client == 'web':
- player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg
- elif 'configs' not in self._configuration_arg('player_skip'):
+ player_ytcfg = master_ytcfg if client == 'web' else {}
+ if 'configs' not in self._configuration_arg('player_skip') and client != 'web':
player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg
player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
@@ -3842,34 +3905,53 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_url = self._download_player_url(video_id)
tried_iframe_fallback = True
- pr = initial_pr if client == 'web' and not ignore_initial_response else None
- for retry in self.RetryManager(fatal=False):
- try:
- pr = pr or self._extract_player_response(
- client, video_id, player_ytcfg or master_ytcfg, player_ytcfg,
- player_url if require_js_player else None, initial_pr, smuggled_data)
- except ExtractorError as e:
- self.report_warning(e)
- break
- experiments = traverse_obj(pr, (
- 'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK',
- 'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...))
- if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
- pr = None
- retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True)
- if not pr:
+ visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg)
+ data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg)
+ po_token = self.fetch_po_token(
+ client=client, visitor_data=visitor_data,
+ data_sync_id=data_sync_id if self.is_authenticated else None,
+ player_url=player_url if require_js_player else None,
+ )
+
+ require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN')
+ if not po_token and require_po_token:
+ self.report_warning(
+ f'No PO Token provided for {client} client, '
+ f'which is required for working {client} formats. '
+ f'You can manually pass a PO Token for this client with '
+ f'--extractor-args "youtube:po_token={client}+XXX"',
+ only_once=True)
+ deprioritize_pr = True
+
+ pr = initial_pr if client == 'web' else None
+ try:
+ pr = pr or self._extract_player_response(
+ client, video_id,
+ master_ytcfg=player_ytcfg or master_ytcfg,
+ player_ytcfg=player_ytcfg,
+ player_url=player_url,
+ initial_pr=initial_pr,
+ visitor_data=visitor_data,
+ data_sync_id=data_sync_id,
+ po_token=po_token)
+ except ExtractorError as e:
+ self.report_warning(e)
continue
if pr_id := self._invalid_player_response(pr, video_id):
skipped_clients[client] = pr_id
elif pr:
# Save client name for introspection later
- name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
- sd[STREAMING_DATA_CLIENT_NAME] = name
+ sd[STREAMING_DATA_CLIENT_NAME] = client
+ sd[STREAMING_DATA_PO_TOKEN] = po_token
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
- f[STREAMING_DATA_CLIENT_NAME] = name
- prs.append(pr)
+ f[STREAMING_DATA_CLIENT_NAME] = client
+ f[STREAMING_DATA_PO_TOKEN] = po_token
+ if deprioritize_pr:
+ deprioritized_prs.append(pr)
+ else:
+ prs.append(pr)
# tv_embedded can work around age-gate and age-verification IF the video is embeddable
if self._is_agegated(pr) and variant != 'tv_embedded':
@@ -3893,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# _producer, _testsuite, & _vr variants can also work around age-verification
append_client('web_creator', 'mediaconnect')
+ prs.extend(deprioritized_prs)
+
if skipped_clients:
self.report_warning(
f'Skipping player responses from {"/".join(skipped_clients)} clients '
@@ -4027,13 +4111,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
- # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds
- # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
- is_broken = client_name in self._BROKEN_CLIENTS
+ po_token = fmt.get(STREAMING_DATA_PO_TOKEN)
+
+ if po_token:
+ fmt_url = update_url_query(fmt_url, {'pot': po_token})
+
+ # Clients that require PO Token return videoplayback URLs that may return 403
+ is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN'))
if is_broken:
self.report_warning(
- f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken '
- 'and may yield HTTP Error 403. They will be deprioritized', only_once=True)
+ f'{video_id}: {client_name} client formats require a PO Token which was not provided. '
+ 'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
fps = int_or_none(fmt.get('fps')) or 0
@@ -4109,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
skip_manifests.add('dash')
- def process_manifest_format(f, proto, client_name, itag):
+ def process_manifest_format(f, proto, client_name, itag, po_token):
key = (proto, f.get('language'))
if not all_formats and key in itags[itag]:
return False
itags[itag].add(key)
+ if f.get('source_preference') is None:
+ f['source_preference'] = -1
+
+ # Clients that require PO Token return videoplayback URLs that may return 403
+ # hls does not currently require PO Token
+ if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls':
+ self.report_warning(
+ f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. '
+ 'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
+ f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ')
+ f['source_preference'] -= 20
+
if itag and all_formats:
f['format_id'] = f'{itag}-{proto}'
elif any(p != proto for p, _ in itags[itag]):
@@ -4126,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
f['language_preference'] = PREFERRED_LANG_VALUE
- if f.get('source_preference') is None:
- f['source_preference'] = -1
-
if itag in ('616', '235'):
f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
f['source_preference'] += 100
@@ -4149,23 +4246,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles = {}
for sd in streaming_data:
client_name = sd.get(STREAMING_DATA_CLIENT_NAME)
-
+ po_token = sd.get(STREAMING_DATA_PO_TOKEN)
hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
if hls_manifest_url:
+ if po_token:
+ hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}'
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
subtitles = self._merge_subtitles(subs, subtitles)
for f in fmts:
if process_manifest_format(f, 'hls', client_name, self._search_regex(
- r'/itag/(\d+)', f['url'], 'itag', default=None)):
+ r'/itag/(\d+)', f['url'], 'itag', default=None), po_token):
yield f
dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl')
if dash_manifest_url:
+ if po_token:
+ dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}'
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
for f in formats:
- if process_manifest_format(f, 'dash', client_name, f['format_id']):
+ if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token):
f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
if needs_live_processing: