aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/youtube.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r--youtube_dl/extractor/youtube.py115
1 files changed, 79 insertions, 36 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index ce97fd75b..54073ef86 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -342,14 +342,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if not self._login():
return
- _DEFAULT_API_DATA = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- },
- },
- }
+ _DEFAULT_API_DATA = {'context': _INNERTUBE_CLIENTS['web']['INNERTUBE_CONTEXT']}
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
@@ -497,11 +490,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
data['params'] = params
for page_num in itertools.count(1):
search = self._download_json(
- 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/search',
video_id='query "%s"' % query,
note='Downloading page %s' % page_num,
errnote='Unable to download API page', fatal=False,
data=json.dumps(data).encode('utf8'),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers={'content-type': 'application/json'})
if not search:
break
@@ -1655,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
assert os.path.basename(func_id) == func_id
self.write_debug('Extracting signature function {0}'.format(func_id))
- cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
+ cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None
if not cache_spec:
code = self._load_player(video_id, player_url, player_id)
@@ -1816,6 +1813,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ret
def _extract_n_function_name(self, jscode):
+ func_name, idx = None, None
+ # these special cases are redundant and probably obsolete (2025-04):
+ # they make the tests run ~10% faster without fallback warnings
+ r"""
func_name, idx = self._search_regex(
# (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
# (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
@@ -1842,9 +1843,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
\(\s*[\w$]+\s*\)
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
default=(None, None))
+ """
+
+ if not func_name:
+ # nfunc=function(x){...}|function nfunc(x); ...
+ # ... var y=[nfunc]|y[idx]=nfunc);
+ # obvious REs hang, so use a two-stage tactic
+ for m in re.finditer(r'''(?x)
+ [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s*
+ (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
+ \s*?[;\n]
+ ''', jscode):
+ func_name = self._search_regex(
+ r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
+ re.escape(m.group('nfunc')), '{'),
+ jscode, 'Initial JS player n function name (2)', group=2, default=None)
+ if func_name:
+ idx = m.group('idx')
+ break
+
# thx bashonly: yt-dlp/yt-dlp/pull/10611
if not func_name:
- self.report_warning('Falling back to generic n function search')
+ self.report_warning('Falling back to generic n function search', only_once=True)
return self._search_regex(
r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
@@ -1858,14 +1878,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return func_name
return self._search_json(
- r'var\s+{0}\s*='.format(re.escape(func_name)), jscode,
+ r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,
'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
- func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]',
+ func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',
transform_source=js_to_json)[int(idx)]
def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
- func_code = self.cache.load('youtube-nsig', player_id)
+ func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07')
jscode = func_code or self._load_player(video_id, player_url)
jsi = JSInterpreter(jscode)
@@ -3339,6 +3359,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'thumbnailViewModel', 'image'), final_key='sources'),
})
+ def _extract_shorts_lockup_view_model(self, view_model):
+ content_id = traverse_obj(view_model, (
+ 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId',
+ T(lambda v: v if YoutubeIE.suitable(v) else None)))
+ if not content_id:
+ return
+ return merge_dicts(self.url_result(
+ content_id, ie=YoutubeIE.ie_key(), video_id=content_id), {
+ 'title': traverse_obj(view_model, (
+ 'overlayMetadata', 'primaryText', 'content', T(compat_str))),
+ 'thumbnails': self._extract_thumbnails(
+ view_model, 'thumbnail', final_key='sources'),
+ })
+
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
if video_id:
@@ -3385,10 +3419,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield entry
def _rich_grid_entries(self, contents):
- for content in contents:
- content = traverse_obj(
- content, ('richItemRenderer', 'content'),
- expected_type=dict) or {}
+ for content in traverse_obj(
+ contents, (Ellipsis, 'richItemRenderer', 'content'),
+ expected_type=dict):
video_renderer = traverse_obj(
content, 'videoRenderer', 'reelItemRenderer',
expected_type=dict)
@@ -3396,6 +3429,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(video_renderer)
if entry:
yield entry
+ # shorts item
+ shorts_lockup_view_model = content.get('shortsLockupViewModel')
+ if shorts_lockup_view_model:
+ entry = self._extract_shorts_lockup_view_model(shorts_lockup_view_model)
+ if entry:
+ yield entry
# playlist
renderer = traverse_obj(
content, 'playlistRenderer', expected_type=dict) or {}
@@ -3434,23 +3473,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
next_continuation = cls._extract_next_continuation_data(renderer)
if next_continuation:
return next_continuation
- contents = []
- for key in ('contents', 'items'):
- contents.extend(try_get(renderer, lambda x: x[key], list) or [])
- for content in contents:
- if not isinstance(content, dict):
- continue
- continuation_ep = try_get(
- content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
- dict)
- if not continuation_ep:
- continue
- continuation = try_get(
- continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ for command in traverse_obj(renderer, (
+ ('contents', 'items', 'rows'), Ellipsis, 'continuationItemRenderer',
+ ('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
+ (('commandExecutorCommand', 'commands', Ellipsis), None), T(dict))):
+ continuation = traverse_obj(command, ('continuationCommand', 'token', T(compat_str)))
if not continuation:
continue
- ctp = continuation_ep.get('clickTrackingParams')
- return YoutubeTabIE._build_continuation_query(continuation, ctp)
+ ctp = command.get('clickTrackingParams')
+ return cls._build_continuation_query(continuation, ctp)
def _entries(self, tab, item_id, webpage):
tab_content = try_get(tab, lambda x: x['content'], dict)
@@ -3499,6 +3530,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(renderer)
if entry:
yield entry
+ renderer = isr_content.get('richGridRenderer')
+ if renderer:
+ for from_ in self._rich_grid_entries(
+ traverse_obj(renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
+ continuation = self._extract_continuation(renderer)
+ continue
if not continuation:
continuation = self._extract_continuation(is_renderer)
@@ -3508,8 +3546,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
rich_grid_renderer = tab_content.get('richGridRenderer')
if not rich_grid_renderer:
return
- for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []):
- yield entry
+ for from_ in self._rich_grid_entries(
+ traverse_obj(rich_grid_renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
continuation = self._extract_continuation(rich_grid_renderer)
@@ -3555,8 +3594,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
response = self._download_json(
- 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/browse',
None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers=headers, data=json.dumps(data).encode('utf8'))
break
except ExtractorError as e: