diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 115 |
1 files changed, 79 insertions, 36 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ce97fd75b..54073ef86 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -342,14 +342,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not self._login(): return - _DEFAULT_API_DATA = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - }, - }, - } + _DEFAULT_API_DATA = {'context': _INNERTUBE_CLIENTS['web']['INNERTUBE_CONTEXT']} _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -497,11 +490,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): data['params'] = params for page_num in itertools.count(1): search = self._download_json( - 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'https://www.youtube.com/youtubei/v1/search', video_id='query "%s"' % query, note='Downloading page %s' % page_num, errnote='Unable to download API page', fatal=False, data=json.dumps(data).encode('utf8'), + query={ + # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'prettyPrint': 'false', + }, headers={'content-type': 'application/json'}) if not search: break @@ -1655,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): assert os.path.basename(func_id) == func_id self.write_debug('Extracting signature function {0}'.format(func_id)) - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None if not cache_spec: code = self._load_player(video_id, player_url, player_id) @@ -1816,6 +1813,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret def _extract_n_function_name(self, jscode): + func_name, idx = None, None + # these special cases are redundant and probably obsolete (2025-04): + # they make the tests run ~10% faster without fallback warnings + r""" func_name, idx = self._search_regex( # (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}}; # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; @@ -1842,9 +1843,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): \(\s*[\w$]+\s*\) ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'), default=(None, None)) + """ + + if not func_name: + # nfunc=function(x){...}|function nfunc(x); ... + # ... var y=[nfunc]|y[idx]=nfunc); + # obvious REs hang, so use a two-stage tactic + for m in re.finditer(r'''(?x) + [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s* + (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\]) + \s*?[;\n] + ''', jscode): + func_name = self._search_regex( + r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format( + re.escape(m.group('nfunc')), '{'), + jscode, 'Initial JS player n function name (2)', group=2, default=None) + if func_name: + idx = m.group('idx') + break + # thx bashonly: yt-dlp/yt-dlp/pull/10611 if not func_name: - self.report_warning('Falling back to generic n function search') + self.report_warning('Falling back to generic n function search', only_once=True) return self._search_regex( r'''(?xs) (?:(?<=[^\w$])|^) # instead of \b, which ignores $ @@ -1858,14 +1878,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return func_name return self._search_json( - r'var\s+{0}\s*='.format(re.escape(func_name)), jscode, + r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode, 'Initial JS player n function list ({0}.{1})'.format(func_name, idx), - func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]', + func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]', transform_source=js_to_json)[int(idx)] def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id) + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -3339,6 +3359,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'thumbnailViewModel', 'image'), final_key='sources'), }) + def _extract_shorts_lockup_view_model(self, view_model): + content_id = traverse_obj(view_model, ( + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', + T(lambda v: v if YoutubeIE.suitable(v) else None))) + if not content_id: + return + return merge_dicts(self.url_result( + content_id, ie=YoutubeIE.ie_key(), video_id=content_id), { + 'title': traverse_obj(view_model, ( + 'overlayMetadata', 'primaryText', 'content', T(compat_str))), + 'thumbnails': self._extract_thumbnails( + view_model, 'thumbnail', final_key='sources'), + }) + def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') if video_id: @@ -3385,10 +3419,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry def _rich_grid_entries(self, contents): - for content in contents: - content = traverse_obj( - content, ('richItemRenderer', 'content'), - expected_type=dict) or {} + for content in traverse_obj( + contents, (Ellipsis, 'richItemRenderer', 'content'), + expected_type=dict): video_renderer = traverse_obj( content, 'videoRenderer', 'reelItemRenderer', expected_type=dict) @@ -3396,6 +3429,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): entry = self._video_entry(video_renderer) if entry: yield entry + # shorts item + shorts_lockup_view_model = content.get('shortsLockupViewModel') + if shorts_lockup_view_model: + entry = self._extract_shorts_lockup_view_model(shorts_lockup_view_model) + if entry: + yield entry # playlist renderer = traverse_obj( content, 'playlistRenderer', expected_type=dict) or {} @@ -3434,23 +3473,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation - contents = [] - for key in ('contents', 'items'): - contents.extend(try_get(renderer, lambda x: x[key], list) or []) - for content in contents: - if not isinstance(content, dict): - continue - continuation_ep = try_get( - content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], - dict) - if not continuation_ep: - continue - continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + for command in traverse_obj(renderer, ( + ('contents', 'items', 'rows'), Ellipsis, 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), + (('commandExecutorCommand', 'commands', Ellipsis), None), T(dict))): + continuation = traverse_obj(command, ('continuationCommand', 'token', T(compat_str))) if not continuation: continue - ctp = continuation_ep.get('clickTrackingParams') - return YoutubeTabIE._build_continuation_query(continuation, ctp) + ctp = command.get('clickTrackingParams') + return cls._build_continuation_query(continuation, ctp) def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) @@ -3499,6 +3530,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): entry = self._video_entry(renderer) if entry: yield entry + renderer = isr_content.get('richGridRenderer') + if renderer: + for from_ in self._rich_grid_entries( + traverse_obj(renderer, ('contents', Ellipsis, T(dict)))): + yield from_ + continuation = self._extract_continuation(renderer) + continue if not continuation: continuation = self._extract_continuation(is_renderer) @@ -3508,8 +3546,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): rich_grid_renderer = tab_content.get('richGridRenderer') if not rich_grid_renderer: return - for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): - yield entry + for from_ in self._rich_grid_entries( + traverse_obj(rich_grid_renderer, ('contents', Ellipsis, T(dict)))): + yield from_ continuation = self._extract_continuation(rich_grid_renderer) @@ -3555,8 +3594,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry response = self._download_json( - 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'https://www.youtube.com/youtubei/v1/browse', None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), + query={ + # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'prettyPrint': 'false', + }, headers=headers, data=json.dumps(data).encode('utf8')) break except ExtractorError as e: |