aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/common.py90
-rw-r--r--youtube_dl/extractor/youtube.py115
2 files changed, 121 insertions, 84 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index cb67b976d..a64fcfccc 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -505,7 +505,7 @@ class InfoExtractor(object):
if not self._x_forwarded_for_ip:
# Geo bypass mechanism is explicitly disabled by user
- if not self._downloader.params.get('geo_bypass', True):
+ if not self.get_param('geo_bypass', True):
return
if not geo_bypass_context:
@@ -527,7 +527,7 @@ class InfoExtractor(object):
# Explicit IP block specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+ ip_block = self.get_param('geo_bypass_ip_block', None)
# Otherwise use random IP block from geo bypass context but only
# if extractor is known as geo bypassable
@@ -538,8 +538,8 @@ class InfoExtractor(object):
if ip_block:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
+ if self.get_param('verbose', False):
+ self.to_screen(
'[debug] Using fake IP %s as X-Forwarded-For.'
% self._x_forwarded_for_ip)
return
@@ -548,7 +548,7 @@ class InfoExtractor(object):
# Explicit country code specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- country = self._downloader.params.get('geo_bypass_country', None)
+ country = self.get_param('geo_bypass_country', None)
# Otherwise use random country code from geo bypass context but
# only if extractor is known as geo bypassable
@@ -559,8 +559,8 @@ class InfoExtractor(object):
if country:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
+ if self.get_param('verbose', False):
+ self.to_screen(
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
% (self._x_forwarded_for_ip, country.upper()))
@@ -586,9 +586,9 @@ class InfoExtractor(object):
raise ExtractorError('An extractor error has occurred.', cause=e)
def __maybe_fake_ip_and_retry(self, countries):
- if (not self._downloader.params.get('geo_bypass_country', None)
+ if (not self.get_param('geo_bypass_country', None)
and self._GEO_BYPASS
- and self._downloader.params.get('geo_bypass', True)
+ and self.get_param('geo_bypass', True)
and not self._x_forwarded_for_ip
and countries):
country_code = random.choice(countries)
@@ -698,7 +698,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
- self._downloader.report_warning(errmsg)
+ self.report_warning(errmsg)
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
@@ -770,11 +770,11 @@ class InfoExtractor(object):
webpage_bytes = prefix + webpage_bytes
if not encoding:
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
- if self._downloader.params.get('dump_intermediate_pages', False):
+ if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.geturl())
dump = base64.b64encode(webpage_bytes).decode('ascii')
- self._downloader.to_screen(dump)
- if self._downloader.params.get('write_pages', False):
+ self.to_screen(dump)
+ if self.get_param('write_pages', False):
basen = '%s_%s' % (video_id, urlh.geturl())
if len(basen) > 240:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -976,19 +976,9 @@ class InfoExtractor(object):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(self.__ie_msg(msg))
- def write_debug(self, msg, only_once=False, _cache=[]):
+ def write_debug(self, msg, only_once=False):
'''Log debug message or Print message to stderr'''
- if not self.get_param('verbose', False):
- return
- message = '[debug] ' + self.__ie_msg(msg)
- logger = self.get_param('logger')
- if logger:
- logger.debug(message)
- else:
- if only_once and hash(message) in _cache:
- return
- self._downloader.to_stderr(message)
- _cache.append(hash(message))
+ self._downloader.write_debug(self.__ie_msg(msg), only_once=only_once)
# name, default=None, *args, **kwargs
def get_param(self, name, *args, **kwargs):
@@ -1084,7 +1074,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
+ if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -1102,7 +1092,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
+ self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
def _search_json(self, start_pattern, string, name, video_id, **kwargs):
@@ -1172,7 +1162,7 @@ class InfoExtractor(object):
username = None
password = None
- if self._downloader.params.get('usenetrc', False):
+ if self.get_param('usenetrc', False):
try:
netrc_machine = netrc_machine or self._NETRC_MACHINE
info = netrc.netrc().authenticators(netrc_machine)
@@ -1183,7 +1173,7 @@ class InfoExtractor(object):
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
except (AttributeError, IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(
+ self.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))
return username, password
@@ -1220,10 +1210,10 @@ class InfoExtractor(object):
"""
if self._downloader is None:
return None
- downloader_params = self._downloader.params
- if downloader_params.get('twofactor') is not None:
- return downloader_params['twofactor']
+ twofactor = self.get_param('twofactor')
+ if twofactor is not None:
+ return twofactor
return compat_getpass('Type %s and press [Return]: ' % note)
@@ -1358,7 +1348,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract JSON-LD')
else:
- self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
@@ -1589,7 +1579,7 @@ class InfoExtractor(object):
if f.get('vcodec') == 'none': # audio only
preference -= 50
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
else:
ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
@@ -1601,7 +1591,7 @@ class InfoExtractor(object):
else:
if f.get('acodec') == 'none': # video only
preference -= 40
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['flv', 'mp4', 'webm']
else:
ORDER = ['webm', 'flv', 'mp4']
@@ -1667,7 +1657,7 @@ class InfoExtractor(object):
""" Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
- if self._downloader.params.get('prefer_insecure', False)
+ if self.get_param('prefer_insecure', False)
else 'https:')
def _proto_relative_url(self, url, scheme=None):
@@ -3199,7 +3189,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _float(self, v, name, fatal=False, **kwargs):
@@ -3209,7 +3199,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _set_cookie(self, domain, name, value, expire_time=None, port=None,
@@ -3218,12 +3208,12 @@ class InfoExtractor(object):
0, name, value, port, port is not None, domain, True,
domain.startswith('.'), path, True, secure, expire_time,
discard, None, None, rest)
- self._downloader.cookiejar.set_cookie(cookie)
+ self.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
""" Return a compat_cookies_SimpleCookie with the cookies for the url """
req = sanitized_Request(url)
- self._downloader.cookiejar.add_cookie_header(req)
+ self.cookiejar.add_cookie_header(req)
return compat_cookies_SimpleCookie(req.get_header('Cookie'))
def _apply_first_set_cookie_header(self, url_handle, cookie):
@@ -3283,8 +3273,8 @@ class InfoExtractor(object):
return not any_restricted
def extract_subtitles(self, *args, **kwargs):
- if (self._downloader.params.get('writesubtitles', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
@@ -3305,7 +3295,11 @@ class InfoExtractor(object):
""" Merge subtitle dictionaries, language by language. """
# ..., * , target=None
- target = kwargs.get('target') or dict(subtitle_dict1)
+ target = kwargs.get('target')
+ if target is None:
+ target = dict(subtitle_dict1)
+ else:
+ subtitle_dicts = (subtitle_dict1,) + subtitle_dicts
for subtitle_dict in subtitle_dicts:
for lang in subtitle_dict:
@@ -3313,8 +3307,8 @@ class InfoExtractor(object):
return target
def extract_automatic_captions(self, *args, **kwargs):
- if (self._downloader.params.get('writeautomaticsub', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
@@ -3322,9 +3316,9 @@ class InfoExtractor(object):
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self._downloader.params.get('mark_watched', False)
+ if (self.get_param('mark_watched', False)
and (self._get_login_info()[0] is not None
- or self._downloader.params.get('cookiefile') is not None)):
+ or self.get_param('cookiefile') is not None)):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
@@ -3332,7 +3326,7 @@ class InfoExtractor(object):
def geo_verification_headers(self):
headers = {}
- geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ geo_verification_proxy = self.get_param('geo_verification_proxy')
if geo_verification_proxy:
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index ce97fd75b..54073ef86 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -342,14 +342,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if not self._login():
return
- _DEFAULT_API_DATA = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- },
- },
- }
+ _DEFAULT_API_DATA = {'context': _INNERTUBE_CLIENTS['web']['INNERTUBE_CONTEXT']}
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
@@ -497,11 +490,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
data['params'] = params
for page_num in itertools.count(1):
search = self._download_json(
- 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/search',
video_id='query "%s"' % query,
note='Downloading page %s' % page_num,
errnote='Unable to download API page', fatal=False,
data=json.dumps(data).encode('utf8'),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers={'content-type': 'application/json'})
if not search:
break
@@ -1655,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
assert os.path.basename(func_id) == func_id
self.write_debug('Extracting signature function {0}'.format(func_id))
- cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
+ cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None
if not cache_spec:
code = self._load_player(video_id, player_url, player_id)
@@ -1816,6 +1813,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ret
def _extract_n_function_name(self, jscode):
+ func_name, idx = None, None
+ # these special cases are redundant and probably obsolete (2025-04):
+ # they make the tests run ~10% faster without fallback warnings
+ r"""
func_name, idx = self._search_regex(
# (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
# (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
@@ -1842,9 +1843,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
\(\s*[\w$]+\s*\)
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
default=(None, None))
+ """
+
+ if not func_name:
+ # nfunc=function(x){...}|function nfunc(x); ...
+ # ... var y=[nfunc]|y[idx]=nfunc);
+ # obvious REs hang, so use a two-stage tactic
+ for m in re.finditer(r'''(?x)
+ [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s*
+ (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
+ \s*?[;\n]
+ ''', jscode):
+ func_name = self._search_regex(
+ r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
+ re.escape(m.group('nfunc')), '{'),
+ jscode, 'Initial JS player n function name (2)', group=2, default=None)
+ if func_name:
+ idx = m.group('idx')
+ break
+
# thx bashonly: yt-dlp/yt-dlp/pull/10611
if not func_name:
- self.report_warning('Falling back to generic n function search')
+ self.report_warning('Falling back to generic n function search', only_once=True)
return self._search_regex(
r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
@@ -1858,14 +1878,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return func_name
return self._search_json(
- r'var\s+{0}\s*='.format(re.escape(func_name)), jscode,
+ r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,
'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
- func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]',
+ func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',
transform_source=js_to_json)[int(idx)]
def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
- func_code = self.cache.load('youtube-nsig', player_id)
+ func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07')
jscode = func_code or self._load_player(video_id, player_url)
jsi = JSInterpreter(jscode)
@@ -3339,6 +3359,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'thumbnailViewModel', 'image'), final_key='sources'),
})
+ def _extract_shorts_lockup_view_model(self, view_model):
+ content_id = traverse_obj(view_model, (
+ 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId',
+ T(lambda v: v if YoutubeIE.suitable(v) else None)))
+ if not content_id:
+ return
+ return merge_dicts(self.url_result(
+ content_id, ie=YoutubeIE.ie_key(), video_id=content_id), {
+ 'title': traverse_obj(view_model, (
+ 'overlayMetadata', 'primaryText', 'content', T(compat_str))),
+ 'thumbnails': self._extract_thumbnails(
+ view_model, 'thumbnail', final_key='sources'),
+ })
+
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
if video_id:
@@ -3385,10 +3419,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield entry
def _rich_grid_entries(self, contents):
- for content in contents:
- content = traverse_obj(
- content, ('richItemRenderer', 'content'),
- expected_type=dict) or {}
+ for content in traverse_obj(
+ contents, (Ellipsis, 'richItemRenderer', 'content'),
+ expected_type=dict):
video_renderer = traverse_obj(
content, 'videoRenderer', 'reelItemRenderer',
expected_type=dict)
@@ -3396,6 +3429,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(video_renderer)
if entry:
yield entry
+ # shorts item
+ shorts_lockup_view_model = content.get('shortsLockupViewModel')
+ if shorts_lockup_view_model:
+ entry = self._extract_shorts_lockup_view_model(shorts_lockup_view_model)
+ if entry:
+ yield entry
# playlist
renderer = traverse_obj(
content, 'playlistRenderer', expected_type=dict) or {}
@@ -3434,23 +3473,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
next_continuation = cls._extract_next_continuation_data(renderer)
if next_continuation:
return next_continuation
- contents = []
- for key in ('contents', 'items'):
- contents.extend(try_get(renderer, lambda x: x[key], list) or [])
- for content in contents:
- if not isinstance(content, dict):
- continue
- continuation_ep = try_get(
- content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
- dict)
- if not continuation_ep:
- continue
- continuation = try_get(
- continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ for command in traverse_obj(renderer, (
+ ('contents', 'items', 'rows'), Ellipsis, 'continuationItemRenderer',
+ ('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
+ (('commandExecutorCommand', 'commands', Ellipsis), None), T(dict))):
+ continuation = traverse_obj(command, ('continuationCommand', 'token', T(compat_str)))
if not continuation:
continue
- ctp = continuation_ep.get('clickTrackingParams')
- return YoutubeTabIE._build_continuation_query(continuation, ctp)
+ ctp = command.get('clickTrackingParams')
+ return cls._build_continuation_query(continuation, ctp)
def _entries(self, tab, item_id, webpage):
tab_content = try_get(tab, lambda x: x['content'], dict)
@@ -3499,6 +3530,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(renderer)
if entry:
yield entry
+ renderer = isr_content.get('richGridRenderer')
+ if renderer:
+ for from_ in self._rich_grid_entries(
+ traverse_obj(renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
+ continuation = self._extract_continuation(renderer)
+ continue
if not continuation:
continuation = self._extract_continuation(is_renderer)
@@ -3508,8 +3546,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
rich_grid_renderer = tab_content.get('richGridRenderer')
if not rich_grid_renderer:
return
- for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []):
- yield entry
+ for from_ in self._rich_grid_entries(
+ traverse_obj(rich_grid_renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
continuation = self._extract_continuation(rich_grid_renderer)
@@ -3555,8 +3594,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
response = self._download_json(
- 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/browse',
None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers=headers, data=json.dumps(data).encode('utf8'))
break
except ExtractorError as e: