diff options
author | dirkf <fieldhouse@gmx.net> | 2024-02-21 00:03:17 +0000 |
---|---|---|
committer | dirkf <fieldhouse@gmx.net> | 2024-03-08 13:03:42 +0000 |
commit | 7216fa2ac4706e099ea2ad9a04fe7bf4300bc745 (patch) | |
tree | 7f211eb58745f1cc7171bc483cdcc752a24421e1 | |
parent | acc383b9e3c2d454121c22570c901dd2c689dc26 (diff) |
[InfoExtractor] Add `_search_json()`
* uses the error diagnostic to truncate the JSON string
* may be confused by non-C-Pythons
-rw-r--r-- | youtube_dl/extractor/common.py | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0e5dfd8fa..b5e95a318 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -25,6 +25,7 @@ from ..compat import ( compat_getpass, compat_integer_types, compat_http_client, + compat_kwargs, compat_map as map, compat_open as open, compat_os_name, @@ -1102,6 +1103,60 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + + # self, start_pattern, string, name, video_id, *, end_pattern='', + # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT + # NB: end_pattern is only used to reduce the size of the initial match + end_pattern = kwargs.pop('end_pattern', '') + # (?:[\s\S]) simulates (?(s):.) (eg) + contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}') + fatal = kwargs.pop('fatal', True) + default = kwargs.pop('default', NO_DEFAULT) + + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format( + start_pattern, contains_pattern, end_pattern), + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + # yt-dlp has a special JSON parser that allows trailing text. + # Until that arrives here, the diagnostic from the exception + # raised by json.loads() is used to extract the wanted text. + # Either way, it's a problem if a transform_source() can't + # handle the trailing text. + + # force an exception + kwargs['fatal'] = True + + # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + for _ in range(2): + try: + # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + transform_source = kwargs.pop('transform_source', None) + if transform_source: + json_string = transform_source(json_string) + return self._parse_json(json_string, video_id, **compat_kwargs(kwargs)) + except ExtractorError as e: + end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None)) + if end is not None: + json_string = json_string[:end] + continue + msg = 'Unable to extract {0} - Failed to parse JSON'.format(name) + if fatal: + raise ExtractorError(msg, cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id) + return default + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. |