[InfoExtractor] Add `_search_json()`

* uses the error diagnostic to truncate the JSON string * may be confused by non-C-Pythons
author: dirkf <fieldhouse@gmx.net> 2024-02-21 00:03:17 +0000
committer: dirkf <fieldhouse@gmx.net> 2024-03-08 13:03:42 +0000
commit: 7216fa2ac4706e099ea2ad9a04fe7bf4300bc745 (patch)
tree: 7f211eb58745f1cc7171bc483cdcc752a24421e1
parent: acc383b9e3c2d454121c22570c901dd2c689dc26 (diff)
1 files changed, 55 insertions, 0 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 0e5dfd8fa..b5e95a318 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -25,6 +25,7 @@ from ..compat import (
     compat_getpass,
     compat_integer_types,
     compat_http_client,
+    compat_kwargs,
     compat_map as map,
     compat_open as open,
     compat_os_name,
@@ -1102,6 +1103,60 @@ class InfoExtractor(object):
             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
             return None
 
+    def _search_json(self, start_pattern, string, name, video_id, **kwargs):
+        """Searches string for the JSON object specified by start_pattern"""
+
+        # self, start_pattern, string, name, video_id, *, end_pattern='',
+        # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
+        # NB: end_pattern is only used to reduce the size of the initial match
+        end_pattern = kwargs.pop('end_pattern', '')
+        # (?:[\s\S]) simulates (?(s):.) (eg)
+        contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}')
+        fatal = kwargs.pop('fatal', True)
+        default = kwargs.pop('default', NO_DEFAULT)
+
+        if default is NO_DEFAULT:
+            default, has_default = {}, False
+        else:
+            fatal, has_default = False, True
+
+        json_string = self._search_regex(
+            r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
+                start_pattern, contains_pattern, end_pattern),
+            string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
+        if not json_string:
+            return default
+
+        # yt-dlp has a special JSON parser that allows trailing text.
+        # Until that arrives here, the diagnostic from the exception
+        # raised by json.loads() is used to extract the wanted text.
+        # Either way, it's a problem if a transform_source() can't
+        # handle the trailing text.
+
+        # force an exception
+        kwargs['fatal'] = True
+
+        # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
+        for _ in range(2):
+            try:
+                # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
+                transform_source = kwargs.pop('transform_source', None)
+                if transform_source:
+                    json_string = transform_source(json_string)
+                return self._parse_json(json_string, video_id, **compat_kwargs(kwargs))
+            except ExtractorError as e:
+                end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None))
+                if end is not None:
+                    json_string = json_string[:end]
+                    continue
+                msg = 'Unable to extract {0} - Failed to parse JSON'.format(name)
+                if fatal:
+                    raise ExtractorError(msg, cause=e.cause, video_id=video_id)
+                elif not has_default:
+                    self.report_warning(
+                        '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
+            return default
+
     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
author	dirkf <fieldhouse@gmx.net>	2024-02-21 00:03:17 +0000
committer	dirkf <fieldhouse@gmx.net>	2024-03-08 13:03:42 +0000
commit	7216fa2ac4706e099ea2ad9a04fe7bf4300bc745 (patch)
tree	7f211eb58745f1cc7171bc483cdcc752a24421e1
parent	acc383b9e3c2d454121c22570c901dd2c689dc26 (diff)