aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2020-05-02 23:40:30 +0700
committerSergey M․ <dstftw@gmail.com>2020-05-02 23:40:30 +0700
commit4433bb02457f961dc2a753b3d4350a4a8cae138f (patch)
tree5b1619f8578759aa2126a5479f35f2b35a284a05
parente40c758c2a8f049cd254ad73f0e6ade00bd004d9 (diff)
downloadyoutube-dl-4433bb02457f961dc2a753b3d4350a4a8cae138f.tar.xz
[extractor/common] Extract multiple JSON-LD entries
-rw-r--r--youtube_dl/extractor/common.py41
1 files changed, 32 insertions, 9 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index c51a3a07d..e9306d806 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1182,16 +1182,33 @@ class InfoExtractor(object):
'twitter card player')
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
- json_ld = self._search_regex(
- JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
+ json_ld_list = list(re.finditer(JSON_LD_RE, html))
default = kwargs.get('default', NO_DEFAULT)
- if not json_ld:
- return default if default is not NO_DEFAULT else {}
# JSON-LD may be malformed and thus `fatal` should be respected.
# At the same time `default` may be passed that assumes `fatal=False`
# for _search_regex. Let's simulate the same behavior here as well.
fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
- return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+ json_ld = []
+ for mobj in json_ld_list:
+ json_ld_item = self._parse_json(
+ mobj.group('json_ld'), video_id, fatal=fatal)
+ if not json_ld_item:
+ continue
+ if isinstance(json_ld_item, dict):
+ json_ld.append(json_ld_item)
+ elif isinstance(json_ld_item, (list, tuple)):
+ json_ld.extend(json_ld_item)
+ if json_ld:
+ json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+ if json_ld:
+ return json_ld
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ raise RegexNotFoundError('Unable to extract JSON-LD')
+ else:
+ self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
@@ -1256,10 +1273,10 @@ class InfoExtractor(object):
extract_interaction_statistic(e)
for e in json_ld:
- if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
+ if '@context' in e:
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
- return info
+ continue
if item_type in ('TVEpisode', 'Episode'):
episode_name = unescapeHTML(e.get('name'))
info.update({
@@ -1293,11 +1310,17 @@ class InfoExtractor(object):
})
elif item_type == 'VideoObject':
extract_video_object(e)
- continue
+ if expected_type is None:
+ continue
+ else:
+ break
video = e.get('video')
if isinstance(video, dict) and video.get('@type') == 'VideoObject':
extract_video_object(video)
- break
+ if expected_type is None:
+ continue
+ else:
+ break
return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod