diff options
author | dirkf <fieldhouse@gmx.net> | 2023-05-05 19:25:42 +0100 |
---|---|---|
committer | dirkf <fieldhouse@gmx.net> | 2023-07-19 22:14:50 +0100 |
commit | b2741f2654e6ddfebc1771b5d5fadb5fd6fe3863 (patch) | |
tree | caf46c5f7dd2af308ba0a69797097c8cd8ce77ac /youtube_dl/extractor/common.py | |
parent | 846522204104e3078c597fa1872465024a684ad6 (diff) |
[InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
https://github.com/yt-dlp/yt-dlp/commit/222a230871fe4fe63f35c49590379c9a77116819,
thanks Lesmiscore
* update extractors in PR using above, fix tests.
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 51 |
1 files changed, 48 insertions, 3 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dbdf456f5..549781186 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import base64 import datetime +import functools import hashlib import json import netrc @@ -23,6 +24,7 @@ from ..compat import ( compat_getpass, compat_integer_types, compat_http_client, + compat_map as map, compat_os_name, compat_str, compat_urllib_error, @@ -31,6 +33,7 @@ from ..compat import ( compat_urllib_request, compat_urlparse, compat_xml_parse_error, + compat_zip as zip, ) from ..downloader.f4m import ( get_base_url, @@ -70,6 +73,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + traverse_obj, try_get, unescapeHTML, unified_strdate, @@ -1349,6 +1353,44 @@ class InfoExtractor(object): break return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) + kw.pop('transform_source', None) + next_data = self._search_regex( + r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', + webpage, 'next.js data', group='nd', **kw) + if not next_data: + return {} + return self._parse_json(next_data, video_id, **nkw) + + def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" + + # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) + context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') + fatal = kwargs.get('fatal', True) + traverse = kwargs.get('traverse', ('data', 0)) + + re_ctx = re.escape(context_name) + + FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*' + r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)') + + js, arg_keys, arg_vals = self._search_regex( + (p.format(re_ctx, FUNCTION_RE) for p in + (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>', + r'{0}\s*\([\s\S]*?{1}')), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} + + args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( + '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) + + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} + @staticmethod def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) @@ -2496,7 +2538,8 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -2514,6 +2557,7 @@ class InfoExtractor(object): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -2522,7 +2566,7 @@ class InfoExtractor(object): # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ - _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)' media_tags = [(media_tag, media_tag_name, media_type, '') for media_tag, media_tag_name, media_type in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] @@ -2540,7 +2584,8 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: |