diff options
author | dirkf <fieldhouse@gmx.net> | 2023-05-05 19:25:42 +0100 |
---|---|---|
committer | dirkf <fieldhouse@gmx.net> | 2023-07-19 22:14:50 +0100 |
commit | b2741f2654e6ddfebc1771b5d5fadb5fd6fe3863 (patch) | |
tree | caf46c5f7dd2af308ba0a69797097c8cd8ce77ac /test | |
parent | 846522204104e3078c597fa1872465024a684ad6 (diff) |
[InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
https://github.com/yt-dlp/yt-dlp/commit/222a230871fe4fe63f35c49590379c9a77116819,
thanks Lesmiscore
* update extractors in PR using above, fix tests.
Diffstat (limited to 'test')
-rw-r--r-- | test/test_InfoExtractor.py | 111 |
1 files changed, 106 insertions, 5 deletions
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6d25441db..34773fbd0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -7,15 +7,33 @@ import io import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from youtube_dl.compat import compat_etree_fromstring, compat_http_server -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError import threading +from test.helper import ( + expect_dict, + expect_value, + FakeYDL, + http_server_port, +) +from youtube_dl.compat import ( + compat_etree_fromstring, + compat_http_server, +) +from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor import ( + get_info_extractor, + YoutubeIE, +) +from youtube_dl.utils import ( + encode_data_uri, + ExtractorError, + RegexNotFoundError, + strip_jsonp, +) + TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" @@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_nextjs_data(self): + html = ''' +<!DOCTYPE html> +<html> +<head> + <meta http-equiv="content-type" content= + "text/html; charset=utf-8"> + <meta name="viewport" content="width=device-width"> + <title>Test _search_nextjs_data()</title> +</head> +<body> + <div id="__next"> + <div style="background-color:#17171E" class="FU" dir="ltr"> + <div class="sc-93de261d-0 dyzzYE"> + <div> + <header class="HD"></header> + <main class="MN"> + <div style="height:0" class="HT0"> + <div style="width:NaN%" data-testid= + "stream-container" class="WDN"></div> + </div> + </main> + </div> + <footer class="sc-6e5faf91-0 dEGaHS"></footer> + </div> + </div> + </div> + <script id="__NEXT_DATA__" type="application/json"> + {"props":{"pageProps":{"video":{"id":"testid"}}}} + </script> +</body> +</html> +''' + search = self.ie._search_nextjs_data(html, 'testID') + self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') + + def test_search_nuxt_data(self): + html = ''' +<!DOCTYPE html> +<html> +<head> + <meta http-equiv="content-type" content= + "text/html; charset=utf-8"> + <title>Nuxt.js Test Page</title> + <meta name="viewport" content= + "width=device-width, initial-scale=1"> + <meta data-hid="robots" name="robots" content="all"> +</head> +<body class="BD"> + <div id="__layout"> + <h1 class="H1">Example heading</h1> + <div class="IN"> + <p>Decoy text</p> + </div> + </div> + <script> + window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null)); + </script> + <script src="/_nuxt/a12345b.js" defer="defer"></script> +</body> +</html> +''' + search = self.ie._search_nuxt_data(html, 'testID') + self.assertEqual(search['track']['id'], 'testid') + def test_search_json_ld_realworld(self): # https://github.com/ytdl-org/youtube-dl/issues/23306 expect_dict( @@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92" + controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain"> + </video> + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( |