diff options
| -rw-r--r-- | test/test_InfoExtractor.py | 111 | ||||
| -rw-r--r-- | youtube_dl/extractor/clipchamp.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 51 | ||||
| -rw-r--r-- | youtube_dl/extractor/globalplayer.py | 32 | ||||
| -rw-r--r-- | youtube_dl/extractor/whyp.py | 25 | 
5 files changed, 168 insertions, 58 deletions
| diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6d25441db..34773fbd0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -7,15 +7,33 @@ import io  import os  import sys  import unittest +  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from youtube_dl.compat import compat_etree_fromstring, compat_http_server -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError  import threading +from test.helper import ( +    expect_dict, +    expect_value, +    FakeYDL, +    http_server_port, +) +from youtube_dl.compat import ( +    compat_etree_fromstring, +    compat_http_server, +) +from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor import ( +    get_info_extractor, +    YoutubeIE, +) +from youtube_dl.utils import ( +    encode_data_uri, +    ExtractorError, +    RegexNotFoundError, +    strip_jsonp, +) +  TEAPOT_RESPONSE_STATUS = 418  TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" @@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):          self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)          self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) +    def test_search_nextjs_data(self): +        html = ''' +<!DOCTYPE html> +<html> +<head> +  <meta http-equiv="content-type" content= +  "text/html; charset=utf-8"> +  <meta name="viewport" content="width=device-width"> +  <title>Test _search_nextjs_data()</title> +</head> +<body> +  <div id="__next"> +    <div style="background-color:#17171E" class="FU" dir="ltr"> +      <div class="sc-93de261d-0 dyzzYE"> +        <div> +          <header class="HD"></header> +          <main class="MN"> +            <div style="height:0" class="HT0"> +              <div style="width:NaN%" data-testid= +              "stream-container" class="WDN"></div> +            </div> +          </main> +        </div> +        <footer class="sc-6e5faf91-0 dEGaHS"></footer> +      </div> +    </div> +  </div> +  <script id="__NEXT_DATA__" type="application/json"> +  {"props":{"pageProps":{"video":{"id":"testid"}}}} +  </script> +</body> +</html> +''' +        search = self.ie._search_nextjs_data(html, 'testID') +        self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') + +    def test_search_nuxt_data(self): +        html = ''' +<!DOCTYPE html> +<html> +<head> +  <meta http-equiv="content-type" content= +  "text/html; charset=utf-8"> +  <title>Nuxt.js Test Page</title> +  <meta name="viewport" content= +  "width=device-width, initial-scale=1"> +  <meta data-hid="robots" name="robots" content="all"> +</head> +<body class="BD"> +  <div id="__layout"> +    <h1 class="H1">Example heading</h1> +    <div class="IN"> +      <p>Decoy text</p> +    </div> +  </div> +  <script> +  window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null)); +  </script> +  <script src="/_nuxt/a12345b.js" defer="defer"></script> +</body> +</html> +''' +        search = self.ie._search_nuxt_data(html, 'testID') +        self.assertEqual(search['track']['id'], 'testid') +      def test_search_json_ld_realworld(self):          # https://github.com/ytdl-org/youtube-dl/issues/23306          expect_dict( @@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):                  }],              }) +        # from https://0000.studio/ +        # with type attribute but without extension in URL +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://0000.studio', +                r''' +                <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92" +                    controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain"> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', +                    'ext': 'mp4', +                }], +            }) +      def test_extract_jwplayer_data_realworld(self):          # from http://www.suffolk.edu/sjc/          expect_dict( diff --git a/youtube_dl/extractor/clipchamp.py b/youtube_dl/extractor/clipchamp.py index 5a732e808..3b485eaab 100644 --- a/youtube_dl/extractor/clipchamp.py +++ b/youtube_dl/extractor/clipchamp.py @@ -35,13 +35,6 @@ class ClipchampIE(InfoExtractor):      _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'      _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} -    def _search_nextjs_data(self, webpage, video_id, **kw): -        return self._parse_json( -            self._search_regex( -                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', -                webpage, 'next.js data', **kw), -            video_id, **kw) -      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dbdf456f5..549781186 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import base64  import datetime +import functools  import hashlib  import json  import netrc @@ -23,6 +24,7 @@ from ..compat import (      compat_getpass,      compat_integer_types,      compat_http_client, +    compat_map as map,      compat_os_name,      compat_str,      compat_urllib_error, @@ -31,6 +33,7 @@ from ..compat import (      compat_urllib_request,      compat_urlparse,      compat_xml_parse_error, +    compat_zip as zip,  )  from ..downloader.f4m import (      get_base_url, @@ -70,6 +73,7 @@ from ..utils import (      str_or_none,      str_to_int,      strip_or_none, +    traverse_obj,      try_get,      unescapeHTML,      unified_strdate, @@ -1349,6 +1353,44 @@ class InfoExtractor(object):                      break          return dict((k, v) for k, v in info.items() if v is not None) +    def _search_nextjs_data(self, webpage, video_id, **kw): +        nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) +        kw.pop('transform_source', None) +        next_data = self._search_regex( +            r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', +            webpage, 'next.js data', group='nd', **kw) +        if not next_data: +            return {} +        return self._parse_json(next_data, video_id, **nkw) + +    def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): +        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" + +        # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) +        context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') +        fatal = kwargs.get('fatal', True) +        traverse = kwargs.get('traverse', ('data', 0)) + +        re_ctx = re.escape(context_name) + +        FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*' +                       r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)') + +        js, arg_keys, arg_vals = self._search_regex( +            (p.format(re_ctx, FUNCTION_RE) for p in +             (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>', +              r'{0}\s*\([\s\S]*?{1}')), +            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), +            default=NO_DEFAULT if fatal else (None, None, None)) +        if js is None: +            return {} + +        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( +            '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) + +        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) +        return traverse_obj(ret, traverse) or {} +      @staticmethod      def _hidden_inputs(html):          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) @@ -2496,7 +2538,8 @@ class InfoExtractor(object):                  return f              return {} -        def _media_formats(src, cur_media_type, type_info={}): +        def _media_formats(src, cur_media_type, type_info=None): +            type_info = type_info or {}              full_url = absolute_url(src)              ext = type_info.get('ext') or determine_ext(full_url)              if ext == 'm3u8': @@ -2514,6 +2557,7 @@ class InfoExtractor(object):                  formats = [{                      'url': full_url,                      'vcodec': 'none' if cur_media_type == 'audio' else None, +                    'ext': ext,                  }]              return is_plain_url, formats @@ -2522,7 +2566,7 @@ class InfoExtractor(object):          # so we wll include them right here (see          # https://www.ampproject.org/docs/reference/components/amp-video)          # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ -        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' +        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'          media_tags = [(media_tag, media_tag_name, media_type, '')                        for media_tag, media_tag_name, media_type                        in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] @@ -2540,7 +2584,8 @@ class InfoExtractor(object):              media_attributes = extract_attributes(media_tag)              src = strip_or_none(media_attributes.get('src'))              if src: -                _, formats = _media_formats(src, media_type) +                f = parse_content_type(media_attributes.get('type')) +                _, formats = _media_formats(src, media_type, f)                  media_info['formats'].extend(formats)              media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))              if media_content: diff --git a/youtube_dl/extractor/globalplayer.py b/youtube_dl/extractor/globalplayer.py index cceab9e6a..db490b141 100644 --- a/youtube_dl/extractor/globalplayer.py +++ b/youtube_dl/extractor/globalplayer.py @@ -24,13 +24,6 @@ class GlobalPlayerBaseIE(InfoExtractor):      def _match_valid_url(cls, url):          return cls.re.match(cls._VALID_URL, url) -    def _search_nextjs_data(self, webpage, video_id, **kw): -        return self._parse_json( -            self._search_regex( -                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', -                webpage, 'next.js data', **kw), -            video_id, **kw) -      def _get_page_props(self, url, video_id):          webpage = self._download_webpage(url, video_id)          return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] @@ -39,13 +32,14 @@ class GlobalPlayerBaseIE(InfoExtractor):          return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests              url, video_id, note='Determining source extension')) -    def _extract_audio(self, episode, series): +    @staticmethod +    def _clean_desc(x): +        x = clean_html(x) +        if x: +            x = x.replace('\xa0', ' ') +        return x -        def clean_desc(x): -            x = clean_html(x) -            if x: -                x = x.replace('\xa0', ' ') -            return x +    def _extract_audio(self, episode, series):          return merge_dicts({              'vcodec': 'none', @@ -56,7 +50,7 @@ class GlobalPlayerBaseIE(InfoExtractor):              'uploader': 'itunesAuthor',  # podcasts only          }), traverse_obj(episode, {              'id': 'id', -            'description': ('description', T(clean_desc)), +            'description': ('description', T(self._clean_desc)),              'duration': ('duration', T(parse_duration)),              'thumbnail': 'imageUrl',              'url': 'streamUrl', @@ -141,9 +135,9 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):              'ext': 'aac',              # 'live_status': 'is_live',              'is_live': True, -            'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', +            'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',              'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', -            'title': 're:^Classic FM Hall of Fame.+$' +            'title': 're:Classic FM Hall of Fame.+$'          },      }] @@ -160,7 +154,7 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):              'is_live': True,          }, traverse_obj(station, {              'title': 'title', -            'description': 'description', +            'description': ('description', T(self._clean_desc)),              'thumbnail': 'image',          }), rev=True) @@ -177,7 +171,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE):              'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',              'categories': ['Society & Culture', 'True Crime'],              'uploader': 'Global', -            'description': 'md5:da5b918eac9ae319454a10a563afacf9', +            'description': r're:(?s).+\bscam\b.+?\bseries available now\b',          },      }, {          # radio catchup @@ -203,7 +197,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE):                          series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],              'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,          }, traverse_obj(series, { -            'description': 'description', +            'description': ('description', T(self._clean_desc)),              'thumbnail': 'imageUrl',              'title': 'title',              'uploader': 'itunesAuthor',  # podcasts only diff --git a/youtube_dl/extractor/whyp.py b/youtube_dl/extractor/whyp.py index 16f9154ad..644eb4617 100644 --- a/youtube_dl/extractor/whyp.py +++ b/youtube_dl/extractor/whyp.py @@ -21,7 +21,7 @@ class WhypIE(InfoExtractor):              'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',              'id': '18337',              'title': 'Home Page Example Track', -            'description': 'md5:bd758000fb93f3159339c852b5b9133c', +            'description': r're:(?s).+\bexample track\b',              'ext': 'mp3',              'duration': 52.82,              'uploader': 'Brad', @@ -33,29 +33,6 @@ class WhypIE(InfoExtractor):          'only_matching': True,      }] -    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', fatal=True, traverse=('data', 0)): -        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" - -        import functools -        import json -        import re -        from ..utils import (js_to_json, NO_DEFAULT) - -        re_ctx = re.escape(context_name) -        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)' -        js, arg_keys, arg_vals = self._search_regex( -            (p.format(re_ctx, FUNCTION_RE) for p in (r'<script>\s*window\.{0}={1}\s*\)\s*;?\s*</script>', r'{0}\(.*?{1}')), -            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), -            default=NO_DEFAULT if fatal else (None, None, None)) -        if js is None: -            return {} - -        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( -            '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) - -        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) -        return traverse_obj(ret, traverse) or {} -      def _real_extract(self, url):          unique_id = self._match_id(url)          webpage = self._download_webpage(url, unique_id) | 
