diff options
| author | dirkf <fieldhouse@gmx.net> | 2024-04-26 18:57:44 +0100 | 
|---|---|---|
| committer | dirkf <fieldhouse@gmx.net> | 2024-05-30 15:46:36 +0100 | 
| commit | 21924742f79ccbd62d16ef4120518c6a5da8614e (patch) | |
| tree | 89264cf78caf3a04808b1b03e9364e000c3d8e1c | |
| parent | 768ccccd9b18bc48d129b12d14eace4ebb3655d8 (diff) | |
[InfoExtractor] Misc yt-dlp back-ports, etc
* add _yes_playlist() method
* avoid crash using _NETRC_MACHINE
* use _search_json() in _search_nextjs_data()
* _search_nextjs_data() default is JSON, not text
* test for above
| -rw-r--r-- | test/test_InfoExtractor.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 63 | 
2 files changed, 50 insertions, 16 deletions
| diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index d55d6ad54..09100a1d6 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -153,6 +153,9 @@ class TestInfoExtractor(unittest.TestCase):  '''          search = self.ie._search_nextjs_data(html, 'testID')          self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') +        search = self.ie._search_nextjs_data( +            'no next.js data here, move along', 'testID', default={'status': 0}) +        self.assertEqual(search['status'], 0)      def test_search_nuxt_data(self):          html = ''' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7fae9e57b..b10e84416 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1169,10 +1169,10 @@ class InfoExtractor(object):      def _get_netrc_login_info(self, netrc_machine=None):          username = None          password = None -        netrc_machine = netrc_machine or self._NETRC_MACHINE          if self._downloader.params.get('usenetrc', False):              try: +                netrc_machine = netrc_machine or self._NETRC_MACHINE                  info = netrc.netrc().authenticators(netrc_machine)                  if info is not None:                      username = info[0] @@ -1180,7 +1180,7 @@ class InfoExtractor(object):                  else:                      raise netrc.NetrcParseError(                          'No authenticators for %s' % netrc_machine) -            except (IOError, netrc.NetrcParseError) as err: +            except (AttributeError, IOError, netrc.NetrcParseError) as err:                  self._downloader.report_warning(                      'parsing .netrc: %s' % error_to_compat_str(err)) @@ -1490,14 +1490,18 @@ class InfoExtractor(object):          return dict((k, v) for k, v in info.items() if v is not None)      def _search_nextjs_data(self, webpage, video_id, **kw): -        nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) -        kw.pop('transform_source', None) -        next_data = self._search_regex( -            r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', -            webpage, 'next.js data', group='nd', **kw) -        if not next_data: -            return {} -        return self._parse_json(next_data, video_id, **nkw) +        # ..., *, transform_source=None, fatal=True, default=NO_DEFAULT + +        # TODO: remove this backward compat +        default = kw.get('default', NO_DEFAULT) +        if default == '{}': +            kw['default'] = {} +            kw = compat_kwargs(kw) + +        return self._search_json( +            r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''', +            webpage, 'next.js data', video_id, end_pattern='</script>', +            **kw)      def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):          """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" @@ -3296,12 +3300,16 @@ class InfoExtractor(object):          return ret      @classmethod -    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): -        """ Merge two subtitle dictionaries, language by language. """ -        ret = dict(subtitle_dict1) -        for lang in subtitle_dict2: -            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) -        return ret +    def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs): +        """ Merge subtitle dictionaries, language by language. """ + +        # ..., * , target=None +        target = kwargs.get('target') or dict(subtitle_dict1) + +        for subtitle_dict in subtitle_dicts: +            for lang in subtitle_dict: +                target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang]) +        return target      def extract_automatic_captions(self, *args, **kwargs):          if (self._downloader.params.get('writeautomaticsub', False) @@ -3334,6 +3342,29 @@ class InfoExtractor(object):      def _generic_title(self, url):          return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) +    def _yes_playlist(self, playlist_id, video_id, *args, **kwargs): +        # smuggled_data=None, *, playlist_label='playlist', video_label='video' +        smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data') +        playlist_label = kwargs.get('playlist_label', 'playlist') +        video_label = kwargs.get('video_label', 'video') + +        if not playlist_id or not video_id: +            return not video_id + +        no_playlist = (smuggled_data or {}).get('force_noplaylist') +        if no_playlist is not None: +            return not no_playlist + +        video_id = '' if video_id is True else ' ' + video_id +        noplaylist = self.get_param('noplaylist') +        self.to_screen( +            'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id) +            if noplaylist else +            'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format( +                playlist_label, '' if playlist_id is True else ' ' + playlist_id, +                video_label, video_id)) +        return not noplaylist +  class SearchInfoExtractor(InfoExtractor):      """ | 
