diff options
| author | dirkf <fieldhouse@gmx.net> | 2024-02-21 00:09:48 +0000 | 
|---|---|---|
| committer | dirkf <fieldhouse@gmx.net> | 2024-03-08 13:03:42 +0000 | 
| commit | f66372403fd9e1661199fea100ba2600fa9697b2 (patch) | |
| tree | 4e99da07e86adb54cca8d33c0aa4f85b581c3b44 /youtube_dl/extractor/common.py | |
| parent | 7216fa2ac4706e099ea2ad9a04fe7bf4300bc745 (diff) | |
[InfoExtractor] Rework and improve JWPlayer extraction
* use traverse_obj() and _search_json()
* support playlist `.load({**video1},{**video2}, ...)`
* support transform_source=... for _extract_jwplayer_data()
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 55 | 
1 files changed, 22 insertions, 33 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b5e95a318..7fae9e57b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3021,25 +3021,22 @@ class InfoExtractor(object):          return formats      def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): -        mobj = re.search( -            r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''', -            webpage) -        if mobj: -            try: -                jwplayer_data = self._parse_json(mobj.group('options'), -                                                 video_id=video_id, -                                                 transform_source=transform_source) -            except ExtractorError: -                pass -            else: -                if isinstance(jwplayer_data, dict): -                    return jwplayer_data +        return self._search_json( +            r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''', +            webpage, 'JWPlayer data', video_id, +            # must be a {...} or sequence, ending +            contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))', +            transform_source=transform_source, default=None)      def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): -        jwplayer_data = self._find_jwplayer_data( -            webpage, video_id, transform_source=js_to_json) -        return self._parse_jwplayer_data( -            jwplayer_data, video_id, *args, **kwargs) + +        # allow passing `transform_source` through to _find_jwplayer_data() +        transform_source = kwargs.pop('transform_source', None) +        kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {} + +        jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind) + +        return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs)      def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,                               m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): @@ -3073,22 +3070,14 @@ class InfoExtractor(object):                  mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)              subtitles = {} -            tracks = video_data.get('tracks') -            if tracks and isinstance(tracks, list): -                for track in tracks: -                    if not isinstance(track, dict): -                        continue -                    track_kind = track.get('kind') -                    if not track_kind or not isinstance(track_kind, compat_str): -                        continue -                    if track_kind.lower() not in ('captions', 'subtitles'): -                        continue -                    track_url = urljoin(base_url, track.get('file')) -                    if not track_url: -                        continue -                    subtitles.setdefault(track.get('label') or 'en', []).append({ -                        'url': self._proto_relative_url(track_url) -                    }) +            for track in traverse_obj(video_data, ( +                    'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))): +                track_url = urljoin(base_url, track.get('file')) +                if not track_url: +                    continue +                subtitles.setdefault(track.get('label') or 'en', []).append({ +                    'url': self._proto_relative_url(track_url) +                })              entry = {                  'id': this_video_id, | 
