diff options
| author | Leonardo Taccari <iamleot@gmail.com> | 2020-11-20 10:00:05 +0100 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-11-20 10:00:05 +0100 | 
| commit | dd9e0f58f3482204491007e06a134c69788b1c82 (patch) | |
| tree | 58207194e150eb323439f26511398dace2b08444 /youtube_dl/extractor/rai.py | |
| parent | 59e583f7e8530ca92776c866897d895c072e2a82 (diff) | |
[rai] Fix extraction for recent raiplay.it updates (#27077)
- Remove first test of RaiPlayIE: it is no longer available
- Make RaiPlayIE extension-agnostic (passing possible `.json' URLs is now
  supported too)
- Adjust RaiPlayLiveIE to recent raiplay.it updates.  Passing it as
  `url_transparent' is no longer supported (there is no longer an accessible
  ContentItem)
- Adjust RaiPlayPlaylistIE to recent raiplay.it updates and instruct it about
  ContentSet-s.
- Update a RaiIE test and remove two tests that are no longer availables
Thanks to @remitamine for the review!
Diffstat (limited to 'youtube_dl/extractor/rai.py')
| -rw-r--r-- | youtube_dl/extractor/rai.py | 126 | 
1 files changed, 52 insertions, 74 deletions
| diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index bee2d53f5..dae7800d2 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,7 +17,6 @@ from ..utils import (      int_or_none,      parse_duration,      strip_or_none, -    unescapeHTML,      unified_strdate,      unified_timestamp,      update_url_query, @@ -122,27 +121,8 @@ class RaiBaseIE(InfoExtractor):  class RaiPlayIE(RaiBaseIE): -    _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE      _TESTS = [{ -        'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', -        'md5': '340aa3b7afb54bfd14a8c11786450d76', -        'info_dict': { -            'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', -            'ext': 'mp4', -            'title': 'La Casa Bianca', -            'alt_title': 'S2016 - Puntata del 23/10/2016', -            'description': 'md5:a09d45890850458077d1f68bb036e0a5', -            'thumbnail': r're:^https?://.*\.jpg$', -            'uploader': 'Rai 3', -            'creator': 'Rai 3', -            'duration': 3278, -            'timestamp': 1477764300, -            'upload_date': '20161029', -            'series': 'La Casa Bianca', -            'season': '2016', -        }, -        'skip': 'This content is not available', -    }, {          'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',          'md5': '8970abf8caf8aef4696e7b1f2adfc696',          'info_dict': { @@ -166,10 +146,11 @@ class RaiPlayIE(RaiBaseIE):      }]      def _real_extract(self, url): -        url, video_id = re.match(self._VALID_URL, url).groups() +        mobj = re.match(self._VALID_URL, url) +        base, video_id, = mobj.group('base', 'id')          media = self._download_json( -            url.replace('.html', '.json'), video_id, 'Downloading video JSON') +            '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON')          title = media['name'] @@ -219,7 +200,7 @@ class RaiPlayIE(RaiBaseIE):  class RaiPlayLiveIE(RaiBaseIE): -    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))'      _TEST = {          'url': 'http://www.raiplay.it/dirette/rainews24',          'info_dict': { @@ -227,7 +208,7 @@ class RaiPlayLiveIE(RaiBaseIE):              'display_id': 'rainews24',              'ext': 'mp4',              'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'description': 'md5:6eca31500550f9376819f174e5644754', +            'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',              'uploader': 'Rai News 24',              'creator': 'Rai News 24',              'is_live': True, @@ -238,53 +219,75 @@ class RaiPlayLiveIE(RaiBaseIE):      }      def _real_extract(self, url): -        display_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        base, display_id, = mobj.group('base', 'id') + +        media = self._download_json( +            '%s.json' % base, +            display_id, 'Downloading channel JSON') -        webpage = self._download_webpage(url, display_id) +        title = media['name'] +        video = media['video'] +        video_id = media['id'].replace('ContentItem-', '') -        video_id = self._search_regex( -            r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, -            webpage, 'content id') +        relinker_info = self._extract_relinker_info(video['content_url'], video_id) +        self._sort_formats(relinker_info['formats']) -        return { -            '_type': 'url_transparent', -            'ie_key': RaiPlayIE.ie_key(), -            'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, +        info = {              'id': video_id,              'display_id': display_id, +            'title': self._live_title(title) if relinker_info.get( +                'is_live') else title, +            'description': media.get('description'), +            'uploader': strip_or_none(media.get('channel')), +            'creator': strip_or_none(media.get('editor')),          } +        info.update(relinker_info) +        return info +  class RaiPlayPlaylistIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))'      _TESTS = [{          'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',          'info_dict': {              'id': 'nondirloalmiocapo',              'title': 'Non dirlo al mio capo', -            'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', +            'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',          },          'playlist_mincount': 12,      }]      def _real_extract(self, url): -        playlist_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        base, playlist_id, = mobj.group('base', 'id') -        webpage = self._download_webpage(url, playlist_id) +        media = self._download_json( +            '%s.json' % base, +            playlist_id, 'Downloading program JSON') -        title = self._html_search_meta( -            ('programma', 'nomeProgramma'), webpage, 'title') -        description = unescapeHTML(self._html_search_meta( -            ('description', 'og:description'), webpage, 'description')) +        title = media.get('name') +        description = None +        if media.get('program_info') and media['program_info'].get('description'): +            description = media['program_info']['description']          entries = [] -        for mobj in re.finditer( -                r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', -                webpage): -            video_url = urljoin(url, mobj.group('path')) -            entries.append(self.url_result( -                video_url, ie=RaiPlayIE.ie_key(), -                video_id=RaiPlayIE._match_id(video_url))) +        for b in media.get('blocks', []): +            for s in b.get('sets', []): +                cs = s.get('id') +                if not cs: +                    continue +                medias = self._download_json( +                    '%s/%s.json' % (base, cs), +                    cs, 'Downloading content set JSON', fatal=False) +                if not medias: +                    continue +                for m in medias['items']: +                    video_url = urljoin(url, m['path_id']) +                    entries.append(self.url_result( +                        video_url, ie=RaiPlayIE.ie_key(), +                        video_id=RaiPlayIE._match_id(video_url)))          return self.playlist_result(entries, playlist_id, title, description) @@ -330,19 +333,6 @@ class RaiIE(RaiBaseIE):              'upload_date': '20161103',          }      }, { -        # drawMediaRaiTV(...) -        'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', -        'md5': '2dd727e61114e1ee9c47f0da6914e178', -        'info_dict': { -            'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', -            'ext': 'mp4', -            'title': 'Il pacco', -            'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', -            'thumbnail': r're:^https?://.*\.jpg$', -            'upload_date': '20141221', -        }, -        'skip': 'This content is not available', -    }, {          # initEdizione('ContentItem-...'          'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',          'info_dict': { @@ -354,18 +344,6 @@ class RaiIE(RaiBaseIE):          },          'skip': 'Changes daily',      }, { -        # HDS live stream with only relinker URL -        'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', -        'info_dict': { -            'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', -            'ext': 'flv', -            'title': 'EuroNews', -        }, -        'params': { -            'skip_download': True, -        }, -        'skip': 'This content is available only in Italy', -    }, {          # HLS live stream with ContentItem in og:url          'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',          'info_dict': { | 
