diff options
Diffstat (limited to 'youtube_dl/extractor/rai.py')
| -rw-r--r-- | youtube_dl/extractor/rai.py | 286 | 
1 files changed, 173 insertions, 113 deletions
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1aa1..dc640b1bc 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,47 +1,141 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urlparse, -) +from ..compat import compat_urlparse  from ..utils import ( -    ExtractorError,      determine_ext, +    ExtractorError, +    find_xpath_attr, +    fix_xml_ampersands, +    int_or_none,      parse_duration,      unified_strdate, -    int_or_none, +    update_url_query,      xpath_text,  ) -class RaiTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): +    def _extract_relinker_formats(self, relinker_url, video_id): +        formats = [] + +        for platform in ('mon', 'flash', 'native'): +            relinker = self._download_xml( +                relinker_url, video_id, +                note='Downloading XML metadata for platform %s' % platform, +                transform_source=fix_xml_ampersands, +                query={'output': 45, 'pl': platform}, +                headers=self.geo_verification_headers()) + +            media_url = find_xpath_attr(relinker, './url', 'type', 'content').text +            if media_url == 'http://download.rai.it/video_no_available.mp4': +                self.raise_geo_restricted() + +            ext = determine_ext(media_url) +            if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): +                continue + +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    media_url, video_id, 'mp4', 'm3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif ext == 'f4m': +                manifest_url = update_url_query( +                    media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), +                    {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) +                formats.extend(self._extract_f4m_formats( +                    manifest_url, video_id, f4m_id='hds', fatal=False)) +            else: +                bitrate = int_or_none(xpath_text(relinker, 'bitrate')) +                formats.append({ +                    'url': media_url, +                    'tbr': bitrate if bitrate > 0 else None, +                    'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', +                }) + +        return formats + +    def _extract_from_content_id(self, content_id, base_url): +        media = self._download_json( +            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, +            content_id, 'Downloading video JSON') + +        thumbnails = [] +        for image_type in ('image', 'image_medium', 'image_300'): +            thumbnail_url = media.get(image_type) +            if thumbnail_url: +                thumbnails.append({ +                    'url': compat_urlparse.urljoin(base_url, thumbnail_url), +                }) + +        formats = [] +        media_type = media['type'] +        if 'Audio' in media_type: +            formats.append({ +                'format_id': media.get('formatoAudio'), +                'url': media['audioUrl'], +                'ext': media.get('formatoAudio'), +            }) +        elif 'Video' in media_type: +            formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) +            self._sort_formats(formats) +        else: +            raise ExtractorError('not a media file') + +        subtitles = {} +        captions = media.get('subtitlesUrl') +        if captions: +            STL_EXT = '.stl' +            SRT_EXT = '.srt' +            if captions.endswith(STL_EXT): +                captions = captions[:-len(STL_EXT)] + SRT_EXT +            subtitles['it'] = [{ +                'ext': 'srt', +                'url': captions, +            }] + +        return { +            'id': content_id, +            'title': media['name'], +            'description': media.get('desc'), +            'thumbnails': thumbnails, +            'uploader': media.get('author'), +            'upload_date': unified_strdate(media.get('date')), +            'duration': parse_duration(media.get('length')), +            'formats': formats, +            'subtitles': subtitles, +        } + + +class RaiTVIE(RaiBaseIE): +    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'      _TESTS = [          {              'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', -            'md5': '96382709b61dd64a6b88e0f791e6df4c', +            'md5': '8970abf8caf8aef4696e7b1f2adfc696',              'info_dict': {                  'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Report del 07/04/2014',                  'description': 'md5:f27c544694cacb46a078db84ec35d2d9',                  'upload_date': '20140407',                  'duration': 6160, +                'thumbnail': 're:^https?://.*\.jpg$',              }          },          { +            # no m3u8 stream              'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', -            'md5': 'd9751b78eac9710d62c2447b224dea39', +            # HDS download, MD5 is unstable              'info_dict': {                  'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',                  'ext': 'flv',                  'title': 'TG PRIMO TEMPO',                  'upload_date': '20140612',                  'duration': 1758, +                'thumbnail': 're:^https?://.*\.jpg$',              }, +            'skip': 'Geo-restricted to Italy',          },          {              'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor):          },          {              'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', -            'md5': '496ab63e420574447f70d02578333437', +            'md5': 'e57493e1cb8bc7c564663f363b171847',              'info_dict': {                  'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Il Candidato - Primo episodio: "Le Primarie"',                  'description': 'md5:364b604f7db50594678f483353164fb8',                  'upload_date': '20140923',                  'duration': 386, +                'thumbnail': 're:^https?://.*\.jpg$',              }          },      ]      def _real_extract(self, url):          video_id = self._match_id(url) -        media = self._download_json( -            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, -            video_id, 'Downloading video JSON') - -        thumbnails = [] -        for image_type in ('image', 'image_medium', 'image_300'): -            thumbnail_url = media.get(image_type) -            if thumbnail_url: -                thumbnails.append({ -                    'url': thumbnail_url, -                }) - -        subtitles = [] -        formats = [] -        media_type = media['type'] -        if 'Audio' in media_type: -            formats.append({ -                'format_id': media.get('formatoAudio'), -                'url': media['audioUrl'], -                'ext': media.get('formatoAudio'), -            }) -        elif 'Video' in media_type: -            def fix_xml(xml): -                return xml.replace(' tag elementi', '').replace('>/', '</') - -            relinker = self._download_xml( -                media['mediaUri'] + '&output=43', -                video_id, transform_source=fix_xml) - -            has_subtitle = False - -            for element in relinker.findall('element'): -                media_url = xpath_text(element, 'url') -                ext = determine_ext(media_url) -                content_type = xpath_text(element, 'content-type') -                if ext == 'm3u8': -                    formats.extend(self._extract_m3u8_formats( -                        media_url, video_id, 'mp4', 'm3u8_native', -                        m3u8_id='hls', fatal=False)) -                elif ext == 'f4m': -                    formats.extend(self._extract_f4m_formats( -                        media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', -                        video_id, f4m_id='hds', fatal=False)) -                elif ext == 'stl': -                    has_subtitle = True -                elif content_type.startswith('video/'): -                    bitrate = int_or_none(xpath_text(element, 'bitrate')) -                    formats.append({ -                        'url': media_url, -                        'tbr': bitrate if bitrate > 0 else None, -                        'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', -                    }) -                elif content_type.startswith('image/'): -                    thumbnails.append({ -                        'url': media_url, -                    }) - -            self._sort_formats(formats) -            if has_subtitle: -                webpage = self._download_webpage(url, video_id) -                subtitles = self._get_subtitles(video_id, webpage) -        else: -            raise ExtractorError('not a media file') +        return self._extract_from_content_id(video_id, url) -        return { -            'id': video_id, -            'title': media['name'], -            'description': media.get('desc'), -            'thumbnails': thumbnails, -            'uploader': media.get('author'), -            'upload_date': unified_strdate(media.get('date')), -            'duration': parse_duration(media.get('length')), -            'formats': formats, -            'subtitles': subtitles, -        } -    def _get_subtitles(self, video_id, webpage): -        subtitles = {} -        m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) -        if m: -            captions = m.group('captions') -            STL_EXT = '.stl' -            SRT_EXT = '.srt' -            if captions.endswith(STL_EXT): -                captions = captions[:-len(STL_EXT)] + SRT_EXT -            subtitles['it'] = [{ -                'ext': 'srt', -                'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), -            }] -        return subtitles - - -class RaiIE(InfoExtractor): +class RaiIE(RaiBaseIE):      _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'      _TESTS = [          {              'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', -            'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', +            'md5': '2dd727e61114e1ee9c47f0da6914e178',              'info_dict': {                  'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Il pacco',                  'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',                  'upload_date': '20141221',              }, -        } +        }, +        { +            # Direct relinker URL +            'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', +            # HDS live stream, MD5 is unstable +            'info_dict': { +                'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', +                'ext': 'flv', +                'title': 'EuroNews', +            }, +            'skip': 'Geo-restricted to Italy', +        }, +        { +            # Embedded content item ID +            'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', +            'md5': '84c1135ce960e8822ae63cec34441d63', +            'info_dict': { +                'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', +                'ext': 'mp4', +                'title': 'TG1 ore 20:00 del 02/07/2016', +                'upload_date': '20160702', +            }, +        }, +        { +            'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', +            # HDS live stream, MD5 is unstable +            'info_dict': { +                'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', +                'ext': 'flv', +                'title': 'La diretta di Rainews24', +            }, +        },      ]      @classmethod @@ -201,7 +238,30 @@ class RaiIE(InfoExtractor):          iframe_url = self._search_regex(              [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',               r'drawMediaRaiTV\(["\'](.+?)["\']'], -            webpage, 'iframe') -        if not iframe_url.startswith('http'): -            iframe_url = compat_urlparse.urljoin(url, iframe_url) -        return self.url_result(iframe_url) +            webpage, 'iframe', default=None) +        if iframe_url: +            if not iframe_url.startswith('http'): +                iframe_url = compat_urlparse.urljoin(url, iframe_url) +            return self.url_result(iframe_url) + +        content_item_id = self._search_regex( +            r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', +            webpage, 'content item ID', group='content_id', default=None) +        if content_item_id: +            return self._extract_from_content_id(content_item_id, url) + +        relinker_url = compat_urlparse.urljoin(url, self._search_regex( +            r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', +            webpage, 'relinker URL', group='url')) +        formats = self._extract_relinker_formats(relinker_url, video_id) +        self._sort_formats(formats) + +        title = self._search_regex( +            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', +            webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +        }  | 
