diff options
Diffstat (limited to 'youtube_dl')
| -rw-r--r-- | youtube_dl/extractor/rai.py | 359 | 
1 files changed, 233 insertions, 126 deletions
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b67e94f88..b77b0a08e 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,23 +1,40 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( +    compat_urlparse, +    compat_str, +)  from ..utils import (      ExtractorError,      determine_ext,      find_xpath_attr,      fix_xml_ampersands, +    GeoRestrictedError,      int_or_none,      parse_duration, +    strip_or_none, +    try_get,      unified_strdate, +    unified_timestamp,      update_url_query, +    urljoin,      xpath_text,  )  class RaiBaseIE(InfoExtractor): -    def _extract_relinker_formats(self, relinker_url, video_id): +    _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' +    _GEO_COUNTRIES = ['IT'] +    _GEO_BYPASS = False + +    def _extract_relinker_info(self, relinker_url, video_id):          formats = [] +        geoprotection = None +        is_live = None +        duration = None          for platform in ('mon', 'flash', 'native'):              relinker = self._download_xml( @@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor):                  query={'output': 45, 'pl': platform},                  headers=self.geo_verification_headers()) -            media_url = find_xpath_attr(relinker, './url', 'type', 'content').text +            if not geoprotection: +                geoprotection = xpath_text( +                    relinker, './geoprotection', default=None) == 'Y' + +            if not is_live: +                is_live = xpath_text( +                    relinker, './is_live', default=None) == 'Y' +            if not duration: +                duration = parse_duration(xpath_text( +                    relinker, './duration', default=None)) + +            url_elem = find_xpath_attr(relinker, './url', 'type', 'content') +            if url_elem is None: +                continue + +            media_url = url_elem.text + +            # This does not imply geo restriction (e.g. +            # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)              if media_url == 'http://download.rai.it/video_no_available.mp4': -                self.raise_geo_restricted() +                continue              ext = determine_ext(media_url)              if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): @@ -53,11 +88,18 @@ class RaiBaseIE(InfoExtractor):                      'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',                  }) -        return formats +        if not formats and geoprotection is True: +            self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + +        return dict((k, v) for k, v in { +            'is_live': is_live, +            'duration': duration, +            'formats': formats, +        }.items() if v is not None)  class RaiPlayIE(RaiBaseIE): -    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' +    _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE      _TESTS = [{          'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',          'md5': '340aa3b7afb54bfd14a8c11786450d76', @@ -65,110 +107,130 @@ class RaiPlayIE(RaiBaseIE):              'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',              'ext': 'mp4',              'title': 'La Casa Bianca', +            'alt_title': 'S2016 - Puntata del 23/10/2016', +            'description': 'md5:a09d45890850458077d1f68bb036e0a5',              'thumbnail': r're:^https?://.*\.jpg$', -            'uploader': r're:^Rai.+', -            'description': 're:^[A-Za-z]+' -        } -    }, { -        'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', -        'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', -        'info_dict': { -            'id': 'efebe701-969c-4593-92f3-285f0d1ce750', -            'ext': 'mp4', -            'title': 'Gazebo - #gazebotraindesi', -            'thumbnail': r're:^https?://.*\.png$', -            'uploader': r're:^Rai.+', -            'description': r're:^[A-Za-z]+' -        } +            'uploader': 'Rai 3', +            'creator': 'Rai 3', +            'duration': 3278, +            'timestamp': 1477764300, +            'upload_date': '20161029', +            'series': 'La Casa Bianca', +            'season': '2016', +        },      }, {          'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',          'md5': '8970abf8caf8aef4696e7b1f2adfc696',          'info_dict': {              'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',              'ext': 'mp4', -            'title': 'Report - Report del 07/04/2014', +            'title': 'Report del 07/04/2014', +            'alt_title': 'S2013/14 - Puntata del 07/04/2014', +            'description': 'md5:f27c544694cacb46a078db84ec35d2d9',              'thumbnail': r're:^https?://.*\.jpg$', -            'uploader': r're:^Rai.+', -            'description': r're:^[A-Za-z]+' -        } +            'uploader': 'Rai 5', +            'creator': 'Rai 5', +            'duration': 6160, +            'series': 'Report', +            'season_number': 5, +            'season': '2013/14', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', +        'only_matching': True,      }] -    _RESOLUTION = '600x400'      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        url, video_id = mobj.group('url', 'id') -        # remove query and fragment part from url -        canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) -        webpage = self._download_webpage(canonical_url, video_id) +        media = self._download_json( +            '%s?json' % url, video_id, 'Downloading video JSON') -        media = self._download_json('%s?json' % canonical_url, -                                    video_id, 'Downloading video JSON') +        title = media['name'] + +        video = media['video'] + +        relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) +        self._sort_formats(relinker_info['formats'])          thumbnails = []          if 'images' in media:              for _, value in media.get('images').items():                  if value:                      thumbnails.append({ -                        'url': value.replace('[RESOLUTION]', self._RESOLUTION) +                        'url': value.replace('[RESOLUTION]', '600x400')                      }) -        if 'video' not in media: -            raise ExtractorError('No video found') +        timestamp = unified_timestamp(try_get( +            media, lambda x: x['availabilities'][0]['start'], compat_str)) -        video = media.get('video') -        duration = parse_duration(video.get('duration')), -        formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) -        self._sort_formats(formats) - -        return { +        info = {              'id': video_id, -            'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), -            'description': self._og_search_description(webpage), +            'title': title, +            'alt_title': media.get('subtitle'), +            'description': media.get('description'),              'uploader': media.get('channel'), -            'duration': duration, +            'creator': media.get('editor'), +            'duration': parse_duration(video.get('duration')), +            'timestamp': timestamp,              'thumbnails': thumbnails, -            'formats': formats +            'series': try_get( +                media, lambda x: x['isPartOf']['name'], compat_str), +            'season_number': int_or_none(try_get( +                media, lambda x: x['isPartOf']['numeroStagioni'])), +            'season': media.get('stagione') or None,          } +        info.update(relinker_info) + +        return info +  class RaiIE(RaiBaseIE): -    _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +    _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE      _TESTS = [{ -        # subdomain test case +        # var uniquename = "ContentItem-..." +        # data-id="ContentItem-..."          'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',          'info_dict': {              'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',              'ext': 'mp4',              'title': 'TG PRIMO TEMPO', -            'upload_date': '20140612', +            'thumbnail': r're:^https?://.*\.jpg$',              'duration': 1758, -            'thumbnail': r're:^https?://.*\.jpg$' +            'upload_date': '20140612',          }      }, { -        # rainews test case +        # with ContentItem in many metas          'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',          'info_dict': {              'id': '1632c009-c843-4836-bb65-80c33084a64b',              'ext': 'mp4', -            'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', -            'upload_date': '20161103', +            'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', +            'description': 'I film in uscita questa settimana.',              'thumbnail': r're:^https?://.*\.png$', -            'description': r're:^[A-Za-z]+' +            'duration': 833, +            'upload_date': '20161103',          }      }, { -        # with media information +        # with ContentItem in og:url          'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',          'md5': '11959b4e44fa74de47011b5799490adf',          'info_dict': {              'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',              'ext': 'mp4',              'title': 'TG1 ore 20:00 del 03/11/2016', +            'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016',              'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 2214,              'upload_date': '20161103', -            'description': r're:^[A-Za-z]+'          }      }, { -        # drawMediaRaiTV test case +        # drawMediaRaiTV(...)          'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',          'md5': '2dd727e61114e1ee9c47f0da6914e178',          'info_dict': { @@ -176,83 +238,67 @@ class RaiIE(RaiBaseIE):              'ext': 'mp4',              'title': 'Il pacco',              'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', +            'thumbnail': r're:^https?://.*\.jpg$',              'upload_date': '20141221',          },      }, { -        # Direct relinker URL +        # initEdizione('ContentItem-...' +        'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', +        'info_dict': { +            'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', +            'ext': 'mp4', +            'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', +            'duration': 2274, +            'upload_date': '20170401', +        }, +        'skip': 'Changes daily', +    }, { +        # HDS live stream with only relinker URL          'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', -        # HDS live stream, MD5 is unstable          'info_dict': {              'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',              'ext': 'flv',              'title': 'EuroNews',          }, -    }, { -        # Embedded content item ID -        'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', -        'info_dict': { -            'id': 'd80d4b70-3812-4501-a888-92edec729f00', -            'ext': 'mp4', -            'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', -            'upload_date': r're:\d{8}', -            'description': r're:.+', +        'params': { +            'skip_download': True,          },      }, { +        # HLS live stream with ContentItem in og:url          'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', -        # HDS live stream, MD5 is unstable          'info_dict': {              'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',              'ext': 'mp4',              'title': 'La diretta di Rainews24',          }, +        'params': { +            'skip_download': True, +        },      }] -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        iframe_url = self._search_regex( -            [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', -             r'drawMediaRaiTV\(["\'](.+?)["\']'], -            webpage, 'iframe', default=None) -        if iframe_url: -            if not iframe_url.startswith('http'): -                iframe_url = compat_urlparse.urljoin(url, iframe_url) -            return self.url_result(iframe_url) - -        content_item_id = self._search_regex( -            r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', -            webpage, 'content item ID', group='content_id', default=None) -        if content_item_id: -            return self._extract_from_content_id(content_item_id, url) - -        try: -            return self._extract_from_content_id(video_id, url) -        except ExtractorError: -            # no media data, only direct relinker -            pass - -        relinker_url = compat_urlparse.urljoin(url, self._search_regex( -            r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', -            webpage, 'relinker URL', group='url')) -        formats = self._extract_relinker_formats(relinker_url, video_id) -        self._sort_formats(formats) - -        title = self._search_regex( -            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', -            webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - -        return { -            'id': video_id, -            'title': title, -            'formats': formats, -        } -      def _extract_from_content_id(self, content_id, url):          media = self._download_json(              'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,              content_id, 'Downloading video JSON') +        title = media['name'].strip() + +        media_type = media['type'] +        if 'Audio' in media_type: +            relinker_info = { +                'formats': { +                    'format_id': media.get('formatoAudio'), +                    'url': media['audioUrl'], +                    'ext': media.get('formatoAudio'), +                } +            } +        elif 'Video' in media_type: +            relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) +        else: +            raise ExtractorError('not a media file') + +        self._sort_formats(relinker_info['formats']) +          thumbnails = []          for image_type in ('image', 'image_medium', 'image_300'):              thumbnail_url = media.get(image_type) @@ -261,20 +307,6 @@ class RaiIE(RaiBaseIE):                      'url': compat_urlparse.urljoin(url, thumbnail_url),                  }) -        formats = [] -        media_type = media['type'] -        if 'Audio' in media_type: -            formats.append({ -                'format_id': media.get('formatoAudio'), -                'url': media['audioUrl'], -                'ext': media.get('formatoAudio'), -            }) -        elif 'Video' in media_type: -            formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) -            self._sort_formats(formats) -        else: -            raise ExtractorError('not a media file') -          subtitles = {}          captions = media.get('subtitlesUrl')          if captions: @@ -287,14 +319,89 @@ class RaiIE(RaiBaseIE):                  'url': captions,              }] -        return { +        info = {              'id': content_id, -            'title': media['name'], -            'description': media.get('desc'), +            'title': title, +            'description': strip_or_none(media.get('desc')),              'thumbnails': thumbnails,              'uploader': media.get('author'),              'upload_date': unified_strdate(media.get('date')),              'duration': parse_duration(media.get('length')), -            'formats': formats,              'subtitles': subtitles,          } + +        info.update(relinker_info) + +        return info + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        content_item_id = None + +        content_item_url = self._html_search_meta( +            ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', +             'twitter:player', 'jsonlink'), webpage, default=None) +        if content_item_url: +            content_item_id = self._search_regex( +                r'ContentItem-(%s)' % self._UUID_RE, content_item_url, +                'content item id', default=None) + +        if not content_item_id: +            content_item_id = self._search_regex( +                r'''(?x) +                    (?: +                        (?:initEdizione|drawMediaRaiTV)\(| +                        <(?:[^>]+\bdata-id|var\s+uniquename)= +                    ) +                    (["\']) +                    (?:(?!\1).)*\bContentItem-(?P<id>%s) +                ''' % self._UUID_RE, +                webpage, 'content item id', default=None, group='id') + +        content_item_ids = set() +        content_item_ids.add(content_item_id) +        if video_id not in content_item_ids: +            content_item_ids.add(video_id) + +        for content_item_id in content_item_ids: +            try: +                return self._extract_from_content_id(content_item_id, url) +            except GeoRestrictedError: +                raise +            except ExtractorError: +                pass + +        relinker_url = self._search_regex( +            r'''(?x) +                (?: +                    var\s+videoURL| +                    mediaInfo\.mediaUri +                )\s*=\s* +                ([\'"]) +                (?P<url> +                    (?:https?:)? +                    //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? +                    (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 +            ''', +            webpage, 'relinker URL', group='url') + +        relinker_info = self._extract_relinker_info( +            urljoin(url, relinker_url), video_id) +        self._sort_formats(relinker_info['formats']) + +        title = self._search_regex( +            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', +            webpage, 'title', group='title', +            default=None) or self._og_search_title(webpage) + +        info = { +            'id': video_id, +            'title': title, +        } + +        info.update(relinker_info) + +        return info  | 
