diff options
Diffstat (limited to 'youtube_dl/extractor/arte.py')
| -rw-r--r-- | youtube_dl/extractor/arte.py | 231 | 
1 files changed, 152 insertions, 79 deletions
| diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..f40532929 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):          } -class ArteTVPlus7IE(InfoExtractor): -    IE_NAME = 'arte.tv:+7' -    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' - +class ArteTVBaseIE(InfoExtractor):      @classmethod      def _extract_url_info(cls, url):          mobj = re.match(cls._VALID_URL, url) @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor):              video_id = mobj.group('id')          return video_id, lang -    def _real_extract(self, url): -        video_id, lang = self._extract_url_info(url) -        webpage = self._download_webpage(url, video_id) -        return self._extract_from_webpage(webpage, video_id, lang) - -    def _extract_from_webpage(self, webpage, video_id, lang): -        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') -        ids = (video_id, '') -        # some pages contain multiple videos (like -        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), -        # so we first try to look for json URLs that contain the video id from -        # the 'vid' parameter. -        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] -        json_url = self._html_search_regex( -            patterns, webpage, 'json vp url', default=None) -        if not json_url: -            def find_iframe_url(webpage, default=NO_DEFAULT): -                return self._html_search_regex( -                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', -                    webpage, 'iframe url', group='url', default=default) - -            iframe_url = find_iframe_url(webpage, None) -            if not iframe_url: -                embed_url = self._html_search_regex( -                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) -                if embed_url: -                    player = self._download_json( -                        embed_url, video_id, 'Downloading player page') -                    iframe_url = find_iframe_url(player['html']) -            # en and es URLs produce react-based pages with different layout (e.g. -            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) -            if not iframe_url: -                program = self._search_regex( -                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', -                    webpage, 'program', default=None) -                if program: -                    embed_html = self._parse_json(program, video_id) -                    if embed_html: -                        iframe_url = find_iframe_url(embed_html['embed_html']) -            if iframe_url: -                json_url = compat_parse_qs( -                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] -        if json_url: -            title = self._search_regex( -                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', -                webpage, 'title', default=None, group='title') -            return self._extract_from_json_url(json_url, video_id, lang, title=title) -        # Different kind of embed URL (e.g. -        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) -        embed_url = self._search_regex( -            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', -            webpage, 'embed url', group='url') -        return self.url_result(embed_url) -      def _extract_from_json_url(self, json_url, video_id, lang, title=None):          info = self._download_json(json_url, video_id)          player_info = info['videoJsonPlayer'] @@ -161,24 +104,53 @@ class ArteTVPlus7IE(InfoExtractor):              'es': 'E[ESP]',          } +        langcode = LANGS.get(lang, lang) +          formats = []          for format_id, format_dict in player_info['VSR'].items():              f = dict(format_dict)              versionCode = f.get('versionCode') -            langcode = LANGS.get(lang, lang) -            lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] -            lang_pref = None -            if versionCode: -                matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] -                lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) -            source_pref = 0 -            if versionCode is not None: -                # The original version with subtitles has lower relevance -                if re.match(r'VO-ST(F|A|E)', versionCode): -                    source_pref -= 10 -                # The version with sourds/mal subtitles has also lower relevance -                elif re.match(r'VO?(F|A|E)-STM\1', versionCode): -                    source_pref -= 9 +            l = re.escape(langcode) + +            # Language preference from most to least priority +            # Reference: section 5.6.3 of +            # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf +            PREFERENCES = ( +                # original version in requested language, without subtitles +                r'VO{0}$'.format(l), +                # original version in requested language, with partial subtitles in requested language +                r'VO{0}-ST{0}$'.format(l), +                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language +                r'VO{0}-STM{0}$'.format(l), +                # non-original (dubbed) version in requested language, without subtitles +                r'V{0}$'.format(l), +                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language +                r'V{0}-ST{0}$'.format(l), +                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language +                r'V{0}-STM{0}$'.format(l), +                # original version in requested language, with partial subtitles in different language +                r'VO{0}-ST(?!{0}).+?$'.format(l), +                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language +                r'VO{0}-STM(?!{0}).+?$'.format(l), +                # original version in different language, with partial subtitles in requested language +                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), +                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language +                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), +                # original version in different language, without subtitles +                r'VO(?:(?!{0}))?$'.format(l), +                # original version in different language, with partial subtitles in different language +                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), +                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language +                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), +            ) + +            for pref, p in enumerate(PREFERENCES): +                if re.match(p, versionCode): +                    lang_pref = len(PREFERENCES) - pref +                    break +            else: +                lang_pref = -1 +              format = {                  'format_id': format_id,                  'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +160,6 @@ class ArteTVPlus7IE(InfoExtractor):                  'height': int_or_none(f.get('height')),                  'tbr': int_or_none(f.get('bitrate')),                  'quality': qfunc(f.get('quality')), -                'source_preference': source_pref,              }              if f.get('mediaType') == 'rtmp': @@ -207,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor):          return info_dict +class ArteTVPlus7IE(ArteTVBaseIE): +    IE_NAME = 'arte.tv:+7' +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) + +    def _real_extract(self, url): +        video_id, lang = self._extract_url_info(url) +        webpage = self._download_webpage(url, video_id) +        return self._extract_from_webpage(webpage, video_id, lang) + +    def _extract_from_webpage(self, webpage, video_id, lang): +        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') +        ids = (video_id, '') +        # some pages contain multiple videos (like +        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), +        # so we first try to look for json URLs that contain the video id from +        # the 'vid' parameter. +        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] +        json_url = self._html_search_regex( +            patterns, webpage, 'json vp url', default=None) +        if not json_url: +            def find_iframe_url(webpage, default=NO_DEFAULT): +                return self._html_search_regex( +                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', +                    webpage, 'iframe url', group='url', default=default) + +            iframe_url = find_iframe_url(webpage, None) +            if not iframe_url: +                embed_url = self._html_search_regex( +                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) +                if embed_url: +                    player = self._download_json( +                        embed_url, video_id, 'Downloading player page') +                    iframe_url = find_iframe_url(player['html']) +            # en and es URLs produce react-based pages with different layout (e.g. +            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) +            if not iframe_url: +                program = self._search_regex( +                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', +                    webpage, 'program', default=None) +                if program: +                    embed_html = self._parse_json(program, video_id) +                    if embed_html: +                        iframe_url = find_iframe_url(embed_html['embed_html']) +            if iframe_url: +                json_url = compat_parse_qs( +                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] +        if json_url: +            title = self._search_regex( +                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', +                webpage, 'title', default=None, group='title') +            return self._extract_from_json_url(json_url, video_id, lang, title=title) +        # Different kind of embed URL (e.g. +        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) +        embed_url = self._search_regex( +            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', +            webpage, 'embed url', group='url') +        return self.url_result(embed_url) + +  # It also uses the arte_vp_url url from the webpage to extract the information  class ArteTVCreativeIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:creative' @@ -239,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:info'      _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',          'info_dict': {              'id': '067528-000-A', @@ -247,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):              'title': 'Service civique, un cache misère ?',              'upload_date': '20160403',          }, -    } +    }]  class ArteTVFutureIE(ArteTVPlus7IE): @@ -272,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:ddc'      _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' +    _TESTS = [] +      def _real_extract(self, url):          video_id, lang = self._extract_url_info(url)          if lang == 'folge': @@ -290,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:concert'      _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',          'md5': '9ea035b7bd69696b67aa2ccaaa218161',          'info_dict': { @@ -300,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE):              'upload_date': '20140128',              'description': 'md5:486eb08f991552ade77439fe6d82c305',          }, -    } +    }]  class ArteTVCinemaIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:cinema'      _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' -    _TEST = { +    _TESTS = [{          'url': 'http://cinema.arte.tv/de/node/38291',          'md5': '6b275511a5107c60bacbeeda368c3aa1',          'info_dict': { @@ -317,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE):              'upload_date': '20160122',              'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',          }, -    } +    }]  class ArteTVMagazineIE(ArteTVPlus7IE): @@ -362,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE):          )      ''' +    _TESTS = [] +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id')          lang = mobj.group('lang')          json_url = mobj.group('json_url')          return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVPlaylistIE(ArteTVBaseIE): +    IE_NAME = 'arte.tv:playlist' +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' + +    _TESTS = [{ +        'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', +        'info_dict': { +            'id': 'PL-013263', +            'title': 'Areva & Uramin', +        }, +        'playlist_mincount': 6, +    }, { +        'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id, lang = self._extract_url_info(url) +        collection = self._download_json( +            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' +            % (lang, playlist_id), playlist_id) +        title = collection.get('title') +        description = collection.get('shortDescription') or collection.get('teaserText') +        entries = [ +            self._extract_from_json_url( +                video['jsonUrl'], video.get('programId') or playlist_id, lang) +            for video in collection['videos'] if video.get('jsonUrl')] +        return self.playlist_result(entries, playlist_id, title, description) | 
