diff options
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/raywenderlich.py | 189 | 
2 files changed, 137 insertions, 57 deletions
| diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eb22d0999..23ad42da9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -899,7 +899,10 @@ from .rai import (      RaiPlayPlaylistIE,      RaiIE,  ) -from .raywenderlich import RayWenderlichIE +from .raywenderlich import ( +    RayWenderlichIE, +    RayWenderlichCourseIE, +)  from .rbmaradio import RBMARadioIE  from .rds import RDSIE  from .redbulltv import RedBullTVIE diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py index 640c3ee23..5411ece21 100644 --- a/youtube_dl/extractor/raywenderlich.py +++ b/youtube_dl/extractor/raywenderlich.py @@ -4,24 +4,37 @@ import re  from .common import InfoExtractor  from .vimeo import VimeoIE +from ..compat import compat_str  from ..utils import ( -    extract_attributes,      ExtractorError, -    smuggle_url, -    unsmuggle_url, +    int_or_none, +    merge_dicts, +    try_get, +    unescapeHTML, +    unified_timestamp,      urljoin,  )  class RayWenderlichIE(InfoExtractor): -    _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            videos\.raywenderlich\.com/courses| +                            (?:www\.)?raywenderlich\.com +                        )/ +                        (?P<course_id>[^/]+)/lessons/(?P<id>\d+) +                    '''      _TESTS = [{ -        'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', +        'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1',          'info_dict': {              'id': '248377018',              'ext': 'mp4', -            'title': 'Testing In iOS Episode 1: Introduction', +            'title': 'Introduction', +            'description': 'md5:804d031b3efa9fcb49777d512d74f722', +            'timestamp': 1513906277, +            'upload_date': '20171222',              'duration': 133,              'uploader': 'Ray Wenderlich',              'uploader_id': 'user3304672', @@ -34,69 +47,133 @@ class RayWenderlichIE(InfoExtractor):          'expected_warnings': ['HTTP Error 403: Forbidden'],      }, {          'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_video_id(data, lesson_id): +        if not data: +            return +        groups = try_get(data, lambda x: x['groups'], list) or [] +        if not groups: +            return +        for group in groups: +            if not isinstance(group, dict): +                continue +            contents = try_get(data, lambda x: x['contents'], list) or [] +            for content in contents: +                if not isinstance(content, dict): +                    continue +                ordinal = int_or_none(content.get('ordinal')) +                if ordinal != lesson_id: +                    continue +                video_id = content.get('identifier') +                if video_id: +                    return compat_str(video_id) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        course_id, lesson_id = mobj.group('course_id', 'id') +        display_id = '%s/%s' % (course_id, lesson_id) + +        webpage = self._download_webpage(url, display_id) + +        thumbnail = self._og_search_thumbnail( +            webpage, default=None) or self._html_search_meta( +            'twitter:image', webpage, 'thumbnail') + +        if '>Subscribe to unlock' in webpage: +            raise ExtractorError( +                'This content is only available for subscribers', +                expected=True) + +        info = { +            'thumbnail': thumbnail, +        } + +        vimeo_id = self._search_regex( +            r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + +        if not vimeo_id: +            data = self._parse_json( +                self._search_regex( +                    r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, +                    'data collection', default='{}', group='data'), +                display_id, transform_source=unescapeHTML, fatal=False) +            video_id = self._extract_video_id( +                data, lesson_id) or self._search_regex( +                r'/videos/(\d+)/', thumbnail, 'video id') +            headers = { +                'Referer': url, +                'X-Requested-With': 'XMLHttpRequest', +            } +            csrf_token = self._html_search_meta( +                'csrf-token', webpage, 'csrf token', default=None) +            if csrf_token: +                headers['X-CSRF-Token'] = csrf_token +            video = self._download_json( +                'https://videos.raywenderlich.com/api/v1/videos/%s.json' +                % video_id, display_id, headers=headers)['video'] +            vimeo_id = video['clips'][0]['provider_id'] +            info.update({ +                '_type': 'url_transparent', +                'title': video.get('name'), +                'description': video.get('description') or video.get( +                    'meta_description'), +                'duration': int_or_none(video.get('duration')), +                'timestamp': unified_timestamp(video.get('created_at')), +            }) + +        return merge_dicts(info, self.url_result( +            VimeoIE._smuggle_referrer( +                'https://player.vimeo.com/video/%s' % vimeo_id, url), +            ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            videos\.raywenderlich\.com/courses| +                            (?:www\.)?raywenderlich\.com +                        )/ +                        (?P<id>[^/]+) +                    ''' + +    _TEST = { +        'url': 'https://www.raywenderlich.com/3530-testing-in-ios',          'info_dict': {              'title': 'Testing in iOS', -            'id': '105-testing-in-ios', +            'id': '3530-testing-in-ios',          },          'params': {              'noplaylist': False,          },          'playlist_count': 29, -    }] +    } + +    @classmethod +    def suitable(cls, url): +        return False if RayWenderlichIE.suitable(url) else super( +            RayWenderlichCourseIE, cls).suitable(url)      def _real_extract(self, url): -        url, smuggled_data = unsmuggle_url(url, {}) +        course_id = self._match_id(url) -        mobj = re.match(self._VALID_URL, url) -        course_id, lesson_id = mobj.group('course_id', 'id') -        video_id = '%s/%s' % (course_id, lesson_id) - -        webpage = self._download_webpage(url, video_id) - -        no_playlist = self._downloader.params.get('noplaylist') -        if no_playlist or smuggled_data.get('force_video', False): -            if no_playlist: -                self.to_screen( -                    'Downloading just video %s because of --no-playlist' -                    % video_id) -            if '>Subscribe to unlock' in webpage: -                raise ExtractorError( -                    'This content is only available for subscribers', -                    expected=True) -            vimeo_id = self._search_regex( -                r'data-vimeo-id=["\'](\d+)', webpage, 'video id') -            return self.url_result( -                VimeoIE._smuggle_referrer( -                    'https://player.vimeo.com/video/%s' % vimeo_id, url), -                ie=VimeoIE.ie_key(), video_id=vimeo_id) - -        self.to_screen( -            'Downloading playlist %s - add --no-playlist to just download video' -            % course_id) - -        lesson_ids = set((lesson_id, )) -        for lesson in re.findall( -                r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage): -            attrs = extract_attributes(lesson) -            if not attrs: -                continue -            lesson_url = attrs.get('href') -            if not lesson_url: -                continue -            lesson_id = self._search_regex( -                r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) -            if not lesson_id: -                continue -            lesson_ids.add(lesson_id) +        webpage = self._download_webpage(url, course_id)          entries = [] -        for lesson_id in sorted(lesson_ids): +        lesson_urls = set() +        for lesson_url in re.findall( +                r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): +            if lesson_url in lesson_urls: +                continue +            lesson_urls.add(lesson_url)              entries.append(self.url_result( -                smuggle_url(urljoin(url, lesson_id), {'force_video': True}), -                ie=RayWenderlichIE.ie_key())) +                urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) -        title = self._search_regex( -            r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', -            default=None) +        title = self._og_search_title( +            webpage, default=None) or self._html_search_meta( +            'twitter:title', webpage, 'title', default=None)          return self.playlist_result(entries, course_id, title) | 
