diff options
Diffstat (limited to 'youtube_dl/extractor/linuxacademy.py')
| -rw-r--r-- | youtube_dl/extractor/linuxacademy.py | 174 | 
1 files changed, 174 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py new file mode 100644 index 000000000..a78c6556e --- /dev/null +++ b/youtube_dl/extractor/linuxacademy.py @@ -0,0 +1,174 @@ +from __future__ import unicode_literals + +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_b64decode, +    compat_HTTPError, +    compat_str, +) +from ..utils import ( +    ExtractorError, +    orderedSet, +    unescapeHTML, +    urlencode_postdata, +    urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:www\.)?linuxacademy\.com/cp/ +                        (?: +                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| +                            modules/view/id/(?P<course_id>\d+) +                        ) +                    ''' +    _TESTS = [{ +        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', +        'info_dict': { +            'id': '1498-2', +            'ext': 'mp4', +            'title': "Introduction to the Practitioner's Brief", +        }, +        'params': { +            'skip_download': True, +        }, +        'skip': 'Requires Linux Academy account credentials', +    }, { +        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', +        'only_matching': True, +    }, { +        'url': 'https://linuxacademy.com/cp/modules/view/id/154', +        'info_dict': { +            'id': '154', +            'title': 'AWS Certified Cloud Practitioner', +            'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', +        }, +        'playlist_count': 41, +        'skip': 'Requires Linux Academy account credentials', +    }] + +    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' +    _ORIGIN_URL = 'https://linuxacademy.com' +    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' +    _NETRC_MACHINE = 'linuxacademy' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            return + +        def random_string(): +            return ''.join([ +                random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') +                for _ in range(32)]) + +        webpage, urlh = self._download_webpage_handle( +            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ +                'client_id': self._CLIENT_ID, +                'response_type': 'token id_token', +                'redirect_uri': self._ORIGIN_URL, +                'scope': 'openid email user_impersonation profile', +                'audience': self._ORIGIN_URL, +                'state': random_string(), +                'nonce': random_string(), +            }) + +        login_data = self._parse_json( +            self._search_regex( +                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, +                'login info', group='value'), None, +            transform_source=lambda x: compat_b64decode(x).decode('utf-8') +        )['extraParams'] + +        login_data.update({ +            'client_id': self._CLIENT_ID, +            'redirect_uri': self._ORIGIN_URL, +            'tenant': 'lacausers', +            'connection': 'Username-Password-Authentication', +            'username': username, +            'password': password, +            'sso': 'true', +        }) + +        login_state_url = compat_str(urlh.geturl()) + +        try: +            login_page = self._download_webpage( +                'https://login.linuxacademy.com/usernamepassword/login', None, +                'Downloading login page', data=json.dumps(login_data).encode(), +                headers={ +                    'Content-Type': 'application/json', +                    'Origin': 'https://login.linuxacademy.com', +                    'Referer': login_state_url, +                }) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: +                error = self._parse_json(e.cause.read(), None) +                message = error.get('description') or error['code'] +                raise ExtractorError( +                    '%s said: %s' % (self.IE_NAME, message), expected=True) +            raise + +        callback_page, urlh = self._download_webpage_handle( +            'https://login.linuxacademy.com/login/callback', None, +            'Downloading callback page', +            data=urlencode_postdata(self._hidden_inputs(login_page)), +            headers={ +                'Content-Type': 'application/x-www-form-urlencoded', +                'Origin': 'https://login.linuxacademy.com', +                'Referer': login_state_url, +            }) + +        access_token = self._search_regex( +            r'access_token=([^=&]+)', compat_str(urlh.geturl()), +            'access token') + +        self._download_webpage( +            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' +            % access_token, None, 'Downloading token validation page') + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') +        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + +        webpage = self._download_webpage(url, item_id) + +        # course path +        if course_id: +            entries = [ +                self.url_result( +                    urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) +                for lesson_url in orderedSet(re.findall( +                    r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', +                    webpage))] +            title = unescapeHTML(self._html_search_regex( +                (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)', +                 r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), +                webpage, 'title', default=None, group='value')) +            description = unescapeHTML(self._html_search_regex( +                r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', +                webpage, 'description', default=None, group='value')) +            return self.playlist_result(entries, course_id, title, description) + +        # single video path +        info = self._extract_jwplayer_data( +            webpage, item_id, require_title=False, m3u8_id='hls',) +        title = self._search_regex( +            (r'>Lecture\s*:\s*(?P<value>[^<]+)', +             r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, +            'title', group='value') +        info.update({ +            'id': item_id, +            'title': title, +        }) +        return info | 
