diff options
Diffstat (limited to 'youtube_dl/extractor/lecturio.py')
| -rw-r--r-- | youtube_dl/extractor/lecturio.py | 186 | 
1 files changed, 186 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py new file mode 100644 index 000000000..62ff28e02 --- /dev/null +++ b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    extract_attributes, +    ExtractorError, +    float_or_none, +    int_or_none, +    str_or_none, +    url_or_none, +    urlencode_postdata, +    urljoin, +) + + +class LecturioBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://app.lecturio.com/en/login' +    _NETRC_MACHINE = 'lecturio' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            return + +        # Sets some cookies +        _, urlh = self._download_webpage_handle( +            self._LOGIN_URL, None, 'Downloading login popup') + +        def is_logged(url_handle): +            return self._LOGIN_URL not in compat_str(url_handle.geturl()) + +        # Already logged in +        if is_logged(urlh): +            return + +        login_form = { +            'signin[email]': username, +            'signin[password]': password, +            'signin[remember]': 'on', +        } + +        response, urlh = self._download_webpage_handle( +            self._LOGIN_URL, None, 'Logging in', +            data=urlencode_postdata(login_form)) + +        # Logged in successfully +        if is_logged(urlh): +            return + +        errors = self._html_search_regex( +            r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, +            'errors', default=None) +        if errors: +            raise ExtractorError('Unable to login: %s' % errors, expected=True) +        raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): +    _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture' +    _TEST = { +        'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', +        'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', +        'info_dict': { +            'id': '39634', +            'ext': 'mp4', +            'title': 'Important Concepts and Terms – Introduction to Microbiology', +        }, +        'skip': 'Requires lecturio account credentials', +    } + +    _CC_LANGS = { +        'German': 'de', +        'English': 'en', +        'Spanish': 'es', +        'French': 'fr', +        'Polish': 'pl', +        'Russian': 'ru', +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage( +            'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, +            display_id) + +        lecture_id = self._search_regex( +            r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + +        api_url = self._search_regex( +            r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, +            'api url', group='url') + +        video = self._download_json(api_url, display_id) + +        title = video['title'].strip() + +        formats = [] +        for format_ in video['content']['media']: +            if not isinstance(format_, dict): +                continue +            file_ = format_.get('file') +            if not file_: +                continue +            ext = determine_ext(file_) +            if ext == 'smil': +                # smil contains only broken RTMP formats anyway +                continue +            file_url = url_or_none(file_) +            if not file_url: +                continue +            label = str_or_none(format_.get('label')) +            filesize = int_or_none(format_.get('fileSize')) +            formats.append({ +                'url': file_url, +                'format_id': label, +                'filesize': float_or_none(filesize, invscale=1000) +            }) +        self._sort_formats(formats) + +        subtitles = {} +        automatic_captions = {} +        cc = self._parse_json( +            self._search_regex( +                r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', +                default='{}'), display_id, fatal=False) +        for cc_label, cc_url in cc.items(): +            cc_url = url_or_none(cc_url) +            if not cc_url: +                continue +            sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles +            lang = self._search_regex( +                r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) +            sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ +                'url': cc_url, +            }) + +        return { +            'id': lecture_id, +            'title': title, +            'formats': formats, +            'subtitles': subtitles, +            'automatic_captions': automatic_captions, +        } + + +class LecturioCourseIE(LecturioBaseIE): +    _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course' +    _TEST = { +        'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', +        'info_dict': { +            'id': 'microbiology-introduction', +            'title': 'Microbiology: Introduction', +        }, +        'playlist_count': 45, +        'skip': 'Requires lecturio account credentials', +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        entries = [] +        for mobj in re.finditer( +                r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', +                webpage): +            params = extract_attributes(mobj.group(0)) +            lecture_url = urljoin(url, params.get('data-url')) +            lecture_id = params.get('data-id') +            entries.append(self.url_result( +                lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + +        title = self._search_regex( +            r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage, +            'title', default=None) + +        return self.playlist_result(entries, display_id, title) | 
