diff options
Diffstat (limited to 'youtube_dl/extractor/teamtreehouse.py')
| -rw-r--r-- | youtube_dl/extractor/teamtreehouse.py | 140 | 
1 files changed, 140 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/teamtreehouse.py b/youtube_dl/extractor/teamtreehouse.py new file mode 100644 index 000000000..d347e97ef --- /dev/null +++ b/youtube_dl/extractor/teamtreehouse.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    determine_ext, +    ExtractorError, +    float_or_none, +    get_element_by_class, +    get_element_by_id, +    parse_duration, +    remove_end, +    urlencode_postdata, +    urljoin, +) + + +class TeamTreeHouseIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)' +    _TESTS = [{ +        # Course +        'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php', +        'info_dict': { +            'id': 'introduction-to-user-authentication-in-php', +            'title': 'Introduction to User Authentication in PHP', +            'description': 'md5:405d7b4287a159b27ddf30ca72b5b053', +        }, +        'playlist_mincount': 24, +    }, { +        # WorkShop +        'url': 'https://teamtreehouse.com/library/deploying-a-react-app', +        'info_dict': { +            'id': 'deploying-a-react-app', +            'title': 'Deploying a React App', +            'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921', +        }, +        'playlist_mincount': 4, +    }, { +        # Video +        'url': 'https://teamtreehouse.com/library/application-overview-2', +        'info_dict': { +            'id': 'application-overview-2', +            'ext': 'mp4', +            'title': 'Application Overview', +            'description': 'md5:4b0a234385c27140a4378de5f1e15127', +        }, +        'expected_warnings': ['This is just a preview'], +    }] +    _NETRC_MACHINE = 'teamtreehouse' + +    def _real_initialize(self): +        email, password = self._get_login_info() +        if email is None: +            return + +        signin_page = self._download_webpage( +            'https://teamtreehouse.com/signin', +            None, 'Downloading signin page') +        data = self._form_hidden_inputs('new_user_session', signin_page) +        data.update({ +            'user_session[email]': email, +            'user_session[password]': password, +        }) +        error_message = get_element_by_class('error-message', self._download_webpage( +            'https://teamtreehouse.com/person_session', +            None, 'Logging in', data=urlencode_postdata(data))) +        if error_message: +            raise ExtractorError(clean_html(error_message), expected=True) + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        title = self._html_search_meta(['og:title', 'twitter:title'], webpage) +        description = self._html_search_meta( +            ['description', 'og:description', 'twitter:description'], webpage) +        entries = self._parse_html5_media_entries(url, webpage, display_id) +        if entries: +            info = entries[0] + +            for subtitles in info.get('subtitles', {}).values(): +                for subtitle in subtitles: +                    subtitle['ext'] = determine_ext(subtitle['url'], 'srt') + +            is_preview = 'data-preview="true"' in webpage +            if is_preview: +                self.report_warning( +                    'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id) +                duration = 30 +            else: +                duration = float_or_none(self._search_regex( +                    r'data-duration="(\d+)"', webpage, 'duration'), 1000) +                if not duration: +                    duration = parse_duration(get_element_by_id( +                        'video-duration', webpage)) + +            info.update({ +                'id': display_id, +                'title': title, +                'description': description, +                'duration': duration, +            }) +            return info +        else: +            def extract_urls(html, extract_info=None): +                for path in re.findall(r'<a[^>]+href="([^"]+)"', html): +                    page_url = urljoin(url, path) +                    entry = { +                        '_type': 'url_transparent', +                        'id': self._match_id(page_url), +                        'url': page_url, +                        'id_key': self.ie_key(), +                    } +                    if extract_info: +                        entry.update(extract_info) +                    entries.append(entry) + +            workshop_videos = self._search_regex( +                r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>', +                webpage, 'workshop videos', default=None) +            if workshop_videos: +                extract_urls(workshop_videos) +            else: +                stages_path = self._search_regex( +                    r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"', +                    webpage, 'stages path') +                if stages_path: +                    stages_page = self._download_webpage( +                        urljoin(url, stages_path), display_id, 'Downloading stages page') +                    for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1): +                        extract_urls(steps_list, { +                            'chapter': chapter, +                            'chapter_number': chapter_number, +                        }) +                    title = remove_end(title, ' Course') + +            return self.playlist_result( +                entries, display_id, title, description) | 
