diff options
Diffstat (limited to 'youtube_dl/extractor/stitcher.py')
| -rw-r--r-- | youtube_dl/extractor/stitcher.py | 135 |
1 files changed, 99 insertions, 36 deletions
diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 97d1ff681..822782507 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,28 +1,74 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - determine_ext, + clean_html, + clean_podcast_url, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': clean_podcast_url(audio_url), + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +84,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +92,53 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) + - webpage = self._download_webpage(url, display_id) +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'formats': formats, - } + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) |
