diff options
Diffstat (limited to 'youtube_dl/extractor/closertotruth.py')
-rw-r--r-- | youtube_dl/extractor/closertotruth.py | 117 |
1 files changed, 70 insertions, 47 deletions
diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index d04ff5e4f..26243d52d 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -1,69 +1,92 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P<id>\d+))' - _TESTS = [ - { - 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', - 'md5': '5c548bde260a9247ddfdc07c7458ed29', - 'info_dict': { - 'id': '0_zof1ktre', - 'ext': 'mov', - 'title': 'Solutions to the Mind-Body Problem?', - 'upload_date': '20140221', - 'timestamp': 1392956007, - 'uploader_id': 'CTTXML' - } + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' }, - { - 'url': 'http://closertotruth.com/interviews/1725', - 'md5': 'b00598fd6a38372edb976408f72c5792', - 'info_dict': { - 'id': '0_19qv5rn1', - 'ext': 'mov', - 'title': 'AyaFr-002 - Francisco J. Ayala', - 'upload_date': '20140307', - 'timestamp': 1394236431, - 'uploader_id': 'CTTXML' - } + 'params': { + 'skip_download': True, }, - { - 'url': 'http://closertotruth.com/episodes/how-do-brains-work', - 'md5': '4dd96aa0a5c296afa5c0bd24895c2f16', - 'info_dict': { - 'id': '0_iuxai6g6', - 'ext': 'mov', - 'title': 'How do Brains Work?', - 'upload_date': '20140221', - 'timestamp': 1392956024, - 'uploader_id': 'CTTXML' - } + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', }, - ] + 'playlist_mincount': 2, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - video_title = self._search_regex(r'<title>(.+) \|.+</title>', webpage, 'video title') + webpage = self._download_webpage(url, display_id) - entry_id = self._search_regex(r'<a[^>]+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") + partner_id = self._search_regex( + r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') - interviewee_name = self._search_regex(r'<div id="(?:node_interview_full_group_white_wrapper|node_interview_series_full_group_ajax_content)"(?:.|\n)*<h3>(.*)</h3>.+', webpage, "video interviewee_name", False) + title = self._search_regex( + r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title') - if interviewee_name: - video_title = video_title + ' - ' + interviewee_name + select = self._search_regex( + r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) - p_id = self._search_regex(r'<script[^>]+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id") + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') return { '_type': 'url_transparent', - 'id': entry_id, - 'url': 'kaltura:%s:%s' % (p_id, entry_id), + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), 'ie_key': 'Kaltura', - 'title': video_title + 'title': title } |