diff options
author | Sergey M․ <dstftw@gmail.com> | 2015-05-10 18:29:15 +0600 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2015-05-10 18:29:15 +0600 |
commit | a6762c4a22325b5b69770de82df8725d2eb5c3df (patch) | |
tree | 60a928a907d44a49c1d2c5cec077ee5c1806d327 | |
parent | 98c2c0febc7e686278ceaadc73eb40fee1b46752 (diff) |
[voicerepublic] Make more robust and extract more metadata
-rw-r--r-- | youtube_dl/extractor/voicerepublic.py | 95 |
1 files changed, 71 insertions, 24 deletions
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index a3e40b940..1106c655b 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_request -from ..utils import ExtractorError +from ..compat import ( + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, +) class VoiceRepublicIE(InfoExtractor): - _VALID_URL = r'https?://voicerepublic\.com/talks/(?P<id>[0-9a-z-]+)' - _TEST = { - 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)' + _TESTS = [{ + 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', 'md5': '0554a24d1657915aa8e8f84e15dc9353', 'info_dict': { 'id': '2296', + 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'duration': 1800, + 'view_count': int, } - } + }, { + 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - req = compat_urllib_request.Request(url) + + req = compat_urllib_request.Request( + compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') webpage = self._download_webpage(req, display_id) - thumbnail = self._og_search_thumbnail(webpage) - video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if '<a>Queued for processing, please stand by...</a>' in webpage: - raise ExtractorError('Audio is still queued for processing') + if '>Queued for processing, please stand by...<' in webpage: + raise ExtractorError( + 'Audio is still queued for processing', expected=True) - formats = [{ - 'url': 'https://voicerepublic.com' + path, - 'ext': ext, - 'format_id': ext, - 'vcodec': 'none', - } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] + data = self._parse_json( + self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None), + display_id, fatal=False) + + if data: + title = data['title'] + description = data.get('teaser') + talk_id = data.get('talk_id') or display_id + talk = data['talk'] + duration = int_or_none(talk.get('duration')) + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['links'].items()] + else: + title = self._og_search_title(webpage) + description = self._html_search_regex( + r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>", + webpage, 'description', fatal=False) + talk_id = self._search_regex( + [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], + webpage, 'talk id', default=None) or display_id + duration = None + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)] self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + view_count = int_or_none(self._search_regex( + r"class='play-count[^']*'>\s*(\d+) plays", + webpage, 'play count', fatal=False)) + return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'formats': formats, - 'url': self._og_search_url(webpage), + 'id': talk_id, + 'display_id': display_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'description': self._og_search_description(webpage), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, } |