diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2014-10-27 01:33:49 +0100 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2014-10-27 01:33:51 +0100 |
commit | ecfe623422f61df886b83e6eae98a0e4a297802e (patch) | |
tree | 9a01cac9a8427a8b0cb12320acfe1af8bc45bd08 /youtube_dl/extractor | |
parent | 4a6c94288ae42402f4a61a924318cedc5d1bb511 (diff) |
[heise] Fix extraction
Now they use an XML format instead of JSON.
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/heise.py | 48 |
1 files changed, 21 insertions, 27 deletions
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index f97b1e085..d41c0413f 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( get_meta_content, + int_or_none, parse_iso8601, ) @@ -28,20 +29,26 @@ class HeiseIE(InfoExtractor): 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', + 'thumbnail': 're:https?://.*\.jpg$', } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - json_url = self._search_regex( - r'json_url:\s*"([^"]+)"', webpage, 'json URL') - config = self._download_json(json_url, video_id) + + container_id = self._search_regex( + r'<div class="videoplayerjw".*?data-container="([0-9]+)"', + webpage, 'container ID') + sequenz_id = self._search_regex( + r'<div class="videoplayerjw".*?data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + data_url = 'http://www.heise.de/videout/feed?container=%s&sequenz=%s' % (container_id, sequenz_id) + doc = self._download_xml(data_url, video_id) info = { 'id': video_id, - 'thumbnail': config.get('poster'), + 'thumbnail': self._og_search_thumbnail(webpage), 'timestamp': parse_iso8601(get_meta_content('date', webpage)), 'description': self._og_search_description(webpage), } @@ -49,32 +56,19 @@ class HeiseIE(InfoExtractor): title = get_meta_content('fulltitle', webpage) if title: info['title'] = title - elif config.get('title'): - info['title'] = config['title'] else: info['title'] = self._og_search_title(webpage) formats = [] - for t, rs in config['formats'].items(): - if not rs or not hasattr(rs, 'items'): - self._downloader.report_warning( - 'formats: {0}: no resolutions'.format(t)) - continue - - for height_str, obj in rs.items(): - format_id = '{0}_{1}'.format(t, height_str) - - if not obj or not obj.get('url'): - self._downloader.report_warning( - 'formats: {0}: no url'.format(format_id)) - continue - - formats.append({ - 'url': obj['url'], - 'format_id': format_id, - 'height': self._int(height_str, 'height'), - }) - + for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): + label = source_node.attrib['label'] + height = int_or_none(self._search_regex( + r'^(.*?_)?([0-9]+)p$', label, 'height', default=None)) + formats.append({ + 'url': source_node.attrib['file'], + 'format_note': label, + 'height': height, + }) self._sort_formats(formats) info['formats'] = formats |