diff options
| -rw-r--r-- | youtube_dl/extractor/zdf.py | 111 | 
1 files changed, 40 insertions, 71 deletions
| diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 221f16686..74c76a9a0 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,12 +1,14 @@  # coding: utf-8  from __future__ import unicode_literals +import functools  import re  from .common import InfoExtractor  from ..utils import (      int_or_none,      unified_strdate, +    OnDemandPagedList,  ) @@ -86,28 +88,8 @@ def extract_from_xml_url(ie, video_id, xml_url):      } -def extract_channel_from_xml_url(ie, channel_id, xml_url): -    doc = ie._download_xml( -        xml_url, channel_id, -        note='Downloading channel info', -        errnote='Failed to download channel info') - -    title = doc.find('.//information/title').text -    description = doc.find('.//information/detail').text -    assets = [{'id': asset.find('./details/assetId').text, -               'type': asset.find('./type').text, -               } for asset in doc.findall('.//teasers/teaser')] - -    return { -        'id': channel_id, -        'title': title, -        'description': description, -        'assets': assets, -    } - -  class ZDFIE(InfoExtractor): -    _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' +    _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'      _TEST = {          'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', @@ -124,67 +106,54 @@ class ZDFIE(InfoExtractor):          'skip': 'Videos on ZDF.de are depublicised in short order',      } -    def _extract_video(self, video_id): +    def _real_extract(self, url): +        video_id = self._match_id(url)          xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id          return extract_from_xml_url(self, video_id, xml_url) -    def _real_extract(self, url): -        return self._extract_video(self._match_id(url)) - - -class ZDFChannelIE(ZDFIE): -    _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*kanaluebersicht/)(?P<id>[0-9]+)' +class ZDFChannelIE(InfoExtractor): +    _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',          'info_dict': {              'id': '1586442', -            'title': 'Titanic', -            'description': 'md5:444c048cfe3fdc2561be7de4bcbf1d04',          }, -        'playlist_count': 3, +        'playlist_count': 4,      } +    _PAGE_SIZE = 50 + +    def _fetch_page(self, channel_id, page): +        offset = page * self._PAGE_SIZE +        xml_url = ( +            'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' +            % (offset, self._PAGE_SIZE, channel_id)) +        doc = self._download_xml( +            xml_url, channel_id, +            note='Downloading channel info', +            errnote='Failed to download channel info') + +        title = doc.find('.//information/title').text +        description = doc.find('.//information/detail').text +        for asset in doc.findall('.//teasers/teaser'): +            a_type = asset.find('./type').text +            a_id = asset.find('./details/assetId').text +            if a_type not in ('video', 'topic'): +                continue +            yield { +                '_type': 'url', +                'playlist_title': title, +                'playlist_description': description, +                'url': 'zdf:%s:%s' % (a_type, a_id), +            } + +    def _real_extract(self, url): +        channel_id = self._match_id(url) +        entries = OnDemandPagedList( +            functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE) -    def _extract_channel(self, channel_id): -        def load_chunks(channel_id, chunk_length): -            offset = 0 -            while True: -                url = ('http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' -                       % (offset, chunk_length, channel_id)) -                result = extract_channel_from_xml_url(self, channel_id, url) -                yield result -                if len(result['assets']) < chunk_length: -                    return -                offset += chunk_length - -        def load_channel(channel_id): -            chunks = list(load_chunks(channel_id, 50))  # The server rejects higher values -            assets = [asset for chunk in chunks for asset in chunk['assets']] -            video_ids = [asset['id'] for asset in -                         filter(lambda asset: asset['type'] == 'video', -                                assets)] -            topic_ids = [asset['id'] for asset in -                         filter(lambda asset: asset['type'] == 'thema', -                                assets)] -            if topic_ids: -                video_ids = reduce(list.__add__, -                                   [load_channel(topic_id)['video_ids'] -                                    for topic_id in topic_ids], -                                   video_ids) - -            result = chunks[0] -            result['video_ids'] = video_ids -            return result - -        channel = load_channel(channel_id)          return {              '_type': 'playlist', -            'id': channel['id'], -            'title': channel['title'], -            'description': channel['description'], -            'entries': [self._extract_video(video_id) -                        for video_id in channel['video_ids']], +            'id': channel_id, +            'entries': entries,          } - -    def _real_extract(self, url): -        return self._extract_channel(self._match_id(url)) | 
