diff options
Diffstat (limited to 'youtube_dl/extractor/ard.py')
| -rw-r--r-- | youtube_dl/extractor/ard.py | 91 | 
1 files changed, 60 insertions, 31 deletions
| diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index dbf8eed99..b88f71bc4 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,22 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals +  import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      ExtractorError,  ) +  class ARDIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' -    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' -    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' +    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' +      _TEST = { -        u'url': u'http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640', -        u'file': u'14077640.mp4', -        u'md5': u'6ca8824255460c787376353f9e20bbd8', -        u'info_dict': { -            u"title": u"11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden" +        'url': 'http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786', +        'file': '19288786.mp4', +        'md5': '515bf47ce209fb3f5a61b7aad364634c', +        'info_dict': { +            'title': 'Edward Snowden im Interview - Held oder Verräter?', +            'description': 'Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdcberwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend.', +            'thumbnail': 'http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037',          }, -        u'skip': u'Requires rtmpdump' +        'skip': 'Blocked outside of Germany',      }      def _real_extract(self, url): @@ -29,26 +35,49 @@ class ARDIE(InfoExtractor):          else:              video_id = m.group('video_id') -        # determine title and media streams from webpage -        html = self._download_webpage(url, video_id) -        title = re.search(self._TITLE, html).group('title') -        streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)] +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex( +            r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title') +        description = self._html_search_meta( +            'dcterms.abstract', webpage, 'description') +        thumbnail = self._og_search_thumbnail(webpage) + +        streams = [ +            mo.groupdict() +            for mo in re.finditer( +                r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)]          if not streams: -            assert '"fsk"' in html -            raise ExtractorError(u'This video is only available after 8:00 pm') - -        # choose default media type and highest quality for now -        stream = max([s for s in streams if int(s["media_type"]) == 0], -                     key=lambda s: int(s["quality"])) - -        # there's two possibilities: RTMP stream or HTTP download -        info = {'id': video_id, 'title': title, 'ext': 'mp4'} -        if stream['rtmp_url']: -            self.to_screen(u'RTMP download detected') -            assert stream['video_url'].startswith('mp4:') -            info["url"] = stream["rtmp_url"] -            info["play_path"] = stream['video_url'] -        else: -            assert stream["video_url"].endswith('.mp4') -            info["url"] = stream["video_url"] -        return [info] +            if '"fsk"' in webpage: +                raise ExtractorError('This video is only available after 20:00') + +        formats = [] +        for s in streams: +            format = { +                'quality': int(s['quality']), +            } +            if s.get('rtmp_url'): +                format['protocol'] = 'rtmp' +                format['url'] = s['rtmp_url'] +                format['playpath'] = s['video_url'] +            else: +                format['url'] = s['video_url'] + +            quality_name = self._search_regex( +                r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], +                'quality name', default='NA') +            format['format_id'] = '%s-%s-%s-%s' % ( +                determine_ext(format['url']), quality_name, s['media_type'], +                s['quality']) + +            formats.append(format) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'formats': formats, +            'thumbnail': thumbnail, +        } | 
