diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/kika.py | 101 | ||||
| -rw-r--r-- | youtube_dl/extractor/mdr.py | 174 | 
3 files changed, 132 insertions, 144 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ad4e9c36..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,7 +274,6 @@ from .karrierevideos import KarriereVideosIE  from .keezmovies import KeezMoviesIE  from .khanacademy import KhanAcademyIE  from .kickstarter import KickStarterIE -from .kika import KikaIE  from .keek import KeekIE  from .kontrtube import KontrTubeIE  from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py deleted file mode 100644 index 5337ac439..000000000 --- a/youtube_dl/extractor/kika.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class KikaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' - -    _TESTS = [ -        { -            'url': 'http://www.kika.de/baumhaus/videos/video19636.html', -            'md5': '4930515e36b06c111213e80d1e4aad0e', -            'info_dict': { -                'id': '19636', -                'ext': 'mp4', -                'title': 'Baumhaus vom 30. Oktober 2015', -                'description': None, -            }, -        }, -        { -            'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', -            'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', -            'info_dict': { -                'id': '8182', -                'ext': 'mp4', -                'title': 'Beutolomäus und der geheime Weihnachtswunsch', -                'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', -            }, -        }, -        { -            'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', -            'md5': '4930515e36b06c111213e80d1e4aad0e', -            'info_dict': { -                'id': '19636', -                'ext': 'mp4', -                'title': 'Baumhaus vom 30. Oktober 2015', -                'description': None, -            }, -        }, -        { -            'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', -            'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', -            'info_dict': { -                'id': '8182', -                'ext': 'mp4', -                'title': 'Beutolomäus und der geheime Weihnachtswunsch', -                'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', -            }, -        }, -    ] - -    def _real_extract(self, url): -        # broadcast_id may be the same as the video_id -        broadcast_id = self._match_id(url) -        webpage = self._download_webpage(url, broadcast_id) - -        xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' -        video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) -        if not video_id: -            err_msg = 'Video %s is not available online' % broadcast_id -            raise ExtractorError(err_msg, expected=True) - -        xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) -        xml_tree = self._download_xml(xml_url, video_id) - -        title = xml_tree.find('title').text -        webpage_url = xml_tree.find('htmlUrl').text - -        # Try to get the description, not available for all videos -        try: -            broadcast_elem = xml_tree.find('broadcast') -            description = broadcast_elem.find('broadcastDescription').text -        except AttributeError: -            description = None - -        # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) -        tmp = xml_tree.find('duration').text.split(':') -        duration = int(tmp[0]) * 60 + int(tmp[1]) - -        formats = [{ -            'url': elem.find('progressiveDownloadUrl').text, -            'ext': elem.find('mediaType').text.lower(), -            'format': elem.find('profileName').text, -            'width': int(elem.find('frameWidth').text), -            'height': int(elem.find('frameHeight').text), -            'abr': int(elem.find('bitrateAudio').text), -            'vbr': int(elem.find('bitrateVideo').text), -            'filesize': int(elem.find('fileSize').text), -        } for elem in xml_tree.find('assets')] -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'formats': formats, -            'duration': duration, -            'webpage_url': webpage_url, -        } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..541ddd909 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,154 @@ +# coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( +    determine_ext, +    int_or_none, +    parse_duration, +    parse_iso8601, +    xpath_text, +)  class MDRIE(InfoExtractor): -    _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)' +    IE_DESC = 'MDR.DE and KiKA' +    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' -    # No tests, MDR regularily deletes its videos -    _TEST = { +    _TESTS = [{ +        # MDR regularily deletes its videos          'url': 'http://www.mdr.de/fakt/video189002.html',          'only_matching': True, -    } +    }, { +        'url': 'http://www.kika.de/baumhaus/videos/video19636.html', +        'md5': '4930515e36b06c111213e80d1e4aad0e', +        'info_dict': { +            'id': '19636', +            'ext': 'mp4', +            'title': 'Baumhaus vom 30. Oktober 2015', +            'duration': 134, +            'uploader': 'KIKA', +        }, +    }, { +        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', +        'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', +        'info_dict': { +            'id': '8182', +            'ext': 'mp4', +            'title': 'Beutolomäus und der geheime Weihnachtswunsch', +            'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', +            'timestamp': 1419047100, +            'upload_date': '20141220', +            'duration': 4628, +            'uploader': 'KIKA', +        }, +    }, { +        'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', +        'only_matching': True, +    }, { +        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', +        'only_matching': True, +    }]      def _real_extract(self, url): -        m = re.match(self._VALID_URL, url) -        video_id = m.group('video_id') -        domain = m.group('domain') +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) -        # determine title and media streams from webpage -        html = self._download_webpage(url, video_id) +        data_url = self._search_regex( +            r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', +            webpage, 'data url', group='url') -        title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title') -        xmlurl = self._search_regex( -            r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') +        doc = self._download_xml( +            compat_urlparse.urljoin(url, data_url), video_id) + +        title = (xpath_text(doc, './title', 'title', default=None) or +                 xpath_text(doc, './broadcast/broadcastName', 'title')) -        doc = self._download_xml(domain + xmlurl, video_id)          formats = [] -        for a in doc.findall('./assets/asset'): -            url_el = a.find('./progressiveDownloadUrl') -            if url_el is None: -                continue -            abr = int(a.find('bitrateAudio').text) // 1000 -            media_type = a.find('mediaType').text -            format = { -                'abr': abr, -                'filesize': int(a.find('fileSize').text), -                'url': url_el.text, -            } - -            vbr_el = a.find('bitrateVideo') -            if vbr_el is None: -                format.update({ -                    'vcodec': 'none', -                    'format_id': '%s-%d' % (media_type, abr), -                }) -            else: -                vbr = int(vbr_el.text) // 1000 -                format.update({ -                    'vbr': vbr, -                    'width': int(a.find('frameWidth').text), -                    'height': int(a.find('frameHeight').text), -                    'format_id': '%s-%d' % (media_type, vbr), -                }) -            formats.append(format) +        processed_urls = [] +        for asset in doc.findall('./assets/asset'): +            for source in ( +                    'progressiveDownload', +                    'dynamicHttpStreamingRedirector', +                    'adaptiveHttpStreamingRedirector'): +                url_el = asset.find('./%sUrl' % source) +                if url_el is None: +                    continue + +                video_url = url_el.text +                if video_url in processed_urls: +                    continue + +                processed_urls.append(video_url) + +                vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) +                abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + +                url_formats = [] + +                ext = determine_ext(url_el.text) +                if ext == 'm3u8': +                    url_formats = self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        preference=0, m3u8_id='HLS', fatal=False) +                elif ext == 'f4m': +                    url_formats = self._extract_f4m_formats( +                        video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, +                        preference=0, f4m_id='HDS', fatal=False) +                else: +                    media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') +                    vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) +                    abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) +                    filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + +                    f = { +                        'url': video_url, +                        'format_id': '%s-%d' % (media_type, vbr or abr), +                        'filesize': filesize, +                        'abr': abr, +                        'preference': 1, +                    } + +                    if vbr: +                        width = int_or_none(xpath_text(asset, './frameWidth', 'width')) +                        height = int_or_none(xpath_text(asset, './frameHeight', 'height')) +                        f.update({ +                            'vbr': vbr, +                            'width': width, +                            'height': height, +                        }) + +                    url_formats.append(f) + +                if not vbr: +                    for f in url_formats: +                        abr = f.get('tbr') or abr +                        if 'tbr' in f: +                            del f['tbr'] +                        f.update({ +                            'abr': abr, +                            'vcodec': 'none', +                        }) + +                if url_formats: +                    formats.extend(url_formats)          self._sort_formats(formats) +        description = xpath_text(doc, './broadcast/broadcastDescription', 'description') +        timestamp = parse_iso8601( +            xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or +            xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) +        duration = parse_duration(xpath_text(doc, './duration', 'duration')) +        uploader = xpath_text(doc, './rights', 'uploader') +          return {              'id': video_id,              'title': title, +            'description': description, +            'timestamp': timestamp, +            'duration': duration, +            'uploader': uploader,              'formats': formats,          } | 
