diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/golem.py | 131 | 
2 files changed, 132 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..71fe38ca0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -135,6 +135,7 @@ from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .godtube import GodTubeIE +from .golem import GolemIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..afb620b1c --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class GolemIE(InfoExtractor): +    _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' +    _TEST = { +        'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', +        'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', +        'info_dict': { +            'id': '14095', +            'format_id': 'high', +            'ext': 'mp4', +            'title': 'iPhone 6 und 6 Plus - Test', +            'duration': 300, +            'filesize': 65309548, +        } +    } + +    _CONFIG = 'https://video.golem.de/xml/{}.xml' +    _PREFIX = 'http://video.golem.de' + +    def _warn(self, fmt, *args): +        self.report_warning(fmt.format(*args), self._id) + +    def _extract_format(self, elem): +        format_id = elem.tag + +        url = elem.findtext('./url') +        if url == '': +            self._warn("{}: url: empty, skipping", format_id) +            return None + +        fmt = { +            'format_id': format_id, +            'url': compat_urlparse.urljoin(self._PREFIX, url) +        } + +        try: +            _, ext = elem.findtext('./filename', '').rsplit('.', 1) +        except ValueError: +            self._warn('{}: ext: missing extension', format_id) +        else: +            fmt['ext'] = ext + +        filesize = elem.findtext('./filesize') +        if filesize is not None: +            try: +                fmt['filesize'] = int(filesize) +            except ValueError as e: +                self._warn('{}: filesize: {}', format_id, e) + +        width = elem.get('width') +        if width is not None: +            try: +                fmt['width'] = int(width) +            except ValueError as e: +                self._warn('{}: width: {}', format_id, e) + +        height = elem.get('height') +        if height is not None: +            try: +                fmt['height'] = int(height) +            except ValueError as e: +                self._warn('{}: height: {}', format_id, e) + +        return fmt + +    def _extract_thumbnail(self, elem): +        url = elem.findtext('./url') +        if url == '': +            return None +        thumb = { +            'url': compat_urlparse.urljoin(self._PREFIX, url) +        } + +        width = elem.get('width') +        if width is not None: +            try: +                thumb['width'] = int(width) +            except ValueError as e: +                self._warn('thumbnail: width: {}', e) + +        height = elem.get('height') +        if height is not None: +            try: +                thumb['height'] = int(height) +            except ValueError as e: +                self._warn('thumbnail: height: {}', e) + +        return thumb + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        self._id = mobj.group('id') + +        config = self._download_xml(self._CONFIG.format(self._id), self._id) + +        info = { +            'id': self._id, +            'title': config.findtext('./title', 'golem') +        } + +        formats = [] +        for e in config.findall('./*[url]'): +            fmt = self._extract_format(e) +            if fmt is not None: +                formats.append(fmt) +        self._sort_formats(formats) +        info['formats'] = formats + +        thumbnails = [] +        for e in config.findall('.//teaser[url]'): +            thumb = self._extract_thumbnail(e) +            if thumb is not None: +                thumbnails.append(thumb) +        info['thumbnails'] = thumbnails + +        playtime = config.findtext('./playtime') +        if playtime is not None: +            try: +                info['duration'] = round(float(playtime)) +            except ValueError as e: +                self._warn('duration: {}', e) + +        return info | 
