diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2015-01-09 23:59:18 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2015-01-09 23:59:18 +0100 | 
| commit | dd622d7c4ec15aec6ea5d66ed88a236147373e95 (patch) | |
| tree | 22b2ca7b1fbcdd96e0e3ed70b1aa5cf15c42d9b9 | |
| parent | b8da6b9fc6b05dff5dfbe98e20546b7e5e0ab170 (diff) | |
[netzkino] Add new extractor (Fixes #4669)
| -rw-r--r-- | test/helper.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/netzkino.py | 86 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 4 | 
5 files changed, 116 insertions, 0 deletions
diff --git a/test/helper.py b/test/helper.py index 77225e4f7..c416f388c 100644 --- a/test/helper.py +++ b/test/helper.py @@ -110,6 +110,20 @@ def expect_info_dict(self, got_dict, expected_dict):          else:              if isinstance(expected, compat_str) and expected.startswith('md5:'):                  got = 'md5:' + md5(got_dict.get(info_field)) +            elif isinstance(expected, compat_str) and expected.startswith('mincount:'): +                got = got_dict.get(info_field) +                self.assertTrue( +                    isinstance(got, list), +                    'Expected field %s to be a list, but it is of type %s' % ( +                        info_field, type(got).__name__)) +                expected_num = int(expected.partition(':')[2]) +                assertGreaterEqual( +                    self, len(got), expected_num, +                    'Expected %d items in field %s, but only got %d' % ( +                        expected_num, info_field, len(got) +                    ) +                ) +                continue              else:                  got = got_dict.get(info_field)              self.assertEqual(expected, got, diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8dacc2c54..5da7568ca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,6 +274,7 @@ from .nbc import (  )  from .ndr import NDRIE  from .ndtv import NDTVIE +from .netzkino import NetzkinoIE  from .nerdcubed import NerdCubedFeedIE  from .newgrounds import NewgroundsIE  from .newstube import NewstubeIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d703893dc..b4cd59e43 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -147,6 +147,17 @@ class InfoExtractor(object):      like_count:     Number of positive ratings of the video      dislike_count:  Number of negative ratings of the video      comment_count:  Number of comments on the video +    comments:       A list of comments, each with one or more of the following +                    properties (all but one of text or html optional): +                        * "author" - human-readable name of the comment author +                        * "author_id" - user ID of the comment author +                        * "id" - Comment ID +                        * "html" - Comment as HTML +                        * "text" - Plain text of the comment +                        * "timestamp" - UNIX timestamp of comment +                        * "parent" - ID of the comment this one is replying to. +                                     Set to "root" to indicate that this is a +                                     comment to the original video.      age_limit:      Age restriction for the video, as an integer (years)      webpage_url:    The url to the video webpage, if given to youtube-dl it                      should allow to get the same result again. (It will be set diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py new file mode 100644 index 000000000..93567d1e3 --- /dev/null +++ b/youtube_dl/extractor/netzkino.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    int_or_none, +    js_to_json, +    parse_iso8601, +) + + +class NetzkinoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' + +    _TEST = { +        'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', +        'md5': '92a3f8b76f8d7220acce5377ea5d4873', +        'info_dict': { +            'id': 'rakete-zum-mond', +            'ext': 'mp4', +            'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', +            'comments': 'mincount:3', +            'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', +            'upload_date': '20120813', +            'thumbnail': 're:https?://.*\.jpg$', +            'timestamp': 1344858571, +            'age_limit': 12, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        category_id = mobj.group('category') +        video_id = mobj.group('id') + +        api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id +        api_info = self._download_json(api_url, video_id) +        info = next( +            p for p in api_info['posts'] if p['slug'] == video_id) +        custom_fields = info['custom_fields'] + +        production_js = self._download_webpage( +            'http://www.netzkino.de/beta/dist/production.min.js', video_id, +            note='Downloading player code') +        avo_js = self._search_regex( +            r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})', +            production_js, 'URL templates') +        templates = self._parse_json( +            avo_js, video_id, transform_source=js_to_json) + +        suffix = { +            'hds': '.mp4/manifest.f4m', +            'hls': '.mp4/master.m3u8', +            'pmd': '.mp4', +        } +        film_fn = custom_fields['Streaming'][0] +        formats = [{ +            'format_id': key, +            'ext': 'mp4', +            'url': tpl.replace('{}', film_fn) + suffix[key], +        } for key, tpl in templates.items()] +        self._sort_formats(formats) + +        comments = [{ +            'timestamp': parse_iso8601(c.get('date'), delimiter=' '), +            'id': c['id'], +            'author': c['name'], +            'html': c['content'], +            'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], +        } for c in info.get('comments', [])] + +        return { +            'id': video_id, +            'formats': formats, +            'comments': comments, +            'title': info['title'], +            'age_limit': int_or_none(custom_fields.get('FSK')[0]), +            'timestamp': parse_iso8601(info.get('date'), delimiter=' '), +            'description': clean_html(info.get('content')), +            'thumbnail': info.get('thumbnail'), +            'playlist_title': api_info.get('title'), +            'playlist_id': category_id, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 29739a483..079e8d2c3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -205,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):  def clean_html(html):      """Clean an HTML snippet into a readable string""" + +    if html is None:  # Convenience for sanitizing descriptions etc. +        return html +      # Newline vs <br />      html = html.replace('\n', ' ')      html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)  | 
