[netzkino] Add new extractor (Fixes #4669)

author: Philipp Hagemeister <phihag@phihag.de> 2015-01-09 23:59:18 +0100
committer: Philipp Hagemeister <phihag@phihag.de> 2015-01-09 23:59:18 +0100
commit: dd622d7c4ec15aec6ea5d66ed88a236147373e95 (patch)
tree: 22b2ca7b1fbcdd96e0e3ed70b1aa5cf15c42d9b9 /youtube_dl
parent: b8da6b9fc6b05dff5dfbe98e20546b7e5e0ab170 (diff)
4 files changed, 102 insertions, 0 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 8dacc2c54..5da7568ca 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -274,6 +274,7 @@ from .nbc import (
 )
 from .ndr import NDRIE
 from .ndtv import NDTVIE
+from .netzkino import NetzkinoIE
 from .nerdcubed import NerdCubedFeedIE
 from .newgrounds import NewgroundsIE
 from .newstube import NewstubeIE
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index d703893dc..b4cd59e43 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -147,6 +147,17 @@ class InfoExtractor(object):
     like_count:     Number of positive ratings of the video
     dislike_count:  Number of negative ratings of the video
     comment_count:  Number of comments on the video
+    comments:       A list of comments, each with one or more of the following
+                    properties (all but one of text or html optional):
+                        * "author" - human-readable name of the comment author
+                        * "author_id" - user ID of the comment author
+                        * "id" - Comment ID
+                        * "html" - Comment as HTML
+                        * "text" - Plain text of the comment
+                        * "timestamp" - UNIX timestamp of comment
+                        * "parent" - ID of the comment this one is replying to.
+                                     Set to "root" to indicate that this is a
+                                     comment to the original video.
     age_limit:      Age restriction for the video, as an integer (years)
     webpage_url:    The url to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
new file mode 100644
index 000000000..93567d1e3
--- /dev/null
+++ b/youtube_dl/extractor/netzkino.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+)
+
+
+class NetzkinoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+        'md5': '92a3f8b76f8d7220acce5377ea5d4873',
+        'info_dict': {
+            'id': 'rakete-zum-mond',
+            'ext': 'mp4',
+            'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
+            'comments': 'mincount:3',
+            'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+            'upload_date': '20120813',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'timestamp': 1344858571,
+            'age_limit': 12,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category_id = mobj.group('category')
+        video_id = mobj.group('id')
+
+        api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
+        api_info = self._download_json(api_url, video_id)
+        info = next(
+            p for p in api_info['posts'] if p['slug'] == video_id)
+        custom_fields = info['custom_fields']
+
+        production_js = self._download_webpage(
+            'http://www.netzkino.de/beta/dist/production.min.js', video_id,
+            note='Downloading player code')
+        avo_js = self._search_regex(
+            r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+            production_js, 'URL templates')
+        templates = self._parse_json(
+            avo_js, video_id, transform_source=js_to_json)
+
+        suffix = {
+            'hds': '.mp4/manifest.f4m',
+            'hls': '.mp4/master.m3u8',
+            'pmd': '.mp4',
+        }
+        film_fn = custom_fields['Streaming'][0]
+        formats = [{
+            'format_id': key,
+            'ext': 'mp4',
+            'url': tpl.replace('{}', film_fn) + suffix[key],
+        } for key, tpl in templates.items()]
+        self._sort_formats(formats)
+
+        comments = [{
+            'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
+            'id': c['id'],
+            'author': c['name'],
+            'html': c['content'],
+            'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
+        } for c in info.get('comments', [])]
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'comments': comments,
+            'title': info['title'],
+            'age_limit': int_or_none(custom_fields.get('FSK')[0]),
+            'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
+            'description': clean_html(info.get('content')),
+            'thumbnail': info.get('thumbnail'),
+            'playlist_title': api_info.get('title'),
+            'playlist_id': category_id,
+        }
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 29739a483..079e8d2c3 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -205,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):
 
 def clean_html(html):
     """Clean an HTML snippet into a readable string"""
+
+    if html is None:  # Convenience for sanitizing descriptions etc.
+        return html
+
     # Newline vs <br />
     html = html.replace('\n', ' ')
     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
author	Philipp Hagemeister <phihag@phihag.de>	2015-01-09 23:59:18 +0100
committer	Philipp Hagemeister <phihag@phihag.de>	2015-01-09 23:59:18 +0100
commit	dd622d7c4ec15aec6ea5d66ed88a236147373e95 (patch)
tree	22b2ca7b1fbcdd96e0e3ed70b1aa5cf15c42d9b9 /youtube_dl
parent	b8da6b9fc6b05dff5dfbe98e20546b7e5e0ab170 (diff)