[snagfilms] Improve and simplify

author: Sergey M․ <dstftw@gmail.com> 2015-06-27 18:20:42 +0600
committer: Sergey M․ <dstftw@gmail.com> 2015-06-27 18:20:42 +0600
commit: 654fd03c73fa0e4407a71c07d821b45321c3cdb8 (patch)
tree: c033dcbe6aa11078ee9b2b3fd46b38585feb9855 /youtube_dl
parent: 533b99fbf9e0961076798c21bfb9fd320363c2dc (diff)
2 files changed, 115 insertions, 63 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 59068a8b8..7e74a971d 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -493,7 +493,10 @@ from .smotri import (
     SmotriUserIE,
     SmotriBroadcastIE,
 )
-from .snagfilms import SnagFilmsIE
+from .snagfilms import (
+    SnagFilmsIE,
+    SnagFilmsEmbedIE,
+)
 from .snotr import SnotrIE
 from .sohu import SohuIE
 from .soompi import (
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
index 74cd2698d..6e103bd49 100644
--- a/youtube_dl/extractor/snagfilms.py
+++ b/youtube_dl/extractor/snagfilms.py
@@ -1,84 +1,133 @@
-from re import match,DOTALL
+from __future__ import unicode_literals
+
+import re
+
 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
 
-class SnagFilmsIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:www.|embed.)?snagfilms\.com/(?:films/title/(?P<display_id>.+?)|embed/player\?.*filmId=(?P<id>.+?))(?:&|/|$)'
+
+class SnagFilmsEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
     _TESTS = [{
-        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
-        'info_dict':
-        {
-            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
-            'display_id': 'lost_for_life',
-            'ext': 'mp4',
-            'title': 'Lost for Life',
-            'duration': 4489,
-            'description': 'In the United States, more than 2500 individuals are serving life-without-parole sentences for crimes they committed when they were seventeen years old or younger. Children as young as thirteen are among the thousands serving these sentences. Directed by Joshua Rof&eacute; (who spent four intensive years on the project), LOST FOR LIFE tells the stories of these individuals, their families and the families of juvenile murder victims. This searingly powerful documentary tackles this contentious issue from multiple perspectives and explores the complexity of the lives of those affected. What is justice when a kid kills? Can a horrific act place a life beyond redemption? Could you forgive?<br />',
-            'categories': ['Documentary','Crime','Award Winning','Festivals']
-        }
-    },{
-        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831',
-        'info_dict':
-        {
+        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+        'md5': '2924e9215c6eff7a55ed35b72276bd93',
+        'info_dict': {
             'id': '74849a00-85a9-11e1-9660-123139220831',
-            'display_id': 'while_we_watch',
             'ext': 'mp4',
             'title': '#whilewewatch',
-            'duration': 2311,
-            'description': 'A gripping portrait of the Occupy Wall Street media revolution,&nbsp;#WHILEWEWATCH is the first definitive film to emerge from Zuccotti Park—with full access and cooperation from masterminds who made #OccupyWallStreet a reality.&nbsp;The #OccupyWallStreet media team had no fear of a critical city government, big corporations, hostile police or a lagging mainstream media to tell their story. Through rain, snow, grueling days and sleeping on concrete, they pump out exhilarating ideas to the world. With little money, they rely on Twitter, texting, Wi-Fi, posters, Tumblr, live streams, YouTube, Facebook, dramatic marches, drumbeats and chants. As the film unfolds, we witness the burgeoning power of social media.<br />',
-            'categories': ['Documentary','Politics']
         }
+    }, {
+        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        display_id, video_id = match(self._VALID_URL,url).groups()
-        if display_id is None:
-            embed_webpage = self._download_webpage('http://www.snagfilms.com/embed/player?filmId=' + video_id, video_id)
-            display_id = self._html_search_regex(
-                r"snagfilms\.com/films/title/(?P<display_id>.+?)(?:/|')",
-                embed_webpage,
-                'display_id'
-            )
-        webpage = self._download_webpage('http://www.snagfilms.com/films/title/' + display_id, display_id)
-
-        json_data = self._parse_json(self._html_search_regex(
-            r'"data":{"film":(?P<data>{.*?}})}',
-            webpage,
-            'data'
-        ), display_id)
-
-        if video_id is None:
-            video_id = json_data['id']
-            embed_webpage = self._download_webpage('http://www.snagfilms.com/embed/player?filmId=' + video_id, video_id)
-
-        title = json_data['title']
-        duration = int(json_data['duration'])
-        description = json_data['synopsis']
-        categories = [category['title'] for category in json_data['categories']]
-        thumbnail = json_data['image']
-
-        sources = self._parse_json(js_to_json(self._html_search_regex(
-            r'sources: (?P<sources>\[.*?\])',
-            embed_webpage,
-            'sources',
-            flags=DOTALL
-        )), video_id)
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
 
         formats = []
-        for source in sources:
-            if source['type'] == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(source['file'], video_id))
+        for source in self._parse_json(js_to_json(self._search_regex(
+                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+            file_ = source.get('file')
+            if not file_:
+                continue
+            type_ = source.get('type')
+            format_id = source.get('label')
+            ext = determine_ext(file_)
+            if any(_ == 'm3u8' for _ in (type_, ext)):
+                formats.extend(self._extract_m3u8_formats(
+                    file_, video_id, 'mp4', m3u8_id='hls'))
             else:
-                formats.append({'url': source['file'],'ext': source['type'], 'resolution': source['label']})
+                bitrate = int_or_none(self._search_regex(
+                    r'(\d+)kbps', file_, 'bitrate', default=None))
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None))
+                formats.append({
+                    'url': file_,
+                    'format_id': format_id,
+                    'tbr': bitrate,
+                    'height': height,
+                })
         self._sort_formats(formats)
 
+        title = self._search_regex(
+            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+            webpage, 'title')
+
         return {
             'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
+
+
+class SnagFilmsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/films/title/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+        'md5': '19844f897b35af219773fd63bdec2942',
+        'info_dict': {
+            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+            'display_id': 'lost_for_life',
+            'ext': 'mp4',
+            'title': 'Lost for Life',
+            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 4489,
+            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+        snag = self._parse_json(
+            self._search_regex(
+                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+            display_id)
+
+        for item in snag:
+            if item.get('data', {}).get('film', {}).get('id') == film_id:
+                data = item['data']['film']
+                title = data['title']
+                description = clean_html(data.get('synopsis'))
+                thumbnail = data.get('image')
+                duration = int_or_none(data.get('duration') or data.get('runtime'))
+                categories = [
+                    category['title'] for category in data.get('categories', [])
+                    if category.get('title')]
+                break
+        else:
+            title = self._search_regex(
+                r'itemprop="title">([^<]+)<', webpage, 'title')
+            description = self._html_search_regex(
+                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+                webpage, 'description', default=None) or self._og_search_description(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+            duration = parse_duration(self._search_regex(
+                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+                webpage, 'duration', fatal=False))
+            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+            'id': film_id,
             'display_id': display_id,
             'title': title,
-            'duration': duration,
             'description': description,
-            'categories': categories,
             'thumbnail': thumbnail,
-            'formats': formats,
+            'duration': duration,
+            'categories': categories,
         }
author	Sergey M․ <dstftw@gmail.com>	2015-06-27 18:20:42 +0600
committer	Sergey M․ <dstftw@gmail.com>	2015-06-27 18:20:42 +0600
commit	654fd03c73fa0e4407a71c07d821b45321c3cdb8 (patch)
tree	c033dcbe6aa11078ee9b2b3fd46b38585feb9855 /youtube_dl
parent	533b99fbf9e0961076798c21bfb9fd320363c2dc (diff)