diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-06-22 23:36:07 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-06-22 23:36:07 +0700 | 
| commit | 23bdae0955ae5e0adaf6212bb7aa6cec77ae4d1d (patch) | |
| tree | 507a00130e448d33582aa309377da3488d1ef2f1 | |
| parent | ca74c90bf55cfb16f5eadc3a63e9389202ee80f5 (diff) | |
[svt] Various improvements
+ [svt:play] Add fallback path looking for video id and fix extraction for oppetarkiv
* [svt:base] Detect geo restriction
* [svt:base] Extract series related metadata
| -rw-r--r-- | youtube_dl/extractor/svt.py | 84 | 
1 files changed, 61 insertions, 23 deletions
| diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 6526a6345..67f56fab8 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -7,13 +7,13 @@ from .common import InfoExtractor  from ..utils import (      determine_ext,      dict_get, +    int_or_none, +    try_get,  )  class SVTBaseIE(InfoExtractor): -    def _extract_video(self, info, video_id): -        video_info = self._get_video_info(info) - +    def _extract_video(self, video_info, video_id):          formats = []          for vr in video_info['videoReferences']:              player_type = vr.get('playerType') @@ -37,6 +37,8 @@ class SVTBaseIE(InfoExtractor):                      'format_id': player_type,                      'url': vurl,                  }) +        if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): +            self.raise_geo_restricted('This video is only available in Sweden')          self._sort_formats(formats)          subtitles = {} @@ -52,15 +54,32 @@ class SVTBaseIE(InfoExtractor):                      subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) -        duration = video_info.get('materialLength') -        age_limit = 18 if video_info.get('inappropriateForChildren') else 0 +        title = video_info.get('title') + +        series = video_info.get('programTitle') +        season_number = int_or_none(video_info.get('season')) +        episode = video_info.get('episodeTitle') +        episode_number = int_or_none(video_info.get('episodeNumber')) + +        duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) +        age_limit = None +        adult = dict_get( +            video_info, ('inappropriateForChildren', 'blockedForChildren'), +            skip_false_values=False) +        if adult is not None: +            age_limit = 18 if adult else 0          return {              'id': video_id, +            'title': title,              'formats': formats,              'subtitles': subtitles,              'duration': duration,              'age_limit': age_limit, +            'series': series, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number,          } @@ -85,9 +104,6 @@ class SVTIE(SVTBaseIE):          if mobj:              return mobj.group('url') -    def _get_video_info(self, info): -        return info['video'] -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          widget_id = mobj.group('widget_id') @@ -97,7 +113,7 @@ class SVTIE(SVTBaseIE):              'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),              article_id) -        info_dict = self._extract_video(info, article_id) +        info_dict = self._extract_video(info['video'], article_id)          info_dict['title'] = info['context']['title']          return info_dict @@ -105,7 +121,7 @@ class SVTIE(SVTBaseIE):  class SVTPlayIE(SVTBaseIE):      IE_DESC = 'SVT Play and Öppet arkiv'      _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',          'md5': '2b6704fe4a28801e1a098bbf3c5ac611',          'info_dict': { @@ -121,25 +137,47 @@ class SVTPlayIE(SVTBaseIE):                  }]              },          }, -    } - -    def _get_video_info(self, info): -        return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'] +    }, { +        # geo restricted to Sweden +        'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        data = self._parse_json(self._search_regex( -            r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id) +        data = self._parse_json( +            self._search_regex( +                r'root\["__svtplay"\]\s*=\s*([^;]+);', +                webpage, 'embedded data', default='{}'), +            video_id, fatal=False)          thumbnail = self._og_search_thumbnail(webpage) -        info_dict = self._extract_video(data, video_id) -        info_dict.update({ -            'title': data['context']['dispatcher']['stores']['MetaStore']['title'], -            'thumbnail': thumbnail, -        }) - -        return info_dict +        if data: +            video_info = try_get( +                data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], +                dict) +            if video_info: +                info_dict = self._extract_video(video_info, video_id) +                info_dict.update({ +                    'title': data['context']['dispatcher']['stores']['MetaStore']['title'], +                    'thumbnail': thumbnail, +                }) +                return info_dict + +        video_id = self._search_regex( +            r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', +            webpage, 'video id', default=None) + +        if video_id: +            data = self._download_json( +                'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) +            info_dict = self._extract_video(data, video_id) +            if not info_dict.get('title'): +                info_dict['title'] = re.sub( +                    r'\s*\|\s*.+?$', '', +                    info_dict.get('episode') or self._og_search_title(webpage)) +            return info_dict | 
