diff options
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/minds.py | 296 | 
2 files changed, 165 insertions, 133 deletions
| diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 90012fc4f..29b0e615e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -653,8 +653,8 @@ from .microsoftvirtualacademy import (  )  from .minds import (      MindsIE, -    MindsActivityIE,      MindsChannelIE, +    MindsGroupIE,  )  from .ministrygrid import MinistryGridIE  from .minoto import MinotoIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 4523d0938..8e9f0f825 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -1,164 +1,196 @@  # coding: utf-8  from __future__ import unicode_literals -import re  from .common import InfoExtractor  from ..compat import compat_str -from ..utils import (int_or_none, sanitized_Request, str_or_none, -                     unified_strdate) - - -class MindsIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P<id>[0-9]+)' -    _TEST = { +from ..utils import ( +    clean_html, +    int_or_none, +    str_or_none, +    strip_or_none, +) + + +class MindsBaseIE(InfoExtractor): +    _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + +    def _call_api(self, path, video_id, resource, query=None): +        api_url = 'https://www.minds.com/api/' + path +        token = self._get_cookies(api_url).get('XSRF-TOKEN') +        return self._download_json( +            api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ +                'Referer': 'https://www.minds.com/', +                'X-XSRF-TOKEN': token.value if token else '', +            }, query=query) + + +class MindsIE(MindsBaseIE): +    IE_NAME = 'minds' +    _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P<id>[0-9]+)' +    _TESTS = [{          'url': 'https://www.minds.com/media/100000000000086822',          'md5': '215a658184a419764852239d4970b045',          'info_dict': {              'id': '100000000000086822',              'ext': 'mp4',              'title': 'Minds intro sequence', -            'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', -            'uploader_id': '100000000000000341', -            'description': '<?xml encoding="utf-8" ?>', +            'thumbnail': r're:https?://.+\.png', +            'uploader_id': 'ottman',              'upload_date': '20130524',              'timestamp': 1369404826, +            'uploader': 'Bill Ottman', +            'view_count': int, +            'like_count': int, +            'dislike_count': int, +            'tags': ['animation'], +            'comment_count': int, +            'license': 'attribution-cc',          }, -        'params': { -            'skip_download': True, +    }, { +        # entity.type == 'activity' and empty title +        'url': 'https://www.minds.com/newsfeed/798025111988506624', +        'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', +        'info_dict': { +            'id': '798022190320226304', +            'ext': 'mp4', +            'title': '798022190320226304', +            'uploader': 'ColinFlaherty', +            'upload_date': '20180111', +            'timestamp': 1515639316, +            'uploader_id': 'ColinFlaherty',          }, -    } +    }, { +        'url': 'https://www.minds.com/archive/view/715172106794442752', +        'only_matching': True, +    }, { +        # youtube perma_url +        'url': 'https://www.minds.com/newsfeed/1197131838022602752', +        'only_matching': True, +    }]      def _real_extract(self, url): -        video_id = self._match_id(url) -        video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id -        token = self._get_cookies(url).get('XSRF-TOKEN') -        headers = { -            'authority': 'www.minds.com', -            'referer': url, -            'x-xsrf-token': token.value if token else '', -        } -        data = self._download_json(video_api_url, video_id, headers=headers, -                                   query={'children': 'false'}) -        formats = [] -        owner = data.get('ownerObj', {}) - -        transcodes = data.get('transcodes', {}) -        # These keys are the width so keep the highest width last -        keys = sorted(transcodes.keys()) - -        for format_id in keys: -            is_numeric = re.match('^[0-9]+\.mp4', format_id) -            video_url = transcodes[format_id] -            info = { -                'url': video_url, -                'format_id': format_id, -                'http_headers': headers, -            } -            if is_numeric: -                info['width'] = int(format_id.split('.')[0]) -            formats.append(info) - -        uploader_id = str_or_none(owner.get('guid') or -                                  data.get('owner_guid') or -                                  owner.get('legacy_guid') or -                                  owner.get('owner_guid')) -        description = str_or_none(data.get('description')) -        if description: -            description = description.strip() -        uploader_url = age_limit = thumbnail = None - -        if owner.get('username'): -            uploader_url = 'https://www.minds.com/%s' % owner.get('username') -        if data.get('mature') is True: -            age_limit = 18 - -        thumbnail_api_url = data.get('thumbnail_src') -        if thumbnail_api_url: -            req = sanitized_Request(thumbnail_api_url) -            req.get_method = lambda: 'HEAD' -            res = self._request_webpage(req, video_id) -            if res.headers.get('content-type', '').startswith('image/'): -                thumbnail = getattr(res, 'url', None) -        tags = data.get('tags', '').strip() -        if isinstance(tags, compat_str) and tags: -            tags = [x.strip() for x in tags.split(',')] +        entity_id = self._match_id(url) +        entity = self._call_api( +            'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] +        if entity.get('type') == 'activity': +            if entity.get('custom_type') == 'video': +                video_id = entity['entity_guid'] +            else: +                return self.url_result(entity['perma_url'])          else: -            tags = None -        category = data.get('category') -        if isinstance(category, compat_str) and category: -            category = [category] -        else: -            category = None +            assert(entity['subtype'] == 'video') +            video_id = entity_id +        # 1080p and webm formats available only on the sources array +        video = self._call_api( +            'v2/media/video/' + video_id, video_id, 'video') + +        formats = [] +        for source in (video.get('sources') or []): +            src = source.get('src') +            if not src: +                continue +            formats.append({ +                'format_id': source.get('label'), +                'height': int_or_none(source.get('size')), +                'url': src, +            }) +        self._sort_formats(formats) + +        entity = video.get('entity') or entity +        owner = entity.get('ownerObj') or {} +        uploader_id = owner.get('username') + +        tags = entity.get('tags') +        if tags and isinstance(tags, compat_str): +            tags = [tags] + +        thumbnail = None +        poster = video.get('poster') or entity.get('thumbnail_src') +        if poster: +            urlh = self._request_webpage(poster, video_id, fatal=False) +            if urlh: +                thumbnail = urlh.geturl()          return {              'id': video_id, -            'title': data['title'], +            'title': entity.get('title') or video_id,              'formats': formats, -            'description': description, -            'license': str_or_none(data.get('license')), -            'creator': str_or_none(owner.get('name') or owner.get('username')), -            'release_date': unified_strdate(data.get('time_created')), -            'timestamp': int_or_none(data.get('time_created')), +            'description': clean_html(entity.get('description')) or None, +            'license': str_or_none(entity.get('license')), +            'timestamp': int_or_none(entity.get('time_created')), +            'uploader': strip_or_none(owner.get('name')),              'uploader_id': uploader_id, -            'uploader_url': uploader_url, -            'view_count': int_or_none(data.get('play:count')), -            'like_count': int_or_none(data.get('thumbs:up:count')), -            'dislike_count': int_or_none(data.get('thumbs:down:count')), -            'average_rating': int_or_none(data.get('rating')), -            'age_limit': age_limit, -            'categories': [str_or_none(data.get('category'))], +            'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, +            'view_count': int_or_none(entity.get('play:count')), +            'like_count': int_or_none(entity.get('thumbs:up:count')), +            'dislike_count': int_or_none(entity.get('thumbs:down:count')),              'tags': tags, -            # As of 20181020 the API is returning `false` for this value both -            # at top level and within the entity.comments:count path. The only -            # other way to get this is to fetch all comments and count. -            'comment_count': int_or_none(data.get('comments:count')), +            'comment_count': int_or_none(entity.get('comments:count')),              'thumbnail': thumbnail,          } -class MindsActivityIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P<id>[0-9]+)' +class MindsFeedBaseIE(MindsBaseIE): +    _PAGE_SIZE = 150 -    def _real_extract(self, url): -        guid = self._match_id(url) -        api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid -        token = self._get_cookies(url).get('XSRF-TOKEN') -        headers = { -            'authority': 'www.minds.com', -            'referer': url, -            'x-xsrf-token': token.value if token else '', -        } -        data = self._download_json(api_url, guid, headers=headers) -        return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) +    def _entries(self, feed_id): +        query = {'limit': self._PAGE_SIZE, 'sync': 1} +        i = 1 +        while True: +            data = self._call_api( +                'v2/feeds/container/%s/videos' % feed_id, +                feed_id, 'page %s' % i, query) +            entities = data.get('entities') or [] +            for entity in entities: +                guid = entity.get('guid') +                if not guid: +                    continue +                yield self.url_result( +                    'https://www.minds.com/newsfeed/' + guid, +                    MindsIE.ie_key(), guid) +            query['from_timestamp'] = data['load-next'] +            if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): +                break +            i += 1 +    def _real_extract(self, url): +        feed_id = self._match_id(url) +        feed = self._call_api( +            'v1/%s/%s' % (self._FEED_PATH, feed_id), +            feed_id, self._FEED_TYPE)[self._FEED_TYPE] + +        return self.playlist_result( +            self._entries(feed['guid']), feed_id, +            strip_or_none(feed.get('name')), +            feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): +    _FEED_TYPE = 'channel' +    IE_NAME = 'minds:' + _FEED_TYPE +    _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P<id>[^/?&#]+)' +    _FEED_PATH = 'channel' +    _TEST = { +        'url': 'https://www.minds.com/ottman', +        'info_dict': { +            'id': 'ottman', +            'title': 'Bill Ottman', +            'description': 'Co-creator & CEO @minds', +        }, +        'playlist_mincount': 54, +    } -class MindsChannelIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P<id>[^/]+)' -    def _real_extract(self, url): -        channel_name = self._match_id(url) -        api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name -        token = self._get_cookies(url).get('XSRF-TOKEN') -        headers = { -            'authority': 'www.minds.com', -            'referer': url, -            'x-xsrf-token': token.value if token else '', -        } -        data = self._download_json(api_url, channel_name, headers=headers) -        channel = data.get('channel', {}) -        params = {'limit': 12, 'offset': ''} -        api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] -        entries = [] -        while True: -            data = self._download_json(api_url, channel['guid'], -                                       headers=headers, query=params) -            activity = data.get('activity', []) -            if len(activity) == 0 or not data.get('load-next'): -                break -            for info in activity: -                if info.get('custom_type') != 'video': -                    continue -                entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) -            params['offset'] = data['load-next'] -        return self.playlist_result(entries, -                                    playlist_title='%s activity' % channel_name) +class MindsGroupIE(MindsFeedBaseIE): +    _FEED_TYPE = 'group' +    IE_NAME = 'minds:' + _FEED_TYPE +    _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P<id>[0-9]+)' +    _FEED_PATH = 'groups/group' +    _TEST = { +        'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', +        'info_dict': { +            'id': '785582576369672204', +            'title': 'Cooking Videos', +        }, +        'playlist_mincount': 1, +    } | 
