diff options
| -rw-r--r-- | youtube_dl/extractor/ninegag.py | 189 | 
1 files changed, 83 insertions, 106 deletions
| diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 3753bc0a2..440f865bc 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,148 +1,125 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      determine_ext, -    url_or_none, +    ExtractorError,      int_or_none, -    float_or_none, -    ExtractorError +    try_get, +    url_or_none,  )  class NineGagIE(InfoExtractor):      IE_NAME = '9gag' -    _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)' -    _TESTS = [{ -        'url': 'https://9gag.com/gag/an5Qz5b', -        'info_dict': { -            'id': 'an5Qz5b', -            'ext': 'webm', -            'title': 'Dogs playing tetherball', -            'upload_date': '20191108', -            'timestamp': 1573243994, -            'age_limit': 0, -            'categories': [ -                'Wholesome' -            ], -            'tags': [ -                'Dog' -            ] -        } -    }, { +    _TEST = {          'url': 'https://9gag.com/gag/ae5Ag7B',          'info_dict': {              'id': 'ae5Ag7B', -            'ext': 'webm', +            'ext': 'mp4',              'title': 'Capybara Agility Training',              'upload_date': '20191108',              'timestamp': 1573237208, -            'age_limit': 0, -            'categories': [ -                'Awesome' -            ], -            'tags': [ -                'Weimaraner', -                'American Pit Bull Terrier' -            ] +            'categories': ['Awesome'], +            'tags': ['Weimaraner', 'American Pit Bull Terrier'], +            'duration': 44, +            'like_count': int, +            'dislike_count': int, +            'comment_count': int,          } -    }] - -    _EXTERNAL_VIDEO_PROVIDERS = { -        'Youtube': 'https://youtube.com/watch?v=%s'      }      def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        rawJsonData = self._search_regex( -            r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', -            webpage, -            'data') -        rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') -        data = self._parse_json(rawJsonData, video_id)['data']['post'] - -        if data['type'] == 'Video': -            vid = data['video']['id'] -            ie_key = data['video']['source'].capitalize() -            return { -                '_type': 'url_transparent', -                'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, -                'ie_key': ie_key, -                'id': vid, -                'duration': data['video'].get('duration'), -                'start_time': data['video'].get('startTs') -            } +        post_id = self._match_id(url) +        post = self._download_json( +            'https://9gag.com/v1/post', post_id, query={ +                'id': post_id +            })['data']['post'] -        if data['type'] == 'EmbedVideo': -            vid = data['video']['id'] -            ie_key = data['video']['source'].capitalize() -            return { -                '_type': 'url_transparent', -                'url': data['video']['embedUrl'], -                #'ie_key': vid, -                'start_time': data['video'].get('startTs') -            } - -        if data['type'] != 'Animated': +        if post.get('type') != 'Animated':              raise ExtractorError(                  'The given url does not contain a video',                  expected=True) +        title = post['title'] +          duration = None          formats = []          thumbnails = [] -        for key in data['images']: -            image = data['images'][key] -            if 'duration' in image and duration is None: -                duration = int_or_none(image['duration']) -            url = url_or_none(image.get('url')) -            if url == None: +        for key, image in (post.get('images') or {}).items(): +            image_url = url_or_none(image.get('url')) +            if not image_url:                  continue -            ext = determine_ext(url) -            if ext == 'jpg' or ext == 'png': -                thumbnail = { -                    'url': url, -                    'width': float_or_none(image.get('width')), -                    'height': float_or_none(image.get('height')) -                } -                thumbnails.append(thumbnail) -            elif ext == 'webm' or ext == 'mp4': -                formats.append({ -                    'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), +            ext = determine_ext(image_url) +            image_id = key.strip('image') +            common = { +                'url': image_url, +                'width': int_or_none(image.get('width')), +                'height': int_or_none(image.get('height')), +            } +            if ext in ('jpg', 'png'): +                webp_url = image.get('webpUrl') +                if webp_url: +                    t = common.copy() +                    t.update({ +                        'id': image_id + '-webp', +                        'url': webp_url, +                    }) +                    thumbnails.append(t) +                common.update({ +                    'id': image_id,                      'ext': ext, -                    'url': url, -                    'width': float_or_none(image.get('width')), -                    'height': float_or_none(image.get('height'))                  }) -        section = None -        postSection = data.get('postSection') -        if postSection != None and 'name' in postSection: -            section = re.sub(r'\\[^\\]{5}', '', postSection['name']) -        age_limit = int_or_none(data.get('nsfw')) -        if age_limit != None: -            age_limit = age_limit * 18 +                thumbnails.append(common) +            elif ext in ('webm', 'mp4'): +                if not duration: +                    duration = int_or_none(image.get('duration')) +                common['acodec'] = 'none' if image.get('hasAudio') == 0 else None +                for vcodec in ('vp8', 'vp9', 'h265'): +                    c_url = image.get(vcodec + 'Url') +                    if not c_url: +                        continue +                    c_f = common.copy() +                    c_f.update({ +                        'format_id': image_id + '-' + vcodec, +                        'url': c_url, +                        'vcodec': vcodec, +                    }) +                    formats.append(c_f) +                common.update({ +                    'ext': ext, +                    'format_id': image_id, +                }) +                formats.append(common) +        self._sort_formats(formats) + +        section = try_get(post, lambda x: x['postSection']['name']) +          tags = None -        if 'tags' in data: +        post_tags = post.get('tags') +        if post_tags:              tags = [] -            for tag in data.get('tags') or []: -                tags.append(tag.get('key')) +            for tag in post_tags: +                tag_key = tag.get('key') +                if not tag_key: +                    continue +                tags.append(tag_key) + +        get_count = lambda x: int_or_none(post.get(x + 'Count'))          return { -            'id': video_id, -            'title': data['title'], -            'timestamp': int_or_none(data.get('creationTs')), +            'id': post_id, +            'title': title, +            'timestamp': int_or_none(post.get('creationTs')),              'duration': duration,              'formats': formats,              'thumbnails': thumbnails, -            'like_count': int_or_none(data.get('upVoteCount')), -            'dislike_count': int_or_none(data.get('downVoteCount')), -            'comment_count': int_or_none(data.get('commentsCount')), -            'age_limit': age_limit, -            'categories': [section], +            'like_count': get_count('upVote'), +            'dislike_count': get_count('downVote'), +            'comment_count': get_count('comments'), +            'age_limit': 18 if post.get('nsfw') == 1 else None, +            'categories': [section] if section else None,              'tags': tags, -            'is_live': False          } | 
