diff options
Diffstat (limited to 'youtube_dl/extractor/twitter.py')
| -rw-r--r-- | youtube_dl/extractor/twitter.py | 259 | 
1 files changed, 202 insertions, 57 deletions
| diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a161f046b..e70b2ab3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -10,21 +10,26 @@ from ..utils import (      remove_end,      int_or_none,      ExtractorError, -    sanitized_Request,  ) -class TwitterCardIE(InfoExtractor): +class TwitterBaseIE(InfoExtractor): +    def _get_vmap_video_url(self, vmap_url, video_id): +        vmap_data = self._download_xml(vmap_url, video_id) +        return xpath_text(vmap_data, './/MediaFile').strip() + + +class TwitterCardIE(TwitterBaseIE):      IE_NAME = 'twitter:card' -    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'      _TESTS = [          {              'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', -            'md5': '4fa26a35f9d1bf4b646590ba8e84be19', +            # MD5 checksums are different in different places              'info_dict': {                  'id': '560070183650213889',                  'ext': 'mp4', -                'title': 'TwitterCard', +                'title': 'Twitter Card',                  'thumbnail': 're:^https?://.*\.jpg$',                  'duration': 30.033,              } @@ -35,14 +40,14 @@ class TwitterCardIE(InfoExtractor):              'info_dict': {                  'id': '623160978427936768',                  'ext': 'mp4', -                'title': 'TwitterCard', +                'title': 'Twitter Card',                  'thumbnail': 're:^https?://.*\.jpg',                  'duration': 80.155,              },          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', -            'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', +            'md5': 'd4724ffe6d2437886d004fa5de1043b3',              'info_dict': {                  'id': 'dq4Oj5quskI',                  'ext': 'mp4', @@ -62,69 +67,106 @@ class TwitterCardIE(InfoExtractor):                  'ext': 'mp4',                  'upload_date': '20151113',                  'uploader_id': '1189339351084113920', -                'uploader': '@ArsenalTerje', -                'title': 'Vine by @ArsenalTerje', +                'uploader': 'ArsenalTerje', +                'title': 'Vine by ArsenalTerje',              },              'add_ie': ['Vine'], -        } +        }, { +            'url': 'https://twitter.com/i/videos/tweet/705235433198714880', +            'md5': '3846d0a07109b5ab622425449b59049d', +            'info_dict': { +                'id': '705235433198714880', +                'ext': 'mp4', +                'title': 'Twitter web player', +                'thumbnail': 're:^https?://.*\.jpg', +            }, +        },      ]      def _real_extract(self, url):          video_id = self._match_id(url) -        # Different formats served for different User-Agents -        USER_AGENTS = [ -            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4 -            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm -        ] -          config = None          formats = [] -        for user_agent in USER_AGENTS: -            request = sanitized_Request(url) -            request.add_header('User-Agent', user_agent) -            webpage = self._download_webpage(request, video_id) - -            iframe_url = self._html_search_regex( -                r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', -                webpage, 'video iframe', default=None) -            if iframe_url: -                return self.url_result(iframe_url) - -            config = self._parse_json(self._html_search_regex( -                r'data-player-config="([^"]+)"', webpage, 'data player config'), -                video_id) -            if 'playlist' not in config: -                if 'vmapUrl' in config: -                    vmap_data = self._download_xml(config['vmapUrl'], video_id) -                    video_url = xpath_text(vmap_data, './/MediaFile').strip() -                    formats.append({ -                        'url': video_url, -                    }) -                    break   # same video regardless of UA -                continue - -            video_url = config['playlist'][0]['source'] +        duration = None -            f = { -                'url': video_url, -            } +        webpage = self._download_webpage(url, video_id) + +        iframe_url = self._html_search_regex( +            r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', +            webpage, 'video iframe', default=None) +        if iframe_url: +            return self.url_result(iframe_url) + +        config = self._parse_json(self._html_search_regex( +            r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), +            video_id) +        def _search_dimensions_in_video_url(a_format, video_url):              m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)              if m: -                f.update({ +                a_format.update({                      'width': int(m.group('width')),                      'height': int(m.group('height')),                  }) + +        playlist = config.get('playlist') +        if playlist: +            video_url = playlist[0]['source'] + +            f = { +                'url': video_url, +            } + +            _search_dimensions_in_video_url(f, video_url) +              formats.append(f) + +        vmap_url = config.get('vmapUrl') or config.get('vmap_url') +        if vmap_url: +            formats.append({ +                'url': self._get_vmap_video_url(vmap_url, video_id), +            }) + +        media_info = None + +        for entity in config.get('status', {}).get('entities', []): +            if 'mediaInfo' in entity: +                media_info = entity['mediaInfo'] + +        if media_info: +            for media_variant in media_info['variants']: +                media_url = media_variant['url'] +                if media_url.endswith('.m3u8'): +                    formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) +                elif media_url.endswith('.mpd'): +                    formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) +                else: +                    vbr = int_or_none(media_variant.get('bitRate'), scale=1000) +                    a_format = { +                        'url': media_url, +                        'format_id': 'http-%d' % vbr if vbr else 'http', +                        'vbr': vbr, +                    } +                    # Reported bitRate may be zero +                    if not a_format['vbr']: +                        del a_format['vbr'] + +                    _search_dimensions_in_video_url(a_format, media_url) + +                    formats.append(a_format) + +            duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) +          self._sort_formats(formats) -        thumbnail = config.get('posterImageUrl') -        duration = float_or_none(config.get('duration')) +        title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') +        thumbnail = config.get('posterImageUrl') or config.get('image_src') +        duration = float_or_none(config.get('duration')) or duration          return {              'id': video_id, -            'title': 'TwitterCard', +            'title': title,              'thumbnail': thumbnail,              'duration': duration,              'formats': formats, @@ -138,7 +180,6 @@ class TwitterIE(InfoExtractor):      _TESTS = [{          'url': 'https://twitter.com/freethenipple/status/643211948184596480', -        'md5': 'db6612ec5d03355953c3ca9250c97e5e',          'info_dict': {              'id': '643211948184596480',              'ext': 'mp4', @@ -149,6 +190,9 @@ class TwitterIE(InfoExtractor):              'uploader': 'FREE THE NIPPLE',              'uploader_id': 'freethenipple',          }, +        'params': { +            'skip_download': True,  # requires ffmpeg +        },      }, {          'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',          'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -161,6 +205,7 @@ class TwitterIE(InfoExtractor):              'uploader': 'Gifs',              'uploader_id': 'giphz',          }, +        'expected_warnings': ['height', 'width'],      }, {          'url': 'https://twitter.com/starwars/status/665052190608723968',          'md5': '39b7199856dee6cd4432e72c74bc69d4', @@ -172,6 +217,36 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'starwars',              'uploader': 'Star Wars',          }, +    }, { +        'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', +        'info_dict': { +            'id': '705235433198714880', +            'ext': 'mp4', +            'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', +            'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', +            'uploader_id': 'BTNBrentYarina', +            'uploader': 'Brent Yarina', +        }, +        'params': { +            # The same video as https://twitter.com/i/videos/tweet/705235433198714880 +            # Test case of TwitterCardIE +            'skip_download': True, +        }, +    }, { +        'url': 'https://twitter.com/jaydingeer/status/700207533655363584', +        'md5': '', +        'info_dict': { +            'id': '700207533655363584', +            'ext': 'mp4', +            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', +            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', +            'thumbnail': 're:^https?://.*\.jpg', +            'uploader': 'jay', +            'uploader_id': 'jaydingeer', +        }, +        'params': { +            'skip_download': True,  # requires ffmpeg +        },      }]      def _real_extract(self, url): @@ -208,21 +283,91 @@ class TwitterIE(InfoExtractor):              return info          mobj = re.search(r'''(?x) -            <video[^>]+class="animated-gif"[^>]+ -                (?:data-height="(?P<height>\d+)")?[^>]+ -                (?:data-width="(?P<width>\d+)")?[^>]+ -                (?:poster="(?P<poster>[^"]+)")?[^>]*>\s* +            <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*                  <source[^>]+video-src="(?P<url>[^"]+)"          ''', webpage)          if mobj: +            more_info = mobj.group('more_info') +            height = int_or_none(self._search_regex( +                r'data-height="(\d+)"', more_info, 'height', fatal=False)) +            width = int_or_none(self._search_regex( +                r'data-width="(\d+)"', more_info, 'width', fatal=False)) +            thumbnail = self._search_regex( +                r'poster="([^"]+)"', more_info, 'poster', fatal=False)              info.update({                  'id': twid,                  'url': mobj.group('url'), -                'height': int_or_none(mobj.group('height')), -                'width': int_or_none(mobj.group('width')), -                'thumbnail': mobj.group('poster'), +                'height': height, +                'width': width, +                'thumbnail': thumbnail,              })              return info -        raise ExtractorError('There\'s not video in this tweet.') +        if 'class="PlayableMedia' in webpage: +            info.update({ +                '_type': 'url_transparent', +                'ie_key': 'TwitterCard', +                'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid), +            }) + +            return info + +        raise ExtractorError('There\'s no video in this tweet.') + + +class TwitterAmplifyIE(TwitterBaseIE): +    IE_NAME = 'twitter:amplify' +    _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' + +    _TEST = { +        'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', +        'md5': '7df102d0b9fd7066b86f3159f8e81bf6', +        'info_dict': { +            'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', +            'ext': 'mp4', +            'title': 'Twitter Video', +            'thumbnail': 're:^https?://.*', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        vmap_url = self._html_search_meta( +            'twitter:amplify:vmap', webpage, 'vmap url') +        video_url = self._get_vmap_video_url(vmap_url, video_id) + +        thumbnails = [] +        thumbnail = self._html_search_meta( +            'twitter:image:src', webpage, 'thumbnail', fatal=False) + +        def _find_dimension(target): +            w = int_or_none(self._html_search_meta( +                'twitter:%s:width' % target, webpage, fatal=False)) +            h = int_or_none(self._html_search_meta( +                'twitter:%s:height' % target, webpage, fatal=False)) +            return w, h + +        if thumbnail: +            thumbnail_w, thumbnail_h = _find_dimension('image') +            thumbnails.append({ +                'url': thumbnail, +                'width': thumbnail_w, +                'height': thumbnail_h, +            }) + +        video_w, video_h = _find_dimension('player') +        formats = [{ +            'url': video_url, +            'width': video_w, +            'height': video_h, +        }] + +        return { +            'id': video_id, +            'title': 'Twitter Video', +            'formats': formats, +            'thumbnails': thumbnails, +        } | 
