diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/kinja.py | 221 | ||||
| -rw-r--r-- | youtube_dl/extractor/onionstudios.py | 54 | 
4 files changed, 241 insertions, 52 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f43b284d..9e3b554fa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -513,6 +513,7 @@ from .keezmovies import KeezMoviesIE  from .ketnet import KetnetIE  from .khanacademy import KhanAcademyIE  from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE  from .kinopoisk import KinoPoiskIE  from .konserthusetplay import KonserthusetPlayIE  from .kontrtube import KontrTubeIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c0780e98..3d919f656 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .viqeo import ViqeoIE  from .expressen import ExpressenIE  from .zype import ZypeIE  from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE  class GenericIE(InfoExtractor): @@ -1487,16 +1488,18 @@ class GenericIE(InfoExtractor):                  'timestamp': 1432570283,              },          }, -        # OnionStudios embed +        # Kinja embed          {              'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',              'info_dict': { -                'id': '2855', +                'id': '106351',                  'ext': 'mp4',                  'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', +                'description': 'Migrated from OnionStudios',                  'thumbnail': r're:^https?://.*\.jpe?g$', -                'uploader': 'ClickHole', -                'uploader_id': 'clickhole', +                'uploader': 'clickhole', +                'upload_date': '20150527', +                'timestamp': 1432744860,              }          },          # SnagFilms embed @@ -2894,6 +2897,12 @@ class GenericIE(InfoExtractor):          if senate_isvp_url:              return self.url_result(senate_isvp_url, 'SenateISVP') +        # Look for Kinja embeds +        kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) +        if kinja_embed_urls: +            return self.playlist_from_matches( +                kinja_embed_urls, video_id, video_title) +          # Look for OnionStudios embeds          onionstudios_url = OnionStudiosIE._extract_url(webpage)          if onionstudios_url: diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py new file mode 100644 index 000000000..79e3026d2 --- /dev/null +++ b/youtube_dl/extractor/kinja.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_str, +    compat_urllib_parse_unquote, +) +from ..utils import ( +    int_or_none, +    parse_iso8601, +    strip_or_none, +    try_get, +    unescapeHTML, +    urljoin, +) + + +class KinjaEmbedIE(InfoExtractor): +    IENAME = 'kinja:embed' +    _DOMAIN_REGEX = r'''(?:[^.]+\.)? +        (?: +            avclub| +            clickhole| +            deadspin| +            gizmodo| +            jalopnik| +            jezebel| +            kinja| +            kotaku| +            lifehacker| +            splinternews| +            the(?:inventory|onion|root|takeout) +        )\.com''' +    _COMMON_REGEX = r'''/ +        (?: +            ajax/inset| +            embed/video +        )/iframe\?.*?\bid=''' +    _VALID_URL = r'''(?x)https?://%s%s +        (?P<type> +            fb| +            imgur| +            instagram| +            jwp(?:layer)?-video| +            kinjavideo| +            mcp| +            megaphone| +            ooyala| +            soundcloud(?:-playlist)?| +            tumblr-post| +            twitch-stream| +            twitter| +            ustream-channel| +            vimeo| +            vine| +            youtube-(?:list|video) +        )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) +    _TESTS = [{ +        'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', +        'only_matching': True, +    }, { +        'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', +        'only_matching': True, +    }] +    _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') +    _PROVIDER_MAP = { +        'fb': ('facebook.com/video.php?v=', 'Facebook'), +        'imgur': ('imgur.com/', 'Imgur'), +        'instagram': ('instagram.com/p/', 'Instagram'), +        'jwplayer-video': _JWPLATFORM_PROVIDER, +        'jwp-video': _JWPLATFORM_PROVIDER, +        'megaphone': ('player.megaphone.fm/', 'Generic'), +        'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), +        'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), +        'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), +        'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), +        'twitch-stream': ('twitch.tv/', 'TwitchStream'), +        'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), +        'ustream-channel': ('ustream.tv/embed/', 'Ustream'), +        'vimeo': ('vimeo.com/', 'Vimeo'), +        'vine': ('vine.co/v/', 'Vine'), +        'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), +        'youtube-video': ('youtube.com/embed/', 'Youtube'), +    } + +    @staticmethod +    def _extract_urls(webpage, url): +        return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( +            r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), +            webpage)] + +    def _real_extract(self, url): +        video_type, video_id = re.match(self._VALID_URL, url).groups() + +        provider = self._PROVIDER_MAP.get(video_type) +        if provider: +            video_id = compat_urllib_parse_unquote(video_id) +            if video_type == 'tumblr-post': +                video_id, blog = video_id.split('-', 1) +                result_url = provider[0] % (blog, video_id) +            elif video_type == 'youtube-list': +                video_id, playlist_id = video_id.split('/') +                result_url = provider[0] % (video_id, playlist_id) +            else: +                if video_type == 'ooyala': +                    video_id = video_id.split('/')[0] +                result_url = provider[0] + video_id +            return self.url_result('http://' + result_url, provider[1]) + +        if video_type == 'kinjavideo': +            data = self._download_json( +                'https://kinja.com/api/core/video/views/videoById', +                video_id, query={'videoId': video_id})['data'] +            title = data['title'] + +            formats = [] +            for k in ('signedPlaylist', 'streaming'): +                m3u8_url = data.get(k + 'Url') +                if m3u8_url: +                    formats.extend(self._extract_m3u8_formats( +                        m3u8_url, video_id, 'mp4', 'm3u8_native', +                        m3u8_id='hls', fatal=False)) +            self._sort_formats(formats) + +            thumbnail = None +            poster = data.get('poster') or {} +            poster_id = poster.get('id') +            if poster_id: +                thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + +            return { +                'id': video_id, +                'title': title, +                'description': strip_or_none(data.get('description')), +                'formats': formats, +                'tags': data.get('tags'), +                'timestamp': int_or_none(try_get( +                    data, lambda x: x['postInfo']['publishTimeMillis']), 1000), +                'thumbnail': thumbnail, +                'uploader': data.get('network'), +            } +        else: +            video_data = self._download_json( +                'https://api.vmh.univision.com/metadata/v1/content/' + video_id, +                video_id)['videoMetadata'] +            iptc = video_data['photoVideoMetadataIPTC'] +            title = iptc['title']['en'] +            fmg = video_data.get('photoVideoMetadata_fmg') or {} +            tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' +            data = self._download_json( +                tvss_domain + '/api/v3/video-auth/url-signature-tokens', +                video_id, query={'mcpids': video_id})['data'][0] +            formats = [] + +            rendition_url = data.get('renditionUrl') +            if rendition_url: +                formats = self._extract_m3u8_formats( +                    rendition_url, video_id, 'mp4', +                    'm3u8_native', m3u8_id='hls', fatal=False) + +            fallback_rendition_url = data.get('fallbackRenditionUrl') +            if fallback_rendition_url: +                formats.append({ +                    'format_id': 'fallback', +                    'tbr': int_or_none(self._search_regex( +                        r'_(\d+)\.mp4', fallback_rendition_url, +                        'bitrate', default=None)), +                    'url': fallback_rendition_url, +                }) + +            self._sort_formats(formats) + +            return { +                'id': video_id, +                'title': title, +                'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), +                'uploader': fmg.get('network'), +                'duration': int_or_none(iptc.get('fileDuration')), +                'formats': formats, +                'description': try_get(iptc, lambda x: x['description']['en'], compat_str), +                'timestamp': parse_iso8601(iptc.get('dateReleased')), +            } diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 7f8c6f0d3..cf5c39e66 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,13 +4,8 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ( -    compat_str, -    int_or_none, -    js_to_json, -    parse_iso8601, -    try_get, -) +from ..compat import compat_str +from ..utils import js_to_json  class OnionStudiosIE(InfoExtractor): @@ -20,7 +15,7 @@ class OnionStudiosIE(InfoExtractor):          'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',          'md5': '5a118d466d62b5cd03647cf2c593977f',          'info_dict': { -            'id': '2937', +            'id': '3459881',              'ext': 'mp4',              'title': 'Hannibal charges forward, stops for a cocktail',              'description': 'md5:545299bda6abf87e5ec666548c6a9448', @@ -53,43 +48,6 @@ class OnionStudiosIE(InfoExtractor):          mcp_id = compat_str(self._parse_json(self._search_regex(              r'window\.mcpMapping\s*=\s*({.+?});', webpage,              'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) -        video_data = self._download_json( -            'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, -            mcp_id)['videoMetadata'] -        iptc = video_data['photoVideoMetadataIPTC'] -        title = iptc['title']['en'] -        fmg = video_data.get('photoVideoMetadata_fmg') or {} -        tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' -        data = self._download_json( -            tvss_domain + '/api/v3/video-auth/url-signature-tokens', -            mcp_id, query={'mcpids': mcp_id})['data'][0] -        formats = [] - -        rendition_url = data.get('renditionUrl') -        if rendition_url: -            formats = self._extract_m3u8_formats( -                rendition_url, mcp_id, 'mp4', -                'm3u8_native', m3u8_id='hls', fatal=False) - -        fallback_rendition_url = data.get('fallbackRenditionUrl') -        if fallback_rendition_url: -            formats.append({ -                'format_id': 'fallback', -                'tbr': int_or_none(self._search_regex( -                    r'_(\d+)\.mp4', fallback_rendition_url, -                    'bitrate', default=None)), -                'url': fallback_rendition_url, -            }) - -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': title, -            'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), -            'uploader': fmg.get('network'), -            'duration': int_or_none(iptc.get('fileDuration')), -            'formats': formats, -            'description': try_get(iptc, lambda x: x['description']['en'], compat_str), -            'timestamp': parse_iso8601(iptc.get('dateReleased')), -        } +        return self.url_result( +            'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id, +            'KinjaEmbed', mcp_id)  | 
