yt_dlp/extractor/substack.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

import re
import urllib.parse

from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
    determine_ext,
    js_to_json,
    str_or_none,
)
from ..utils.traversal import traverse_obj


class SubstackIE(InfoExtractor):
    _VALID_URL = r'https?://(?P<username>[\w-]+)\.substack\.com/p/(?P<id>[\w-]+)'
    _TESTS = [{
        'url': 'https://haleynahman.substack.com/p/i-made-a-vlog?s=r',
        'md5': 'f27e4fc6252001d48d479f45e65cdfd5',
        'info_dict': {
            'id': '47660949',
            'ext': 'mp4',
            'title': 'I MADE A VLOG',
            'description': 'md5:9248af9a759321e1027226f988f54d96',
            'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18',
            'uploader': 'Maybe Baby',
            'uploader_id': '33628',
        },
    }, {
        'url': 'https://haleynahman.substack.com/p/-dear-danny-i-found-my-boyfriends?s=r',
        'md5': '0a63eacec877a1171a62cfa69710fcea',
        'info_dict': {
            'id': '51045592',
            'ext': 'mpga',
            'title': "🎧 Dear Danny: I found my boyfriend's secret Twitter account",
            'description': 'md5:a57f2439319e56e0af92dd0c95d75797',
            'thumbnail': 'md5:daa40b6b79249417c14ff8103db29639',
            'uploader': 'Maybe Baby',
            'uploader_id': '33628',
        },
    }, {
        'url': 'https://andrewzimmern.substack.com/p/mussels-with-black-bean-sauce-recipe',
        'md5': 'fd3c07077b02444ff0130715b5f632bb',
        'info_dict': {
            'id': '47368578',
            'ext': 'mp4',
            'title': 'Mussels with Black Bean Sauce: Recipe of the Week #7',
            'description': 'md5:b96234a2906c7d854d5229818d889515',
            'thumbnail': 'md5:e30bfaa9da40e82aa62354263a9dd232',
            'uploader': "Andrew Zimmern's Spilled Milk ",
            'uploader_id': '577659',
        },
    }, {
        # Podcast that needs its file extension resolved to mp3
        'url': 'https://persuasion1.substack.com/p/summers',
        'md5': '1456a755d46084744facdfac9edf900f',
        'info_dict': {
            'id': '141970405',
            'ext': 'mp3',
            'title': 'Larry Summers on What Went Wrong on Campus',
            'description': 'Yascha Mounk and Larry Summers also discuss the promise and perils of artificial intelligence.',
            'thumbnail': r're:https://substackcdn\.com/image/.+\.jpeg',
            'uploader': 'Persuasion',
            'uploader_id': '61579',
        },
    }]

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage):
            return

        mobj = re.search(r'{[^}]*\\?["\']subdomain\\?["\']\s*:\s*\\?["\'](?P<subdomain>[^\\"\']+)', webpage)
        if mobj:
            parsed = urllib.parse.urlparse(url)
            yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
            raise cls.StopExtraction

    def _extract_video_formats(self, video_id, url):
        formats, subtitles = [], {}
        for video_format in ('hls', 'mp4'):
            video_url = urllib.parse.urljoin(url, f'/api/v1/video/upload/{video_id}/src?type={video_format}')

            if video_format == 'hls':
                fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False)
                formats.extend(fmts)
                self._merge_subtitles(subs, target=subtitles)
            else:
                formats.append({
                    'url': video_url,
                    'ext': video_format,
                })

        return formats, subtitles

    def _real_extract(self, url):
        display_id, username = self._match_valid_url(url).group('id', 'username')
        webpage = self._download_webpage(url, display_id)

        webpage_info = self._parse_json(self._search_json(
            r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string',
            display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id)

        canonical_url = url
        domain = traverse_obj(webpage_info, ('domainInfo', 'customDomain', {str}))
        if domain:
            canonical_url = urllib.parse.urlparse(url)._replace(netloc=domain).geturl()

        post_type = webpage_info['post']['type']
        formats, subtitles = [], {}
        if post_type == 'podcast':
            fmt = {'url': webpage_info['post']['podcast_url']}
            if not determine_ext(fmt['url'], default_ext=None):
                # The redirected format URL expires but the original URL doesn't,
                # so we only want to extract the extension from this request
                fmt['ext'] = determine_ext(self._request_webpage(
                    HEADRequest(fmt['url']), display_id,
                    'Resolving podcast file extension',
                    'Podcast URL is invalid').url)
            formats.append(fmt)
        elif post_type == 'video':
            formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url)
        else:
            self.raise_no_formats(f'Page type "{post_type}" is not supported')

        return {
            'id': str(webpage_info['post']['id']),
            'formats': formats,
            'subtitles': subtitles,
            'title': traverse_obj(webpage_info, ('post', 'title')),
            'description': traverse_obj(webpage_info, ('post', 'description')),
            'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')),
            'uploader': traverse_obj(webpage_info, ('pub', 'name')),
            'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))),
            'webpage_url': canonical_url,
        }