diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 210 | 
1 files changed, 152 insertions, 58 deletions
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 15da3496e..277c3c7b4 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -15,7 +15,12 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    float_or_none,      int_or_none, +    KNOWN_EXTENSIONS, +    merge_dicts, +    mimetype2ext, +    str_or_none,      try_get,      unified_timestamp,      update_url_query, @@ -57,7 +62,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'E.T. ExTerrestrial Music',                  'timestamp': 1349920598,                  'upload_date': '20121011', -                'duration': 143, +                'duration': 143.216,                  'license': 'all-rights-reserved',                  'view_count': int,                  'like_count': int, @@ -100,7 +105,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'jaimeMF',                  'timestamp': 1386604920,                  'upload_date': '20131209', -                'duration': 9, +                'duration': 9.927,                  'license': 'all-rights-reserved',                  'view_count': int,                  'like_count': int, @@ -120,7 +125,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'jaimeMF',                  'timestamp': 1386604920,                  'upload_date': '20131209', -                'duration': 9, +                'duration': 9.927,                  'license': 'all-rights-reserved',                  'view_count': int,                  'like_count': int, @@ -140,7 +145,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'oddsamples',                  'timestamp': 1389232924,                  'upload_date': '20140109', -                'duration': 17, +                'duration': 17.346,                  'license': 'cc-by-sa',                  'view_count': int,                  'like_count': int, @@ -160,7 +165,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'Ori Uplift Music',                  'timestamp': 1504206263,                  'upload_date': '20170831', -                'duration': 7449, +                'duration': 7449.096,                  'license': 'all-rights-reserved',                  'view_count': int,                  'like_count': int, @@ -180,7 +185,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'garyvee',                  'timestamp': 1488152409,                  'upload_date': '20170226', -                'duration': 207, +                'duration': 207.012,                  'thumbnail': r're:https?://.*\.jpg',                  'license': 'all-rights-reserved',                  'view_count': int, @@ -192,9 +197,31 @@ class SoundcloudIE(InfoExtractor):                  'skip_download': True,              },          }, +        # not avaialble via api.soundcloud.com/i1/tracks/id/streams +        { +            'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', +            'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', +            'info_dict': { +                'id': '583011102', +                'ext': 'mp3', +                'title': 'Mezzo Valzer', +                'description': 'md5:4138d582f81866a530317bae316e8b61', +                'uploader': 'Giovanni Sarani', +                'timestamp': 1551394171, +                'upload_date': '20190228', +                'duration': 180.157, +                'thumbnail': r're:https?://.*\.jpg', +                'license': 'all-rights-reserved', +                'view_count': int, +                'like_count': int, +                'comment_count': int, +                'repost_count': int, +            }, +            'expected_warnings': ['Unable to download JSON metadata'], +        }      ] -    _CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK' +    _CLIENT_ID = 'FweeGBOOEOYJWLJN3oEyToGLKhmSz0I7'      @staticmethod      def _extract_urls(webpage): @@ -202,10 +229,6 @@ class SoundcloudIE(InfoExtractor):              r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',              webpage)] -    def report_resolve(self, video_id): -        """Report information extraction.""" -        self.to_screen('%s: Resolving id' % video_id) -      @classmethod      def _resolv_url(cls, url):          return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID @@ -224,6 +247,10 @@ class SoundcloudIE(InfoExtractor):          def extract_count(key):              return int_or_none(info.get('%s_count' % key)) +        like_count = extract_count('favoritings') +        if like_count is None: +            like_count = extract_count('likes') +          result = {              'id': track_id,              'uploader': username, @@ -231,15 +258,17 @@ class SoundcloudIE(InfoExtractor):              'title': title,              'description': info.get('description'),              'thumbnail': thumbnail, -            'duration': int_or_none(info.get('duration'), 1000), +            'duration': float_or_none(info.get('duration'), 1000),              'webpage_url': info.get('permalink_url'),              'license': info.get('license'),              'view_count': extract_count('playback'), -            'like_count': extract_count('favoritings'), +            'like_count': like_count,              'comment_count': extract_count('comment'),              'repost_count': extract_count('reposts'),              'genre': info.get('genre'),          } + +        format_urls = set()          formats = []          query = {'client_id': self._CLIENT_ID}          if secret_token is not None: @@ -248,6 +277,7 @@ class SoundcloudIE(InfoExtractor):              # We can build a direct link to the song              format_url = update_url_query(                  'https://api.soundcloud.com/tracks/%s/download' % track_id, query) +            format_urls.add(format_url)              formats.append({                  'format_id': 'download',                  'ext': info.get('original_format', 'mp3'), @@ -256,44 +286,91 @@ class SoundcloudIE(InfoExtractor):                  'preference': 10,              }) -        # We have to retrieve the url +        # Old API, does not work for some tracks (e.g. +        # https://soundcloud.com/giovannisarani/mezzo-valzer)          format_dict = self._download_json(              'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, -            track_id, 'Downloading track url', query=query) - -        for key, stream_url in format_dict.items(): -            ext, abr = 'mp3', None -            mobj = re.search(r'_([^_]+)_(\d+)_url', key) -            if mobj: -                ext, abr = mobj.groups() -                abr = int(abr) -            if key.startswith('http'): -                stream_formats = [{ -                    'format_id': key, -                    'ext': ext, -                    'url': stream_url, -                }] -            elif key.startswith('rtmp'): -                # The url doesn't have an rtmp app, we have to extract the playpath -                url, path = stream_url.split('mp3:', 1) -                stream_formats = [{ -                    'format_id': key, -                    'url': url, -                    'play_path': 'mp3:' + path, -                    'ext': 'flv', -                }] -            elif key.startswith('hls'): -                stream_formats = self._extract_m3u8_formats( -                    stream_url, track_id, ext, entry_protocol='m3u8_native', -                    m3u8_id=key, fatal=False) -            else: +            track_id, 'Downloading track url', query=query, fatal=False) + +        if format_dict: +            for key, stream_url in format_dict.items(): +                if stream_url in format_urls: +                    continue +                format_urls.add(stream_url) +                ext, abr = 'mp3', None +                mobj = re.search(r'_([^_]+)_(\d+)_url', key) +                if mobj: +                    ext, abr = mobj.groups() +                    abr = int(abr) +                if key.startswith('http'): +                    stream_formats = [{ +                        'format_id': key, +                        'ext': ext, +                        'url': stream_url, +                    }] +                elif key.startswith('rtmp'): +                    # The url doesn't have an rtmp app, we have to extract the playpath +                    url, path = stream_url.split('mp3:', 1) +                    stream_formats = [{ +                        'format_id': key, +                        'url': url, +                        'play_path': 'mp3:' + path, +                        'ext': 'flv', +                    }] +                elif key.startswith('hls'): +                    stream_formats = self._extract_m3u8_formats( +                        stream_url, track_id, ext, entry_protocol='m3u8_native', +                        m3u8_id=key, fatal=False) +                else: +                    continue + +                if abr: +                    for f in stream_formats: +                        f['abr'] = abr + +                formats.extend(stream_formats) + +        # New API +        transcodings = try_get( +            info, lambda x: x['media']['transcodings'], list) or [] +        for t in transcodings: +            if not isinstance(t, dict):                  continue - -            if abr: -                for f in stream_formats: -                    f['abr'] = abr - -            formats.extend(stream_formats) +            format_url = url_or_none(t.get('url')) +            if not format_url: +                continue +            stream = self._download_json( +                update_url_query(format_url, query), track_id, fatal=False) +            if not isinstance(stream, dict): +                continue +            stream_url = url_or_none(stream.get('url')) +            if not stream_url: +                continue +            if stream_url in format_urls: +                continue +            format_urls.add(stream_url) +            protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) +            if protocol != 'hls' and '/hls' in format_url: +                protocol = 'hls' +            ext = None +            preset = str_or_none(t.get('preset')) +            if preset: +                ext = preset.split('_')[0] +                if ext not in KNOWN_EXTENSIONS: +                    mimetype = try_get( +                        t, lambda x: x['format']['mime_type'], compat_str) +                    ext = mimetype2ext(mimetype) or 'mp3' +            format_id_list = [] +            if protocol: +                format_id_list.append(protocol) +            format_id_list.append(ext) +            format_id = '_'.join(format_id_list) +            formats.append({ +                'url': stream_url, +                'format_id': format_id, +                'ext': ext, +                'protocol': 'm3u8_native' if protocol == 'hls' else 'http', +            })          if not formats:              # We fallback to the stream_url in the original info, this @@ -303,11 +380,11 @@ class SoundcloudIE(InfoExtractor):                  'url': update_url_query(info['stream_url'], query),                  'ext': 'mp3',              }) +            self._check_formats(formats, track_id)          for f in formats:              f['vcodec'] = 'none' -        self._check_formats(formats, track_id)          self._sort_formats(formats)          result['formats'] = formats @@ -319,6 +396,7 @@ class SoundcloudIE(InfoExtractor):              raise ExtractorError('Invalid URL: %s' % url)          track_id = mobj.group('track_id') +        new_info = {}          if track_id is not None:              info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID @@ -344,13 +422,31 @@ class SoundcloudIE(InfoExtractor):              if token:                  resolve_title += '/%s' % token -            self.report_resolve(full_title) - -            url = 'https://soundcloud.com/%s' % resolve_title -            info_json_url = self._resolv_url(url) -        info = self._download_json(info_json_url, full_title, 'Downloading info JSON') - -        return self._extract_info_dict(info, full_title, secret_token=token) +            webpage = self._download_webpage(url, full_title, fatal=False) +            if webpage: +                entries = self._parse_json( +                    self._search_regex( +                        r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage, +                        'data', default='[]'), full_title, fatal=False) +                if entries: +                    for e in entries: +                        if not isinstance(e, dict): +                            continue +                        if e.get('id') != 67: +                            continue +                        data = try_get(e, lambda x: x['data'][0], dict) +                        if data: +                            new_info = data +                            break +                info_json_url = self._resolv_url( +                    'https://soundcloud.com/%s' % resolve_title) + +        # Contains some additional info missing from new_info +        info = self._download_json( +            info_json_url, full_title, 'Downloading info JSON') + +        return self._extract_info_dict( +            merge_dicts(info, new_info), full_title, secret_token=token)  class SoundcloudPlaylistBaseIE(SoundcloudIE): @@ -396,8 +492,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):              full_title += '/' + token              url += '/' + token -        self.report_resolve(full_title) -          resolv_url = self._resolv_url(url)          info = self._download_json(resolv_url, full_title)  | 
