diff options
Diffstat (limited to 'youtube_dl/extractor/yandexmusic.py')
| -rw-r--r-- | youtube_dl/extractor/yandexmusic.py | 157 | 
1 files changed, 118 insertions, 39 deletions
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 025716958..fd6268ba4 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,17 +10,35 @@ from ..utils import (      ExtractorError,      int_or_none,      float_or_none, -    sanitized_Request, -    urlencode_postdata,  )  class YandexMusicBaseIE(InfoExtractor):      @staticmethod      def _handle_error(response): -        error = response.get('error') -        if error: -            raise ExtractorError(error, expected=True) +        if isinstance(response, dict): +            error = response.get('error') +            if error: +                raise ExtractorError(error, expected=True) +            if response.get('type') == 'captcha' or 'captcha' in response: +                YandexMusicBaseIE._raise_captcha() + +    @staticmethod +    def _raise_captcha(): +        raise ExtractorError( +            'YandexMusic has considered youtube-dl requests automated and ' +            'asks you to solve a CAPTCHA. You can either wait for some ' +            'time until unblocked and optionally use --sleep-interval ' +            'in future or alternatively you can go to https://music.yandex.ru/ ' +            'solve CAPTCHA, then export cookies and pass cookie file to ' +            'youtube-dl with --cookies', +            expected=True) + +    def _download_webpage(self, *args, **kwargs): +        webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) +        if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: +            self._raise_captcha() +        return webpage      def _download_json(self, *args, **kwargs):          response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) @@ -39,10 +57,16 @@ class YandexMusicTrackIE(YandexMusicBaseIE):          'info_dict': {              'id': '4878838',              'ext': 'mp3', -            'title': 'Carlo Ambrosio - Gypsy Eyes 1', +            'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1',              'filesize': 4628061,              'duration': 193.04, -        } +            'track': 'Gypsy Eyes 1', +            'album': 'Gypsy Soul', +            'album_artist': 'Carlo Ambrosio', +            'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', +            'release_year': '2009', +        }, +        'skip': 'Travis CI servers blocked by YandexMusic',      }      def _get_track_url(self, storage_dir, track_id): @@ -51,6 +75,12 @@ class YandexMusicTrackIE(YandexMusicBaseIE):              % storage_dir,              track_id, 'Downloading track location JSON') +        # Each string is now wrapped in a list, this is probably only temporarily thus +        # supporting both scenarios (see https://github.com/rg3/youtube-dl/issues/10193) +        for k, v in data.items(): +            if v and isinstance(v, list): +                data[k] = v[0] +          key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest()          storage = storage_dir.split('.') @@ -64,16 +94,45 @@ class YandexMusicTrackIE(YandexMusicBaseIE):              thumbnail = cover_uri.replace('%%', 'orig')              if not thumbnail.startswith('http'):                  thumbnail = 'http://' + thumbnail -        return { + +        track_title = track['title'] +        track_info = {              'id': track['id'],              'ext': 'mp3',              'url': self._get_track_url(track['storageDir'], track['id']), -            'title': '%s - %s' % (track['artists'][0]['name'], track['title']),              'filesize': int_or_none(track.get('fileSize')),              'duration': float_or_none(track.get('durationMs'), 1000),              'thumbnail': thumbnail, +            'track': track_title,          } +        def extract_artist(artist_list): +            if artist_list and isinstance(artist_list, list): +                artists_names = [a['name'] for a in artist_list if a.get('name')] +                if artists_names: +                    return ', '.join(artists_names) + +        albums = track.get('albums') +        if albums and isinstance(albums, list): +            album = albums[0] +            if isinstance(album, dict): +                year = album.get('year') +                track_info.update({ +                    'album': album.get('title'), +                    'album_artist': extract_artist(album.get('artists')), +                    'release_year': compat_str(year) if year else None, +                }) + +        track_artist = extract_artist(track.get('artists')) +        if track_artist: +            track_info.update({ +                'artist': track_artist, +                'title': '%s - %s' % (track_artist, track_title), +            }) +        else: +            track_info['title'] = track_title +        return track_info +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          album_id, track_id = mobj.group('album_id'), mobj.group('id') @@ -105,6 +164,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):              'title': 'Carlo Ambrosio - Gypsy Soul (2009)',          },          'playlist_count': 50, +        'skip': 'Travis CI servers blocked by YandexMusic',      }      def _real_extract(self, url): @@ -127,7 +187,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):  class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):      IE_NAME = 'yandexmusic:playlist'      IE_DESC = 'Яндекс.Музыка - Плейлист' -    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' +    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'      _TESTS = [{          'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -137,6 +197,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):              'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',          },          'playlist_count': 6, +        'skip': 'Travis CI servers blocked by YandexMusic',      }, {          # playlist exceeding the limit of 150 tracks shipped with webpage (see          # https://github.com/rg3/youtube-dl/issues/6666) @@ -145,46 +206,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):              'id': '1036',              'title': 'Музыка 90-х',          }, -        'playlist_count': 310, +        'playlist_mincount': 300, +        'skip': 'Travis CI servers blocked by YandexMusic',      }]      def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) - -        mu = self._parse_json( -            self._search_regex( -                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), -            playlist_id) - -        playlist = mu['pageData']['playlist'] -        tracks, track_ids = playlist['tracks'], playlist['trackIds'] - -        # tracks dictionary shipped with webpage is limited to 150 tracks, +        mobj = re.match(self._VALID_URL, url) +        tld = mobj.group('tld') +        user = mobj.group('user') +        playlist_id = mobj.group('id') + +        playlist = self._download_json( +            'https://music.yandex.%s/handlers/playlist.jsx' % tld, +            playlist_id, 'Downloading missing tracks JSON', +            fatal=False, +            headers={ +                'Referer': url, +                'X-Requested-With': 'XMLHttpRequest', +                'X-Retpath-Y': url, +            }, +            query={ +                'owner': user, +                'kinds': playlist_id, +                'light': 'true', +                'lang': tld, +                'external-domain': 'music.yandex.%s' % tld, +                'overembed': 'false', +            })['playlist'] + +        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + +        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,          # missing tracks should be retrieved manually.          if len(tracks) < len(track_ids): -            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) -            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) -            request = sanitized_Request( -                'https://music.yandex.ru/handlers/track-entries.jsx', -                urlencode_postdata({ +            present_track_ids = set([ +                compat_str(track['id']) +                for track in tracks if track.get('id')]) +            missing_track_ids = [ +                track_id for track_id in track_ids +                if track_id not in present_track_ids] +            missing_tracks = self._download_json( +                'https://music.yandex.%s/handlers/track-entries.jsx' % tld, +                playlist_id, 'Downloading missing tracks JSON', +                fatal=False, +                headers={ +                    'Referer': url, +                    'X-Requested-With': 'XMLHttpRequest', +                }, +                query={                      'entries': ','.join(missing_track_ids), -                    'lang': mu.get('settings', {}).get('lang', 'en'), -                    'external-domain': 'music.yandex.ru', +                    'lang': tld, +                    'external-domain': 'music.yandex.%s' % tld,                      'overembed': 'false', -                    'sign': mu.get('authData', {}).get('user', {}).get('sign'),                      'strict': 'true', -                })) -            request.add_header('Referer', url) -            request.add_header('X-Requested-With', 'XMLHttpRequest') - -            missing_tracks = self._download_json( -                request, playlist_id, 'Downloading missing tracks JSON', fatal=False) +                })              if missing_tracks:                  tracks.extend(missing_tracks)          return self.playlist_result(              self._build_playlist(tracks),              compat_str(playlist_id), -            playlist['title'], playlist.get('description')) +            playlist.get('title'), playlist.get('description'))  | 
