diff options
Diffstat (limited to 'youtube_dl/extractor/soundcloud.py')
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 237 | 
1 files changed, 143 insertions, 94 deletions
| diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 29cd5617c..e22ff9c38 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,3 +1,4 @@ +# encoding: utf-8  import json  import re  import itertools @@ -23,25 +24,72 @@ class SoundcloudIE(InfoExtractor):       """      _VALID_URL = r'''^(?:https?://)? -                    (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) +                    (?:(?:(?:www\.|m\.)?soundcloud\.com/ +                            (?P<uploader>[\w\d-]+)/ +                            (?!sets/)(?P<title>[\w\d-]+)/? +                            (?P<token>[^?]+?)?(?:[?].*)?$)                         |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) -                       |(?P<widget>w.soundcloud.com/player/?.*?url=.*) +                       |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)                      )                      '''      IE_NAME = u'soundcloud' -    _TEST = { -        u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', -        u'file': u'62986583.mp3', -        u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', -        u'info_dict': { -            u"upload_date": u"20121011",  -            u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  -            u"uploader": u"E.T. ExTerrestrial Music",  -            u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" -        } -    } +    _TESTS = [ +        { +            u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', +            u'file': u'62986583.mp3', +            u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', +            u'info_dict': { +                u"upload_date": u"20121011",  +                u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  +                u"uploader": u"E.T. ExTerrestrial Music",  +                u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" +            } +        }, +        # not streamable song +        { +            u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', +            u'info_dict': { +                u'id': u'47127627', +                u'ext': u'mp3', +                u'title': u'Goldrushed', +                u'uploader': u'The Royal Concept', +                u'upload_date': u'20120521', +            }, +            u'params': { +                # rtmp +                u'skip_download': True, +            }, +        }, +        # private link +        { +            u'url': u'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', +            u'md5': u'aa0dd32bfea9b0c5ef4f02aacd080604', +            u'info_dict': { +                u'id': u'123998367', +                u'ext': u'mp3', +                u'title': u'Youtube - Dl Test Video \'\' Ä↭', +                u'uploader': u'jaimeMF', +                u'description': u'test chars:  \"\'/\\ä↭', +                u'upload_date': u'20131209', +            }, +        }, +        # downloadable song +        { +            u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', +            u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', +            u'info_dict': { +                u'id': u'105614606', +                u'ext': u'wav', +                u'title': u'Just Your Problem Baby (Acapella)', +                u'description': u'Vocals', +                u'uploader': u'Sim Gretina', +                u'upload_date': u'20130815', +            }, +        }, +    ]      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' +    _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'      @classmethod      def suitable(cls, url): @@ -55,25 +103,85 @@ class SoundcloudIE(InfoExtractor):      def _resolv_url(cls, url):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID -    def _extract_info_dict(self, info, full_title=None, quiet=False): -        video_id = info['id'] -        name = full_title or video_id -        if quiet == False: +    def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): +        track_id = compat_str(info['id']) +        name = full_title or track_id +        if quiet:              self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None:              thumbnail = thumbnail.replace('-large', '-t500x500') -        return { -            'id':       info['id'], -            'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID, +        ext = u'mp3' +        result = { +            'id': track_id,              'uploader': info['user']['username'],              'upload_date': unified_strdate(info['created_at']), -            'title':    info['title'], -            'ext':      u'mp3', +            'title': info['title'],              'description': info['description'],              'thumbnail': thumbnail,          } +        if info.get('downloadable', False): +            # We can build a direct link to the song +            format_url = ( +                u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( +                    track_id, self._CLIENT_ID)) +            result['formats'] = [{ +                'format_id': 'download', +                'ext': info.get('original_format', u'mp3'), +                'url': format_url, +                'vcodec': 'none', +            }] +        else: +            # We have to retrieve the url +            streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' +                'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) +            stream_json = self._download_webpage( +                streams_url, +                track_id, u'Downloading track url') + +            formats = [] +            format_dict = json.loads(stream_json) +            for key, stream_url in format_dict.items(): +                if key.startswith(u'http'): +                    formats.append({ +                        'format_id': key, +                        'ext': ext, +                        'url': stream_url, +                        'vcodec': 'none', +                    }) +                elif key.startswith(u'rtmp'): +                    # The url doesn't have an rtmp app, we have to extract the playpath +                    url, path = stream_url.split('mp3:', 1) +                    formats.append({ +                        'format_id': key, +                        'url': url, +                        'play_path': 'mp3:' + path, +                        'ext': ext, +                        'vcodec': 'none', +                    }) + +            if not formats: +                # We fallback to the stream_url in the original info, this +                # cannot be always used, sometimes it can give an HTTP 404 error +                formats.append({ +                    'format_id': u'fallback', +                    'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, +                    'ext': ext, +                    'vcodec': 'none', +                }) + +            def format_pref(f): +                if f['format_id'].startswith('http'): +                    return 2 +                if f['format_id'].startswith('rtmp'): +                    return 1 +                return 0 + +            formats.sort(key=format_pref) +            result['formats'] = formats + +        return result      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) @@ -81,6 +189,7 @@ class SoundcloudIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          track_id = mobj.group('track_id') +        token = None          if track_id is not None:              info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID              full_title = track_id @@ -89,87 +198,28 @@ class SoundcloudIE(InfoExtractor):              return self.url_result(query['url'][0], ie='Soundcloud')          else:              # extract uploader (which is in the url) -            uploader = mobj.group(1) +            uploader = mobj.group('uploader')              # extract simple title (uploader + slug of song title) -            slug_title =  mobj.group(2) -            full_title = '%s/%s' % (uploader, slug_title) +            slug_title =  mobj.group('title') +            token = mobj.group('token') +            full_title = resolve_title = '%s/%s' % (uploader, slug_title) +            if token: +                resolve_title += '/%s' % token              self.report_resolve(full_title) -            url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) +            url = 'http://soundcloud.com/%s' % resolve_title              info_json_url = self._resolv_url(url)          info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON')          info = json.loads(info_json) -        return self._extract_info_dict(info, full_title) +        return self._extract_info_dict(info, full_title, secret_token=token)  class SoundcloudSetIE(SoundcloudIE):      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'      IE_NAME = u'soundcloud:set' -    _TEST = { -        u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep", -        u"playlist": [ -            { -                u"file":"30510138.mp3", -                u"md5":"f9136bf103901728f29e419d2c70f55d", -                u"info_dict": { -                    u"upload_date": u"20111213", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"D-D-Dance" -                } -            }, -            { -                u"file":"47127625.mp3", -                u"md5":"09b6758a018470570f8fd423c9453dd8", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"The Royal Concept - Gimme Twice" -                } -            }, -            { -                u"file":"47127627.mp3", -                u"md5":"154abd4e418cea19c3b901f1e1306d9c", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Goldrushed" -                } -            }, -            { -                u"file":"47127629.mp3", -                u"md5":"2f5471edc79ad3f33a683153e96a79c1", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"In the End" -                } -            }, -            { -                u"file":"47127631.mp3", -                u"md5":"f9ba87aa940af7213f98949254f1c6e2", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Knocked Up" -                } -            }, -            { -                u"file":"75206121.mp3", -                u"md5":"f9d1fe9406717e302980c30de4af9353", -                u"info_dict": { -                    u"upload_date": u"20130116", -                    u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ", -                    u"uploader": u"The Royal Concept", -                    u"title": u"World On Fire" -                } -            } -        ] -    } +    # it's in tests/test_playlists.py +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -188,7 +238,6 @@ class SoundcloudSetIE(SoundcloudIE):          resolv_url = self._resolv_url(url)          info_json = self._download_webpage(resolv_url, full_title) -        videos = []          info = json.loads(info_json)          if 'errors' in info:              for err in info['errors']: @@ -204,11 +253,11 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' +    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'      IE_NAME = u'soundcloud:user'      # it's in tests/test_playlists.py -    _TEST = None +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) | 
