diff options
author | Leslie P. Polzer <polzer@gnu.org> | 2015-02-20 14:48:12 +0100 |
---|---|---|
committer | Leslie P. Polzer <polzer@gnu.org> | 2015-02-20 14:48:12 +0100 |
commit | 365577f5676d63089cb834855dd4cdce7d0dc8aa (patch) | |
tree | 65e40e488b391525a0d681bb82c92e1c391a02cc /youtube_dl/extractor/chirbit.py | |
parent | 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 (diff) |
[chirbit] add profile extractor.
Diffstat (limited to 'youtube_dl/extractor/chirbit.py')
-rw-r--r-- | youtube_dl/extractor/chirbit.py | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ class ChirbitIE(InfoExtractor): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; |