diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/chirbit.py | 97 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundgasm.py | 36 | 
3 files changed, 138 insertions, 1 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4d3e79de9..c3088fba2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .ccc import CCCIE  from .ceskatelevize import CeskaTelevizeIE  from .channel9 import Channel9IE  from .chilloutzone import ChilloutzoneIE +from .chirbit import ChirbitIE, ChirbitProfileIE  from .cinchcast import CinchcastIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE @@ -425,7 +426,10 @@ from .soundcloud import (      SoundcloudUserIE,      SoundcloudPlaylistIE  ) -from .soundgasm import SoundgasmIE +from .soundgasm import ( +    SoundgasmIE, +    SoundgasmProfileIE +)  from .southpark import (      SouthParkIE,      SouthparkDeIE, diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..443192f43 --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import clean_html + + +class ChirbitIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P<id>[^/]+)' +    _TEST = { +        'url': 'http://chirb.it/PrIPv5', +        'md5': '9847b0dad6ac3e074568bf2cfb197de8', +        'info_dict': { +            'id': 'PrIPv5', +            'display_id': 'kukushtv_1423231243', +            'ext': 'mp3', +            'title': 'Фасадстрой', +            'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' +        } +    } + +    def _real_extract(self, url): +        audio_linkid = self._match_id(url) +        webpage = self._download_webpage(url, audio_linkid) + +        audio_title = self._html_search_regex(r'<h2\s+itemprop="name">(.*?)</h2>', webpage, 'title') +        audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') +        audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + +        return { +            'id': audio_linkid, +            'display_id': audio_id, +            'title': audio_title, +            'url': audio_url +        } + +class ChirbitProfileIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)/?$' +    _TEST = { +        'url': 'http://chirbit.com/ScarletBeauty', +        'playlist_count': 3, +        'info_dict': { +            '_type': 'playlist', +            'title': 'ScarletBeauty', +            'id': 'ScarletBeauty' +        } +    } + +    def _real_extract(self, url): +        profile_id = self._match_id(url) + +        # Chirbit has a pretty weird "Last Page" navigation behavior. +        # We grab the profile's oldest entry to determine when to +        # stop fetching entries. +        oldestpage = self._download_webpage(url + '/24599', profile_id) +        oldest_page_entries = re.findall( +            r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', +            oldestpage); +        oldestentry = clean_html(oldest_page_entries[-1]); + +        ids = [] +        titles = [] +        n = 0 +        while True: +            page = self._download_webpage(url + '/' + str(n), profile_id) +            page_ids = re.findall( +                r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', +                page); +            page_titles = re.findall( +                r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''', +                page); +            ids += page_ids +            titles += page_titles +            if oldestentry in page_ids: +                break +            n += 1 + +        entries = [] +        i = 0 +        for id in ids: +            entries.append({ +                'id': id, +                'title': titles[i], +                'url': 'http://audio.chirbit.com/' + id + '.mp3' +            }); +            i += 1 + +        info_dict = { +            '_type': 'playlist', +            'id': profile_id, +            'title': profile_id, +            'entries': entries +        } + +        return info_dict; diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..e568ff18c 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import clean_html  class SoundgasmIE(InfoExtractor): @@ -38,3 +39,38 @@ class SoundgasmIE(InfoExtractor):              'title': audio_title,              'description': description          } + +class SoundgasmProfileIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[0-9a-zA-Z_\-]+)/?$' +    _TEST = { +        'url': 'http://soundgasm.net/u/ytdl', +        'playlist_count': 1, +        'info_dict': { +            '_type': 'playlist', +            'id': 'ytdl', +            'title': 'ytdl' +        } +    } + +    def _real_extract(self, url): +        profile_id = self._match_id(url) +        webpage = self._download_webpage(url, profile_id) + +        ids = re.findall(r'''<a\s+href=".+?/u/%s/([^/]+)">''' % re.escape(profile_id), webpage) +        ids = [clean_html(id) for id in ids] + +        entries = [] +        for id in ids: +            entries.append({ +                '_type': 'url', +                'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) +            }) + +        info_dict = { +            '_type': 'playlist', +            'id': profile_id, +            'title': profile_id, +            'entries': entries +        } + +        return info_dict; | 
