aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/mixcloud.py
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2016-04-15 17:14:17 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2016-04-15 17:14:17 +0800
commite6da9240d44774495a7ae0f2780bd42e4d2628f5 (patch)
treee4e23b9d8fcb68a7db0e39bc3ad3e43245035051 /youtube_dl/extractor/mixcloud.py
parentdd91dfcd67a0d1db25836f734579742ff73b0c66 (diff)
downloadyoutube-dl-e6da9240d44774495a7ae0f2780bd42e4d2628f5.tar.xz
[mixcloud:stream] Add new extractor
Closes #7633
Diffstat (limited to 'youtube_dl/extractor/mixcloud.py')
-rw-r--r--youtube_dl/extractor/mixcloud.py75
1 files changed, 66 insertions, 9 deletions
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index d4a4963ee..483f6925f 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
import base64
import functools
+import itertools
import re
from .common import InfoExtractor
@@ -123,18 +124,26 @@ class MixcloudIE(InfoExtractor):
class MixcloudPlaylistBaseIE(InfoExtractor):
_PAGE_SIZE = 24
- def _fetch_tracks_page(self, path, video_id, page_name, current_page):
- resp = self._download_webpage(
+ def _find_urls_in_page(self, page):
+ for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page):
+ yield self.url_result(
+ compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
+ MixcloudIE.ie_key())
+
+ def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None):
+ real_page_number = real_page_number or current_page + 1
+ return self._download_webpage(
'https://www.mixcloud.com/%s/' % path, video_id,
note='Download %s (page %d)' % (page_name, current_page + 1),
errnote='Unable to download %s' % page_name,
- query={'page': (current_page + 1), 'list': 'main', '_ajax': '1'},
+ query={'page': real_page_number, 'list': 'main', '_ajax': '1'},
headers={'X-Requested-With': 'XMLHttpRequest'})
- for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', resp):
- yield self.url_result(
- compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
- MixcloudIE.ie_key())
+ def _tracks_page_func(self, page, video_id, page_name, current_page):
+ resp = self._fetch_tracks_page(page, video_id, page_name, current_page)
+
+ for item in self._find_urls_in_page(resp):
+ yield item
def _get_user_description(self, page_content):
return self._html_search_regex(
@@ -207,7 +216,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
entries = OnDemandPagedList(
functools.partial(
- self._fetch_tracks_page,
+ self._tracks_page_func,
'%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type),
self._PAGE_SIZE, use_cache=True)
@@ -255,8 +264,56 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
entries = OnDemandPagedList(
functools.partial(
- self._fetch_tracks_page,
+ self._tracks_page_func,
'%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
self._PAGE_SIZE)
return self.playlist_result(entries, video_id, playlist_title, description)
+
+
+class MixcloudStreamIE(MixcloudPlaylistBaseIE):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
+ IE_NAME = 'mixcloud:stream'
+
+ _TEST = {
+ 'url': 'https://www.mixcloud.com/FirstEar/stream/',
+ 'info_dict': {
+ 'id': 'FirstEar',
+ 'title': 'First Ear',
+ 'description': 'Curators of good music\nfirstearmusic.com',
+ },
+ 'playlist_mincount': 192,
+ }
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, user_id)
+
+ entries = []
+ prev_page_url = None
+
+ def _handle_page(page):
+ entries.extend(self._find_urls_in_page(page))
+ return self._search_regex(
+ r'm-next-page-url="([^"]+)"', page,
+ 'next page URL', default=None)
+
+ next_page_url = _handle_page(webpage)
+
+ for idx in itertools.count(0):
+ if not next_page_url or prev_page_url == next_page_url:
+ break
+
+ prev_page_url = next_page_url
+ current_page = int(self._search_regex(
+ r'\?page=(\d+)', next_page_url, 'next page number'))
+
+ next_page_url = _handle_page(self._fetch_tracks_page(
+ '%s/stream' % user_id, user_id, 'stream', idx,
+ real_page_number=current_page))
+
+ username = self._og_search_title(webpage)
+ description = self._get_user_description(webpage)
+
+ return self.playlist_result(entries, user_id, username, description)