From 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 10:49:45 +0100 Subject: [chirbit] Add new extractor. --- youtube_dl/extractor/chirbit.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/chirbit.py (limited to 'youtube_dl/extractor/chirbit.py') diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..06a3e1a7a --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ChirbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' + _TEST = { + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'display_id': 'kukushtv_1423231243', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + } + } + + def _real_extract(self, url): + audio_linkid = self._match_id(url) + webpage = self._download_webpage(url, audio_linkid) + + audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') + audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') + audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + + return { + 'id': audio_linkid, + 'display_id': audio_id, + 'title': audio_title, + 'url': audio_url + } -- cgit v1.2.3 From 365577f5676d63089cb834855dd4cdce7d0dc8aa Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 14:48:12 +0100 Subject: [chirbit] add profile extractor. --- youtube_dl/extractor/chirbit.py | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'youtube_dl/extractor/chirbit.py') diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ class ChirbitIE(InfoExtractor): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''(.*?)''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; -- cgit v1.2.3 From ddc369f073fda4ddd429c2d9a104e561cefd417f Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Mon, 23 Feb 2015 12:00:43 +0100 Subject: [chirbit] fix profile downloader regex. --- youtube_dl/extractor/chirbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor/chirbit.py') diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 47ce94aa0..443192f43 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -37,7 +37,7 @@ class ChirbitIE(InfoExtractor): } class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'playlist_count': 3, -- cgit v1.2.3 From a65d4e7f1458a681f250d6e2e0190644b50d6793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:15:16 +0600 Subject: [chirbit] Simplify and extract profile from RSS (#5032) --- youtube_dl/extractor/chirbit.py | 113 +++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 64 deletions(-) (limited to 'youtube_dl/extractor/chirbit.py') diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 443192f43..124307b7c 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,97 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import clean_html +from ..utils import ( + parse_duration, + int_or_none, +) class ChirbitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', 'md5': '9847b0dad6ac3e074568bf2cfb197de8', 'info_dict': { 'id': 'PrIPv5', - 'display_id': 'kukushtv_1423231243', 'ext': 'mp3', 'title': 'Фасадстрой', - 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + 'duration': 52, + 'view_count': int, + 'comment_count': int, } - } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }] def _real_extract(self, url): - audio_linkid = self._match_id(url) - webpage = self._download_webpage(url, audio_linkid) + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') - audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') - audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') - audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) return { - 'id': audio_linkid, - 'display_id': audio_id, - 'title': audio_title, - 'url': audio_url + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, } + class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', - 'playlist_count': 3, 'info_dict': { - '_type': 'playlist', - 'title': 'ScarletBeauty', - 'id': 'ScarletBeauty' - } + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) - # Chirbit has a pretty weird "Last Page" navigation behavior. - # We grab the profile's oldest entry to determine when to - # stop fetching entries. - oldestpage = self._download_webpage(url + '/24599', profile_id) - oldest_page_entries = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - oldestpage); - oldestentry = clean_html(oldest_page_entries[-1]); + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) - ids = [] - titles = [] - n = 0 - while True: - page = self._download_webpage(url + '/' + str(n), profile_id) - page_ids = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - page); - page_titles = re.findall( - r'''(.*?)''', - page); - ids += page_ids - titles += page_titles - if oldestentry in page_ids: - break - n += 1 + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] - entries = [] - i = 0 - for id in ids: - entries.append({ - 'id': id, - 'title': titles[i], - 'url': 'http://audio.chirbit.com/' + id + '.mp3' - }); - i += 1 - - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } + title = rss.find('./channel/title').text - return info_dict; + return self.playlist_result(entries, profile_id, title) -- cgit v1.2.3 From 04e8c1108023d9fe5c466d16f988a469e04f326e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:28:14 +0600 Subject: [chirbit] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/chirbit.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor/chirbit.py') diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 124307b7c..b1eeaf101 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -9,6 +9,7 @@ from ..utils import ( class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', @@ -57,6 +58,7 @@ class ChirbitIE(InfoExtractor): class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', -- cgit v1.2.3