diff options
Diffstat (limited to 'youtube_dl/extractor/gogoanime.py')
-rw-r--r-- | youtube_dl/extractor/gogoanime.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/youtube_dl/extractor/gogoanime.py b/youtube_dl/extractor/gogoanime.py new file mode 100644 index 000000000..d4f4ecc58 --- /dev/null +++ b/youtube_dl/extractor/gogoanime.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + get_element_by_attribute, + unescapeHTML +) + + +class GoGoAnimeIE(InfoExtractor): + IE_NAME = 'gogoanime' + IE_DESC = 'GoGoAnime' + _VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)' + + _TEST = { + 'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1', + 'info_dict': { + 'id': 'mahou-shoujo-madoka-magica-movie-1' + }, + 'playlist_count': 3 + } + + def _real_extract(self, url): + video_id = self._match_id(url) + page = self._download_webpage(url, video_id) + + if 'Oops! Page Not Found</font>' in page: + raise ExtractorError('Video does not exist', expected=True) + + content = get_element_by_attribute("class", "postcontent", page) + vids = re.findall(r'<iframe[^>]*?src=[\'"](h[^\'"]+)[\'"]', content) + vids = [ + unescapeHTML(compat_urllib_parse.unquote(x)) + for x in vids if not re.search(r".*videofun.*", x)] + + if re.search(r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />', page): + return self.playlist_result([self.url_result(vid) for vid in vids], video_id) + + title = self._html_search_regex( + r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>', page, 'title') + + return { + '_type': 'url', + 'id': video_id, + 'url': vids[0], + 'title': title, + } + + +class GoGoAnimeSearchIE(InfoExtractor): + IE_NAME = 'gogoanime:search' + IE_DESC = 'GoGoAnime Search' + + _VALID_URL = r'http://www\.gogoanime\.com/.*\?s=(?P<id>[^&]*)' + _TEST = { + 'url': 'http://www.gogoanime.com/?s=bokusatsu', + 'info_dict': { + 'id': 'bokusatsu' + }, + 'playlist_count': 6 + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + posts = re.findall( + r'<div class="postlist">[^<]*<p[^>]*>[^<]*<a href="(?P<url>[^"]+)"', + webpage) + + return self.playlist_result( + [self.url_result(p) for p in posts], playlist_id) |