diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/anitube.py | 32 | ||||
-rw-r--r-- | youtube_dl/extractor/niconico.py | 65 | ||||
-rw-r--r-- | youtube_dl/extractor/teachertube.py | 21 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 4 |
4 files changed, 69 insertions, 53 deletions
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 2b019daa9..31f0d417c 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,22 +1,24 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class AnitubeIE(InfoExtractor): - IE_NAME = u'anitube.se' + IE_NAME = 'anitube.se' _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)' _TEST = { - u'url': u'http://www.anitube.se/video/36621', - u'md5': u'59d0eeae28ea0bc8c05e7af429998d43', - u'file': u'36621.mp4', - u'info_dict': { - u'id': u'36621', - u'ext': u'mp4', - u'title': u'Recorder to Randoseru 01', + 'url': 'http://www.anitube.se/video/36621', + 'md5': '59d0eeae28ea0bc8c05e7af429998d43', + 'info_dict': { + 'id': '36621', + 'ext': 'mp4', + 'title': 'Recorder to Randoseru 01', + 'duration': 180.19, }, - u'skip': u'Blocked in the US', + 'skip': 'Blocked in the US', } def _real_extract(self, url): @@ -24,13 +26,15 @@ class AnitubeIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', - webpage, u'key') + key = self._html_search_regex( + r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') - config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, - key) + config_xml = self._download_xml( + 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) video_title = config_xml.find('title').text + thumbnail = config_xml.find('image').text + duration = float(config_xml.find('duration').text) formats = [] video_url = config_xml.find('file') @@ -49,5 +53,7 @@ class AnitubeIE(InfoExtractor): return { 'id': video_id, 'title': video_title, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats } diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index ba7464cb8..d98131271 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -8,10 +8,10 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, compat_urlparse, - compat_str, - ExtractorError, unified_strdate, + parse_duration, + int_or_none, ) @@ -30,6 +30,7 @@ class NiconicoIE(InfoExtractor): 'uploader_id': '2698420', 'upload_date': '20131123', 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + 'duration': 33, }, 'params': { 'username': 'ydl.niconico@gmail.com', @@ -37,7 +38,7 @@ class NiconicoIE(InfoExtractor): }, } - _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' + _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' # Determine whether the downloader uses authentication to download video _AUTHENTICATE = False @@ -109,37 +110,39 @@ class NiconicoIE(InfoExtractor): video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - video_title = video_info.find('.//title').text - video_extension = video_info.find('.//movie_type').text - video_format = video_extension.upper() - video_thumbnail = video_info.find('.//thumbnail_url').text - video_description = video_info.find('.//description').text - video_uploader_id = video_info.find('.//user_id').text - video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) - video_view_count = video_info.find('.//view_counter').text - video_webpage_url = video_info.find('.//watch_url').text - - # uploader - video_uploader = video_uploader_id - url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id - try: - user_info = self._download_xml( - url, video_id, note='Downloading user information') - video_uploader = user_info.find('.//nickname').text - except ExtractorError as err: - self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) + title = video_info.find('.//title').text + extension = video_info.find('.//movie_type').text + video_format = extension.upper() + thumbnail = video_info.find('.//thumbnail_url').text + description = video_info.find('.//description').text + upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + view_count = int_or_none(video_info.find('.//view_counter').text) + comment_count = int_or_none(video_info.find('.//comment_num').text) + duration = parse_duration(video_info.find('.//length').text) + webpage_url = video_info.find('.//watch_url').text + + if video_info.find('.//ch_id') is not None: + uploader_id = video_info.find('.//ch_id').text + uploader = video_info.find('.//ch_name').text + elif video_info.find('.//user_id') is not None: + uploader_id = video_info.find('.//user_id').text + uploader = video_info.find('.//user_nickname').text + else: + uploader_id = uploader = None return { 'id': video_id, 'url': video_real_url, - 'title': video_title, - 'ext': video_extension, + 'title': title, + 'ext': extension, 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'uploader_id': video_uploader_id, - 'view_count': video_view_count, - 'webpage_url': video_webpage_url, + 'thumbnail': thumbnail, + 'description': description, + 'uploader': uploader, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'duration': duration, + 'webpage_url': webpage_url, } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 1a438e1e4..2c2113b14 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -14,7 +14,7 @@ class TeacherTubeIE(InfoExtractor): IE_NAME = 'teachertube' IE_DESC = 'teachertube.com videos' - _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/|audio/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', @@ -45,6 +45,15 @@ class TeacherTubeIE(InfoExtractor): 'title': 'PER ASPERA AD ASTRA', 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', }, + }, { + 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', + 'md5': '9c79fbb2dd7154823996fc28d4a26998', + 'info_dict': { + 'id': '297790', + 'ext': 'mp4', + 'title': 'Intro Video - Schleicher', + 'description': 'Intro Video - Why to flip, how flipping will', + }, }] def _real_extract(self, url): @@ -92,23 +101,21 @@ class TeacherTubeUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' + _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('user') urls = [] webpage = self._download_webpage(url, user_id) - urls.extend(re.findall( - r'"sidebar_thumb_time">[0-9:]+</div>\s+<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">', - webpage)) + urls.extend(re.findall(self._MEDIA_RE, webpage)) pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] for p in pages: more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) - urls.extend(re.findall( - r'"sidebar_thumb_time">[0-9:]+</div>\s+<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">', - webpage)) + urls.extend(re.findall(self._MEDIA_RE, webpage)) entries = [] for url in urls: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bf0fbc924..f420b8148 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1698,14 +1698,14 @@ class YoutubeSearchURLIE(InfoExtractor): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') + r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML') part_codes = re.findall( r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) entries = [] for part_code in part_codes: part_title = self._html_search_regex( - r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) + [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) part_url_snippet = self._html_search_regex( r'(?s)href="([^"]+)"', part_code, 'item URL') part_url = compat_urlparse.urljoin( |