diff options
author | remitamine <remitamine@gmail.com> | 2016-03-05 18:35:48 +0100 |
---|---|---|
committer | remitamine <remitamine@gmail.com> | 2016-03-05 18:35:48 +0100 |
commit | a9793f58a1971e3f458be01200df485f4e9b0bda (patch) | |
tree | b2079a2fa3c44641e81dc18e17e0276eb0c15155 | |
parent | 7177fd24f8ed262024501c1020e5791c811fbfe4 (diff) | |
parent | 1d4c9ed90ca8c01fcfdb838bafaa201c04bd599e (diff) |
Merge pull request #8754 from remitamine/5min
update 5min related web sites info extraction and add support for Aol features.
-rw-r--r-- | youtube_dl/extractor/__init__.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/aol.py | 70 | ||||
-rw-r--r-- | youtube_dl/extractor/engadget.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/fivemin.py | 51 |
4 files changed, 60 insertions, 91 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 08b3dc673..899bf8114 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE -from .aol import AolIE +from .aol import ( + AolIE, + AolFeaturesIE, +) from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b51eafc45..b761b2cc4 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,24 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'''(?x) - (?: - aol-video:| - http://on\.aol\.com/ - (?: - video/.*-| - playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid= - ) - ) - (?P<id>[0-9]+) - (?:$|\?) - ''' + _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -29,42 +16,31 @@ class AolIE(InfoExtractor): 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', }, 'add_ie': ['FiveMin'], - }, { - 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316', - 'info_dict': { - 'id': '152147', - 'title': 'Brace Yourself - Today\'s Weirdest News', - }, - 'playlist_mincount': 10, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - playlist_id = mobj.group('playlist_id') - if not playlist_id or self._downloader.params.get('noplaylist'): - return self.url_result('5min:%s' % video_id) + video_id = self._match_id(url) + return self.url_result('5min:%s' % video_id) - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_regex( - r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title') - playlist_html = self._search_regex( - r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage, - 'playlist HTML') - entries = [{ - '_type': 'url', - 'url': 'aol-video:%s' % m.group('id'), - 'ie_key': 'Aol', - } for m in re.finditer( - r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>", - playlist_html)] +class AolFeaturesIE(InfoExtractor): + IE_NAME = 'features.aol.com' + _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': mobj.group('playlist_display_id'), - 'title': title, - 'entries': entries, - } + _TESTS = [{ + 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', + 'md5': '7db483bb0c09c85e241f84a34238cc75', + 'info_dict': { + 'id': '519507715', + 'ext': 'mp4', + 'title': 'What To Watch - February 17, 2016', + }, + 'add_ie': ['FiveMin'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self.url_result(self._search_regex( + r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"', + webpage, '5min embed url'), 'FiveMin') diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e4180701d..e5e57d485 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -1,21 +1,13 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - url_basename, -) class EngadgetIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://www.engadget.com/ - (?:video(?:/5min)?/(?P<id>\d+)| - [\d/]+/.*?) - ''' + _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)' _TEST = { - 'url': 'http://www.engadget.com/video/5min/518153925/', + 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { 'id': '518153925', @@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - if video_id is not None: - return self.url_result('5min:%s' % video_id) - else: - title = url_basename(url) - webpage = self._download_webpage(url, title) - ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage) - return { - '_type': 'playlist', - 'title': title, - 'entries': [self.url_result('5min:%s' % vid) for vid in ids] - } + return self.url_result('5min:%s' % video_id) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2955965d9..67d50a386 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse, @@ -16,12 +18,7 @@ from ..utils import ( class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'''(?x) - (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| - https?://(?:(?:massively|www)\.)?joystiq\.com/video/| - 5min:) - (?P<id>\d+) - ''' + _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))' _TESTS = [ { @@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor): 'title': 'How to Make a Next-Level Fruit Salad', 'duration': 184, }, + 'skip': 'no longer available', }, ] _ERRORS = { @@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + sid = mobj.group('sid') + + if mobj.group('query'): + qs = compat_parse_qs(mobj.group('query')) + if not qs.get('playList'): + raise ExtractorError('Invalid URL', expected=True) + video_id = qs['playList'][0] + if qs.get('sid'): + sid = qs['sid'][0] + embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - query = compat_urllib_parse.urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }) + if not sid: + embed_page = self._download_webpage(embed_url, video_id, + 'Downloading embed page') + sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') + response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, + 'https://syn.5min.com/handlers/SenseHandler.ashx?' + + compat_urllib_parse.urlencode({ + 'func': 'GetResults', + 'playlist': video_id, + 'sid': sid, + 'isPlayerSeed': 'true', + 'url': embed_url, + }), video_id) if not response['success']: raise ExtractorError( @@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor): parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) for rendition in info['Renditions']: - if rendition['RenditionType'] == 'm3u8': - formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) - elif rendition['RenditionType'] == 'aac': + if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': continue else: rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) |