diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/arte.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/cnn.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/drtv.py | 37 | ||||
-rw-r--r-- | youtube_dl/extractor/empflix.py | 32 | ||||
-rw-r--r-- | youtube_dl/extractor/karrierevideos.py | 96 | ||||
-rw-r--r-- | youtube_dl/extractor/nextmedia.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/prosiebensat1.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/rtbf.py | 30 | ||||
-rw-r--r-- | youtube_dl/extractor/rutv.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/tnaflix.py | 34 |
11 files changed, 202 insertions, 81 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 24efb7ce5..79bcd9106 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -244,6 +244,7 @@ from .kaltura import KalturaIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE @@ -338,8 +339,7 @@ from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..fce38248d 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -195,7 +195,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..9a5a8f4bb 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'matching_only': True, } - } + ] diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py new file mode 100644 index 000000000..bed94bc93 --- /dev/null +++ b/youtube_dl/extractor/karrierevideos.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + fix_xml_ampersands, + float_or_none, + xpath_with_ns, + xpath_text, +) + + +class KarriereVideosIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', + 'info_dict': { + 'id': '32c91', + 'ext': 'flv', + 'title': 'AltenpflegerIn', + 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # broken ampersands + 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', + 'info_dict': { + 'id': '5sniu', + 'ext': 'flv', + 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', + 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = (self._html_search_meta('title', webpage, default=None) or + self._search_regex(r'<h1 class="title">([^<]+)</h1>')) + + video_id = self._search_regex( + r'/config/video/(.+?)\.xml', webpage, 'video id') + playlist = self._download_xml( + 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, + video_id, transform_source=fix_xml_ampersands) + + NS_MAP = { + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./tracklist/item') + video_file = xpath_text( + item, ns('./jwplayer:file'), 'video url', fatal=True) + streamer = xpath_text( + item, ns('./jwplayer:streamer'), 'streamer', fatal=True) + + uploader = xpath_text( + item, ns('./jwplayer:author'), 'uploader') + duration = float_or_none( + xpath_text(item, ns('./jwplayer:duration'), 'duration')) + + description = self._html_search_regex( + r'(?s)<div class="leadtext">(.+?)</div>', + webpage, 'description') + + thumbnail = self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + if thumbnail: + thumbnail = compat_urlparse.urljoin(url, thumbnail) + + return { + 'id': video_id, + 'url': streamer.replace('rtmpt', 'rtmp'), + 'play_path': 'mp4:%s' % video_file, + 'ext': 'flv', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + } diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:b23787119933404ce515c6356a8c355c', + 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', + 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } - }] - - _URL_PATTERN = r'\{url: \'(.+)\'\}' - - def _fetch_title(self, page): - return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - - def _fetch_thumbnail(self, page): - return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - - def _fetch_timestamp(self, page): - return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): - _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' - _TESTS = [{ + }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): 'expected_warnings': [ 'video thumbnail', ] + }, { + 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', + 'only_matching': True, }] + _URL_PATTERN = r'\{url: \'(.+)\'\}' + def _fetch_title(self, page): - return self._html_search_meta('description', page, 'news title') + return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or + self._html_search_meta('description', page, 'news title')) + + def _fetch_thumbnail(self, page): + return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + + def _fetch_timestamp(self, page): + return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index dce64e151..5a381d9ce 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) class RTBFIE(InfoExtractor): @@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor): 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', 'duration': 3099, - 'timestamp': 1398456336, - 'upload_date': '20140425', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) + webpage = self._download_webpage( + 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) - data = json.loads(self._html_search_regex( - r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] + data = self._parse_json( + unescapeHTML(self._search_regex( + r'data-video="([^"]+)"', webpage, 'data video')), + video_id) video_url = data.get('downloadUrl') or data.get('url') - if data['provider'].lower() == 'youtube': + if data.get('provider').lower() == 'youtube': return self.url_result(video_url, 'Youtube') return { @@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor): 'url': video_url, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), - 'thumbnail': data['thumbnail']['large'], + 'thumbnail': data.get('thumbnail'), 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': data['created'], - 'view_count': data['viewCount'], + 'timestamp': int_or_none(data.get('created')), + 'view_count': int_or_none(data.get('viewCount')), } diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index d48cbbf14..59af9aba0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,26 +10,32 @@ from ..utils import ( class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', - 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', - 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'description': '', - 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 91, - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'info_dict': { + 'id': '553878', + 'display_id': 'Carmella-Decesare-striptease', + 'ext': 'mp4', + 'title': 'Carmella Decesare - striptease', + 'description': '', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 91, + 'age_limit': 18, + } + }, + { + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'matching_only': True, } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) |