diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/aenetworks.py (renamed from youtube_dl/extractor/ae.py) | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/atresplayer.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/bbc.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/digiteka.py (renamed from youtube_dl/extractor/ultimedia.py) | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/lemonde.py | 34 | ||||
-rw-r--r-- | youtube_dl/extractor/letv.py | 80 | ||||
-rw-r--r-- | youtube_dl/extractor/ustream.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/viewster.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/weiqitv.py | 52 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 13 |
14 files changed, 215 insertions, 41 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2c56797a5..bab3d7b46 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,7 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .ae import AEIE +from .aenetworks import AENetworksIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE @@ -333,10 +333,12 @@ from .kuwo import ( from .la7 import LA7IE from .laola1tv import Laola1TvIE from .lecture2go import Lecture2GoIE +from .lemonde import LemondeIE from .letv import ( LetvIE, LetvTvIE, - LetvPlaylistIE + LetvPlaylistIE, + LetvCloudIE, ) from .libsyn import LibsynIE from .lifenews import ( @@ -777,7 +779,7 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE -from .ultimedia import UltimediaIE +from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE @@ -856,6 +858,7 @@ from .webofstories import ( WebOfStoriesPlaylistIE, ) from .weibo import WeiboIE +from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE diff --git a/youtube_dl/extractor/ae.py b/youtube_dl/extractor/aenetworks.py index 3bc7c12fc..43d7b0523 100644 --- a/youtube_dl/extractor/ae.py +++ b/youtube_dl/extractor/aenetworks.py @@ -4,7 +4,9 @@ from .common import InfoExtractor from ..utils import smuggle_url -class AEIE(InfoExtractor): +class AENetworksIE(InfoExtractor): + IE_NAME = 'aenetworks' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' _TESTS = [{ @@ -20,13 +22,14 @@ class AEIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'expected_warnings': ['JSON-LD'], }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { 'id': 'eg47EERs_JsZ', 'ext': 'mp4', 'title': "Winter Is Coming", - 'description': 'md5:a40e370925074260b1c8a633c632c63a', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 3fb042cea..b8f9ae005 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -132,11 +132,6 @@ class AtresPlayerIE(InfoExtractor): }) formats.append(format_info) - m3u8_url = player.get('urlVideoHls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index ce99a34ab..1c493b72d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -718,19 +718,10 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = None - playlist_title = None - playlist_description = None - - ld = self._parse_json( - self._search_regex( - r'(?s)<script type="application/ld\+json">(.+?)</script>', - webpage, 'ld json', default='{}'), - playlist_id, fatal=False) - if ld: - timestamp = parse_iso8601(ld.get('datePublished')) - playlist_title = ld.get('headline') - playlist_description = ld.get('articleBody') + json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') + playlist_description = json_ld_info.get('description') if not timestamp: timestamp = parse_iso8601(self._search_regex( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 92e2e4f43..8da70ae14 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -763,13 +763,13 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, fatal=True): + def _search_json_ld(self, html, video_id, **kwargs): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', - html, 'JSON-LD', fatal=fatal, group='json_ld') + html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=fatal) + return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) def _json_ld(self, json_ld, video_id, fatal=True): if isinstance(json_ld, compat_str): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index b687ec4d6..6e462af69 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -37,7 +37,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -104,6 +104,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): { 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', 'only_matching': True, + }, + { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, } ] @@ -336,7 +340,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/digiteka.py index 60328123c..7bb79ffda 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/digiteka.py @@ -7,9 +7,9 @@ from .common import InfoExtractor from ..utils import int_or_none -class UltimediaIE(InfoExtractor): +class DigitekaIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:www\.)?ultimedia\.com/ + https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/ (?: deliver/ (?P<embed_type> @@ -56,6 +56,9 @@ class UltimediaIE(InfoExtractor): 'timestamp': 1424760500, 'uploader_id': '3rfzk', }, + }, { + 'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b3f8efc80..26d3698c8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -57,7 +57,7 @@ from .pladform import PladformIE from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE -from .ultimedia import UltimediaIE +from .digiteka import DigitekaIE class GenericIE(InfoExtractor): @@ -1402,7 +1402,7 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) + r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) @@ -1814,10 +1814,10 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') - # Look for Ulltimedia embeds - ultimedia_url = UltimediaIE._extract_url(webpage) - if ultimedia_url: - return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia') + # Look for Digiteka embeds + digiteka_url = DigitekaIE._extract_url(webpage) + if digiteka_url: + return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) # Look for AdobeTVVideo embeds mobj = re.search( diff --git a/youtube_dl/extractor/lemonde.py b/youtube_dl/extractor/lemonde.py new file mode 100644 index 000000000..be66fff03 --- /dev/null +++ b/youtube_dl/extractor/lemonde.py @@ -0,0 +1,34 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LemondeIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P<id>[^/]+)\.html' + _TESTS = [{ + 'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html', + 'md5': '01fb3c92de4c12c573343d63e163d302', + 'info_dict': { + 'id': 'lqm3kl', + 'ext': 'mp4', + 'title': "Comprendre l'affaire Bygmalion en 5 minutes", + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 320, + 'upload_date': '20160119', + 'timestamp': 1453194778, + 'uploader_id': '3pmkp', + }, + }, { + 'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + digiteka_url = self._proto_relative_url(self._search_regex( + r'url\s*:\s*(["\'])(?P<url>(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1', + webpage, 'digiteka url', group='url')) + return self.url_result(digiteka_url, 'Digiteka') diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index be648000e..08bdae8a2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import datetime import re import time +import base64 from .common import InfoExtractor from ..compat import ( @@ -16,7 +17,9 @@ from ..utils import ( parse_iso8601, sanitized_Request, int_or_none, + str_or_none, encode_data_uri, + url_basename, ) @@ -239,3 +242,80 @@ class LetvPlaylistIE(LetvTvIE): }, 'playlist_mincount': 7 }] + + +class LetvCloudIE(InfoExtractor): + IE_DESC = '乐视云' + _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' + + _TESTS = [{ + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': 'p7jnfw5hw9_467623dedf', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_467623dedf', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'info_dict': { + 'id': 'p7jnfw5hw9_ec93197892', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_ec93197892', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'info_dict': { + 'id': 'p7jnfw5hw9_187060b6fd', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_187060b6fd', + }, + }] + + def _real_extract(self, url): + uu_mobj = re.search('uu=([\w]+)', url) + vu_mobj = re.search('vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + play_json_req = sanitized_Request( + 'http://api.letvcloud.com/gpc.php?cf=html5&sign=signxxxxx&ver=2.2&format=json&' + + 'uu=' + uu + '&vu=' + vu) + play_json = self._download_json(play_json_req, media_id, 'Downloading playJson data') + + if not play_json.get('data'): + if play_json.get('message'): + raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) + elif play_json.get('code'): + raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) + else: + raise ExtractorError('Letv cloud returned an unknwon error') + + def b64decode(s): + return base64.b64decode(s.encode('utf-8')).decode('utf-8') + + formats = [] + for media in play_json['data']['video_info']['media'].values(): + play_url = media['play_url'] + url = b64decode(play_url['main_url']) + decoded_url = b64decode(url_basename(url)) + formats.append({ + 'url': url, + 'ext': determine_ext(decoded_url), + 'format_id': int_or_none(play_url.get('vtype')), + 'format_note': str_or_none(play_url.get('definition')), + 'width': int_or_none(play_url.get('vwidth')), + 'height': int_or_none(play_url.get('vheight')), + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': 'Video %s' % media_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 73b05ecab..b5fe753d7 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -47,7 +47,7 @@ class UstreamIE(InfoExtractor): m = re.match(self._VALID_URL, url) video_id = m.group('id') - # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990) + # some sites use this embed format (see: https://github.com/rg3/youtube-dl/issues/2990) if m.group('type') == 'embed/recorded': video_id = m.group('id') desktop_url = 'http://www.ustream.tv/recorded/' + video_id diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 185b1c119..fe94a4793 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -155,10 +155,10 @@ class ViewsterIE(InfoExtractor): self._sort_formats(formats) - synopsis = info.get('Synopsis', {}) + synopsis = info.get('Synopsis') or {} # Prefer title outside synopsis since it's less messy title = (info.get('Title') or synopsis['Title']).strip() - description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short') + description = synopsis.get('Detailed') or (info.get('Synopsis') or {}).get('Short') duration = int_or_none(info.get('Duration')) timestamp = parse_iso8601(info.get('ReleaseDate')) diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py new file mode 100644 index 000000000..e333ae345 --- /dev/null +++ b/youtube_dl/extractor/weiqitv.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class WeiqiTVIE(InfoExtractor): + IE_DESC = 'WQTV' + _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + + _TESTS = [{ + 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': '53c744f09874f0e76a8b46f3', + 'ext': 'mp4', + 'title': '2013年度盘点', + }, + }, { + 'url': 'http://www.weiqitv.com/index/video_play?videoId=567379a2d4c36cca518b4569', + 'info_dict': { + 'id': '567379a2d4c36cca518b4569', + 'ext': 'mp4', + 'title': '民国围棋史', + }, + }, { + 'url': 'http://www.weiqitv.com/index/video_play?videoId=5430220a9874f088658b4567', + 'info_dict': { + 'id': '5430220a9874f088658b4567', + 'ext': 'mp4', + 'title': '二路托过的手段和运用', + }, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) + + info_json_str = self._search_regex( + 'var\s+video\s*=\s*(.+});', page, 'info json str') + info_json = self._parse_json(info_json_str, media_id) + + letvcloud_url = self._search_regex( + 'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') + + return { + '_type': 'url_transparent', + 'ie_key': 'LetvCloud', + 'url': letvcloud_url, + 'title': info_json['name'], + 'id': media_id, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4f227f19..d31161d21 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -613,7 +613,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'skip_download': 'requires avconv', - } + }, + 'skip': 'This live event has ended.', }, # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) { @@ -706,6 +707,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) + # Also tests cut-off URL expansion in video description (see + # https://github.com/rg3/youtube-dl/issues/1892, + # https://github.com/rg3/youtube-dl/issues/8164) 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', 'info_dict': { 'id': 'lsguqyKfVQg', @@ -1237,7 +1241,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:[a-zA-Z-]+="[^"]+"\s+)*? (?:title|href)="([^"]+)"\s+ (?:[a-zA-Z-]+="[^"]+"\s+)*? - class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)".*?> + class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> [^<]+\.{3}\s* </a> ''', r'\1', video_description) @@ -1505,6 +1509,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for a_format in formats: a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: + unavailable_message = self._html_search_regex( + r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', + video_webpage, 'unavailable message', default=None) + if unavailable_message: + raise ExtractorError(unavailable_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest |