diff options
Diffstat (limited to 'youtube_dl/extractor')
91 files changed, 2443 insertions, 1223 deletions
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 100cf997f..d57ad85c2 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -36,6 +36,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, 'Charter_Direct': { 'name': 'Charter Spectrum', 'username_field': 'IDToken1', @@ -1308,6 +1313,12 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}) + headers.update(self.geo_verification_headers()) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 989505c82..acc4ce38d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,91 +5,52 @@ import re from .turner import TurnerBaseIE from ..utils import ( - ExtractorError, int_or_none, + strip_or_none, ) class AdultSwimIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' _TESTS = [{ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', - 'playlist': [ - { - 'md5': '247572debc75c7652f253c8daa51a14d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 1', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - { - 'md5': '77b0e037a4b20ec6b98671c4c379f48d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 4', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - ], 'info_dict': { 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', 'title': 'Rick and Morty - Pilot', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1493267400, + 'upload_date': '20170427', }, - 'skip': 'This video is only available for registered users', - }, { - 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', - 'playlist': [ - { - 'md5': '2eb5c06d0f9a1539da3718d897f13ec5', - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog-0', - 'ext': 'flv', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' - }, - } - ], - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' + 'params': { + # m3u8 download + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', - 'playlist': [ - { - 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', - 'info_dict': { - 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'mp4', - 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', - }, - } - ], 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { - # heroMetadata.trailer 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', 'info_dict': { 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', 'ext': 'mp4', 'title': 'Decker - Inside Decker: A New Hero', - 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', - 'duration': 249.008, + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', }, 'params': { # m3u8 download @@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'b7A69dzfRzuaXIECdxW8XQ', + 'title': 'Attack on Titan', + 'description': 'md5:6c8e003ea0777b47013e894767f5e114', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', }, - 'playlist': [{ - 'md5': '', - 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'ext': 'mp4', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', - }, - }], 'params': { # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'], }] - @staticmethod - def find_video_info(collection, slug): - for video in collection.get('videos'): - if video.get('slug') == slug: - return video - - @staticmethod - def find_collection_by_linkURL(collections, linkURL): - for collection in collections: - if collection.get('linkURL') == linkURL: - return collection - - @staticmethod - def find_collection_containing_video(collections, slug): - for collection in collections: - for video in collection.get('videos'): - if video.get('slug') == slug: - return collection, video - return None, None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_path = mobj.group('show_path') - episode_path = mobj.group('episode_path') - is_playlist = True if mobj.group('is_playlist') else False - - webpage = self._download_webpage(url, episode_path) - - # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrapped_data = self._parse_json(self._search_regex( - r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) - - # Downloading videos from a /videos/playlist/ URL needs to be handled differently. - # NOTE: We are only downloading one video (the current one) not the playlist - if is_playlist: - collections = bootstrapped_data['playlists']['collections'] - collection = self.find_collection_by_linkURL(collections, show_path) - video_info = self.find_video_info(collection, episode_path) - - show_title = video_info['showTitle'] - segment_ids = [video_info['videoPlaybackID']] + show_path, episode_path = re.match(self._VALID_URL, url).groups() + display_id = episode_path or show_path + webpage = self._download_webpage(url, display_id) + initial_data = self._parse_json(self._search_regex( + r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', + webpage, 'initial data'), display_id) + + is_stream = show_path == 'streams' + if is_stream: + if not episode_path: + episode_path = 'live-stream' + + video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) + video_id = video_data.get('stream') + + if not video_id: + entries = [] + for episode in video_data.get('archiveEpisodes', []): + episode_url = episode.get('url') + if not episode_url: + continue + entries.append(self.url_result( + episode_url, 'AdultSwim', episode.get('id'))) + return self.playlist_result( + entries, video_data.get('id'), video_data.get('title'), + strip_or_none(video_data.get('description'))) else: - collections = bootstrapped_data['show']['collections'] - collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. - if video_info is None: - if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: - video_info = bootstrapped_data['slugged_video'] - if not video_info: - video_info = bootstrapped_data.get( - 'heroMetadata', {}).get('trailer', {}).get('video') - if not video_info: - video_info = bootstrapped_data.get('onlineOriginals', [None])[0] - if not video_info: - raise ExtractorError('Unable to find video info') - - show = bootstrapped_data['show'] - show_title = show['title'] - stream = video_info.get('stream') - if stream and stream.get('videoPlaybackID'): - segment_ids = [stream['videoPlaybackID']] - elif video_info.get('clips'): - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] - elif video_info.get('videoPlaybackID'): - segment_ids = [video_info['videoPlaybackID']] - elif video_info.get('id'): - segment_ids = [video_info['id']] - else: - if video_info.get('auth') is True: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - else: - raise ExtractorError('Unable to find stream or clips') - - episode_id = video_info['id'] - episode_title = video_info['title'] - episode_description = video_info.get('description') - episode_duration = int_or_none(video_info.get('duration')) - view_count = int_or_none(video_info.get('views')) + show_data = initial_data['show'] + + if not episode_path: + entries = [] + for video in show_data.get('videos', []): + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('id'))) + return self.playlist_result( + entries, show_data.get('id'), show_data.get('title'), + strip_or_none(show_data.get('metadata', {}).get('description'))) + + video_data = show_data['sluggedVideo'] + video_id = video_data['id'] + + info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, + video_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': video_data.get('auth'), + }) - entries = [] - for part_num, segment_id in enumerate(segment_ids): - segement_info = self._extract_cvp_info( - 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, - segment_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', - 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', - }, - }) - segment_title = '%s - %s' % (show_title, episode_title) - if len(segment_ids) > 1: - segment_title += ' Part %d' % (part_num + 1) - segement_info.update({ - 'id': segment_id, - 'title': segment_title, - 'description': episode_description, + info.update({ + 'id': video_id, + 'display_id': display_id, + 'description': info.get('description') or strip_or_none(video_data.get('description')), + }) + if not is_stream: + info.update({ + 'duration': info.get('duration') or int_or_none(video_data.get('duration')), + 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), + 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), + 'episode': info['title'], + 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), }) - entries.append(segement_info) - return { - '_type': 'playlist', - 'id': episode_id, - 'display_id': episode_path, - 'entries': entries, - 'title': '%s - %s' % (show_title, episode_title), - 'description': episode_description, - 'duration': episode_duration, - 'view_count': view_count, - } + info['series'] = video_data.get('collection_title') or info.get('series') + if info['series'] and info['series'] != info['title']: + info['title'] = '%s - %s' % (info['series'], info['title']) + + return info diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c01c67303..2dcdba9d2 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE): for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): entries.append(self.url_result( compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - elif url_parts_len == 2: + if entries: + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + else: + # single season + url_parts_len = 2 + if url_parts_len == 2: entries = [] for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) @@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE): url, episode_attributes['data-canonical']) entries.append(self.url_result( episode_url, 'AENetworks', - episode_attributes['data-videoid'])) + episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 78d29c861..c8cb91dcb 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor): file_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', note='Downloading part %d m3u8 information' % file_num) - title = title if one else '%s (part %d)' % (title, file_num) file_info = common_entry.copy() file_info.update({ 'id': format_id, - 'title': title, + 'title': title if one else '%s (part %d)' % (title, file_num), 'upload_date': upload_date, 'duration': file_duration, 'formats': formats, diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 388e578d5..c68be3134 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,9 +4,9 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', 'info_dict': { 'id': '3792260579001', @@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor): }, 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', - } + }, { + 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 98f8e69cd..fde1a8ff7 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -34,9 +34,12 @@ class AMPIE(InfoExtractor): if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: - thumbnail = thumbnail_data['@attributes'] + thumbnail = thumbnail_data.get('@attributes', {}) + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue thumbnails.append({ - 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'url': self._proto_relative_url(thumbnail_url, 'http:'), 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) @@ -47,9 +50,14 @@ class AMPIE(InfoExtractor): if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: - subtitle = subtitle_data['@attributes'] - lang = subtitle.get('lang') or 'en' - subtitles[lang] = [{'url': subtitle['href']}] + subtitle = subtitle_data.get('@attributes', {}) + subtitle_href = subtitle.get('href') + if not subtitle_href: + continue + subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ + 'url': subtitle_href, + 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), + }) formats = [] media_content = get_media_node('content') diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 623f44dce..8023da702 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -5,6 +5,7 @@ import base64 import hashlib import json import random +import re import time from .common import InfoExtractor @@ -16,6 +17,7 @@ from ..utils import ( intlist_to_bytes, int_or_none, strip_jsonp, + unescapeHTML, ) @@ -26,6 +28,8 @@ def md5_text(s): class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor): 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' } + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' def __init__(self, *args, **kwargs): @@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor): } if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): - # Not using _extract_m3u8_formats here as individual media - # playlists are also included in published_urls. - if tbr is None: - formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) - continue - else: + if tbr is not None: a_format.update({ 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), 'ext': 'mp4', @@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json(self._html_search_regex( - r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, - 'Anvato player data'), video_id) + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) return self._get_anvato_videos( anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index ea7a70393..a84b8b1eb 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor): _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' _TEST = { 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'md5': 'e7c38568a01ea45402570e6029206723', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', 'title': 'Energy', 'uploader': 'Drake', - 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150710', 'timestamp': 1436545535, }, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a6801f3d4..b45b431e1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor): }, { 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', 'info_dict': { - 'id': 'blackthorn', + 'id': '4489', + 'title': 'Blackthorn', }, 'playlist_mincount': 2, 'expected_warnings': ['Unable to download JSON metadata'], @@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor): 'title': 'Most Popular', 'id': 'mostpopular', }, - 'playlist_mincount': 80, + 'playlist_mincount': 30, }, { 'url': 'http://trailers.apple.com/#section=moviestudios', 'info_dict': { diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index e21045bed..3c7d7250b 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor): } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', + 'md5': '0869000b4ce265e8ca62738b336b268a', 'info_dict': { 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:b4544662605877edd99df22f9620d858', + 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 99af6dc5a..01fa308ff 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor): }, { 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'md5': '6e52cbb513c405e403dbacb7aacf8747', 'info_dict': { 'id': 'capitulo-112-david-bustamante', 'ext': 'flv', diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 8fc5f65c6..e48bb8972 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor): 'title': '3/09/2016 Czaban Hour 3', 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', 'duration': 2245.72, - 'uploader': 'Steve Czaban', + 'uploader': 'SB Nation A.M.', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', } }, { diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 056e06376..489d0ba53 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor): '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '73d0b3171568232574e45652f8720b5c', + 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', 'info_dict': { 'id': '2650410135', - 'ext': 'mp3', - 'title': 'Lanius (Battle)', - 'uploader': 'Ben Prunty Music', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'uploader': 'Ben Prunty', }, }] @@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) + thumbnail = self._html_search_meta('og:image', webpage, default=None) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) @@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor): return { 'id': track_id, 'title': data['title'], + 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), } @@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': info.get('thumb_url'), + 'thumbnail': info.get('thumb_url') or thumbnail, 'uploader': info.get('artist'), 'artist': artist, 'track': track, diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b0b7914d8..d5c5822f2 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -16,7 +16,7 @@ class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '46c384def73b33dbc581262e5ee67cef', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', 'info_dict': { 'id': '5416503', 'ext': 'mp4', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80dd8382e..1e3f25515 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor): 'preference': -2 if 'hd.mp4' in backup_url else -3, }) + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + self._sort_formats(formats) entries.append({ diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 7a8e1f60b..e829974ff 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, @@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': '8c2c12e3af7805152675446c905d159b', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index ff0aa11b1..2c32b6ae2 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -77,7 +77,7 @@ class BRIE(InfoExtractor): 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', - 'upload_date': '20140117', + 'upload_date': '20170208', } }, ] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 97602ca30..0ed59bcbc 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,6 +5,7 @@ import re import json from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, @@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor): return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ @@ -522,7 +523,7 @@ class BrightcoveNewIE(InfoExtractor): # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) - (<video\s+[^>]*data-video-id=['"]?[^>]+>) + (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (<script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ @@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(message, expected=True) raise + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + title = json_data['name'].strip() formats = [] @@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor): }) formats.append(f) - errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor): is_live = False duration = float_or_none(json_data.get('duration'), 1000) - if duration and duration < 0: + if duration is not None and duration <= 0: is_live = True return { diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f1f128c45..acd87e371 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index cf678e7f8..87ad14e91 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -96,6 +96,7 @@ class CBCIE(InfoExtractor): 'info_dict': { 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, 'playlist_mincount': 6, }] @@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }, { - # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '17a61eb813539abea40618d6323a7f82', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 8d5f11dd1..7d78e3aae 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE): 'title': 'A Very Blue Anniversary', 'description': 'CBS2’s Cindy Hsu has more.', 'thumbnail': 're:^https?://.*', - 'timestamp': 1479962220, - 'upload_date': '20161124', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b501..78b7a923c 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + multipart_encode, parse_duration, + random_birthday, + urljoin, ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor): 'description': 'md5:269ccd135d550da90d1662651fcb9772', 'thumbnail': r're:^https?://.*\.jpg$', 'average_rating': float, - 'duration': 39 + 'duration': 39, + 'age_limit': 0, } }, { 'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor): 'uploader': 'crash404', 'view_count': int, 'average_rating': float, - 'duration': 137 + 'duration': 137, + 'age_limit': 0, } }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + need_confirm_age = False + if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + formats = [] uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, } def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor): for href, resolution in re.findall( r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): - webpage = self._download_webpage( + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( self._BASE_URL + href, video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor): # invalid version is requested. self.report_warning('Unable to download %s version information' % resolution) continue + extract_format(webpage, resolution) self._sort_formats(formats) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index bb52e0c6f..0920f6219 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': '720563e467b86374c194bdead08d207d', + 'md5': 'b9a5dc46294154c1193e2d10e0c95693', 'info_dict': { 'id': '4343170', 'ext': 'mp4', diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index 18c734766..6a41db87c 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', @@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'wmv', 'title': '64ste Vakantiecursus: Afvalwater', 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 10853, 'timestamp': 1326446400, 'upload_date': '20120113', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8b3f04c61..fec39da8b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -245,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -990,6 +994,7 @@ class InfoExtractor(object): 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), }) for e in json_ld: @@ -1334,7 +1339,7 @@ class InfoExtractor(object): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + formats = [] format_url = lambda u: ( u @@ -1386,6 +1391,7 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': format_url(media_url), + 'manifest_url': m3u8_url, 'language': media.get('LANGUAGE'), 'ext': ext, 'protocol': entry_protocol, @@ -1438,7 +1444,7 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': manifest_url, - 'manifest_url': manifest_url, + 'manifest_url': m3u8_url, 'tbr': tbr, 'ext': ext, 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), @@ -1995,6 +2001,12 @@ class InfoExtractor(object): compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] @@ -2016,8 +2028,11 @@ class InfoExtractor(object): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 - width = int_or_none(track.get('MaxWidth')) - height = int_or_none(track.get('MaxHeight')) + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2168,7 +2183,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') if hds_host: f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2190,8 +2205,9 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') - http_base_url = 'http' + url_base + url_base = self._search_regex( + r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') + http_base_url = '%s:%s' % ('http', url_base) formats = [] if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( @@ -2225,7 +2241,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': protocol + url_base, + 'url': '%s:%s' % (protocol, url_base), 'format_id': protocol, 'protocol': protocol, }) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index d3463b874..0c3f0c0e4 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,7 +16,6 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, - remove_end, ) @@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| + (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) + )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor): 'upload_date': '20150916', 'timestamp': 1442434955, } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, }] def _extract_series(self, url, webpage): @@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage, url_type): + def _extract_video_params(self, webpage): query = {} params = self._search_regex( r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) @@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor): 'playerId': params['data-player'], 'target': params['id'], }) - video_id = query['videoId'] + return query + + def _extract_video(self, params): + video_id = params['videoId'] + video_info = None - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=query) - if info_page: - video_info = info_page.get('video') - if not video_info: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + else: info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=query) + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: video_info = self._parse_json( self._search_regex( r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), @@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld( - webpage, video_id, fatal=False) if url_type != 'embed' else {} - info.update({ + return { 'id': video_id, 'formats': formats, 'title': title, @@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor): 'series': video_info.get('series_title'), 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), - }) - return info + 'categories': video_info.get('categories'), + } def _real_extract(self, url): - site, url_type, item_id = re.match(self._VALID_URL, url).groups() + video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() - # Convert JS embed to regular embed - if url_type == 'embedjs': - parsed_url = compat_urlparse.urlparse(url) - url = compat_urlparse.urlunparse(parsed_url._replace( - path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) - url_type = 'embed' + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) - webpage = self._download_webpage(url, item_id) + webpage = self._download_webpage(url, display_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage, url_type) + params = self._extract_video_params(webpage) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index 5fa1f006b..6ea03e65c 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -24,12 +24,11 @@ class CoubIE(InfoExtractor): 'duration': 4.6, 'timestamp': 1428527772, 'upload_date': '20150408', - 'uploader': 'Артём Лоскутников', + 'uploader': 'Artyom Loskutnikov', 'uploader_id': 'artyom.loskutnikov', 'view_count': int, 'like_count': int, 'repost_count': int, - 'comment_count': int, 'age_limit': 0, }, }, { @@ -118,7 +117,6 @@ class CoubIE(InfoExtractor): view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) like_count = int_or_none(coub.get('likes_count')) repost_count = int_or_none(coub.get('recoubs_count')) - comment_count = int_or_none(coub.get('comments_count')) age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) if age_restricted is not None: @@ -137,7 +135,6 @@ class CoubIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, 'repost_count': repost_count, - 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f919ed208..13f425b2b 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor): 'season_number': 8, 'episode_number': 4, 'subtitles': { - 'en-US': [{ - 'ext': 'ttml', - }] + 'en-US': [ + {'ext': 'vtt'}, + {'ext': 'tt'}, + ] }, }, 'params': { diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ed8b30bb..2ffa4a7f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', @@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'series': "KONOSUBA -God's blessing on this wonderful world!", 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, - 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode': 'Give Me Deliverance From This Judicial Injustice!', 'episode_number': 1, }, 'params': { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d4576160b..171820e27 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,7 @@ from ..utils import ( smuggle_url, determine_ext, ExtractorError, + extract_attributes, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor): 'uploader_id': '12987475', }, }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) @@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor): if ustream_url: return self.url_result(ustream_url, UstreamIE.ie_key()) + if '&vod' not in url: + bc = self._search_regex( + r"(<[^>]+id='brightcove-player-embed'[^>]+>)", + webpage, 'brightcove embed', default=None) + if bc: + bc_attr = extract_attributes(bc) + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + bc_attr.get('data-bcaccountid', '3162030207001'), + bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'), + bc_attr.get('data-newbcplayerid', 'default'), + bc_attr['data-bcid']) + return self.url_result(smuggle_url(bc_url, {'source_url': url})) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 98c835bf1..538565c66 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, determine_protocol, + try_get, unescapeHTML, ) @@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) title = unescapeHTML(video_data['title']) - video_sources = self._download_json(video_data.get( - 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + sources_url = (try_get( + video_data, + (lambda x: x['plugins']['sources']['url'], + lambda x: x['sources']['url']), compat_str) or + 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + + video_sources = self._download_json(sources_url, video_id) formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 246efde43..f8db76c18 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor): ('stream_h264_hd1080_url', 'hd180'), ] - _TESTS = [ - { - 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', - 'md5': '2137c41a8e78554bb09225b8eb322406', - 'info_dict': { - 'id': 'x2iuewm', - 'ext': 'mp4', - 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 74, - 'timestamp': 1425657362, - 'upload_date': '20150306', - 'uploader': 'IGN', - 'uploader_id': 'xijv66', - 'age_limit': 0, - 'view_count': int, - } + _TESTS = [{ + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + 'view_count': int, + }, + }, { + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', + 'info_dict': { + 'id': 'x2iuewm', + 'ext': 'mp4', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, }, + 'skip': 'video gone', + }, { # Vevo video - { - 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', - 'info_dict': { - 'title': 'Roar (Official)', - 'id': 'USUV71301934', - 'ext': 'mp4', - 'uploader': 'Katy Perry', - 'upload_date': '20130905', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'VEVO is only available in some countries', + 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + 'info_dict': { + 'title': 'Roar (Official)', + 'id': 'USUV71301934', + 'ext': 'mp4', + 'uploader': 'Katy Perry', + 'upload_date': '20130905', + }, + 'params': { + 'skip_download': True, }, + 'skip': 'VEVO is only available in some countries', + }, { # age-restricted video - { - 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', - 'md5': '0d667a7b9cebecc3c89ee93099c4159d', - 'info_dict': { - 'id': 'xyh2zz', - 'ext': 'mp4', - 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', - 'uploader': 'HotWaves1012', - 'age_limit': 18, - }, - 'skip': 'video gone', + 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + 'md5': '0d667a7b9cebecc3c89ee93099c4159d', + 'info_dict': { + 'id': 'xyh2zz', + 'ext': 'mp4', + 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + 'uploader': 'HotWaves1012', + 'age_limit': 18, }, + 'skip': 'video gone', + }, { # geo-restricted, player v5 - { - 'url': 'http://www.dailymotion.com/video/xhza0o', - 'only_matching': True, - }, + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, + }, { # with subtitles - { - 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', - 'only_matching': True, - }, - { - 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', - 'only_matching': True, - } - ] + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index bdfe638b4..5c9c0ecdc 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor): 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'Daily Show', + 'title': 'Daily Show for July 03, 2015', + 'description': 'md5:80eb927244d6749900de6072c7cc2c86', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index 1f75352ca..148605c0b 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor): 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', 'duration': 290, 'timestamp': 1476767794.2809999, - 'upload_date': '20160525', + 'upload_date': '20161018', 'uploader': 'parthivi001', 'uploader_id': 'user52596202', 'view_count': int, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 82d8a042f..9757f4422 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals import time import hashlib +import re from .common import InfoExtractor from ..utils import ( ExtractorError, unescapeHTML, + unified_strdate, + urljoin, ) @@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': 'iseven', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': '17732', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor): 'uploader': uploader, 'is_live': True, } + + +class DouyuShowIE(InfoExtractor): + _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', + 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'info_dict': { + 'id': 'rjNBdvnVXNzvE2yw', + 'ext': 'mp4', + 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', + 'duration': 7150.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '陈一发儿', + 'uploader_id': 'XrZwYelr5wbK', + 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', + 'upload_date': '20170402', + }, + }, { + 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', + 'only_matching': True, + }] + + def _real_extract(self, url): + url = url.replace('vmobile.', 'v.') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + room_info = self._parse_json(self._search_regex( + r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + + video_info = None + + for trial in range(5): + # Sometimes Douyu rejects our request. Let's try it more times + try: + video_info = self._download_json( + 'https://vmobile.douyu.com/video/getInfo', video_id, + query={'vid': video_id}, + headers={ + 'Referer': url, + 'x-requested-with': 'XMLHttpRequest', + }) + break + except ExtractorError: + self._sleep(1, video_id) + + if not video_info: + raise ExtractorError('Can\'t fetch video info') + + formats = self._extract_m3u8_formats( + video_info['data']['video_url'], video_id, + entry_protocol='m3u8_native', ext='mp4') + + upload_date = unified_strdate(self._html_search_regex( + r'<em>上传时间:</em><span>([^<]+)</span>', webpage, + 'upload date', fatal=False)) + + uploader = uploader_id = uploader_url = None + mobj = re.search( + r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', + webpage) + if mobj: + uploader_id, uploader = mobj.groups() + uploader_url = urljoin(url, '/author/' + uploader_id) + + return { + 'id': video_id, + 'title': room_info['name'], + 'formats': formats, + 'duration': room_info.get('duration'), + 'thumbnail': room_info.get('pic'), + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e4917014a..c84624f1e 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '25e659cccc9a2ed956110a299fdf5983', + 'md5': '7ae17b4e18eb5d29212f424a7511c184', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', @@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor): 'upload_date': '20160823', 'duration': 606.84, }, - 'params': { - 'skip_download': True, - }, }, { + # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'md5': '2c37175c718155930f939ef59952474a', 'info_dict': { 'id': 'christiania-pusher-street-ryddes-drdkrjpo', 'ext': 'mp4', 'title': 'LIVE Christianias rydning af Pusher Street er i gang', - 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', 'duration': 131.4, }, + 'params': { + 'skip_download': True, + }, + }, { + # with SignLanguage formats + 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', + 'info_dict': { + 'id': 'historien-om-danmark-stenalder', + 'ext': 'mp4', + 'title': 'Historien om Danmark: Stenalder (1)', + 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', + 'timestamp': 1490401996, + 'upload_date': '20170325', + 'duration': 3502.04, + 'formats': 'mincount:20', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor): elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') - spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') if not uri: @@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor): target = link.get('Target') format_id = target or '' preference = None - if spoken_subtitles: + if asset_target in ('SpokenSubtitles', 'SignLanguage'): preference = -1 - format_id += '-spoken-subtitles' + format_id += '-%s' % asset_target if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 355a4e56f..ed603eb29 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE +from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE @@ -250,7 +251,10 @@ from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE -from .douyutv import DouyuTVIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) from .dplay import ( DPlayIE, DPlayItIE, @@ -349,9 +353,9 @@ from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( - PluzzIE, - FranceTvInfoIE, FranceTVIE, + FranceTVEmbedIE, + FranceTVInfoIE, GenerationQuoiIE, CultureboxIE, ) @@ -541,6 +545,7 @@ from .mangomolo import ( ) from .matchtv import MatchTVIE from .mdr import MDRIE +from .mediaset import MediasetIE from .medici import MediciIE from .meipai import MeipaiIE from .melonvod import MelonVODIE @@ -662,6 +667,8 @@ from .nintendo import NintendoIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE +from .nonktube import NonkTubeIE +from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE @@ -730,8 +737,8 @@ from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, - ORFOE1IE, ORFFM4IE, + ORFOE1IE, ORFIPTVIE, ) from .packtpub import ( @@ -1096,6 +1103,10 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) +from .upskill import ( + UpskillIE, + UpskillCourseIE, +) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE @@ -1123,6 +1134,7 @@ from .vgtv import ( from .vh1 import VH1IE from .vice import ( ViceIE, + ViceArticleIE, ViceShowIE, ) from .viceland import VicelandIE @@ -1298,5 +1310,6 @@ from .youtube import ( YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE +from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index a3bb98377..985542727 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TEST = { - 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'i0qKWsk3qJaM', + 'id': 'bwduI3X_TgUB', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', @@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config = self._parse_json( - self._search_regex( - r"data-player-config='([^']+)'", webpage, 'data player config'), + self._html_search_regex( + r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", + webpage, 'data player config'), video_id) return self.url_result(smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 48d43ae58..546d5caa0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,11 +21,13 @@ from .dailymotion import ( class FranceTVBaseInfoExtractor(InfoExtractor): - def _extract_video(self, video_id, catalogue): + def _extract_video(self, video_id, catalogue=None): info = self._download_json( - 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' - % (video_id, catalogue), - video_id, 'Downloading video JSON') + 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', + video_id, 'Downloading video JSON', query={ + 'idDiffusion': video_id, + 'catalogue': catalogue or '', + }) if info.get('status') == 'NOK': raise ExtractorError( @@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor): } -class PluzzIE(FranceTVBaseInfoExtractor): - IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' +class FranceTVIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html' - # Can't use tests, videos expire in 7 days + _TESTS = [{ + 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', + 'info_dict': { + 'id': '157550144', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1494156300, + 'upload_date': '20170507', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + # france3 + 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', + 'only_matching': True, + }, { + # france4 + 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', + 'only_matching': True, + }, { + # france5 + 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', + 'only_matching': True, + }, { + # franceo + 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', + 'only_matching': True, + }, { + # france2 live + 'url': 'https://www.france.tv/france-2/direct.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', + 'only_matching': True, + }, { + 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_meta( - 'id_video', webpage, 'video id', default=None) + catalogue = None + video_id = self._search_regex( + r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'video id', default=None, group='id') + if not video_id: - video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + video_id, catalogue = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + webpage, 'video ID').split('@') + return self._extract_video(video_id, catalogue) - return self._extract_video(video_id, 'Pluzz') +class FranceTVEmbedIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' -class FranceTvInfoIE(FranceTVBaseInfoExtractor): + _TEST = { + 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', + 'info_dict': { + 'id': 'NI_983319', + 'ext': 'mp4', + 'title': 'Le Pen Reims', + 'upload_date': '20170505', + 'timestamp': 1493981780, + 'duration': 16, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, + video_id) + + return self._extract_video(video['video_id'], video.get('catalog')) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' @@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id, catalogue) -class FranceTVIE(FranceTVBaseInfoExtractor): - IE_NAME = 'francetv' - IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?france[2345o]\.fr/ - (?: - emissions/[^/]+/(?:videos|diffusions)| - emission/[^/]+| - videos| - jt - ) - /| - embed\.francetv\.fr/\?ue= - ) - (?P<id>[^/?]+) - ''' - - _TESTS = [ - # france2 - { - 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - 'md5': 'c03fc87cb85429ffd55df32b9fc05523', - 'info_dict': { - 'id': '109169362', - 'ext': 'flv', - 'title': '13h15, le dimanche...', - 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', - 'upload_date': '20140914', - 'timestamp': 1410693600, - }, - }, - # france3 - { - 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', - 'md5': '679bb8f8921f8623bd658fa2f8364da0', - 'info_dict': { - 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - 'ext': 'mp4', - 'title': 'Le scandale du prix des médicaments', - 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', - 'upload_date': '20131113', - 'timestamp': 1384380000, - }, - }, - # france4 - { - 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', - 'info_dict': { - 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'ext': 'mp4', - 'title': 'Hero Corp Making of - Extrait 1', - 'description': 'md5:c87d54871b1790679aec1197e73d650a', - 'upload_date': '20131106', - 'timestamp': 1383766500, - }, - }, - # france5 - { - 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', - 'md5': 'f6c577df3806e26471b3d21631241fd0', - 'info_dict': { - 'id': '123327454', - 'ext': 'flv', - 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', - 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', - 'upload_date': '20150831', - 'timestamp': 1441035120, - }, - }, - # franceo - { - 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', - 'md5': '47d5816d3b24351cdce512ad7ab31da8', - 'info_dict': { - 'id': '125377621', - 'ext': 'flv', - 'title': 'Infô soir', - 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', - 'upload_date': '20150718', - 'timestamp': 1437241200, - 'duration': 414, - }, - }, - { - # francetv embed - 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'flv', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - }, - { - 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', - 'only_matching': True, - }, - { - 'url': 'http://www.franceo.fr/videos/125377617', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) - - class GenerationQuoiIE(InfoExtractor): IE_NAME = 'france2.fr:generation-quoi' _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index e44a2a87f..8c37509ec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,15 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, js_to_json, - sanitized_Request, ExtractorError, urlencode_postdata ) @@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'funimation' + _TOKEN = None _TESTS = [{ 'url': 'https://www.funimation.com/shows/hacksign/role-play/', @@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { - 'id': '9635', + 'id': '210051', 'display_id': 'broadcast-dub-preview', 'ext': 'mp4', 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', - 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, }] - _LOGIN_URL = 'http://www.funimation.com/login' - - def _extract_cloudflare_session_ua(self, url): - ci_session_cookie = self._get_cookies(url).get('ci_session') - if ci_session_cookie: - ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) - # ci_session is a string serialized by PHP function serialize() - # This case is simple enough to use regular expressions only - return self._search_regex( - r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', - default=None) - def _login(self): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata({ - 'email_field': username, - 'password_field': password, - }) - user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) - if not user_agent: - user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' - login_request = sanitized_Request(self._LOGIN_URL, data, headers={ - 'User-Agent': user_agent, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - login_page = self._download_webpage( - login_request, None, 'Logging in as %s' % username) - if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): - return - error = self._html_search_regex( - r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', - login_page, 'error messages', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in as %s' % username, data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + self._TOKEN = data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise def _real_initialize(self): self._login() @@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor): description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) try: + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Token %s' % self._TOKEN sources = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, - video_id)['items'] + video_id, headers=headers)['items'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read(), video_id)['errors'][0] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 81c0ce9a3..49409369c 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor): m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) source_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] bitrates.sort() diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 682c49e79..00d311158 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -78,8 +78,7 @@ class GameSpotIE(OnceIE): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3136427db..f71d9092e 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor): 'format': 'jp', # The japanese audio } }, + { + # gdc-player.html + 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', + 'info_dict': { + 'id': '1435', + 'display_id': 'An-American-engine-in-Tokyo', + 'ext': 'flv', + 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + }, + }, ] def _login(self, webpage_url, display_id): @@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor): 'title': title, } - PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 67184bc5d..c108d4a8a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -86,6 +86,10 @@ from .openload import OpenloadIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE class GenericIE(InfoExtractor): @@ -1427,6 +1431,22 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1677,6 +1697,42 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 5, }, + { + 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', + 'info_dict': { + 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', + 'title': 'Standoff with Walnut Creek murder suspect ends', + 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', + }, + 'playlist_mincount': 4, + }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2070,57 +2126,20 @@ class GenericIE(InfoExtractor): playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player - match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) - if match: - embed_url = self._proto_relative_url( - unescapeHTML(match.group('url'))) - return { - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'Wistia', - 'uploader': video_uploader, - } - - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: return { '_type': 'url_transparent', - 'url': 'wistia:%s' % match.group('id'), - 'ie_key': 'Wistia', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, } - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 - ''', webpage) - if match: - return self.url_result(self._proto_relative_url( - 'wistia:%s' % match.group('id')), 'Wistia') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player - matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', - webpage) - if matches: - return { - '_type': 'playlist', - 'entries': [{ - '_type': 'url', - 'ie_key': 'CondeNast', - 'url': ma, - } for ma in matches], - 'title': video_title, - 'id': video_id, - } - # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -2514,28 +2533,11 @@ class GenericIE(InfoExtractor): return self.playlist_result( limelight_urls, video_id, video_title, video_description) - mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) - if mobj: - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - return self.url_result(smuggle_url('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), - 'Limelight%s' % mobj.group(1), mobj.group(2)) - - mobj = re.search( - r'''(?sx) - <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) - ''', webpage) - if mobj: - return self.url_result(smuggle_url( - 'limelight:media:%s' % mobj.group('id'), - {'source_url': url}), 'LimelightMedia', mobj.group('id')) + # Look for Anvato embeds + anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) + if anvato_urls: + return self.playlist_result( + anvato_urls, video_id, video_title, video_description) # Look for AdobeTVVideo embeds mobj = re.search( @@ -2654,6 +2656,18 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 4c9be47b4..9c7b1bd37 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,22 +36,26 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ - 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { - 'id': '0_g86w5onx', + 'id': 'VDKA3807643', 'ext': 'mp4', - 'title': 'Sneak Peek: Language Arts', - 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + 'title': 'The Traitor in the White House', + 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', - 'only_matching': True, + 'url': 'http://watchdisneyxd.go.com/doraemon', + 'info_dict': { + 'title': 'Doraemon', + 'id': 'SH55574025', + }, + 'playlist_mincount': 51, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -60,19 +64,36 @@ class GoIE(AdobePassIE): 'only_matching': True, }] + def _extract_videos(self, brand, video_id='-1', show_id='-1'): + display_id = video_id if video_id != '-1' else show_id + return self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + display_id)['video'] + def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + site_info = self._SITE_INFO[sub_domain] + brand = site_info['brand'] if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - video_data = self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), - video_id)['video'][0] + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + if not video_id: + # show extraction works for Disney, DisneyJunior and DisneyXD + # ABC and Freeform has different layout + show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') + videos = self._extract_videos(brand, show_id=show_id) + show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) + entries = [] + for video in videos: + entries.append(self.url_result( + video['url'], 'Go', video.get('id'), video.get('title'))) + entries.reverse() + return self.playlist_result(entries, show_id, show_title) + video_data = self._extract_videos(brand, video_id)[0] + video_id = video_data['id'] title = video_data['title'] formats = [] @@ -105,7 +126,7 @@ class GoIE(AdobePassIE): self._initialize_geo_bypass(['US']) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', - video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) + video_id, data=urlencode_postdata(data)) errors = entitlement.get('errors', {}).get('errors', []) if errors: for error in errors: diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index e21ebb8fb..1d905dc81 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -16,8 +16,8 @@ from ..utils import ( class HitboxIE(InfoExtractor): IE_NAME = 'hitbox' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/video/203213', 'info_dict': { 'id': '203213', @@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', + 'only_matching': True, + }] def _extract_metadata(self, url, video_id): thumb_base = 'https://edge.sf.hitbox.tv' metadata = self._download_json( - '%s/%s' % (url, video_id), video_id, - 'Downloading metadata JSON') + '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON') date = 'media_live_since' media_type = 'livestream' @@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor): views = int_or_none(video_meta.get('media_views')) timestamp = parse_iso8601(video_meta.get(date), ' ') categories = [video_meta.get('category_name')] - thumbs = [ - {'url': thumb_base + video_meta.get('media_thumbnail'), - 'width': 320, - 'height': 180}, - {'url': thumb_base + video_meta.get('media_thumbnail_large'), - 'width': 768, - 'height': 432}, - ] + thumbs = [{ + 'url': thumb_base + video_meta.get('media_thumbnail'), + 'width': 320, + 'height': 180 + }, { + 'url': thumb_base + video_meta.get('media_thumbnail_large'), + 'width': 768, + 'height': 432 + }] return { 'id': video_id, @@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/video/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/video/%s' % video_id, video_id, 'Downloading video JSON') formats = [] @@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/video', - video_id) + 'https://www.smashcast.tv/api/media/video', video_id) metadata['formats'] = formats return metadata @@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor): class HitboxLiveIE(HitboxIE): IE_NAME = 'hitbox:live' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/dimak', 'info_dict': { 'id': 'dimak', @@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE): # live 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/dimak', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/live/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/live/%s' % video_id, video_id) formats = [] @@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/live', - video_id) + 'https://www.smashcast.tv/api/media/live', video_id) metadata['formats'] = formats metadata['is_live'] = True metadata['title'] = self._live_title(metadata.get('title')) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f95c00c73..3ff672a89 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/videoplayer/vi1562949145', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9fb71e8ef..fe425e786 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE): def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) - http_audio_url = fields['filename'] - if http_audio_url is None: + http_audio_url = fields.get('filename') + if not http_audio_url: return [] cookies_header = {'Cookie': self._extract_cookies(webpage)} diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 3190b187c..1f91ba017 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -8,15 +10,15 @@ from ..utils import ( urlencode_postdata, xpath_element, xpath_text, - urljoin, update_url_query, + js_to_json, ) class Laola1TvEmbedIE(InfoExtractor): IE_NAME = 'laola1tv:embed' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TEST = { + _TESTS = [{ # flashvars.premium = "false"; 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', 'info_dict': { @@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor): 'uploader': 'ITTF - International Table Tennis Federation', 'upload_date': '20161211', }, - } + }] + + def _extract_token_url(self, stream_access_url, video_id, data): + return self._download_json( + stream_access_url, video_id, headers={ + 'Content-Type': 'application/json', + }, data=json.dumps(data).encode())['data']['stream-access'][0] + + def _extract_formats(self, token_url, video_id): + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) @@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor): else: data_abo = urlencode_postdata( dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - token_url = self._download_json( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', - video_id, query={ + stream_access_url = update_url_query( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { 'videoId': _v('id'), 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), 'label': _v('label'), 'area': _v('area'), - }, data=data_abo)['data']['stream-access'][0] - - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib - - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) + }) + token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - self._sort_formats(formats) + formats = self._extract_formats(token_url, video_id) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] @@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(InfoExtractor): +class Laola1TvIE(Laola1TvEmbedIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor): if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = urljoin(url, self._search_regex( - r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url')) + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, js_to_json) + + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) return { - '_type': 'url', + 'id': video_id, 'display_id': display_id, - 'url': iframe_url, - 'ie_key': 'Laola1TvEmbed', + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 9eda956d2..0a07c1320 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,7 +23,6 @@ from ..utils import ( str_or_none, url_basename, urshift, - update_url_query, ) @@ -51,7 +50,7 @@ class LeIE(InfoExtractor): 'id': '1415246', 'ext': 'mp4', 'title': '美人天下01', - 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', }, 'params': { 'hls_prefer_native': True, @@ -69,7 +68,6 @@ class LeIE(InfoExtractor): 'params': { 'hls_prefer_native': True, }, - 'skip': 'Only available in China', }, { 'url': 'http://sports.le.com/video/25737697.html', 'only_matching': True, @@ -81,7 +79,7 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: @@ -90,15 +88,8 @@ class LeIE(InfoExtractor): return param1 def calc_time_key(self, param1): - _loc2_ = 773625421 - _loc3_ = self.ror(param1, _loc2_ % 13) - _loc3_ = _loc3_ ^ _loc2_ - _loc3_ = self.ror(_loc3_, _loc2_ % 17) - return _loc3_ - - # reversed from http://jstatic.letvcdn.com/sdk/player.js - def get_mms_key(self, time): - return self.ror(time, 8) ^ 185025305 + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ # see M3U8Encryption class in KLetvPlayer.swf @staticmethod @@ -122,7 +113,7 @@ class LeIE(InfoExtractor): def _check_errors(self, play_json): # Check for errors - playstatus = play_json['playstatus'] + playstatus = play_json['msgs']['playstatus'] if playstatus['status'] == 0: flag = playstatus['flag'] if flag == 1: @@ -134,58 +125,31 @@ class LeIE(InfoExtractor): media_id = self._match_id(url) page = self._download_webpage(url, media_id) - play_json_h5 = self._download_json( - 'http://api.le.com/mms/out/video/playJsonH5', - media_id, 'Downloading html5 playJson data', query={ - 'id': media_id, - 'platid': 3, - 'splatid': 304, - 'format': 1, - 'tkey': self.get_mms_key(int(time.time())), - 'domain': 'www.le.com', - 'tss': 'no', - }, - headers=self.geo_verification_headers()) - self._check_errors(play_json_h5) - play_json_flash = self._download_json( - 'http://api.le.com/mms/out/video/playJson', + 'http://player-pc.le.com/mms/out/video/playJson', media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, 'splatid': 101, 'format': 1, + 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.le.com', + 'region': 'cn', }, headers=self.geo_verification_headers()) self._check_errors(play_json_flash) - def get_h5_urls(media_url, format_id): - location = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id, query={ - 'format': 1, - 'expect': 3, - 'tss': 'no', - })['location'] - - return { - 'http': update_url_query(location, {'tss': 'no'}), - 'hls': update_url_query(location, {'tss': 'ios'}), - } - def get_flash_urls(media_url, format_id): - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'rateid': format_id, - }) - nodes_data = self._download_json( media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) req = self._request_webpage( nodes_data['nodelist'][0]['location'], media_id, @@ -199,29 +163,28 @@ class LeIE(InfoExtractor): extracted_formats = [] formats = [] - for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): - playurl = play_json['playurl'] - play_domain = playurl['domain'][0] - - for format_id, format_data in playurl.get('dispatch', []).items(): - if format_id in extracted_formats: - continue - extracted_formats.append(format_id) - - media_url = play_domain + format_data[0] - for protocol, format_url in get_urls(media_url, format_id).items(): - f = { - 'url': format_url, - 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - 'quality': int_or_none(format_id), - } - - if format_id[-1:] == 'p': - f['height'] = int_or_none(format_id[:-1]) - - formats.append(f) + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index d3bca6435..b312e77f1 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor): formats = self._extract_akamai_formats( '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == len(self._BITRATES): self._sort_formats(m3u8_formats) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c7de65353..c54519636 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '50f79e05ba149149c1b4ea961223d5b3', + 'md5': '0813c2430bea7a46bf13acf3406992f4', 'info_dict': { 'id': '757_1364311680', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident', @@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor): } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'b13a29626183c9d33944e6a04f41aafc', + 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', 'info_dict': { 'id': 'f93_1390833151', 'ext': 'mp4', @@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$' } }, { + # Prochan embed 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', 'md5': '42c6d97d54f1db107958760788c5f48f', 'info_dict': { @@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'CapObveus', 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, - } + }, + 'skip': 'Video is dead', }, { # Covers https://github.com/rg3/youtube-dl/pull/5983 + # Multiple resolutions 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', 'info_dict': { 'id': '801_1409392012', 'ext': 'mp4', @@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor): webpage, 'age limit', default=None)) video_thumbnail = self._og_search_thumbnail(webpage) - sources_raw = self._search_regex( - r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source - else: - # Maybe an embed? - embed_url = self._search_regex( - r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } + entries = self._parse_html5_media_entries(url, webpage, video_id) + if not entries: + # Maybe an embed? + embed_url = self._search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', + webpage, 'embed URL') + return { + '_type': 'url_transparent', + 'url': embed_url, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + } - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) + info_dict = entries[0] - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None) - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: - formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) + self._sort_formats(info_dict['formats']) - return { + info_dict.update({ 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, - 'formats': formats, 'age_limit': age_limit, 'thumbnail': video_thumbnail, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py new file mode 100644 index 000000000..9760eafd5 --- /dev/null +++ b/youtube_dl/extractor/mediaset.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + parse_duration, + try_get, + unified_strdate, +) + + +class MediasetIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + mediaset:| + https?:// + (?:www\.)?video\.mediaset\.it/ + (?: + (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + ) + )(?P<id>[0-9]+) + ''' + _TESTS = [{ + # full episode + 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', + 'info_dict': { + 'id': '661824', + 'ext': 'mp4', + 'title': 'Quarta puntata', + 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1414, + 'creator': 'mediaset', + 'upload_date': '20161107', + 'series': 'Hello Goodbye', + 'categories': ['reality'], + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # clip + 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'only_matching': True, + }, { + # iframe simple + 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'only_matching': True, + }, { + # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) + 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'only_matching': True, + }, { + 'url': 'mediaset:661824', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_list = self._download_json( + 'http://cdnsel01.mediaset.net/GetCdn.aspx', + video_id, 'Downloading video CDN JSON', query={ + 'streamid': video_id, + 'format': 'json', + })['videoList'] + + formats = [] + for format_url in video_list: + if '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': determine_ext(format_url), + }) + self._sort_formats(formats) + + mediainfo = self._download_json( + 'http://plr.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading video info JSON', query={ + 'id': video_id, + })['video'] + + title = mediainfo['title'] + + creator = try_get( + mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + category = try_get( + mediainfo, lambda x: x['brand-info']['category'], compat_str) + categories = [category] if category else None + + return { + 'id': video_id, + 'title': title, + 'description': mediainfo.get('short-description'), + 'thumbnail': mediainfo.get('thumbnail'), + 'duration': parse_duration(mediainfo.get('duration')), + 'creator': creator, + 'upload_date': unified_strdate(mediainfo.get('production-date')), + 'webpage_url': mediainfo.get('url'), + 'series': mediainfo.get('brand-value'), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 28b743cca..964dc542c 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor): video_id, 'Downloading gigya script') # Get a appKey/uuid for getting the session key - appKey_var = self._search_regex( - r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)', - gigya_sc, 'appKey variable') appKey = self._search_regex( - r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey') + r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', + gigya_sc, 'appKey') session_json = self._download_json( 'https://appgrid-api.cloud.accedo.tv/session', diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index f281238c9..e164d5940 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -12,64 +12,62 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + myspace\.com/[^/]+/ + (?P<mediatype> + video/[^/]+/(?P<video_id>\d+)| + music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) + ) + ''' - _TESTS = [ - { - 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', - 'md5': '9c1483c106f4a695c47d2911feed50a7', - 'info_dict': { - 'id': '109594919', - 'ext': 'mp4', - 'title': 'Little Big Town', - 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', - 'uploader': 'Five Minutes to the Stage', - 'uploader_id': 'fiveminutestothestage', - 'timestamp': 1414108751, - 'upload_date': '20141023', - }, + _TESTS = [{ + 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', + 'info_dict': { + 'id': '109594919', + 'ext': 'mp4', + 'title': 'Little Big Town', + 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', + 'uploader': 'Five Minutes to the Stage', + 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, + }, { # songs - { - 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', - 'md5': '1d7ee4604a3da226dd69a123f748b262', - 'info_dict': { - 'id': '93388656', - 'ext': 'm4a', - 'title': 'Of weakened soul...', - 'uploader': 'Killsorrow', - 'uploader_id': 'killsorrow', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', - 'info_dict': { - 'id': 'xqds0B_meys', - 'ext': 'webm', - 'title': 'Three Days Grace - Animal I Have Become', - 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', - 'uploader': 'ThreeDaysGraceVEVO', - 'uploader_id': 'ThreeDaysGraceVEVO', - 'upload_date': '20091002', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', - 'info_dict': { - 'id': 'ypWvQgnJrSU', - 'ext': 'mp4', - 'title': 'Starset - First Light', - 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Yumi K', - 'uploader_id': 'SorenPromotions', - 'upload_date': '20140725', - } + 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', + 'info_dict': { + 'id': '93388656', + 'ext': 'm4a', + 'title': 'Of weakened soul...', + 'uploader': 'Killsorrow', + 'uploader_id': 'killsorrow', }, - ] + }, { + 'add_ie': ['Youtube'], + 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', + 'info_dict': { + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', + }, + }, { + 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', + 'only_matching': True, + }, { + 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('video_id') or mobj.group('song_id') is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2a44d05d..62db70b43 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -5,10 +5,8 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_urlparse from ..utils import ( find_xpath_attr, - lowercase_escape, smuggle_url, unescapeHTML, update_url_query, @@ -17,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ { @@ -37,16 +35,6 @@ class NBCIE(AdobePassIE): }, }, { - 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', - 'info_dict': { - 'id': '176', - 'ext': 'flv', - 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', - 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', - }, - 'skip': '404 Not Found', - }, - { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { 'id': '2832821', @@ -64,11 +52,6 @@ class NBCIE(AdobePassIE): 'skip': 'Only works from US', }, { - # This video has expired but with an escaped embedURL - 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', - 'only_matching': True, - }, - { # HLS streams requires the 'hdnea3' cookie 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', 'info_dict': { @@ -88,59 +71,38 @@ class NBCIE(AdobePassIE): ] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - info = { + permalink, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://api.nbc.com/v3/videos', video_id, query={ + 'filter[permalink]': permalink, + })['data'][0]['attributes'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + video_id = video_data['guid'] + title = video_data['title'] + if video_data.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'nbcentertainment', title, video_id, + video_data.get('vChipRating')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'nbcentertainment', resource) + theplatform_url = smuggle_url(update_url_query( + 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + query), {'force_smil_url': True}) + return { '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, + 'title': title, + 'url': theplatform_url, + 'description': video_data.get('description'), + 'keywords': video_data.get('keywords'), + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode_number': int_or_none(video_data.get('episodeNumber')), + 'series': video_data.get('showName'), + 'ie_key': 'ThePlatform', } - video_data = None - preload = self._search_regex( - r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None) - if preload: - preload_data = self._parse_json(preload, video_id) - path = compat_urllib_parse_urlparse(url).path.rstrip('/') - entity_id = preload_data.get('xref', {}).get(path) - video_data = preload_data.get('entities', {}).get(entity_id) - if video_data: - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': - resource = self._get_mvpd_resource( - 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'nbcentertainment', resource) - theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, - query), {'force_smil_url': True}) - info.update({ - 'id': video_id, - 'title': title, - 'url': theplatform_url, - 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), - }) - else: - theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( - [ - r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', - r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', - r'"embedURL"\s*:\s*"([^"]+)"' - ], - webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) - if theplatform_url.startswith('//'): - theplatform_url = 'http:' + theplatform_url - info['url'] = smuggle_url(theplatform_url, {'source_url': url}) - return info class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index f5e3f6815..9b5ad5a9f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + extract_attributes, get_element_by_class, urlencode_postdata, ) @@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage): - player_url = compat_urlparse.urljoin(url, player_url) - + for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): + player = extract_attributes(mobj.group(0)) + player_path = player.get('href') + if not player_path: + continue + kind = self._search_regex( + r'(low|high)$', player.get('class') or '', 'kind', + default='low') + player_url = compat_urlparse.urljoin(url, player_path) player_page = self._download_webpage( player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native', - preference=2 if 'hq' in kind else 1) - formats.extend(entries[0]['formats']) + m3u8_entry_protocol='m3u8_native') + kind_formats = entries[0]['formats'] + for f in kind_formats: + f['quality'] = 2 if kind == 'high' else 1 + formats.extend(kind_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py new file mode 100644 index 000000000..63e58aae2 --- /dev/null +++ b/youtube_dl/extractor/nonktube.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class NonkTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized', + 'info_dict': { + 'id': '118636', + 'ext': 'mp4', + 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized', + 'age_limit': 18, + 'duration': 1150.98, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.nonktube.com/embed/118636', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_nuevo( + 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' + % video_id, video_id) + + info['age_limit'] = 18 + return info diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py new file mode 100644 index 000000000..f7fa098a5 --- /dev/null +++ b/youtube_dl/extractor/noovo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + smuggle_url, + try_get, +) + + +class NoovoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ + # clip + 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', + 'info_dict': { + 'id': '5386045029001', + 'ext': 'mp4', + 'title': 'Chrysler Imperial', + 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', + 'timestamp': 1491399228, + 'upload_date': '20170405', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': 'RPM+', + }, + 'params': { + 'skip_download': True, + }, + }, { + # episode + 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', + 'info_dict': { + 'id': '5395865725001', + 'title': 'Épisode 13 : Les retrouvailles', + 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'ext': 'mp4', + 'timestamp': 1492019320, + 'upload_date': '20170412', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': "L'amour est dans le pré", + 'season_number': 5, + 'episode': 'Épisode 13', + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, + video_id)['data'] + + content = try_get(data, lambda x: x['contents'][0]) + + brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + + series = try_get( + data, ( + lambda x: x['show']['title'], + lambda x: x['season']['show']['title']), + compat_str) + + episode = None + og = data.get('og') + if isinstance(og, dict) and og.get('type') == 'video.episode': + episode = og.get('title') + + video = content or data + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'title': video.get('title'), + 'creator': video.get('source'), + 'view_count': int_or_none(video.get('viewsCount')), + 'series': series, + 'season_number': int_or_none(try_get( + data, lambda x: x['season']['seasonNumber'])), + 'episode': episode, + 'episode_number': int_or_none(data.get('episodeNumber')), + } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7fe79cb53..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor): vcodec = 'none' if data.get('mediaType') == 'Audio' else None - # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged - for entry in entries: entry.update(common_info) for f in entry['formats']: f['vcodec'] = vcodec + points = data.get('shortIndexPoints') + if isinstance(points, list): + chapters = [] + for next_num, point in enumerate(points, start=1): + if not isinstance(point, dict): + continue + start_time = parse_duration(point.get('startPoint')) + if start_time is None: + continue + end_time = parse_duration( + data.get('duration') + if next_num == len(points) + else points[next_num].get('startPoint')) + if end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': point.get('title'), + }) + if chapters and len(entries) == 1: + entries[0]['chapters'] = chapters + return self.playlist_result(entries, video_id, title, description) diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index 87fb94d1f..be1e09d37 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -10,9 +10,10 @@ from ..utils import ( class NuevoBaseIE(InfoExtractor): - def _extract_nuevo(self, config_url, video_id): + def _extract_nuevo(self, config_url, video_id, headers={}): config = self._download_xml( - config_url, video_id, transform_source=lambda s: s.strip()) + config_url, video_id, transform_source=lambda s: s.strip(), + headers=headers) title = xpath_text(config, './title', 'title', fatal=True).strip() video_id = xpath_text(config, './mediaid', default=video_id) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 1e2c54e68..cc296eabd 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals import re -import calendar -import datetime from .common import InfoExtractor from ..compat import compat_str @@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor): } -class ORFOE1IE(InfoExtractor): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' - - # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TESTS = [{ - 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', - 'only_matching': True, - }, { - 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - data = self._download_json( - 'http://oe1.orf.at/programm/%s/konsole' % show_id, - show_id - ) - - timestamp = datetime.datetime.strptime('%s %s' % ( - data['item']['day_label'], - data['item']['time'] - ), '%d.%m.%Y %H:%M') - unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - - return { - 'id': show_id, - 'title': data['item']['title'], - 'url': data['item']['url_stream'], - 'ext': 'mp3', - 'description': data['item'].get('info'), - 'timestamp': unix_timestamp - } - - -class ORFFM4IE(InfoExtractor): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20160110/IS/', - 'md5': '01e736e8f1cef7e13246e880a59ad298', - 'info_dict': { - 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', - 'ext': 'mp3', - 'title': 'Im Sumpf', - 'description': 'md5:384c543f866c4e422a55f66a62d669cd', - 'duration': 7173, - 'timestamp': 1452456073, - 'upload_date': '20160110', - }, - 'skip': 'Live streams on FM4 got deleted soon', - } - +class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') + if station == 'fm4': + show_id = '4%s' % show_id + data = self._download_json( - 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date), show_id ) def extract_entry_dict(info, title, subtitle): return { 'id': info['loopStreamId'].replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']), 'title': title, 'description': subtitle, 'duration': (info['end'] - info['start']) / 1000, @@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor): } +class ORFFM4IE(ORFRadioIE): + IE_NAME = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20170107/CC', + 'md5': '2b0be47375432a7ef104453432a19212', + 'info_dict': { + 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', + 'ext': 'mp3', + 'title': 'Solid Steel Radioshow', + 'description': 'Die Mixshow von Coldcut und Ninja Tune.', + 'duration': 3599, + 'timestamp': 1483819257, + 'upload_date': '20170107', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + +class ORFOE1IE(ORFRadioIE): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://oe1.orf.at/player/20170108/456544', + 'md5': '34d8a6e67ea888293741c86a099b745b', + 'info_dict': { + 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', + 'ext': 'mp3', + 'title': 'Morgenjournal', + 'duration': 609, + 'timestamp': 1483858796, + 'upload_date': '20170108', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 881f3bcc7..bb668c999 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( clean_html, ExtractorError, @@ -11,6 +14,7 @@ from ..utils import ( strip_or_none, unified_timestamp, urljoin, + urlencode_postdata, ) @@ -34,6 +38,32 @@ class PacktPubIE(PacktPubBaseIE): 'upload_date': '20170331', }, } + _NETRC_MACHINE = 'packtpub' + _TOKEN = None + + def _real_initialize(self): + (username, password) = self._get_login_info() + if username is None: + return + webpage = self._download_webpage(self._PACKT_BASE, None) + login_form = self._form_hidden_inputs( + 'packt-user-login-form', webpage) + login_form.update({ + 'email': username, + 'password': password, + }) + self._download_webpage( + self._PACKT_BASE, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form)) + try: + self._TOKEN = self._download_json( + '%s/users/tokens/sessions' % self._MAPT_REST, None, + 'Downloading Authorization Token')['data']['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404): + message = self._parse_json(e.cause.read().decode(), None)['message'] + raise ExtractorError(message, expected=True) + raise def _handle_error(self, response): if response.get('status') != 'success': @@ -51,14 +81,17 @@ class PacktPubIE(PacktPubBaseIE): course_id, chapter_id, video_id = mobj.group( 'course_id', 'chapter_id', 'id') + headers = {} + if self._TOKEN: + headers['Authorization'] = self._TOKEN video = self._download_json( '%s/users/me/products/%s/chapters/%s/sections/%s' % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, - 'Downloading JSON video')['data'] + 'Downloading JSON video', headers=headers)['data'] content = video.get('content') if not content: - raise ExtractorError('This video is locked', expected=True) + self.raise_login_required('This video is locked') video_url = content['file'] diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3e51b4dd7..16cc667d0 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,7 +8,9 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -264,6 +266,13 @@ class PBSIE(InfoExtractor): 'playlist_count': 2, }, { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, + { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { 'id': '2276541483', @@ -381,10 +390,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', - r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -464,6 +473,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -479,6 +489,20 @@ class PBSIE(InfoExtractor): extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -515,7 +539,7 @@ class PBSIE(InfoExtractor): http_url = format_url self._remove_duplicate_formats(formats) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: @@ -588,4 +612,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b25f1f193..1dcc8df00 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index ed38c77eb..e2202d603 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -62,8 +62,7 @@ class R7IE(InfoExtractor): # m3u8 format always matches the http format, let's copy metadata from # one to another m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - formats)) + lambda f: f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == 1: f_copy = m3u8_formats[0].copy() f_copy.update(f) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index 2340dae53..e921ca3e6 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor): _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', 'info_dict': { - 'id': '5111223049001', + 'id': '5419055995001', 'ext': 'mp4', - 'title': ': LES HEROS DU 88e ETAGE', - 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'title': 'UN DELICIEUX PROJET', + 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', 'uploader_id': '1969646226001', - 'upload_date': '20160904', - 'timestamp': 1472951103, + 'upload_date': '20170502', + 'timestamp': 1493745308, }, 'params': { - # rtmp download 'skip_download': True, }, - 'skip': 'Only works from France', + 'skip': 'only available for a week', } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' @@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 9e533103c..58e0b4c80 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', + 'md5': '934bb6a6d220d99c010783c9719960d5', 'info_dict': { 'id': '765767', 'ext': 'mp4', @@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor): }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': 'e54a254fb8b871968fd8403255f28589', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', 'info_dict': { 'id': '10002447', 'ext': 'mp4', @@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor): else: title = data['name'] + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + return { 'id': video_id, 'title': title, @@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor): 'description': data.get('web_site_text'), 'duration': int_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1b1afab32..3f3c681ae 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -210,7 +210,7 @@ class TEDIE(InfoExtractor): resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a424b1c6..de236bbba 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE): 'url': src, }) + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + return { 'title': info['title'], 'subtitles': subtitles, 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(duration, 1000), 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), + 'chapters': chapters, } def _extract_theplatform_metadata(self, path, video_id): diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index b8504f0eb..cd642355c 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - qualities, -) class TheSceneIE(InfoExtractor): @@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor): 'season': 'Ready To Wear Spring 2013', 'tags': list, 'categories': list, + 'upload_date': '20120913', + 'timestamp': 1347512400, + 'uploader': 'vogue', }, } @@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor): self._html_search_regex( r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - player = self._download_webpage(player_url, display_id) - info = self._parse_json( - self._search_regex( - r'(?m)video\s*:\s*({.+?}),$', player, 'info json'), - display_id) - - video_id = info['id'] - title = info['title'] - - qualities_order = qualities(('low', 'high')) - formats = [{ - 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), - 'url': f['src'], - 'quality': qualities_order(f['quality']), - } for f in info['sources']] - self._sort_formats(formats) - return { - 'id': video_id, + '_type': 'url_transparent', 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('poster_frame'), - 'duration': int_or_none(info.get('duration')), - 'series': info.get('series_title'), - 'season': info.get('season_title'), - 'tags': info.get('tags'), - 'categories': info.get('categories'), + 'url': player_url, + 'ie_key': 'CondeNast', } diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index c54b876d3..348d6ecdf 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,7 +17,7 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor): }, { 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', 'only_matching': True, + }, { + 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'only_matching': True, }] _FORMAT_PREFERENCES = { diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 938e05076..f705a06c9 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -6,42 +6,48 @@ import re class ToypicsIE(InfoExtractor): - IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' + IE_DESC = 'Toypics video' + _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)' _TEST = { 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'md5': '16e806ad6d6f58079d210fe30985e08b', 'info_dict': { 'id': '514', 'ext': 'mp4', - 'title': 'Chance-Bulge\'d, 2', + 'title': "Chance-Bulge'd, 2", 'age_limit': 18, 'uploader': 'kidsune', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - page = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') - title = self._html_search_regex( - r'<title>Toypics - ([^<]+)</title>', page, 'title') - username = self._html_search_regex( - r'toypics.net/([^/"]+)" class="user-name">', page, 'username') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + title = self._html_search_regex([ + r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h', + r'<title>([^<]+) - Toypics</title>', + ], webpage, 'title') + + uploader = self._html_search_regex( + r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader', + fatal=False) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, - 'uploader': username, + 'uploader': uploader, 'age_limit': 18, } class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { @@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group('username') + username = self._match_id(url) profile_page = self._download_webpage( url, username, note='Retrieving profile page') @@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor): note='Downloading page %d/%d' % (n, page_count)) urls.extend( re.findall( - r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">', + r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"', lpage)) return { diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 1c0be9fc6..efeb677ee 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -13,6 +13,7 @@ from ..utils import ( xpath_attr, update_url_query, ExtractorError, + strip_or_none, ) @@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE): 'height': int_or_none(image.get('height')), } for image in video_data.findall('images/image')] + is_live = xpath_text(video_data, 'isLive') == 'true' + return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'description': xpath_text(video_data, 'description'), + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), 'timestamp': self._extract_timestamp(video_data), 'upload_date': xpath_attr(video_data, 'metas', 'version'), 'series': xpath_text(video_data, 'showTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 06ea2b40a..c5b3288ad 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) formats.extend(m3u8_formats) for i, m3u8_format in enumerate(m3u8_formats, 2): http_url = '%s-%d.mp4' % (video_url_base, i) diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index b6537141a..ebde6053f 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( extract_attributes, + try_get, urlencode_postdata, ExtractorError, ) @@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor): webpage, 'channel element')) title = current_channel['data-name'] - resource_id = self._search_regex( - r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id') - platform = self._search_regex( - r'platform\s*=\s*"([^"]+)"', webpage, 'platform') + resource_id = current_channel['data-id'] + token = self._search_regex( - r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null') - validate = self._search_regex( - r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null') + r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage, + 'token', group='token') + + context = self._download_json( + 'https://tvplayer.com/watch/context', display_id, + 'Downloading JSON context', query={ + 'resource': resource_id, + 'nonce': token, + }) + + validate = context['validate'] + platform = try_get( + context, lambda x: x['platform']['key'], compat_str) or 'firefox' try: response = self._download_json( 'http://api.tvplayer.com/api/v2/stream/live', - resource_id, headers={ + display_id, 'Downloading JSON stream', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data=urlencode_postdata({ + 'id': resource_id, 'service': 1, 'platform': platform, - 'id': resource_id, - 'token': token, 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: @@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise - formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4') + formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py new file mode 100644 index 000000000..30297b4dd --- /dev/null +++ b/youtube_dl/extractor/upskill.py @@ -0,0 +1,176 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + urlencode_postdata, + urljoin, +) + + +class UpskillBaseIE(InfoExtractor): + _LOGIN_URL = 'http://upskillcourses.com/sign_in' + _NETRC_MACHINE = 'upskill' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login page') + + login_url = compat_str(urlh.geturl()) + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + # Successful login + if any(re.search(p, response) for p in ( + r'class=["\']user-signout', + r'<a[^>]+\bhref=["\']/sign_out', + r'>\s*Log out\s*<')): + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class UpskillIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'info_dict': { + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked')): + self.raise_login_required('Lecture contents locked') + + title = self._og_search_title(webpage, default=None) + + return { + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + } + + +class UpskillCourseIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': '119763', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if UpskillIE.suitable(url) else super( + UpskillCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id', + default=course_id) + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li: + continue + lecture_url = self._search_regex( + r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entries.append( + self.url_result( + urljoin('http://upskillcourses.com/', lecture_url), + ie=UpskillIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', + r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), + webpage, 'course title', fatal=False) + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 9aa38bc5a..890a149ea 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor from ..compat import ( @@ -11,7 +12,6 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, parse_iso8601, ) @@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE): } def _initialize_api(self, video_id): - req = sanitized_Request( - 'http://www.vevo.com/auth', data=b'') webpage = self._download_webpage( - req, None, + 'https://accounts.vevo.com/token', None, note='Retrieving oauth token', - errnote='Unable to retrieve oauth token') + errnote='Unable to retrieve oauth token', + data=json.dumps({ + 'client_id': 'SPupX1tvqFEopQ1YS6SS', + 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): self.raise_geo_restricted( '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) - self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] def _call_api(self, path, *args, **kwargs): try: diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index f0a7fd739..54e207b39 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -20,7 +20,7 @@ from ..utils import ( class ViceBaseIE(AdobePassIE): - def _extract_preplay_video(self, url, webpage): + def _extract_preplay_video(self, url, locale, webpage): watch_hub_data = extract_attributes(self._search_regex( r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) video_id = watch_hub_data['vms-id'] @@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, watch_hub_data.get('video-rating')) - query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE): try: host = 'www.viceland' if is_locked else self._PREPLAY_HOST - preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) + preplay = self._download_json( + 'https://%s.com/%s/preplay/%s' % (host, locale, video_id), + video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: error = json.loads(e.cause.read().decode()) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error['details']), expected=True) raise video_data = preplay['video'] @@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE): class ViceIE(ViceBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', - 'md5': 'e9d77741f9e42ba583e683cd170660f7', + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2', 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', 'ext': 'flv', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, + 'title': 'Monkey Labs of Holland', + 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', }, 'add_ie': ['Ooyala'], }, { - 'url': 'http://www.vice.com/video/how-to-hack-a-car', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', - 'info_dict': { - 'id': '3jstaBeXgAs', - 'ext': 'mp4', - 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', - 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader_id': 'MotherboardTV', - 'uploader': 'Motherboard', - 'upload_date': '20140529', - }, - 'add_ie': ['Youtube'], - }, { 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'md5': '', 'info_dict': { 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', 'uploader': 'Waypoint', 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'uploader_id': '57f7d621e05ca860fa9ccaf9', - 'timestamp': 1477941983938, + 'timestamp': 1477941983, + 'upload_date': '20161031', }, 'params': { # m3u8 download @@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE): }, 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - }, { - 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', - 'only_matching': True, + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, }] _PREPLAY_HOST = 'video.vice' def _real_extract(self, url): - video_id = self._match_id(url) + locale, video_id = re.match(self._VALID_URL, url).groups() webpage, urlh = self._download_webpage_handle(url, video_id) embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, @@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE): r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) if youtube_id: return self.url_result(youtube_id, 'Youtube') - return self._extract_preplay_video(urlh.geturl(), webpage) + return self._extract_preplay_video(urlh.geturl(), locale, webpage) class ViceShowIE(InfoExtractor): + IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' _TEST = { @@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor): r'<title>(.+?)</title>', webpage, 'title', default=None) if title: title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta('description', webpage, 'description') + description = self._html_search_meta( + 'description', webpage, 'description') return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): + IE_NAME = 'vice:article' + _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '58dc0a3dee202d2a0ccfcbd8', + 'ext': 'mp4', + 'title': 'Mormon War on Porn ', + 'description': 'md5:ad396a2481e7f8afb5ed486878421090', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c693', + 'timestamp': 1489160690, + 'upload_date': '20170310', + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'window\.__PREFETCH_DATA\s*=\s*({.*});', + webpage, 'prefetch data'), display_id) + body = prefetch_data['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="(.*youtube\.com/.*)"', + body, 'YouTube URL', default=None) + if youtube_url: + return _url_res(youtube_url, 'Youtube') + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + prefetch_data['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 87f9216b5..bd60235c8 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .vice import ViceBaseIE class VicelandIE(ViceBaseIE): - _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)' _TEST = { 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', 'info_dict': { @@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE): 'skip_download': True, }, 'add_ie': ['UplynkPreplay'], + 'skip': '404', } _PREPLAY_HOST = 'www.viceland' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + locale = mobj.group('locale') webpage = self._download_webpage(url, video_id) - return self._extract_preplay_video(url, webpage) + return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25a5..e5f964d39 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import random import re from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import ( float_or_none, parse_age_limit, qualities, + random_birthday, try_get, unified_timestamp, urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + query = random_birthday('birth_year', 'birth_month', 'birth_day') video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, - video_id, query={ - 'birth_month': random.randint(1, 12), - 'birth_day': random.randint(1, 31), - 'birth_year': random.randint(1950, 1995), - }) + video_id, query=query) title = video['title'] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5ef7635b6..3e67eb8c2 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,24 +5,30 @@ import re import itertools from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) class VierIE(InfoExtractor): IE_NAME = 'vier' IE_DESC = 'vier.be and vijf.be' _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + _NETRC_MACHINE = 'vier' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], }, }, { 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', @@ -30,32 +36,103 @@ class VierIE(InfoExtractor): 'id': '2561614', 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 'ext': 'mp4', - 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', - 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Log in to extract metadata'], }, { - 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, }] + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id site = mobj.group('site') + if not self._logged_in: + self._login(site) + webpage = self._download_webpage(url, display_id) + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id') + webpage, 'video id', default=video_id or display_id) application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], webpage, 'application', default=site + '_vod') @@ -64,12 +141,25 @@ class VierIE(InfoExtractor): webpage, 'filename') playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', + webpage, 'description', default=None, group='value') thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) return { 'id': video_id, @@ -77,6 +167,10 @@ class VierIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, 'formats': formats, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fcf0cb100..d5d5b4c69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 487047fd7..9959627c0 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE): audio_locale = streams_json.get('audio_locale') formats = [] - for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream_id or audio_locale - m3u8_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=stream_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - if audio_locale: - for f in m3u8_formats: - f['language'] = audio_locale - formats.extend(m3u8_formats) + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream.get('hardsub_locale') or audio_locale + format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) + if stream_type == 'adaptive_hls': + adaptive_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + else: + adaptive_formats = self._extract_mpd_formats( + stream_url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_locale: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + formats.extend(adaptive_formats) self._sort_formats(formats) + subtitles = {} + for subtitle in streams_json.get('subtitles', {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + thumbnails = [] for thumbnail in video_data.get('images', {}).get('thumbnails', []): thumbnail_url = thumbnail.get('source') @@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE): 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'description': video_data.get('description'), 'duration': float_or_none(video_data.get('duration_ms'), 1000), diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad986..625d0a1cc 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _TEST = { 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor): }, } + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index c634b8dec..2182d6fd4 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, float_or_none, + unescapeHTML, ) @@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + match = re.search( + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return unescapeHTML(match.group('url')) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + if match: + return 'wistia:%s' % match.group('id') + + match = re.search( + r'''(?sx) + <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674a0..bea9b87ad 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + js_to_json, orderedSet, parse_duration, sanitized_Request, @@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor): 'age_limit': 18, } }, { + # FLV videos with duplicated formats + 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', + 'md5': 'a406963eb349dd43692ec54631efd88b', + 'info_dict': { + 'id': '9299752', + 'display_id': 'A-Super-Run-Part-1-YT', + 'ext': 'flv', + 'title': 'A Super Run - Part 1 (YT)', + 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'uploader': 'tshirtguy59', + 'duration': 579, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + }, + }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', 'only_matching': True, @@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor): }) sources = self._parse_json(self._search_regex( - r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id) + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) title = self._search_regex( diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 30825daae..eca603028 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -6,8 +6,10 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, - ExtractorError, determine_ext, + ExtractorError, + int_or_none, + parse_duration, ) @@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor): 'id': '4588838', 'ext': 'mp4', 'title': 'Biker Takes his Girl', + 'duration': 108, 'age_limit': 18, } } @@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor): r'<title>(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + video_duration = int_or_none(self._og_search_property( + 'duration', webpage, default=None)) or parse_duration( + self._search_regex( + r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + webpage, 'duration', fatal=False)) formats = [] @@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': video_title, + 'duration': video_duration, 'thumbnail': video_thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index fd6268ba4..eb1062142 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + tracks = playlist['tracks'] + track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2b9115c..d66693c0c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -38,7 +38,6 @@ from ..utils import ( parse_duration, remove_quotes, remove_start, - sanitized_Request, smuggle_url, str_to_int, try_get, @@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) - login_form.update({ - 'checkConnection': 'youtube', - 'Email': username, - 'Passwd': password, - }) + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_results = self._download_webpage( - self._PASSWORD_CHALLENGE_URL, None, - note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form)) - if login_results is False: - return False + def warn(message): + self._downloader.report_warning(message) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] - error_msg = self._html_search_regex( - r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', - login_results, 'error message', default=None) - if error_msg: - raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - tfa_code = remove_start(tfa_code, 'G-') + if challenge_results is False: + return - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_data = urlencode_postdata(tfa_form_strs) + tfa = try_get(res, lambda x: x[0][0], list) + if tfa: + tfa_str = try_get(tfa, lambda x: x[2], compat_str) + if tfa_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(tfa, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True def _real_initialize(self): @@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1257,6 +1338,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1325,6 +1435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: age_gate = False video_info = None + sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1341,6 +1452,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + sts = ytplayer_config.get('sts') if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1349,14 +1461,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # The general idea is to take a union of itags of both DASH manifests (for example # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) + for el in ('info', 'embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts video_info_webpage = self._download_webpage( - video_info_url, + '%s://www.youtube.com/get_video_info' % proto, video_id, note=False, - errnote='unable to download video info webpage') + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue get_video_info = compat_parse_qs(video_info_webpage) if get_video_info.get('use_cipher_signature') != ['True']: add_dash_mpd(get_video_info) @@ -1399,9 +1522,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ @@ -1558,6 +1681,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) + chapters = self._extract_chapters(description_original, video_duration) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1629,7 +1754,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1789,6 +1915,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py new file mode 100644 index 000000000..889aff5d8 --- /dev/null +++ b/youtube_dl/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info |