diff options
Diffstat (limited to 'youtube_dl')
66 files changed, 2158 insertions, 938 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f7254560c..21586f0f4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -29,7 +29,6 @@ import random from .compat import ( compat_basestring, compat_cookiejar, - compat_expanduser, compat_get_terminal_size, compat_http_client, compat_kwargs, @@ -54,6 +53,7 @@ from .utils import ( encode_compat_str, encodeFilename, error_to_compat_str, + expand_path, ExtractorError, format_bytes, formatSeconds, @@ -616,7 +616,7 @@ class YoutubeDL(object): sanitize = lambda k, v: sanitize_filename( compat_str(v), restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id')) + is_id=(k == 'id' or k.endswith('_id'))) template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) @@ -672,7 +672,7 @@ class YoutubeDL(object): FORMAT_RE.format(numeric_field), r'%({0})s'.format(numeric_field), outtmpl) - tmpl = compat_expanduser(outtmpl) + tmpl = expand_path(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding @@ -1872,6 +1872,7 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and + outtmpl != '-' and '%' not in outtmpl and self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) @@ -2169,7 +2170,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: - opts_cookiefile = compat_expanduser(opts_cookiefile) + opts_cookiefile = expand_path(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0c401baa6..f15606568 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -16,7 +16,6 @@ from .options import ( parseOpts, ) from .compat import ( - compat_expanduser, compat_getpass, compat_shlex_split, workaround_optparse_bug9161, @@ -26,6 +25,7 @@ from .utils import ( decodeOption, DEFAULT_OUTTMPL, DownloadError, + expand_path, match_filter_func, MaxDownloadsReached, preferredencoding, @@ -88,7 +88,7 @@ def _real_main(argv=None): batchfd = sys.stdin else: batchfd = io.open( - compat_expanduser(opts.batchfile), + expand_path(opts.batchfile), 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: @@ -196,7 +196,7 @@ def _real_main(argv=None): if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: raise ValueError('Playlist end must be greater than playlist start') if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: + if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: parser.error('invalid audio format specified') if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') @@ -238,18 +238,15 @@ def _real_main(argv=None): any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_printing = opts.print_json - download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive + download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive # PostProcessors postprocessors = [] - # Add the metadata pp first, the other pps will copy it if opts.metafromtitle: postprocessors.append({ 'key': 'MetadataFromTitle', 'titleformat': opts.metafromtitle }) - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) if opts.extractaudio: postprocessors.append({ 'key': 'FFmpegExtractAudio', @@ -262,6 +259,16 @@ def _real_main(argv=None): 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # And this post-processor should be placed before other metadata + # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of + # extra metadata. By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata: + postprocessors.append({'key': 'FFmpegMetadata'}) if opts.convertsubtitles: postprocessors.append({ 'key': 'FFmpegSubtitlesConvertor', @@ -442,7 +449,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) + retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 5fe839eb1..7bdade1bd 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -8,8 +8,11 @@ import re import shutil import traceback -from .compat import compat_expanduser, compat_getenv -from .utils import write_json_file +from .compat import compat_getenv +from .utils import ( + expand_path, + write_json_file, +) class Cache(object): @@ -21,7 +24,7 @@ class Cache(object): if res is None: cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') res = os.path.join(cache_root, 'youtube-dl') - return compat_expanduser(res) + return expand_path(res) def _get_cache_fn(self, section, key, dtype): assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bdd3545a2..e13cf547d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,7 +6,10 @@ import sys import re from .common import FileDownloader -from ..compat import compat_setenv +from ..compat import ( + compat_setenv, + compat_str, +) from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -270,6 +273,10 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] + + if self.params.get('test', False): + args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 4989abce1..7534e4da5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,6 +30,15 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + def _delegate_to_ffmpeg(self, filename, info_dict): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + @staticmethod def can_download(manifest, info_dict): UNSUPPORTED_FEATURES = ( @@ -53,10 +62,12 @@ class HlsFD(FragmentFD): ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) - check_results.append(not info_dict.get('is_live')) return all(check_results) def real_download(self, filename, info_dict): + if info_dict.get('is_live'): + return self._delegate_to_ffmpeg(filename, info_dict) + man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) @@ -68,13 +79,7 @@ class HlsFD(FragmentFD): if info_dict.get('extra_param_to_segment_url'): self.report_error('pycrypto not found. Please install it.') return False - self.report_warning( - 'hlsnative has detected features it does not support, ' - 'extraction will be delegated to ffmpeg') - fd = FFmpegFD(self.ydl, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - return fd.real_download(filename, info_dict) + return self._delegate_to_ffmpeg(filename, info_dict) total_frags = 0 for line in s.splitlines(): diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 55a9322a7..9f8a71262 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -25,7 +25,8 @@ class AddAnimeIE(InfoExtractor): 'ext': 'mp4', 'description': 'One Piece 606', 'title': 'One Piece 606', - } + }, + 'skip': 'Video is gone', }, { 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', 'only_matching': True, diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 4d655bd5e..1b2d364ca 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -36,6 +36,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Charter_Direct': { + 'name': 'Charter Spectrum', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, 'thr030': { 'name': '3 Rivers Communications' }, @@ -1453,6 +1458,8 @@ class AdobePassIE(InfoExtractor): self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue + if '<error' in authorize: + raise ExtractorError(xml_text(authorize, 'details'), expected=True) authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) requestor_info[guid] = authz_token self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index e0a0f7c57..b774d6db8 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -4,15 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import compat_xpath from ..utils import ( ExtractorError, int_or_none, - update_url_query, - xpath_element, xpath_text, ) @@ -43,7 +38,8 @@ class AfreecaTVIE(InfoExtractor): 'uploader': 'dailyapril', 'uploader_id': 'dailyapril', 'upload_date': '20160503', - } + }, + 'skip': 'Video is gone', }, { 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', 'info_dict': { @@ -71,6 +67,19 @@ class AfreecaTVIE(InfoExtractor): 'upload_date': '20160502', }, }], + 'skip': 'Video is gone', + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', + 'info_dict': { + 'id': '18650793', + 'ext': 'flv', + 'uploader': '윈아디', + 'uploader_id': 'badkids', + 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, @@ -90,40 +99,33 @@ class AfreecaTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - parsed_url = compat_urllib_parse_urlparse(url) - info_url = compat_urlparse.urlunparse(parsed_url._replace( - netloc='afbbs.afreecatv.com:8080', - path='/api/video/get_video_info.php')) video_xml = self._download_xml( - update_url_query(info_url, {'nTitleNo': video_id}), video_id) + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, query={'nTitleNo': video_id}) - if xpath_element(video_xml, './track/video/file') is None: + video_element = video_xml.findall(compat_xpath('./track/video'))[1] + if video_element is None or video_element.text is None: raise ExtractorError('Specified AfreecaTV video does not exist', expected=True) - title = xpath_text(video_xml, './track/title', 'title') + video_url_raw = video_element.text + + app, playpath = video_url_raw.split('mp4:') + + title = xpath_text(video_xml, './track/title', 'title', fatal=True) uploader = xpath_text(video_xml, './track/nickname', 'uploader') uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') duration = int_or_none(xpath_text(video_xml, './track/duration', 'duration')) thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - entries = [] - for i, video_file in enumerate(video_xml.findall('./track/video/file')): - video_key = self.parse_video_key(video_file.get('key', '')) - if not video_key: - continue - entries.append({ - 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), - 'title': title, - 'upload_date': video_key.get('upload_date'), - 'duration': int_or_none(video_file.get('duration')), - 'url': video_file.text, - }) - - info = { + return { 'id': video_id, + 'url': app, + 'ext': 'flv', + 'play_path': 'mp4:' + playpath, + 'rtmp_live': True, # downloading won't end without this 'title': title, 'uploader': uploader, 'uploader_id': uploader_id, @@ -131,20 +133,6 @@ class AfreecaTVIE(InfoExtractor): 'thumbnail': thumbnail, } - if len(entries) > 1: - info['_type'] = 'multi_video' - info['entries'] = entries - elif len(entries) == 1: - info['url'] = entries[0]['url'] - info['upload_date'] = entries[0].get('upload_date') - else: - raise ExtractorError( - 'No files found for the specified AfreecaTV video, either' - ' the URL is incorrect or the video has been made private.', - expected=True) - - return info - class AfreecaTVGlobalIE(AfreecaTVIE): IE_NAME = 'afreecatv:global' diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index 50ffb442d..4495ddbb0 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -93,8 +93,7 @@ class ArkenaIE(InfoExtractor): exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) if kind == 'm3u8' or 'm3u8' in exts: formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, live=is_live)) elif kind == 'flash' or 'f4m' in exts: formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index e3c669830..99af6dc5a 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -90,7 +90,8 @@ class AtresPlayerIE(InfoExtractor): request, None, 'Logging in as %s' % username) error = self._html_search_regex( - r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None) + r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>', + response, 'error', default=None) if error: raise ExtractorError( 'Unable to login: %s' % error, expected=True) @@ -155,13 +156,17 @@ class AtresPlayerIE(InfoExtractor): if format_id == 'token' or not video_url.startswith('http'): continue if 'geodeswowsmpra3player' in video_url: - f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) + # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] + # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) # this videos are protected by DRM, the f4m downloader doesn't support them continue - else: - f4m_url = video_url[:-9] + '/manifest.f4m' - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) + video_url_hd = video_url.replace('free_es', 'es') + formats.extend(self._extract_f4m_formats( + video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds', + fatal=False)) + formats.extend(self._extract_mpd_formats( + video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', + fatal=False)) self._sort_formats(formats) path_data = player.get('pathData') diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py new file mode 100644 index 000000000..1584d53fc --- /dev/null +++ b/youtube_dl/extractor/atvat.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, +) + + +class ATVAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)' + _TESTS = [{ + 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', + 'md5': 'c3b6b975fb3150fc628572939df205f2', + 'info_dict': { + 'id': '1698447', + 'ext': 'mp4', + 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', + } + }, { + 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', + webpage, 'player data')), display_id)['config']['initial_video'] + + video_id = video_data['id'] + video_title = video_data['title'] + + parts = [] + for part in video_data.get('parts', []): + part_id = part['id'] + part_title = part['title'] + + formats = [] + for source in part.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, part_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source.get('delivery'), + 'url': source_url, + }) + self._sort_formats(formats) + + parts.append({ + 'id': part_id, + 'title': part_title, + 'thumbnail': part.get('preview_image_url'), + 'duration': int_or_none(part.get('duration')), + 'is_live': part.get('is_livestream'), + 'formats': formats, + }) + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': video_title, + 'entries': parts, + } diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 1f5b6ed92..8820a3914 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor): animalplanet| bravo| mtv| - space + space| + etalk )\.ca| much\.com - )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' + )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', 'only_matching': True, + }, { + 'url': 'http://www.etalk.ca/video?videoid=663455', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', @@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'sciencechannel': 'discsci', 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', + 'etalk': 'ctv', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 000000000..57882fbee --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> + entries = [] + for video in re.findall(r'(?i)(<video[^>]+>)', webpage): + attrs = extract_attributes(video) + + video_id = attrs.get('data-brightcove-video-id') + account_id = attrs.get('data-account') + player_id = attrs.get('data-player') + embed = attrs.get('data-embed') + + if video_id and account_id and player_id and embed: + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + if len(entries) == 0: + return self.url_result(url, 'Generic') + elif len(entries) == 1: + return self.url_result(entries[0], 'BrightcoveNew') + else: + return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 27685eed0..46ef8e605 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -193,7 +193,13 @@ class BrightcoveLegacyIE(InfoExtractor): if videoPlayer is not None: if isinstance(videoPlayer, list): videoPlayer = videoPlayer[0] - if not (videoPlayer.isdigit() or videoPlayer.startswith('ref:')): + videoPlayer = videoPlayer.strip() + # UUID is also possible for videoPlayer (e.g. + # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd + # or http://www8.hp.com/cn/zh/home.html) + if not (re.match( + r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', + videoPlayer) or videoPlayer.startswith('ref:')): return None params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') @@ -515,6 +521,9 @@ class BrightcoveNewIE(InfoExtractor): return entries def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage( @@ -544,8 +553,10 @@ class BrightcoveNewIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - raise ExtractorError( - json_data.get('message') or json_data['error_code'], expected=True) + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) raise title = json_data['name'].strip() diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b1dfacf80..dd2529a6d 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -160,8 +160,7 @@ class CeskaTelevizeIE(InfoExtractor): for format_id, stream_url in item.get('streamUrls', {}).items(): if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + stream_url, playlist_id, 'mp4', 'm3u8_native', m3u8_id='hls-%s' % format_id, fatal=False) else: stream_formats = self._extract_mpd_formats( diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 865dbcaba..e92894246 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -4,62 +4,62 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, ExtractorError, - parse_filesize, + int_or_none, + parse_iso8601, qualities, + unescapeHTML, ) class Channel9IE(InfoExtractor): - ''' - Common extractor for channel9.msdn.com. - - The type of provided URL (video or playlist) is determined according to - meta Search.PageType from web page HTML rather than URL itself, as it is - not always possible to do. - ''' IE_DESC = 'Channel 9' IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' _TESTS = [{ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', + 'md5': '32083d4eaf1946db6d454313f44510ca', 'info_dict': { - 'id': 'Events/TechEd/Australia/2013/KOS002', - 'ext': 'mp4', + 'id': '6c413323-383a-49dc-88f9-a22800cab024', + 'ext': 'wmv', 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', + 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', 'duration': 4576, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1377717420, + 'upload_date': '20130828', 'session_code': 'KOS002', - 'session_day': 'Day 1', 'session_room': 'Arena 1A', - 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', - 'Mads Kristensen'], + 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', + 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', 'info_dict': { - 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'ext': 'mp4', + 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', + 'ext': 'wmv', 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', + 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', 'duration': 1540, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1386381991, + 'upload_date': '20131207', 'authors': ['Mike Wilmot'], }, }, { # low quality mp4 is best 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 'info_dict': { - 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', 'ext': 'mp4', 'title': 'Ranges for the Standard Library', - 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', 'duration': 5646, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20150930', + 'timestamp': 1443640735, }, 'params': { 'skip_download': True, @@ -70,7 +70,7 @@ class Channel9IE(InfoExtractor): 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', 'title': 'Channel 9', }, - 'playlist_count': 2, + 'playlist_mincount': 100, }, { 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', 'only_matching': True, @@ -81,189 +81,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - def _formats_from_html(self, html): - FORMAT_REGEX = r''' - (?x) - <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* - <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* - (?:<div\s+class="popup\s+rounded">\s* - <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* - </div>)? # File size part may be missing - ''' - quality = qualities(( - 'MP3', 'MP4', - 'Low Quality WMV', 'Low Quality MP4', - 'Mid Quality WMV', 'Mid Quality MP4', - 'High Quality WMV', 'High Quality MP4')) - formats = [{ - 'url': x.group('url'), - 'format_id': x.group('quality'), - 'format_note': x.group('note'), - 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize_approx': parse_filesize(x.group('filesize')), - 'quality': quality(x.group('quality')), - 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html))] - - self._sort_formats(formats) - - return formats - - def _extract_title(self, html): - title = self._html_search_meta('title', html, 'title') - if title is None: - title = self._og_search_title(html) - TITLE_SUFFIX = ' (Channel 9)' - if title is not None and title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)] - return title - - def _extract_description(self, html): - DESCRIPTION_REGEX = r'''(?sx) - <div\s+class="entry-content">\s* - <div\s+id="entry-body">\s* - (?P<description>.+?)\s* - </div>\s* - </div> - ''' - m = re.search(DESCRIPTION_REGEX, html) - if m is not None: - return m.group('description') - return self._html_search_meta('description', html, 'description') - - def _extract_duration(self, html): - m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) - return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None - - def _extract_slides(self, html): - m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html) - return m.group('slidesurl') if m is not None else None - - def _extract_zip(self, html): - m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html) - return m.group('zipurl') if m is not None else None - - def _extract_avg_rating(self, html): - m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html) - return float(m.group('avgrating')) if m is not None else 0 - - def _extract_rating_count(self, html): - m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html) - return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 - - def _extract_view_count(self, html): - m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html) - return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 - - def _extract_comment_count(self, html): - m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html) - return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 - - def _fix_count(self, count): - return int(str(count).replace(',', '')) if count is not None else None - - def _extract_authors(self, html): - m = re.search(r'(?s)<li class="author">(.*?)</li>', html) - if m is None: - return None - return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1)) - - def _extract_session_code(self, html): - m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html) - return m.group('code') if m is not None else None - - def _extract_session_day(self, html): - m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html) - return m.group('day').strip() if m is not None else None - - def _extract_session_room(self, html): - m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html) - return m.group('room') if m is not None else None - - def _extract_session_speakers(self, html): - return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) - - def _extract_content(self, html, content_path): - # Look for downloadable content - formats = self._formats_from_html(html) - slides = self._extract_slides(html) - zip_ = self._extract_zip(html) - - # Nothing to download - if len(formats) == 0 and slides is None and zip_ is None: - self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path) - return - - # Extract meta - title = self._extract_title(html) - description = self._extract_description(html) - thumbnail = self._og_search_thumbnail(html) - duration = self._extract_duration(html) - avg_rating = self._extract_avg_rating(html) - rating_count = self._extract_rating_count(html) - view_count = self._extract_view_count(html) - comment_count = self._extract_comment_count(html) - - common = { - '_type': 'video', - 'id': content_path, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'avg_rating': avg_rating, - 'rating_count': rating_count, - 'view_count': view_count, - 'comment_count': comment_count, - } - - result = [] - - if slides is not None: - d = common.copy() - d.update({'title': title + '-Slides', 'url': slides}) - result.append(d) - - if zip_ is not None: - d = common.copy() - d.update({'title': title + '-Zip', 'url': zip_}) - result.append(d) - - if len(formats) > 0: - d = common.copy() - d.update({'title': title, 'formats': formats}) - result.append(d) - - return result - - def _extract_entry_item(self, html, content_path): - contents = self._extract_content(html, content_path) - if contents is None: - return contents - - if len(contents) > 1: - raise ExtractorError('Got more than one entry') - result = contents[0] - result['authors'] = self._extract_authors(html) - - return result - - def _extract_session(self, html, content_path): - contents = self._extract_content(html, content_path) - if contents is None: - return contents - - session_meta = { - 'session_code': self._extract_session_code(html), - 'session_day': self._extract_session_day(html), - 'session_room': self._extract_session_room(html), - 'session_speakers': self._extract_session_speakers(html), - } - - for content in contents: - content.update(session_meta) - - return self.playlist_result(contents) - def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id @@ -274,9 +91,7 @@ class Channel9IE(InfoExtractor): return self.playlist_result(entries, video_id, title_text) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - content_path = mobj.group('contentpath') - rss = mobj.group('rss') + content_path, rss = re.match(self._VALID_URL, url).groups() if rss: return self._extract_list(content_path, url) @@ -284,17 +99,158 @@ class Channel9IE(InfoExtractor): webpage = self._download_webpage( url, content_path, 'Downloading web page') - page_type = self._search_regex( - r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2', - webpage, 'page type', default=None, group='pagetype') - if page_type: - if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content - return self._extract_entry_item(webpage, content_path) - elif page_type == 'Session': # Event session page, may contain downloadable content - return self._extract_session(webpage, content_path) - elif page_type == 'Event': - return self._extract_list(content_path) + episode_data = self._search_regex( + r"data-episode='([^']+)'", webpage, 'episode data', default=None) + if episode_data: + episode_data = self._parse_json(unescapeHTML( + episode_data), content_path) + content_id = episode_data['contentId'] + is_session = '/Sessions(' in episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + if is_session: + content_url += '?$expand=Speakers' + else: + content_url += '?$expand=Authors' + content_data = self._download_json(content_url, content_id) + title = content_data['Title'] + + QUALITIES = ( + 'mp3', + 'wmv', 'mp4', + 'wmv-low', 'mp4-low', + 'wmv-mid', 'mp4-mid', + 'wmv-high', 'mp4-high', + ) + + quality_key = qualities(QUALITIES) + + def quality(quality_id, format_url): + return (len(QUALITIES) if '_Source.' in format_url + else quality_key(quality_id)) + + formats = [] + urls = set() + + SITE_QUALITIES = { + 'MP3': 'mp3', + 'MP4': 'mp4', + 'Low Quality WMV': 'wmv-low', + 'Low Quality MP4': 'mp4-low', + 'Mid Quality WMV': 'wmv-mid', + 'Mid Quality MP4': 'mp4-mid', + 'High Quality WMV': 'wmv-high', + 'High Quality MP4': 'mp4-high', + } + + formats_select = self._search_regex( + r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, + 'formats select', default=None) + if formats_select: + for mobj in re.finditer( + r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', + formats_select): + format_url = mobj.group('url') + if format_url in urls: + continue + urls.add(format_url) + format_id = mobj.group('format') + quality_id = SITE_QUALITIES.get(format_id, format_id) + formats.append({ + 'url': format_url, + 'format_id': quality_id, + 'quality': quality(quality_id, format_url), + 'vcodec': 'none' if quality_id == 'mp3' else None, + }) + + API_QUALITIES = { + 'VideoMP4Low': 'mp4-low', + 'VideoWMV': 'wmv-mid', + 'VideoMP4Medium': 'mp4-mid', + 'VideoMP4High': 'mp4-high', + 'VideoWMVHQ': 'wmv-hq', + } + + for format_id, q in API_QUALITIES.items(): + q_url = content_data.get(format_id) + if not q_url or q_url in urls: + continue + urls.add(q_url) + formats.append({ + 'url': q_url, + 'format_id': q, + 'quality': quality(q, q_url), + }) + + self._sort_formats(formats) + + slides = content_data.get('Slides') + zip_file = content_data.get('ZipFile') + + if not formats and not slides and not zip_file: + raise ExtractorError( + 'None of recording, slides or zip are available for %s' % content_path) + + subtitles = {} + for caption in content_data.get('Captions', []): + caption_url = caption.get('Url') + if not caption_url: + continue + subtitles.setdefault(caption.get('Language', 'en'), []).append({ + 'url': caption_url, + 'ext': 'vtt', + }) + + common = { + 'id': content_id, + 'title': title, + 'description': clean_html(content_data.get('Description') or content_data.get('Body')), + 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), + 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), + 'timestamp': parse_iso8601(content_data.get('PublishedDate')), + 'avg_rating': int_or_none(content_data.get('Rating')), + 'rating_count': int_or_none(content_data.get('RatingCount')), + 'view_count': int_or_none(content_data.get('Views')), + 'comment_count': int_or_none(content_data.get('CommentCount')), + 'subtitles': subtitles, + } + if is_session: + speakers = [] + for s in content_data.get('Speakers', []): + speaker_name = s.get('FullName') + if not speaker_name: + continue + speakers.append(speaker_name) + + common.update({ + 'session_code': content_data.get('Code'), + 'session_room': content_data.get('Room'), + 'session_speakers': speakers, + }) else: - raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) - else: # Assuming list + authors = [] + for a in content_data.get('Authors', []): + author_name = a.get('DisplayName') + if not author_name: + continue + authors.append(author_name) + common['authors'] = authors + + contents = [] + + if slides: + d = common.copy() + d.update({'title': title + '-Slides', 'url': slides}) + contents.append(d) + + if zip_file: + d = common.copy() + d.update({'title': title + '-Zip', 'url': zip_file}) + contents.append(d) + + if formats: + d = common.copy() + d.update({'title': title, 'formats': formats}) + contents.append(d) + return self.playlist_result(contents) + else: return self._extract_list(content_path) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index ae5ba0015..9bc8dbea4 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -1,97 +1,56 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_HTTPError, -) from ..utils import ( - ExtractorError, - HEADRequest, - remove_end, + str_to_int, + unified_strdate, ) class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec' - _VALID_URL = r'''(?x) - https?://(?:www\.)?cloudy\.ec/ - (?:v/|embed\.php\?id=) - (?P<id>[A-Za-z0-9]+) - ''' - _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' - _API_URL = 'http://www.cloudy.ec/api/player.api.php' - _MAX_TRIES = 2 - _TEST = { + _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '5cb253ace826a42f35b4740539bedf07', + 'md5': '29832b05028ead1b58be86bf319397ca', 'info_dict': { 'id': 'af511e2527aac', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Funny Cats and Animals Compilation june 2013', + 'upload_date': '20130913', + 'view_count': int, } - } - - def _extract_video(self, video_id, file_key, error_url=None, try_num=0): - - if try_num > self._MAX_TRIES - 1: - raise ExtractorError('Unable to extract video URL', expected=True) - - form = { - 'file': video_id, - 'key': file_key, - } - - if error_url: - form.update({ - 'numOfErrors': try_num, - 'errorCode': '404', - 'errorUrl': error_url, - }) + }, { + 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', + 'only_matching': True, + }] - player_data = self._download_webpage( - self._API_URL, video_id, 'Downloading player data', query=form) - data = compat_parse_qs(player_data) - - try_num += 1 - - if 'error' in data: - raise ExtractorError( - '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])), - expected=True) + def _real_extract(self, url): + video_id = self._match_id(url) - title = data.get('title', [None])[0] - if title: - title = remove_end(title, '&asdasdas').strip() + webpage = self._download_webpage( + 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) - video_url = data.get('url', [None])[0] + info = self._parse_html5_media_entries(url, webpage, video_id)[0] - if video_url: - try: - self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: - self.report_warning('Invalid video URL, requesting another', video_id) - return self._extract_video(video_id, file_key, video_url, try_num) + webpage = self._download_webpage( + 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) - return { - 'id': video_id, - 'url': video_url, - 'title': title, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + if webpage: + info.update({ + 'title': self._search_regex( + r'<h\d[^>]*>([^<]+)<', webpage, 'title'), + 'upload_date': unified_strdate(self._search_regex( + r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, + 'upload date', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r'([\d,.]+) views<', webpage, 'view count', fatal=False)), + }) - url = self._EMBED_URL % video_id - webpage = self._download_webpage(url, video_id) + if not info.get('title'): + info['title'] = video_id - file_key = self._search_regex( - [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], - webpage, 'file_key') + info['id'] = video_id - return self._extract_video(video_id, file_key) + return info diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c2ca73ee1..6c3c095f7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -36,34 +36,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, GeoRestrictedError, GeoUtils, int_or_none, js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -714,6 +715,13 @@ class InfoExtractor(object): video_info['title'] = video_title return video_info + def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): + urlrs = orderedSet( + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) + for m in matches) + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" @@ -2204,56 +2212,9 @@ class InfoExtractor(object): this_video_id = video_id or video_data['mediaid'] - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) + formats = self._parse_jwplayer_formats( + video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, + mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) self._sort_formats(formats) subtitles = {} @@ -2284,6 +2245,65 @@ class InfoExtractor(object): else: return self.playlist_result(entries) + def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + formats = [] + for source in jwplayer_sources_data: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ( + 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like "1080p", "720p SD", or 1080. + height = int_or_none(self._search_regex( + r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'tbr': int_or_none(source.get('bitrate')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 8d8f60598..d3463b874 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -9,13 +9,14 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - orderedSet, - remove_end, - extract_attributes, - mimetype2ext, determine_ext, + extract_attributes, int_or_none, + js_to_json, + mimetype2ext, + orderedSet, parse_iso8601, + remove_end, ) @@ -67,6 +68,16 @@ class CondeNastIE(InfoExtractor): 'timestamp': 1363219200, } }, { + 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', + 'info_dict': { + 'id': '58d1865bfd2e6126e2000015', + 'ext': 'mp4', + 'title': 'The Only True Surprise? Trump’s an Idiot', + 'uploader': 'gq', + 'upload_date': '20170321', + 'timestamp': 1490126427, + }, + }, { # JS embed 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', 'md5': 'f1a6f9cafb7083bab74a710f65d08999', @@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor): }) video_id = query['videoId'] video_info = None - info_page = self._download_webpage( + info_page = self._download_json( 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', query=query, fatal=False) + video_id, 'Downloading video info', fatal=False, query=query) if info_page: - video_info = self._parse_json(self._search_regex( - r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] - else: + video_info = info_page.get('video') + if not video_info: info_page = self._download_webpage( 'http://player.cnevids.com/player/loader.js', video_id, 'Downloading loader info', query=query) - video_info = self._parse_json(self._search_regex( - r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) + video_info = self._parse_json( + self._search_regex( + r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), + video_id, transform_source=js_to_json)['video'] + title = video_info['title'] formats = [] - for fdata in video_info.get('sources', [{}])[0]: + for fdata in video_info['sources']: src = fdata.get('src') if not src: continue ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue quality = fdata.get('quality') formats.append({ 'format_id': ext + ('-%s' % quality if quality else ''), @@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor): path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) url_type = 'embed' - self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) webpage = self._download_webpage(url, item_id) if url_type == 'series': diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9c6cf00ca..d15fd3744 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -177,6 +177,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'uploader': 'Kadokawa Pictures Inc.', 'upload_date': '20170118', 'series': "KONOSUBA -God's blessing on this wonderful world!", + 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, 'episode': 'Give Me Deliverance from this Judicial Injustice!', 'episode_number': 1, @@ -222,6 +223,23 @@ class CrunchyrollIE(CrunchyrollBaseIE): # just test metadata extraction 'skip_download': True, }, + }, { + # A video with a vastly different season name compared to the series name + 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', + 'info_dict': { + 'id': '590532', + 'ext': 'mp4', + 'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test', + 'description': 'Mahiro and Nyaruko talk about official certification.', + 'uploader': 'TV TOKYO', + 'upload_date': '20120305', + 'series': 'Nyarko-san: Another Crawling Chaos', + 'season': 'Haiyoru! Nyaruani (ONA)', + }, + 'params': { + # Just test metadata extraction + 'skip_download': True, + }, }] _FORMAT_IDS = { @@ -491,7 +509,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text # webpage provide more accurate data than series_title from XML series = self._html_search_regex( r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', - webpage, 'series', default=xpath_text(metadata, 'series_title')) + webpage, 'series', fatal=False) + season = xpath_text(metadata, 'series_title') episode = xpath_text(metadata, 'episode_title') episode_number = int_or_none(xpath_text(metadata, 'episode_number')) @@ -508,6 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, + 'season': season, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 2042493a8..7cd5d4291 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -1,17 +1,21 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( extract_attributes, + ExtractorError, int_or_none, parse_age_limit, - ExtractorError, + remove_end, + unescapeHTML, ) -class DiscoveryGoIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?: +class DiscoveryGoBaseIE(InfoExtractor): + _VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| @@ -21,18 +25,23 @@ class DiscoveryGoIE(InfoExtractor): sciencechannel| tlc| velocitychannel - )go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)''' + )go\.com/%s(?P<id>[^/?#&]+)''' + + +class DiscoveryGoIE(DiscoveryGoBaseIE): + _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' + _GEO_COUNTRIES = ['US'] _TEST = { - 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', + 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', 'info_dict': { - 'id': '57a33c536b66d1cd0345eeb1', + 'id': '58c167d86b66d12f2addeb01', 'ext': 'mp4', - 'title': 'Kiss First, Ask Questions Later!', - 'description': 'md5:fe923ba34050eae468bffae10831cb22', - 'duration': 2579, - 'series': 'Love at First Kiss', - 'season_number': 1, - 'episode_number': 1, + 'title': 'Reaper Madness', + 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', + 'duration': 2519, + 'series': 'Bering Sea Gold', + 'season_number': 8, + 'episode_number': 6, 'age_limit': 14, }, } @@ -113,3 +122,46 @@ class DiscoveryGoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE): + _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % '' + _TEST = { + 'url': 'https://www.discoverygo.com/bering-sea-gold/', + 'info_dict': { + 'id': 'bering-sea-gold', + 'title': 'Bering Sea Gold', + 'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e', + }, + 'playlist_mincount': 6, + } + + @classmethod + def suitable(cls, url): + return False if DiscoveryGoIE.suitable(url) else super( + DiscoveryGoPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage): + data = self._parse_json( + mobj.group('json'), display_id, + transform_source=unescapeHTML, fatal=False) + if not isinstance(data, dict) or data.get('type') != 'episode': + continue + episode_url = data.get('socialUrl') + if not episode_url: + continue + entries.append(self.url_result( + episode_url, ie=DiscoveryGoIE.ie_key(), + video_id=data.get('id'))) + + return self.playlist_result( + entries, display_id, + remove_end(self._og_search_title( + webpage, fatal=False), ' | Discovery GO'), + self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/discoverynetworks.py index fd145ba42..b6653784c 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -9,13 +9,13 @@ from ..compat import ( compat_parse_qs, compat_urlparse, ) +from ..utils import smuggle_url -class TlcDeIE(InfoExtractor): - IE_NAME = 'tlc.de' - _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' +class DiscoveryNetworksDeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))' - _TEST = { + _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', 'info_dict': { 'id': '3235167922001', @@ -29,7 +29,13 @@ class TlcDeIE(InfoExtractor): 'upload_date': '20140404', 'uploader_id': '1659832546', }, - } + }, { + 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/', + 'only_matching': True, + }, { + 'url': 'http://www.discovery.de/#5332316765001', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s' def _real_extract(self, url): @@ -39,5 +45,8 @@ class TlcDeIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + return self.url_result(smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 9a83fb31a..82d8a042f 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import time +import hashlib + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -16,7 +19,7 @@ class DouyuTVIE(InfoExtractor): 'info_dict': { 'id': '17732', 'display_id': 'iseven', - 'ext': 'mp4', + 'ext': 'flv', 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', @@ -31,7 +34,7 @@ class DouyuTVIE(InfoExtractor): 'info_dict': { 'id': '85982', 'display_id': '85982', - 'ext': 'mp4', + 'ext': 'flv', 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', 'thumbnail': r're:^https?://.*\.jpg$', @@ -47,7 +50,7 @@ class DouyuTVIE(InfoExtractor): 'info_dict': { 'id': '17732', 'display_id': '17732', - 'ext': 'mp4', + 'ext': 'flv', 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', @@ -66,10 +69,6 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] - # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf - # is encrypted originally, but ffdec can dump memory to get the decrypted one. - _API_KEY = 'A12Svb&%1UUmf@hC' - def _real_extract(self, url): video_id = self._match_id(url) @@ -80,6 +79,7 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + # Grab metadata from mobile API room = self._download_json( 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, note='Downloading room info')['data'] @@ -88,8 +88,19 @@ class DouyuTVIE(InfoExtractor): if room.get('show_status') == '2': raise ExtractorError('Live stream is offline', expected=True) - formats = self._extract_m3u8_formats( - room['hls_url'], video_id, ext='mp4') + # Grab the URL from PC client API + # The m3u8 url from mobile API requires re-authentication every 5 minutes + tt = int(time.time()) + signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt) + sign = hashlib.md5(signContent.encode('ascii')).hexdigest() + video_url = self._download_json( + 'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id, + video_id, note='Downloading video URL info', + query={'rate': 0}, headers={ + 'auth': sign, + 'time': str(tt), + 'aid': 'pcclient' + })['data']['live_url'] title = self._live_title(unescapeHTML(room['room_name'])) description = room.get('show_details') @@ -99,7 +110,7 @@ class DouyuTVIE(InfoExtractor): return { 'id': room_id, 'display_id': video_id, - 'formats': formats, + 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 32028bc3b..87c5dd63e 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -6,37 +6,24 @@ import re import time from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_HTTPError, +) from ..utils import ( USER_AGENTS, + ExtractorError, int_or_none, + unified_strdate, + remove_end, update_url_query, ) class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?P<domain>www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' _TESTS = [{ - # geo restricted, via direct unsigned hls URL - 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', - 'info_dict': { - 'id': '1255600', - 'display_id': 'stagione-1-episodio-25', - 'ext': 'mp4', - 'title': 'Episodio 25', - 'description': 'md5:cae5f40ad988811b197d2d27a53227eb', - 'duration': 2761, - 'timestamp': 1454701800, - 'upload_date': '20160205', - 'creator': 'RTIT', - 'series': 'Take me out', - 'season_number': 1, - 'episode_number': 25, - 'age_limit': 0, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { # non geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { @@ -168,3 +155,90 @@ class DPlayIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class DPlayItIE(InfoExtractor): + _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)' + _GEO_COUNTRIES = ['IT'] + _TEST = { + 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', + 'md5': '2b808ffb00fc47b884a172ca5d13053c', + 'info_dict': { + 'id': '6918', + 'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij', + 'ext': 'mp4', + 'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij', + 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'upload_date': '20160524', + 'series': 'Biografie imbarazzanti', + 'season_number': 1, + 'episode': 'Luigi Di Maio: la psicosi di Stanislawskij', + 'episode_number': 1, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + info_url = self._search_regex( + r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', + webpage, 'video id') + + title = remove_end(self._og_search_title(webpage), ' | Dplay') + + try: + info = self._download_json( + info_url, display_id, headers={ + 'Authorization': 'Bearer %s' % self._get_cookies(url).get( + 'dplayit_token').value, + 'Referer': url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + if error.get('code') == 'access.denied.geoblocked': + self.raise_geo_restricted( + msg=error.get('detail'), countries=self._GEO_COUNTRIES) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise + + hls_url = info['data']['attributes']['streaming']['hls']['url'] + + formats = self._extract_m3u8_formats( + hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + series = self._html_search_regex( + r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', + webpage, 'series', fatal=False) + episode = self._search_regex( + r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)', + webpage, 'episode', fatal=False) + + mobj = re.search( + r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})', + webpage) + if mobj: + season_number = int(mobj.group('season_number')) + episode_number = int(mobj.group('episode_number')) + upload_date = unified_strdate(mobj.group('upload_date')) + else: + season_number = episode_number = upload_date = None + + return { + 'id': info_url.rpartition('/')[-1], + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e966d7483..e4917014a 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -15,6 +15,8 @@ from ..utils import ( class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['DK'] IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', @@ -137,7 +139,7 @@ class DRTVIE(InfoExtractor): if not formats and restricted_to_denmark: self.raise_geo_restricted( 'Unfortunately, DR is not allowed to show this program outside Denmark.', - expected=True) + countries=self._GEO_COUNTRIES) self._sort_formats(formats) @@ -156,6 +158,7 @@ class DRTVIE(InfoExtractor): class DRTVLiveIE(InfoExtractor): IE_NAME = 'drtv:live' _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' + _GEO_COUNTRIES = ['DK'] _TEST = { 'url': 'https://www.dr.dk/tv/live/dr1', 'info_dict': { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1613a9d3..6a7028a4d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import ( ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE @@ -117,6 +118,7 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bostonglobe import BostonGlobeIE from .bpb import BpbIE from .br import BRIE from .bravotv import BravoTVIE @@ -246,7 +248,10 @@ from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DPlayItIE, +) from .dramafever import ( DramaFeverIE, DramaFeverSeriesIE, @@ -262,7 +267,11 @@ from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE -from .discoverygo import DiscoveryGoIE +from .discoverygo import ( + DiscoveryGoIE, + DiscoveryGoPlaylistIE, +) +from .discoverynetworks import DiscoveryNetworksDeIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE @@ -793,6 +802,7 @@ from .rai import ( ) from .rbmaradio import RBMARadioIE from .rds import RDSIE +from .redbulltv import RedBullTVIE from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -966,7 +976,6 @@ from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE from .tinypic import TinyPicIE -from .tlc import TlcDeIE from .tmz import ( TMZIE, TMZArticleIE, @@ -979,6 +988,7 @@ from .tnaflix import ( ) from .toggle import ToggleIE from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE @@ -999,6 +1009,7 @@ from .tunein import ( TuneInTopicIE, TuneInShortenerIE, ) +from .tunepk import TunePkIE from .turbo import TurboIE from .tutv import TutvIE from .tv2 import ( @@ -1165,6 +1176,8 @@ from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE +from .vrak import VrakIE +from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE from .vvvvid import VVVVIDIE diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py index 2f3035147..f62ddebae 100644 --- a/youtube_dl/extractor/eyedotv.py +++ b/youtube_dl/extractor/eyedotv.py @@ -54,7 +54,7 @@ class EyedoTVIE(InfoExtractor): 'id': video_id, 'title': title, 'formats': self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'), + m3u8_url, video_id, 'mp4', 'm3u8_native'), 'description': xpath_text(video_data, _add_ns('Description')), 'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))), 'uploader': xpath_text(video_data, _add_ns('Createur')), diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 70b8c95c5..b69c1ede0 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -196,6 +196,10 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, + }, { + # no title + 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', + 'only_matching': True, }] @staticmethod @@ -303,7 +307,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json( self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)', + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) if server_js_data: @@ -353,15 +357,15 @@ class FacebookIE(InfoExtractor): self._sort_formats(formats) video_title = self._html_search_regex( - r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', - default=None) + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, + 'title', default=None) if not video_title: video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', webpage, 'alternative title', default=None) if not video_title: video_title = self._html_search_meta( - 'description', webpage, 'title') + 'description', webpage, 'title', default=None) if video_title: video_title = limit_length(video_title, 80) else: diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 9f2e5d065..159fdf9c4 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -47,9 +47,12 @@ class FOXIE(AdobePassIE): resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) - return { + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update({ '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), 'id': video_id, - } + }) + + return info diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b98da692c..b8fa17588 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -4,7 +4,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, - unified_strdate, + extract_attributes, + int_or_none, ) @@ -19,6 +20,7 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', + 'timestamp': 1393642916, 'vcodec': 'none', } } @@ -28,30 +30,34 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_url = self._search_regex( - r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<button[^>]+data-asset-source="([^"]+)"', - webpage, 'video path') + video_data = extract_attributes(self._search_regex( + r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)', + webpage, 'video data')) - title = self._og_search_title(webpage) + video_url = video_data['data-asset-source'] + title = video_data.get('data-asset-title') or self._og_search_title(webpage) - upload_date = unified_strdate(self._search_regex( - '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<', - webpage, 'upload date', fatal=False)) + description = self._html_search_regex( + r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', + webpage, 'description', default=None) thumbnail = self._search_regex( - r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-dejavu-src="([^"]+)"', + r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( - r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) - vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None + ext = determine_ext(video_url.lower()) return { 'id': display_id, 'display_id': display_id, 'url': video_url, 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'vcodec': vcodec, + 'ext': ext, + 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'upload_date': upload_date, + 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'duration': int_or_none(video_data.get('data-duration')), } diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py index a90f9156c..72a845945 100644 --- a/youtube_dl/extractor/freshlive.py +++ b/youtube_dl/extractor/freshlive.py @@ -56,9 +56,8 @@ class FreshLiveIE(InfoExtractor): is_live = info.get('liveStreamUrl') is not None formats = self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - m3u8_id='hls') + stream_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls') if is_live: title = self._live_title(title) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3fe0237b6..274f81738 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -84,6 +84,7 @@ from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE +from .rutube import RutubeIE class GenericIE(InfoExtractor): @@ -448,6 +449,23 @@ class GenericIE(InfoExtractor): }, }], }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -884,12 +902,13 @@ class GenericIE(InfoExtractor): }, # LazyYT { - 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', + 'url': 'https://skiplagged.com/', 'info_dict': { - 'id': '1986', - 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', + 'id': 'skiplagged', + 'title': 'Skiplagged: The smart way to find cheap flights', }, - 'playlist_mincount': 2, + 'playlist_mincount': 1, + 'add_ie': ['Youtube'], }, # Cinchcast embed { @@ -1517,10 +1536,38 @@ class GenericIE(InfoExtractor): 'add_ie': [VideoPressIE.ie_key()], }, { + # Rutube embed + 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', + 'info_dict': { + 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', + 'ext': 'flv', + 'title': 'Магаззино: Казань 2', + 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', + 'uploader': 'Магаззино', + 'upload_date': '20170228', + 'uploader_id': '996642', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [RutubeIE.ie_key()], + }, + { # ThePlatform embedded with whitespaces in URLs 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', 'only_matching': True, }, + { + # Senate ISVP iframe https + 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', + 'md5': 'fb8c70b0b515e5037981a2492099aab8', + 'info_dict': { + 'id': 'govtaff020316', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + }, + 'add_ie': [SenateISVPIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1820,14 +1867,6 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) - # Helper method - def _playlist_from_matches(matches, getter=None, ie=None): - urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -1848,28 +1887,28 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(webpage) if bc_urls: - return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: - return _playlist_from_matches(tp_urls, ie='ThePlatform') + return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') # Look for Vessel embeds vessel_urls = VesselIE._extract_urls(webpage) if vessel_urls: - return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, ie='RtlNl') + return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') vimeo_urls = VimeoIE._extract_urls(url, webpage) if vimeo_urls: - return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) + return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1891,25 +1930,25 @@ class GenericIE(InfoExtractor): (?:embed|v|p)/.+?) \1''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return self.playlist_from_matches( + matches, video_id, video_title, lambda m: unescapeHTML(m[1])) # Look for lazyYT YouTube embed matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) # Look for Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)<div[^>]+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches(matches, lambda m: m[-1]) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) matches = DailymotionIE._extract_urls(webpage) if matches: - return _playlist_from_matches(matches) + return self.playlist_from_matches(matches, video_id, video_title) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1918,8 +1957,8 @@ class GenericIE(InfoExtractor): playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: - return _playlist_from_matches( - playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + return self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player match = re.search( @@ -2026,8 +2065,9 @@ class GenericIE(InfoExtractor): if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: - return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + return self.playlist_from_matches( + embeds, video_id, video_title, + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -2089,13 +2129,13 @@ class GenericIE(InfoExtractor): # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) if matches: - return _playlist_from_matches( - matches, getter=unescapeHTML, ie='FunnyOrDie') + return self.playlist_from_matches( + matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: - return _playlist_from_matches(matches, ie='BBCCoUk') + return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) @@ -2110,32 +2150,32 @@ class GenericIE(InfoExtractor): # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: - return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) if xhamster_urls: - return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') # Look for embedded TNAFlixNetwork player tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) if tnaflix_urls: - return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) # Look for embedded PornHub player pornhub_urls = PornHubIE._extract_urls(webpage) if pornhub_urls: - return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) + return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) # Look for embedded DrTuber player drtuber_urls = DrTuberIE._extract_urls(webpage) if drtuber_urls: - return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) + return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) # Look for embedded RedTube player redtube_urls = RedTubeIE._extract_urls(webpage) if redtube_urls: - return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) + return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) # Look for embedded Tvigle player mobj = re.search( @@ -2181,12 +2221,12 @@ class GenericIE(InfoExtractor): # Look for embedded soundcloud player soundcloud_urls = SoundcloudIE._extract_urls(webpage) if soundcloud_urls: - return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for tunein player tunein_urls = TuneInBaseIE._extract_urls(webpage) if tunein_urls: - return _playlist_from_matches(tunein_urls) + return self.playlist_from_matches(tunein_urls, video_id, video_title) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2469,30 +2509,36 @@ class GenericIE(InfoExtractor): # Look for DBTV embeds dbtv_urls = DBTVIE._extract_urls(webpage) if dbtv_urls: - return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) + return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) # Look for Videa embeds videa_urls = VideaIE._extract_urls(webpage) if videa_urls: - return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) + return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) # Look for 20 minuten embeds twentymin_urls = TwentyMinutenIE._extract_urls(webpage) if twentymin_urls: - return _playlist_from_matches( - twentymin_urls, ie=TwentyMinutenIE.ie_key()) + return self.playlist_from_matches( + twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) # Look for Openload embeds openload_urls = OpenloadIE._extract_urls(webpage) if openload_urls: - return _playlist_from_matches( - openload_urls, ie=OpenloadIE.ie_key()) + return self.playlist_from_matches( + openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: - return _playlist_from_matches( - videopress_urls, ie=VideoPressIE.ie_key()) + return self.playlist_from_matches( + videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) + + # Look for Rutube embeds + rutube_urls = RutubeIE._extract_urls(webpage) + if rutube_urls: + return self.playlist_from_matches( + rutube_urls, ie=RutubeIE.ie_key()) # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( @@ -2521,7 +2567,11 @@ class GenericIE(InfoExtractor): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: - return self._parse_jwplayer_data(jwplayer_data, video_id) + info = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False) + if not info.get('title'): + info['title'] = video_title + return info def check_video(vurl): if YoutubeIE.suitable(vurl): @@ -2596,11 +2646,14 @@ class GenericIE(InfoExtractor): found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + if new_url != url: + self.report_following_redirect(new_url) + return { + '_type': 'url', + 'url': new_url, + } + else: + found = None if not found: # twitter:player is a https URL to iframe player that may or may not diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 21ed846b2..4c9be47b4 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,7 +36,7 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'info_dict': { @@ -52,6 +52,12 @@ class GoIE(AdobePassIE): }, { 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', 'only_matching': True, + }, { + 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', + 'only_matching': True, + }, { + 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 8116ad9bd..931f71a5a 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( xpath_text, xpath_element, @@ -14,14 +15,26 @@ from ..utils import ( class HBOBaseIE(InfoExtractor): _FORMATS_INFO = { + 'pro7': { + 'width': 1280, + 'height': 720, + }, '1920': { 'width': 1280, 'height': 720, }, + 'pro6': { + 'width': 768, + 'height': 432, + }, '640': { 'width': 768, 'height': 432, }, + 'pro5': { + 'width': 640, + 'height': 360, + }, 'highwifi': { 'width': 640, 'height': 360, @@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( video_url.replace('.tar', '/base_index_w8.m3u8'), video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif source.tag == 'hls': + # #EXT-X-BYTERANGE is not supported by native hls downloader + # and ffmpeg (#10955) + # formats.extend(self._extract_m3u8_formats( + # video_url.replace('.tar', '/base_index.m3u8'), + # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + continue + elif source.tag == 'dash': + formats.extend(self._extract_mpd_formats( + video_url.replace('.tar', '/manifest.mpd'), + video_id, mpd_id='dash', fatal=False)) else: format_info = self._FORMATS_INFO.get(source.tag, {}) formats.append({ @@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor): class HBOIE(HBOBaseIE): + IE_NAME = 'hbo' _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', - 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'md5': '2c6a6bc1222c7e91cb3334dad1746e5a', 'info_dict': { 'id': '1437839', 'ext': 'mp4', @@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE): class HBOEpisodeIE(HBOBaseIE): - _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html' + IE_NAME = 'hbo:episode' + _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?' _TESTS = [{ 'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', - 'md5': '689132b253cc0ab7434237fc3a293210', + 'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb', 'info_dict': { 'id': '1439518', 'display_id': 'ep-52-inside-the-episode', @@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE): }, { 'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true', 'only_matching': True, + }, { + 'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + content = self._download_json( + 'http://www.hbo.com/api/content/' + path, display_id)['content'] - video_id = self._search_regex( - r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)', - webpage, 'video ID', group='video_id') + video_id = compat_str((content.get('parsed', {}).get( + 'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId']) info_dict = self._extract_from_id(video_id) info_dict['display_id'] = display_id diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c863413bf..7f946c6ed 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -119,7 +119,8 @@ class LivestreamIE(InfoExtractor): m3u8_url = video_data.get('m3u8_url') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) f4m_url = video_data.get('f4m_url') if f4m_url: @@ -158,11 +159,11 @@ class LivestreamIE(InfoExtractor): if smil_url: formats.extend(self._extract_smil_formats(smil_url, broadcast_id)) - entry_protocol = 'm3u8' if is_live else 'm3u8_native' m3u8_url = stream_info.get('m3u8_url') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) + m3u8_url, broadcast_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) rtsp_url = stream_info.get('rtsp_url') if rtsp_url: @@ -276,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor): 'view_count': view_count, } - def _extract_video_formats(self, video_data, video_id, entry_protocol): + def _extract_video_formats(self, video_data, video_id): formats = [] progressive_url = video_data.get('progressiveUrl') @@ -289,7 +290,8 @@ class LivestreamOriginalIE(InfoExtractor): m3u8_url = video_data.get('httpUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) rtsp_url = video_data.get('rtspUrl') if rtsp_url: @@ -340,11 +342,10 @@ class LivestreamOriginalIE(InfoExtractor): } video_data = self._download_json(stream_url, content_id) is_live = video_data.get('isLive') - entry_protocol = 'm3u8' if is_live else 'm3u8_native' info.update({ 'id': content_id, 'title': self._live_title(info['title']) if is_live else info['title'], - 'formats': self._extract_video_formats(video_data, content_id, entry_protocol), + 'formats': self._extract_video_formats(video_data, content_id), 'is_live': is_live, }) return info diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py new file mode 100644 index 000000000..6e067474b --- /dev/null +++ b/youtube_dl/extractor/medialaan.py @@ -0,0 +1,259 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + try_get, + unified_timestamp, + urlencode_postdata, +) + + +class MedialaanIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?P<site_id>vtm|q2|vtmkzoom)\.be/ + (?: + video(?:/[^/]+/id/|/?\?.*?\baid=)| + (?:[^/]+/)* + ) + ) + (?P<id>[^/?#&]+) + ''' + _NETRC_MACHINE = 'medialaan' + _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' + _SITE_TO_APP_ID = { + 'vtm': 'vtm_watch', + 'q2': 'q2', + 'vtmkzoom': 'vtmkzoom', + } + _TESTS = [{ + # vod + 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'info_dict': { + 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'ext': 'mp4', + 'title': 'Allemaal Chris afl. 6', + 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', + 'timestamp': 1487533280, + 'upload_date': '20170219', + 'duration': 2562, + 'series': 'Allemaal Chris', + 'season': 'Allemaal Chris', + 'season_number': 1, + 'season_id': '256936078124527', + 'episode': 'Allemaal Chris afl. 6', + 'episode_number': 6, + 'episode_id': '256936078591527', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # clip + 'url': 'http://vtm.be/video?aid=168332', + 'info_dict': { + 'id': '168332', + 'ext': 'mp4', + 'title': '"Veronique liegt!"', + 'description': 'md5:1385e2b743923afe54ba4adc38476155', + 'timestamp': 1489002029, + 'upload_date': '20170308', + 'duration': 96, + }, + }, { + # vod + 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'only_matching': True, + }, { + # vod + 'url': 'http://vtm.be/video?aid=163157', + 'only_matching': True, + }, { + # vod + 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'only_matching': True, + }, { + # clip + 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'only_matching': True, + }] + + def _real_initialize(self): + self._logged_in = False + + def _login(self): + username, password = self._get_login_info() + if username is None: + self.raise_login_required() + + auth_data = { + 'APIKey': self._APIKEY, + 'sdk': 'js_6.1', + 'format': 'json', + 'loginID': username, + 'password': password, + } + + auth_info = self._download_json( + 'https://accounts.eu1.gigya.com/accounts.login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(auth_data)) + + error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') + if error_message: + raise ExtractorError( + 'Unable to login: %s' % error_message, expected=True) + + self._uid = auth_info['UID'] + self._uid_signature = auth_info['UIDSignature'] + self._signature_timestamp = auth_info['signatureTimestamp'] + + self._logged_in = True + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, site_id = mobj.group('id', 'site_id') + + webpage = self._download_webpage(url, video_id) + + config = self._parse_json( + self._search_regex( + r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', + webpage, 'config', default='{}'), video_id, + transform_source=lambda s: s.replace( + '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) + + vod_id = config.get('vodId') or self._search_regex( + (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', + r'<[^>]+id=["\']vod-(\d+)'), + webpage, 'video_id', default=None) + + # clip, no authentication required + if not vod_id: + player = self._parse_json( + self._search_regex( + r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', + default=''), + video_id, transform_source=lambda s: '[%s]' % s, fatal=False) + if player: + video = player[-1] + info = { + 'id': video_id, + 'url': video['videoUrl'], + 'title': video['title'], + 'thumbnail': video.get('imageUrl'), + 'timestamp': int_or_none(video.get('createdDate')), + 'duration': int_or_none(video.get('duration')), + } + else: + info = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_id='hls')[0] + info.update({ + 'id': video_id, + 'title': self._html_search_meta('description', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + }) + # vod, authentication required + else: + if not self._logged_in: + self._login() + + settings = self._parse_json( + self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', default='{}'), + video_id) + + def get(container, item): + return try_get( + settings, lambda x: x[container][item], + compat_str) or self._search_regex( + r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, + default=None) + + app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') + sso = get('vod', 'gigyaDatabase') or 'vtm-sso' + + data = self._download_json( + 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, + video_id, query={ + 'app_id': app_id, + 'user_network': sso, + 'UID': self._uid, + 'UIDSignature': self._uid_signature, + 'signatureTimestamp': self._signature_timestamp, + }) + + formats = self._extract_m3u8_formats( + data['response']['uri'], video_id, entry_protocol='m3u8_native', + ext='mp4', m3u8_id='hls') + + self._sort_formats(formats) + + info = { + 'id': vod_id, + 'formats': formats, + } + + api_key = get('vod', 'apiKey') + channel = get('medialaanGigya', 'channel') + + if api_key: + videos = self._download_json( + 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, + query={ + 'channels': channel, + 'ids': vod_id, + 'limit': 1, + 'apikey': api_key, + }) + if videos: + video = try_get( + videos, lambda x: x['response']['videos'][0], dict) + if video: + def get(container, item, expected_type=None): + return try_get( + video, lambda x: x[container][item], expected_type) + + def get_string(container, item): + return get(container, item, compat_str) + + info.update({ + 'series': get_string('program', 'title'), + 'season': get_string('season', 'title'), + 'season_number': int_or_none(get('season', 'number')), + 'season_id': get_string('season', 'id'), + 'episode': get_string('episode', 'title'), + 'episode_number': int_or_none(get('episode', 'number')), + 'episode_id': get_string('episode', 'id'), + 'duration': int_or_none( + video.get('duration')) or int_or_none( + video.get('durationMillis'), scale=1000), + 'title': get_string('episode', 'title'), + 'description': get_string('episode', 'text'), + 'timestamp': unified_timestamp(get_string( + 'publication', 'begin')), + }) + + if not info.get('title'): + info['title'] = try_get( + config, lambda x: x['videoConfig']['title'], + compat_str) or self._html_search_regex( + r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', + default=None) or self._og_search_title(webpage) + + if not info.get('description'): + info['description'] = self._html_search_regex( + r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>', + webpage, 'description', default=None) + + return info diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index ec1b4c4fe..40f72d66f 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -51,6 +51,7 @@ class MioMioIE(InfoExtractor): 'ext': 'mp4', 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', }, + 'skip': 'Unable to load videos', }] def _extract_mioplayer(self, webpage, video_id, title, http_headers): @@ -94,9 +95,18 @@ class MioMioIE(InfoExtractor): return entries + def _download_chinese_webpage(self, *args, **kwargs): + # Requests with English locales return garbage + headers = { + 'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3', + } + kwargs.setdefault('headers', {}).update(headers) + return self._download_webpage(*args, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_chinese_webpage( + url, video_id) title = self._html_search_meta( 'description', webpage, 'title', fatal=True) @@ -106,7 +116,7 @@ class MioMioIE(InfoExtractor): if '_h5' in mioplayer_path: player_url = compat_urlparse.urljoin(url, mioplayer_path) - player_webpage = self._download_webpage( + player_webpage = self._download_chinese_webpage( player_url, video_id, note='Downloading player webpage', headers={'Referer': url}) entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 79e0b8ada..28b743cca 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import uuid from .common import InfoExtractor +from .ooyala import OoyalaIE from ..compat import ( compat_str, compat_urllib_parse_urlencode, @@ -24,6 +25,9 @@ class MiTeleBaseIE(InfoExtractor): r'(?s)(<ms-video-player.+?</ms-video-player>)', webpage, 'ms video player')) video_id = player_data['data-media-id'] + if player_data.get('data-cms-id') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) config_url = compat_urlparse.urljoin(url, player_data['data-config']) config = self._download_json( config_url, video_id, 'Downloading config JSON') diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d9943fc2c..8961309fd 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE): formats.extend(self._extract_f4m_formats( stack_base_url + 'f4m', stack_id, f4m_id='hds', fatal=False)) - mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 50473d777..38fefe492 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,41 +3,27 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + determine_ext, + ExtractorError, fix_xml_ampersands, orderedSet, parse_duration, qualities, strip_jsonp, unified_strdate, - ExtractorError, ) class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): - token_page = self._download_webpage( - 'http://ida.omroep.nl/npoplayer/i.js', - video_id, note='Downloading token') - token = self._search_regex( - r'npoplayer\.token = "(.+?)"', token_page, 'token') - # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js - token_l = list(token) - first = second = None - for i in range(5, len(token_l) - 4): - if token_l[i].isdigit(): - if first is None: - first = i - elif second is None: - second = i - if first is None or second is None: - first = 12 - second = 13 - - token_l[first], token_l[second] = token_l[second], token_l[first] - - return ''.join(token_l) + return self._download_json( + 'http://ida.omroep.nl/app.php/auth', video_id, + note='Downloading token')['token'] class NPOIE(NPOBaseIE): @@ -58,103 +44,113 @@ class NPOIE(NPOBaseIE): (?P<id>[^/?#]+) ''' - _TESTS = [ - { - 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', - 'md5': '4b3f9c429157ec4775f2c9cb7b911016', - 'info_dict': { - 'id': 'VPWON_1220719', - 'ext': 'm4v', - 'title': 'Nieuwsuur', - 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', - 'upload_date': '20140622', - }, + _TESTS = [{ + 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', + 'md5': '4b3f9c429157ec4775f2c9cb7b911016', + 'info_dict': { + 'id': 'VPWON_1220719', + 'ext': 'm4v', + 'title': 'Nieuwsuur', + 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', + 'upload_date': '20140622', }, - { - 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', - 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', - 'info_dict': { - 'id': 'VARA_101191800', - 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show: The best of.', - 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', - 'upload_date': '20090227', - 'duration': 2400, - }, + }, { + 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', + 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', + 'info_dict': { + 'id': 'VARA_101191800', + 'ext': 'm4v', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', + 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', + 'upload_date': '20090227', + 'duration': 2400, }, - { - 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', - 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - 'duration': 3000, - }, + }, { + 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', + 'upload_date': '20130225', + 'duration': 3000, }, - { - 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', - 'info_dict': { - 'id': 'WO_VPRO_043706', - 'ext': 'wmv', - 'title': 'De nieuwe mens - Deel 1', - 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', - 'duration': 4680, - }, - 'params': { - # mplayer mms download - 'skip_download': True, - } + }, { + 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', + 'info_dict': { + 'id': 'WO_VPRO_043706', + 'ext': 'm4v', + 'title': 'De nieuwe mens - Deel 1', + 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', + 'duration': 4680, }, + 'params': { + 'skip_download': True, + } + }, { # non asf in streams - { - 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', - 'md5': 'b3da13de374cbe2d5332a7e910bef97f', - 'info_dict': { - 'id': 'WO_NOS_762771', - 'ext': 'mp4', - 'title': 'Hoe gaat Europa verder na Parijs?', - }, - }, - { - 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', - 'md5': '01c6a2841675995da1f0cf776f03a9c3', - 'info_dict': { - 'id': 'VPWON_1233944', - 'ext': 'm4v', - 'title': 'Aap, poot, pies', - 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', - 'upload_date': '20150508', - 'duration': 599, - }, + 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', + 'info_dict': { + 'id': 'WO_NOS_762771', + 'ext': 'mp4', + 'title': 'Hoe gaat Europa verder na Parijs?', }, - { - 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', - 'md5': 'd30cd8417b8b9bca1fdff27428860d08', - 'info_dict': { - 'id': 'POW_00996502', - 'ext': 'm4v', - 'title': '''"Dit is wel een 'landslide'..."''', - 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', - 'upload_date': '20150508', - 'duration': 462, - }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', + 'info_dict': { + 'id': 'VPWON_1233944', + 'ext': 'm4v', + 'title': 'Aap, poot, pies', + 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'upload_date': '20150508', + 'duration': 599, }, - { - 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', - 'only_matching': True, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', + 'info_dict': { + 'id': 'POW_00996502', + 'ext': 'm4v', + 'title': '''"Dit is wel een 'landslide'..."''', + 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', + 'upload_date': '20150508', + 'duration': 462, }, - { - 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', - 'only_matching': True, + 'params': { + 'skip_download': True, + } + }, { + # audio + 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', + 'info_dict': { + 'id': 'RBX_FUNX_6683215', + 'ext': 'mp3', + 'title': 'Jouw Stad Rotterdam', + 'description': 'md5:db251505244f097717ec59fabc372d9f', }, - { - 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', - 'only_matching': True, + 'params': { + 'skip_download': True, } - ] + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', + 'only_matching': True, + }, { + # live stream + 'url': 'npo:LI_NL1_4188102', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -183,70 +179,115 @@ class NPOIE(NPOBaseIE): token = self._get_token(video_id) formats = [] + urls = set() + + quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) + items = self._download_json( + 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, + 'Downloading formats JSON', query={ + 'adaptive': 'yes', + 'token': token, + })['items'][0] + for num, item in enumerate(items): + item_url = item.get('url') + if not item_url or item_url in urls: + continue + urls.add(item_url) + format_id = self._search_regex( + r'video/ida/([^/]+)', item_url, 'format id', + default=None) + + def add_format_url(format_url): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality(format_id), + }) + + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + if item.get('contentType') in ('url', 'audio'): + add_format_url(item_url) + continue - pubopties = metadata.get('pubopties') - if pubopties: - quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) - for format_id in pubopties: - format_info = self._download_json( - 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' - % (video_id, format_id, token), - video_id, 'Downloading %s JSON' % format_id) - if format_info.get('error_code', 0) or format_info.get('errorcode', 0): + try: + stream_info = self._download_json( + item_url + '&type=json', video_id, + 'Downloading %s stream JSON' + % item.get('label') or item.get('format') or format_id or num) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error = (self._parse_json( + ee.cause.read().decode(), video_id, + fatal=False) or {}).get('errorstring') + if error: + raise ExtractorError(error, expected=True) + raise + # Stream URL instead of JSON, example: npo:LI_NL1_4188102 + if isinstance(stream_info, compat_str): + if not stream_info.startswith('http'): continue - streams = format_info.get('streams') - if streams: - try: - video_info = self._download_json( - streams[0] + '&type=json', - video_id, 'Downloading %s stream JSON' % format_id) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring') - if error: - raise ExtractorError(error, expected=True) - raise - else: - video_info = format_info - video_url = video_info.get('url') - if not video_url: + video_url = stream_info + # JSON + else: + video_url = stream_info.get('url') + if not video_url or video_url in urls: + continue + urls.add(item_url) + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + else: + add_format_url(video_url) + + is_live = metadata.get('medium') == 'live' + + if not is_live: + for num, stream in enumerate(metadata.get('streams', [])): + stream_url = stream.get('url') + if not stream_url or stream_url in urls: continue - if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) - else: + urls.add(stream_url) + # smooth streaming is not supported + stream_type = stream.get('type', '').lower() + if stream_type in ['ss', 'ms']: + continue + if stream_type == 'hds': + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, fatal=False) + # f4m downloader downloads only piece of live stream + for f4m_format in f4m_formats: + f4m_format['preference'] = -1 + formats.extend(f4m_formats) + elif stream_type == 'hls': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', fatal=False)) + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + elif '.asf' in stream_url: + asx = self._download_xml( + stream_url, video_id, + 'Downloading stream %d ASX playlist' % num, + transform_source=fix_xml_ampersands, fatal=False) + if not asx: + continue + ref = asx.find('./ENTRY/Ref') + if ref is None: + continue + video_url = ref.get('href') + if not video_url or video_url in urls: + continue + urls.add(video_url) formats.append({ 'url': video_url, - 'format_id': format_id, - 'quality': quality(format_id), + 'ext': stream.get('formaat', 'asf'), + 'quality': stream.get('kwaliteit'), + 'preference': -10, }) - - streams = metadata.get('streams') - if streams: - for i, stream in enumerate(streams): - stream_url = stream.get('url') - if not stream_url: - continue - if '.asf' not in stream_url: + else: formats.append({ 'url': stream_url, 'quality': stream.get('kwaliteit'), }) - continue - asx = self._download_xml( - stream_url, video_id, - 'Downloading stream %d ASX playlist' % i, - transform_source=fix_xml_ampersands) - ref = asx.find('./ENTRY/Ref') - if ref is None: - continue - video_url = ref.get('href') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - }) self._sort_formats(formats) @@ -259,28 +300,28 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://www.npo.nl/live/npo-1', 'info_dict': { - 'id': 'LI_NEDERLAND1_136692', + 'id': 'LI_NL1_4188102', 'display_id': 'npo-1', 'ext': 'mp4', - 'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'Livestream', + 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -296,58 +337,12 @@ class NPOLiveIE(NPOBaseIE): live_id = self._search_regex( r'data-prid="([^"]+)"', webpage, 'live id') - metadata = self._download_json( - 'http://e.omroep.nl/metadata/%s' % live_id, - display_id, transform_source=strip_jsonp) - - token = self._get_token(display_id) - - formats = [] - - streams = metadata.get('streams') - if streams: - for stream in streams: - stream_type = stream.get('type').lower() - # smooth streaming is not supported - if stream_type in ['ss', 'ms']: - continue - stream_info = self._download_json( - 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' - % (stream.get('url'), token), - display_id, 'Downloading %s JSON' % stream_type) - if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0): - continue - stream_url = self._download_json( - stream_info['stream'], display_id, - 'Downloading %s URL' % stream_type, - 'Unable to download %s URL' % stream_type, - transform_source=strip_jsonp, fatal=False) - if not stream_url: - continue - if stream_type == 'hds': - f4m_formats = self._extract_f4m_formats(stream_url, display_id) - # f4m downloader downloads only piece of live stream - for f4m_format in f4m_formats: - f4m_format['preference'] = -1 - formats.extend(f4m_formats) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4')) - else: - formats.append({ - 'url': stream_url, - 'preference': -10, - }) - - self._sort_formats(formats) - return { + '_type': 'url_transparent', + 'url': 'npo:%s' % live_id, + 'ie_key': NPOIE.ie_key(), 'id': live_id, 'display_id': display_id, - 'title': self._live_title(metadata['titel']), - 'description': metadata['info'], - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'formats': formats, - 'is_live': True, } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index fc7ff43a6..58ffde541 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,22 +75,51 @@ class OpenloadIE(InfoExtractor): '<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>', webpage, 'openload ID') - first_char = int(ol_id[0]) - urlcode = [] - num = 1 - - while num < len(ol_id): - i = ord(ol_id[num]) - key = 0 - if i <= 90: - key = i - 65 - elif i >= 97: - key = 25 + i - 97 - urlcode.append((key, compat_chr(int(ol_id[num + 2:num + 5]) // int(ol_id[num + 1]) - first_char))) - num += 5 - - video_url = 'https://openload.co/stream/' + ''.join( - [value for _, value in sorted(urlcode, key=lambda x: x[0])]) + video_url_chars = [] + + first_char = ord(ol_id[0]) + key = first_char - 55 + maxKey = max(2, key) + key = min(maxKey, len(ol_id) - 38) + t = ol_id[key:key + 36] + + hashMap = {} + v = ol_id.replace(t, '') + h = 0 + + while h < len(t): + f = t[h:h + 3] + i = int(f, 8) + hashMap[h / 3] = i + h += 3 + + h = 0 + H = 0 + while h < len(v): + B = '' + C = '' + if len(v) >= h + 2: + B = v[h:h + 2] + if len(v) >= h + 3: + C = v[h:h + 3] + i = int(B, 16) + h += 2 + if H % 3 == 0: + i = int(C, 8) + h += 1 + elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: + i = int(C, 10) + h += 1 + index = H % 7 + + A = hashMap[index] + i ^= 213 + i ^= A + video_url_chars.append(compat_chr(i)) + H += 1 + + video_url = 'https://openload.co/stream/%s?mime=true' + video_url = video_url % (''.join(video_url_chars)) title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e0cbd045e..e45d9fe55 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE): 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 'ext': 'mp4', - 'title': 'Management of SQL Server - Demo Monitoring', + 'title': 'Demo Monitoring', 'duration': 338, }, 'skip': 'Requires pluralsight account credentials', @@ -169,11 +169,10 @@ class PluralsightIE(PluralsightBaseIE): collection = course['modules'] - module, clip = None, None + clip = None for module_ in collection: if name in (module_.get('moduleName'), module_.get('name')): - module = module_ for clip_ in module_.get('clips', []): clip_index = clip_.get('clipIndex') if clip_index is None: @@ -187,7 +186,7 @@ class PluralsightIE(PluralsightBaseIE): if not clip: raise ExtractorError('Unable to resolve clip') - title = '%s - %s' % (module['title'], clip['title']) + title = clip['title'] QUALITIES = { 'low': {'width': 640, 'height': 480}, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..b25f1f193 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import operator # import os import re @@ -18,6 +20,7 @@ from ..utils import ( js_to_json, orderedSet, # sanitized_Request, + remove_quotes, str_to_int, ) # from ..aes import ( @@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - video_url = self._search_regex( - r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, - 'video url', group='url') + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] title = self._search_regex( r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 1245309a7..d8a4bd244 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -301,6 +301,21 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): }, }, { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { # geo restricted to Germany 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', 'only_matching': True, @@ -338,6 +353,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', @@ -369,7 +385,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): def _extract_clip(self, url, webpage): clip_id = self._html_search_regex( self._CLIPID_REGEXES, webpage, 'clip id') - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) info = self._extract_video_info(url, clip_id) description = self._html_search_regex( self._DESCRIPTION_REGEXES, webpage, 'description', default=None) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py new file mode 100644 index 000000000..afab62426 --- /dev/null +++ b/youtube_dl/extractor/redbulltv.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + int_or_none, + try_get, + # unified_timestamp, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q756YYX51W11', + 'ext': 'mp4', + 'title': 'ABC of...WRC', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + # 'timestamp': 1488405786, + # 'upload_date': '20170301', + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', + 'info_dict': { + 'id': 'AP-1PMT5JCWH1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2 E4', + 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', + 'duration': 904.6, + # 'timestamp': 1487290093, + # 'upload_date': '20170217', + 'series': 'Hashtags', + 'season_number': 2, + 'episode_number': 4, + }, + }, { + 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + session = self._download_json( + 'https://api-v2.redbull.tv/session', video_id, + note='Downloading access token', query={ + 'build': '4.370.0', + 'category': 'personal_computer', + 'os_version': '1.0', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) + + try: + info = self._download_json( + 'https://api-v2.redbull.tv/content/%s' % video_id, + video_id, note='Downloading video information', + headers={'Authorization': auth} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + video = info['video_product'] + + title = info['title'].strip() + + formats = self._extract_m3u8_formats( + video['url'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for _, captions in (try_get( + video, lambda x: x['attachments']['captions'], + dict) or {}).items(): + if not captions or not isinstance(captions, list): + continue + for caption in captions: + caption_url = caption.get('url') + if not caption_url: + continue + ext = caption.get('format') + if ext == 'xml': + ext = 'ttml' + subtitles.setdefault(caption.get('lang') or 'en', []).append({ + 'url': caption_url, + 'ext': ext, + }) + + subheading = info.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': info.get('long_description') or info.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + # 'timestamp': unified_timestamp(info.get('published')), + 'series': info.get('show_title'), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index fd1df925b..889fa7628 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,7 +17,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -39,8 +39,17 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 20d01754a..6c09df25a 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -82,6 +82,9 @@ class RuutuIE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) else: diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 387a4f7f6..db5ef8b57 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor): @staticmethod def _search_iframe_url(webpage): mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index b3aa4ce26..0ee4a8ff8 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -121,7 +121,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA' + _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index e973c867c..9f5c237ef 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -65,7 +65,7 @@ class StreamableIE(InfoExtractor): # to return video info like the title properly sometimes, and doesn't # include info like the video duration video = self._download_json( - 'https://streamable.com/ajax/videos/%s' % video_id, video_id) + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) # Format IDs: # 0 The video is being uploaded diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d5abfc9e4..fdcc7d573 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -44,6 +44,10 @@ class TelecincoIE(MiTeleBaseIE): }, { 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', 'only_matching': True, + }, { + # ooyala video + 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index 82d73c31d..fafaa826f 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, smuggle_url, + try_get, ) class TeleQuebecIE(InfoExtractor): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', 'md5': 'fe95a0957e5707b1b01f5013e725c90f', 'info_dict': { @@ -18,10 +20,14 @@ class TeleQuebecIE(InfoExtractor): 'ext': 'mp4', 'title': 'Le couronnement de New York', 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', - 'upload_date': '20160220', - 'timestamp': 1455965438, + 'upload_date': '20170201', + 'timestamp': 1485972222, } - } + }, { + # no description + 'url': 'http://zonevideo.telequebec.tv/media/30261', + 'only_matching': True, + }] def _real_extract(self, url): media_id = self._match_id(url) @@ -31,9 +37,13 @@ class TeleQuebecIE(InfoExtractor): return { '_type': 'url_transparent', 'id': media_id, - 'url': smuggle_url('limelight:media:' + media_data['streamInfo']['sourceId'], {'geo_countries': ['CA']}), + 'url': smuggle_url( + 'limelight:media:' + media_data['streamInfo']['sourceId'], + {'geo_countries': ['CA']}), 'title': media_data['title'], - 'description': media_data.get('descriptions', [{'text': None}])[0].get('text'), - 'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000), + 'description': try_get( + media_data, lambda x: x['descriptions'][0]['text'], compat_str), + 'duration': int_or_none( + media_data.get('durationInMilliseconds'), 1000), 'ie_key': 'LimelightMedia', } diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py new file mode 100644 index 000000000..b5ba1c01d --- /dev/null +++ b/youtube_dl/extractor/toongoggles.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class ToonGogglesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?' + _TESTS = [{ + 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', + 'md5': '18289fc2b951eff6b953a9d8f01e6831', + 'info_dict': { + 'id': '217147', + 'ext': 'mp4', + 'title': 'Football', + 'uploader_id': '1', + 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', + 'upload_date': '20160718', + 'timestamp': 1468879330, + } + }, { + 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', + 'info_dict': { + 'id': '227759', + 'title': 'Om Nom Stories Around The World', + }, + 'playlist_mincount': 11, + }] + + def _call_api(self, action, page_id, query): + query.update({ + 'for_ng': 1, + 'for_web': 1, + 'show_meta': 1, + 'version': 7.0, + }) + return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) + + def _parse_episode_data(self, episode_data): + title = episode_data['episode_name'] + + return { + '_type': 'url_transparent', + 'id': episode_data['episode_id'], + 'title': title, + 'url': 'kaltura:513551:' + episode_data['entry_id'], + 'thumbnail': episode_data.get('thumbnail_url'), + 'description': episode_data.get('description'), + 'duration': parse_duration(episode_data.get('hms')), + 'series': episode_data.get('show_name'), + 'season_number': int_or_none(episode_data.get('season_num')), + 'episode_id': episode_data.get('episode_id'), + 'episode': title, + 'episode_number': int_or_none(episode_data.get('episode_num')), + 'categories': episode_data.get('categories'), + 'ie_key': 'Kaltura', + } + + def _real_extract(self, url): + show_id, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id: + episode_data = self._call_api('search', episode_id, { + 'filter': 'episode', + 'id': episode_id, + })['objects'][0] + return self._parse_episode_data(episode_data) + else: + show_data = self._call_api('getepisodesbyshow', show_id, { + 'max': 1000000000, + 'showid': show_id, + }) + entries = [] + for episode_data in show_data.get('objects', []): + entries.append(self._parse_episode_data(episode_data)) + return self.playlist_result(entries, show_id, show_data.get('show_name')) diff --git a/youtube_dl/extractor/tunepk.py b/youtube_dl/extractor/tunepk.py new file mode 100644 index 000000000..9d42651ce --- /dev/null +++ b/youtube_dl/extractor/tunepk.py @@ -0,0 +1,90 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class TunePkIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)| + embed\.tune\.pk/play/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins', + 'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55', + 'info_dict': { + 'id': '6919541', + 'ext': 'mp4', + 'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins', + 'description': 'md5:eb5a04114fafef5cec90799a93a2d09c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1487327564, + 'upload_date': '20170217', + 'uploader': 'Movie Trailers', + 'duration': 107, + 'view_count': int, + } + }, { + 'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no', + 'only_matching': True, + }, { + 'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://tune.pk/video/%s' % video_id, video_id) + + details = self._parse_json( + self._search_regex( + r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'), + video_id)['details'] + + video = details['video'] + title = video.get('title') or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + + formats = self._parse_jwplayer_formats( + details['player']['sources'], video_id) + self._sort_formats(formats) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = video.get('thumb') or self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + + timestamp = unified_timestamp(video.get('date_added')) + uploader = try_get( + video, lambda x: x['uploader']['name'], + compat_str) or self._html_search_meta('author', webpage, 'author') + + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index f3541b654..7af11659f 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -12,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex|tube)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -43,10 +45,12 @@ class TwentyFourVideoIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') webpage = self._download_webpage( - 'http://www.24video.sex/video/view/%s' % video_id, video_id) + 'http://%s/video/view/%s' % (host, video_id), video_id) title = self._og_search_title(webpage) description = self._html_search_regex( @@ -72,11 +76,11 @@ class TwentyFourVideoIE(InfoExtractor): # Sets some cookies self._download_xml( - r'http://www.24video.sex/video/xml/%s?mode=init' % video_id, + r'http://%s/video/xml/%s?mode=init' % (host, video_id), video_id, 'Downloading init XML') video_xml = self._download_xml( - 'http://www.24video.sex/video/xml/%s?mode=play' % video_id, + 'http://%s/video/xml/%s?mode=play' % (host, video_id), video_id, 'Downloading video XML') video = xpath_element(video_xml, './/video', 'video', fatal=True) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bbba394b0..2daf9dfac 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -12,7 +12,6 @@ from ..compat import ( compat_str, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..utils import ( clean_html, @@ -24,6 +23,7 @@ from ..utils import ( parse_iso8601, update_url_query, urlencode_postdata, + urljoin, ) @@ -32,7 +32,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_URL = 'http://www.twitch.tv/login' + _LOGIN_URL = 'https://www.twitch.tv/login' _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' _NETRC_MACHINE = 'twitch' @@ -64,6 +64,35 @@ class TwitchBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login. Twitch said: %s' % message, expected=True) + def login_step(page, urlh, note, data): + form = self._hidden_inputs(page) + form.update(data) + + page_url = urlh.geturl() + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, + 'post url', default=page_url, group='url') + post_url = urljoin(page_url, post_url) + + headers = {'Referer': page_url} + + try: + response = self._download_json( + post_url, None, note, + data=urlencode_postdata(form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise + + redirect_url = urljoin(post_url, response['redirect']) + return self._download_webpage_handle( + redirect_url, None, 'Downloading login redirect page', + headers=headers) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') @@ -71,40 +100,19 @@ class TwitchBaseIE(InfoExtractor): if 'blacklist_message' in login_page: fail(clean_html(login_page)) - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, - }) - - redirect_url = handle.geturl() - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=redirect_url, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(redirect_url, post_url) - - headers = {'Referer': redirect_url} + redirect_page, handle = login_step( + login_page, handle, 'Logging in as %s' % username, { + 'username': username, + 'password': password, + }) - try: - response = self._download_json( - post_url, None, 'Logging in as %s' % username, - data=urlencode_postdata(login_form), - headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json( - e.cause.read().decode('utf-8'), None) - fail(response['message']) - raise - - if response.get('redirect'): - self._download_webpage( - response['redirect'], None, 'Downloading login redirect page', - headers=headers) + if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: + # TODO: Add mechanism to request an SMS or phone call + tfa_token = self._get_tfa_info('two-factor authentication token') + login_step(redirect_page, handle, 'Submitting TFA token', { + 'authy_token': tfa_token, + 'remember_2fa': 'true', + }) def _prefer_source(self, formats): try: diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index d26fb49b3..5086f591e 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class VierIE(InfoExtractor): IE_NAME = 'vier' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 'info_dict': { @@ -24,6 +24,19 @@ class VierIE(InfoExtractor): 'skip_download': True, }, }, { + 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', + 'info_dict': { + 'id': '2561614', + 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', + 'ext': 'mp4', + 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', + 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', 'only_matching': True, }, { @@ -35,6 +48,7 @@ class VierIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + site = mobj.group('site') webpage = self._download_webpage(url, display_id) @@ -43,7 +57,7 @@ class VierIE(InfoExtractor): webpage, 'video id') application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default='vier_vod') + webpage, 'application', default=site + '_vod') filename = self._search_regex( [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], webpage, 'filename') @@ -68,7 +82,7 @@ class VierIE(InfoExtractor): class VierVideosIE(InfoExtractor): IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' _TESTS = [{ 'url': 'http://www.vier.be/demoestuin/videos', 'info_dict': { @@ -76,6 +90,12 @@ class VierVideosIE(InfoExtractor): }, 'playlist_mincount': 153, }, { + 'url': 'http://www.vijf.be/temptationisland/videos', + 'info_dict': { + 'id': 'temptationisland', + }, + 'playlist_mincount': 159, + }, { 'url': 'http://www.vier.be/demoestuin/videos?page=6', 'info_dict': { 'id': 'demoestuin-page6', @@ -92,6 +112,7 @@ class VierVideosIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') + site = mobj.group('site') page_id = mobj.group('page') if page_id: @@ -105,13 +126,13 @@ class VierVideosIE(InfoExtractor): entries = [] for current_page_id in itertools.count(start_page): current_page = self._download_webpage( - 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), + 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), program, 'Downloading page %d' % (current_page_id + 1)) page_entries = [ - self.url_result('http://www.vier.be' + video_url, 'Vier') + self.url_result('http://www.' + site + '.be' + video_url, 'Vier') for video_url in re.findall( - r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] + r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] entries.extend(page_entries) if page_id or '>Meer<' not in current_page: break diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 3fd889c8e..db6a65d2e 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor): class ViuIE(ViuBaseIE): - _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' + _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', 'info_dict': { @@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE): 'skip_download': 'm3u8 download', }, 'skip': 'Geo-restricted to Indonesia', + }, { + 'url': 'https://india.viu.com/en/media/1126286865', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 7c42a4f54..dc2719cf9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -432,8 +432,7 @@ class VKIE(VKBaseIE): }) elif format_id == 'hls': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False, live=is_live)) elif format_id == 'rtmp': formats.append({ diff --git a/youtube_dl/extractor/vrak.py b/youtube_dl/extractor/vrak.py new file mode 100644 index 000000000..daa247cce --- /dev/null +++ b/youtube_dl/extractor/vrak.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + unescapeHTML, +) + + +class VrakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P<id>[\d.]+)' + _TEST = { + 'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721', + 'info_dict': { + 'id': '5345661243001', + 'ext': 'mp4', + 'title': 'Obésité, film de hockey et Roseline Filion', + 'timestamp': 1488492126, + 'upload_date': '20170302', + 'uploader_id': '2890187628001', + 'creator': 'VRAK.TV', + 'age_limit': 8, + 'series': 'ALT (Actualité Légèrement Tordue)', + 'episode': 'Obésité, film de hockey et Roseline Filion', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h\d\b[^>]+\bclass=["\']videoTitle["\'][^>]*>([^<]+)', + webpage, 'title', default=None) or self._og_search_title(webpage) + + content = self._parse_json( + self._search_regex( + r'data-player-options-content=(["\'])(?P<content>{.+?})\1', + webpage, 'content', default='{}', group='content'), + video_id, transform_source=unescapeHTML) + + ref_id = content.get('refId') or self._search_regex( + r'refId":"([^&]+)"', webpage, 'ref id') + + brightcove_id = self._search_regex( + r'''(?x) + java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s + [^>]* + java\.lang\.String\s+value\s*=\s*["'](\d+) + ''' % re.escape(ref_id), webpage, 'brightcove id') + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'description': content.get('description'), + 'creator': content.get('brand'), + 'age_limit': parse_age_limit(content.get('rating')), + 'series': content.get('showName') or content.get( + 'episodeName'), # this is intentional + 'season_number': int_or_none(content.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(content.get('episodeNumber')), + 'tags': content.get('tags', []), + } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f7e6360a3..8bb7362bb 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -19,9 +19,10 @@ class WDRBaseIE(InfoExtractor): def _extract_wdr_video(self, webpage, display_id): # for wdr.de the data-extension is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + # for wdrmaus, in a tag with the class "videoButton" (previously a link + # to the page in a multiline "videoLink"-tag) json_metadata = self._html_search_regex( - r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', webpage, 'media link', default=None, flags=re.MULTILINE) if not json_metadata: @@ -32,7 +33,7 @@ class WDRBaseIE(InfoExtractor): jsonp_url = media_link_obj['mediaObj']['url'] metadata = self._download_json( - jsonp_url, 'metadata', transform_source=strip_jsonp) + jsonp_url, display_id, transform_source=strip_jsonp) metadata_tracker_data = metadata['trackerData'] metadata_media_resource = metadata['mediaResource'] @@ -161,23 +162,23 @@ class WDRIE(WDRBaseIE): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1096487', - 'ext': 'flv', + 'id': 'mdb-1323501', + 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', - 'description': '- Die Sendung mit der Maus -', + 'description': 'Die Seite mit der Maus -', }, 'skip': 'The id changes from week to week because of the new episode' }, { - 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', + 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', 'md5': '803138901f6368ee497b4d195bb164f2', 'info_dict': { 'id': 'mdb-186083', 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', - 'description': '- Die Sendung mit der Maus -', + 'description': 'Die Seite mit der Maus -', }, }, { @@ -186,7 +187,7 @@ class WDRIE(WDRBaseIE): 'info_dict': { 'id': 'mdb-869971', 'ext': 'flv', - 'title': 'Funkhaus Europa Livestream', + 'title': 'COSMO Livestream', 'description': 'md5:2309992a6716c347891c045be50992e4', 'upload_date': '20160101', }, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 81c793921..ca40de522 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -59,6 +59,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + def _set_language(self): self._set_cookie( '.youtube.com', 'PREF', 'f1=50000000&hl=en', @@ -265,9 +267,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) (?(1).+)? # if we found the ID, everything can follow - $""" + $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -924,6 +931,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, + { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1454,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported') + raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) # Start extracting information self.report_information_extraction(video_id) @@ -1864,8 +1875,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) - )""" + (%(playlist_id)s) + )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 8b51d3c6f..6b811535f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -773,7 +773,7 @@ def parseOpts(overrideArguments=None): help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default; No effect without -x') + help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 96ddb3b36..7c162d92a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -26,15 +26,25 @@ from ..utils import ( EXT_TO_OUT_FORMATS = { - "aac": "adts", - "m4a": "ipod", - "mka": "matroska", - "mkv": "matroska", - "mpg": "mpeg", - "ogv": "ogg", - "ts": "mpegts", - "wma": "asf", - "wmv": "asf", + 'aac': 'adts', + 'flac': 'flac', + 'm4a': 'ipod', + 'mka': 'matroska', + 'mkv': 'matroska', + 'mpg': 'mpeg', + 'ogv': 'ogg', + 'ts': 'mpegts', + 'wma': 'asf', + 'wmv': 'asf', +} +ACODECS = { + 'mp3': 'libmp3lame', + 'aac': 'aac', + 'flac': 'flac', + 'm4a': 'aac', + 'opus': 'opus', + 'vorbis': 'libvorbis', + 'wav': None, } @@ -237,7 +247,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): acodec = 'copy' extension = 'm4a' more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'mp3', 'vorbis', 'opus']: + elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: # Lossless if possible acodec = 'copy' extension = filecodec @@ -256,8 +266,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): else: more_opts += ['-b:a', self._preferredquality + 'k'] else: - # We convert the audio (lossy) - acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'opus': 'opus', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] + # We convert the audio (lossy if codec is lossy) + acodec = ACODECS[self._preferredcodec] extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 807183f4a..2340bc306 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_basestring, compat_chr, compat_etree_fromstring, + compat_expanduser, compat_html_entities, compat_html_entities_html5, compat_http_client, @@ -473,7 +474,8 @@ def timeconvert(timestr): def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. """ def replace_insane(char): if restricted and char in ACCENT_CHARS: @@ -538,6 +540,11 @@ def sanitized_Request(url, *args, **kwargs): return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -1747,11 +1754,16 @@ def base_url(url): def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None if re.match(r'^(?:https?:)?//', path): return path - if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + if isinstance(base, bytes): + base = base.decode('utf-8') + if not isinstance(base, compat_str) or not re.match( + r'^(?:https?:)?//', base): return None return compat_urlparse.urljoin(base, path) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 261218b80..13904c724 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.27' +__version__ = '2017.03.24' |