diff options
author | practicalswift <practicalswift@users.noreply.github.com> | 2021-10-01 13:53:59 +0000 |
---|---|---|
committer | practicalswift <practicalswift@users.noreply.github.com> | 2021-10-01 13:51:10 +0000 |
commit | 4747da3a5b639b5a336b737e7e3cbf060cf2efcf (patch) | |
tree | c375c8eff3a7f4b02f66247b52dfee286e7fcbae | |
parent | e69cbac628bfdca4a8e4ead821190eaf5b6b3d07 (diff) |
Add syscall sandboxing (seccomp-bpf)
-rwxr-xr-x | ci/test/00_setup_env_i686_multiprocess.sh | 1 | ||||
-rw-r--r-- | configure.ac | 37 | ||||
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/bitcoind.cpp | 2 | ||||
-rw-r--r-- | src/checkqueue.h | 2 | ||||
-rw-r--r-- | src/httpserver.cpp | 3 | ||||
-rw-r--r-- | src/index/base.cpp | 2 | ||||
-rw-r--r-- | src/init.cpp | 36 | ||||
-rw-r--r-- | src/logging.cpp | 1 | ||||
-rw-r--r-- | src/logging.h | 1 | ||||
-rw-r--r-- | src/mapport.cpp | 2 | ||||
-rw-r--r-- | src/net.cpp | 6 | ||||
-rw-r--r-- | src/node/blockstorage.cpp | 2 | ||||
-rw-r--r-- | src/rpc/misc.cpp | 25 | ||||
-rw-r--r-- | src/scheduler.cpp | 2 | ||||
-rw-r--r-- | src/torcontrol.cpp | 2 | ||||
-rw-r--r-- | src/util/syscall_sandbox.cpp | 882 | ||||
-rw-r--r-- | src/util/syscall_sandbox.h | 57 | ||||
-rw-r--r-- | test/config.ini.in | 1 | ||||
-rwxr-xr-x | test/functional/feature_notifications.py | 3 | ||||
-rwxr-xr-x | test/functional/feature_syscall_sandbox.py | 34 | ||||
-rwxr-xr-x | test/functional/feature_versionbits_warning.py | 3 | ||||
-rwxr-xr-x | test/functional/rpc_misc.py | 2 | ||||
-rwxr-xr-x | test/functional/rpc_signer.py | 3 | ||||
-rwxr-xr-x | test/functional/test_framework/test_framework.py | 11 | ||||
-rwxr-xr-x | test/functional/test_runner.py | 1 | ||||
-rwxr-xr-x | test/functional/wallet_signer.py | 3 |
27 files changed, 1125 insertions, 1 deletions
diff --git a/ci/test/00_setup_env_i686_multiprocess.sh b/ci/test/00_setup_env_i686_multiprocess.sh index f7f65f6e3a..a25c98a004 100755 --- a/ci/test/00_setup_env_i686_multiprocess.sh +++ b/ci/test/00_setup_env_i686_multiprocess.sh @@ -14,4 +14,5 @@ export DEP_OPTS="DEBUG=1 MULTIPROCESS=1" export GOAL="install" export BITCOIN_CONFIG="--enable-debug CC='clang -m32' CXX='clang++ -m32' LDFLAGS='--rtlib=compiler-rt -lgcc_s'" export TEST_RUNNER_ENV="BITCOIND=bitcoin-node" +export TEST_RUNNER_EXTRA="--nosandbox" export PIP_PACKAGES="lief" diff --git a/configure.ac b/configure.ac index 0dc480e6c1..a0bf5136eb 100644 --- a/configure.ac +++ b/configure.ac @@ -71,6 +71,12 @@ case $host in ;; esac +AC_ARG_WITH([seccomp], + [AS_HELP_STRING([--with-seccomp], + [enable experimental syscall sandbox feature (-sandbox), default is yes if seccomp-bpf is detected under Linux x86_64])], + [seccomp_found=$withval], + [seccomp_found=auto]) + dnl Require C++17 compiler (no GNU extensions) AX_CXX_COMPILE_STDCXX([17], [noext], [mandatory]) @@ -1443,6 +1449,36 @@ if test "x$use_external_signer" != xno; then fi AM_CONDITIONAL([ENABLE_EXTERNAL_SIGNER], [test "x$use_external_signer" = "xyes"]) +dnl Do not compile with syscall sandbox support when compiling under the sanitizers. +dnl The sanitizers introduce use of syscalls that are not typically used in bitcoind +dnl (such as execve when the sanitizers execute llvm-symbolizer). +if test x$use_sanitizers != x; then + AC_MSG_WARN(Specifying --with-sanitizers forces --without-seccomp since the sanitizers introduce use of syscalls not allowed by the bitcoind syscall sandbox (-sandbox=<mode>).) + seccomp_found=no +fi +if test "x$seccomp_found" != "xno"; then + AC_MSG_CHECKING([for seccomp-bpf (Linux x86-64)]) + AC_PREPROC_IFELSE([AC_LANG_PROGRAM([[ + @%:@include <linux/seccomp.h> + ]], [[ + #if !defined(__x86_64__) + # error Syscall sandbox is an experimental feature currently available only under Linux x86-64. + #endif + ]])],[ + AC_MSG_RESULT(yes) + seccomp_found="yes" + AC_DEFINE(USE_SYSCALL_SANDBOX, 1, [Define this symbol to build with syscall sandbox support.]) + ],[ + AC_MSG_RESULT(no) + seccomp_found="no" + ]) +fi +dnl Currently only enable -sandbox=<mode> feature if seccomp is found. +dnl In the future, sandboxing could be also be supported with other +dnl sandboxing mechanisms besides seccomp. +use_syscall_sandbox=$seccomp_found +AM_CONDITIONAL([ENABLE_SYSCALL_SANDBOX], [test "x$use_syscall_sandbox" != "xno"]) + dnl Check for reduced exports if test x$use_reduce_exports = xyes; then AX_CHECK_COMPILE_FLAG([-fvisibility=hidden],[CXXFLAGS="$CXXFLAGS -fvisibility=hidden"], @@ -1933,6 +1969,7 @@ echo echo "Options used to compile and link:" echo " external signer = $use_external_signer" echo " multiprocess = $build_multiprocess" +echo " with experimental syscall sandbox support = $use_syscall_sandbox" echo " with libs = $build_bitcoin_libs" echo " with wallet = $enable_wallet" if test "x$enable_wallet" != "xno"; then diff --git a/src/Makefile.am b/src/Makefile.am index 52c8b85357..e038a0b132 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -261,6 +261,7 @@ BITCOIN_CORE_H = \ util/sock.h \ util/spanparsing.h \ util/string.h \ + util/syscall_sandbox.h \ util/system.h \ util/thread.h \ util/threadnames.h \ @@ -611,6 +612,7 @@ libbitcoin_util_a_SOURCES = \ util/spanparsing.cpp \ util/strencodings.cpp \ util/string.cpp \ + util/syscall_sandbox.cpp \ util/time.cpp \ util/tokenpipe.cpp \ $(BITCOIN_CORE_H) diff --git a/src/bitcoind.cpp b/src/bitcoind.cpp index 654679af27..25ec2809e9 100644 --- a/src/bitcoind.cpp +++ b/src/bitcoind.cpp @@ -19,6 +19,7 @@ #include <shutdown.h> #include <util/check.h> #include <util/strencodings.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/threadnames.h> #include <util/tokenpipe.h> @@ -238,6 +239,7 @@ static bool AppInit(NodeContext& node, int argc, char* argv[]) daemon_ep.Close(); } #endif + SetSyscallSandboxPolicy(SyscallSandboxPolicy::SHUTOFF); if (fRet) { WaitForShutdown(); } diff --git a/src/checkqueue.h b/src/checkqueue.h index 4ceeb3600a..7c20e2013c 100644 --- a/src/checkqueue.h +++ b/src/checkqueue.h @@ -7,6 +7,7 @@ #include <sync.h> #include <tinyformat.h> +#include <util/syscall_sandbox.h> #include <util/threadnames.h> #include <algorithm> @@ -151,6 +152,7 @@ public: for (int n = 0; n < threads_num; ++n) { m_worker_threads.emplace_back([this, n]() { util::ThreadRename(strprintf("scriptch.%i", n)); + SetSyscallSandboxPolicy(SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK); Loop(false /* worker thread */); }); } diff --git a/src/httpserver.cpp b/src/httpserver.cpp index fa0379f612..b583ed323a 100644 --- a/src/httpserver.cpp +++ b/src/httpserver.cpp @@ -12,6 +12,7 @@ #include <shutdown.h> #include <sync.h> #include <util/strencodings.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/threadnames.h> #include <util/translation.h> @@ -279,6 +280,7 @@ static void http_reject_request_cb(struct evhttp_request* req, void*) static bool ThreadHTTP(struct event_base* base) { util::ThreadRename("http"); + SetSyscallSandboxPolicy(SyscallSandboxPolicy::NET_HTTP_SERVER); LogPrint(BCLog::HTTP, "Entering http event loop\n"); event_base_dispatch(base); // Event loop will be interrupted by InterruptHTTPServer() @@ -332,6 +334,7 @@ static bool HTTPBindAddresses(struct evhttp* http) static void HTTPWorkQueueRun(WorkQueue<HTTPClosure>* queue, int worker_num) { util::ThreadRename(strprintf("httpworker.%i", worker_num)); + SetSyscallSandboxPolicy(SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER); queue->Run(); } diff --git a/src/index/base.cpp b/src/index/base.cpp index 6fd2701e2e..3ca86a310e 100644 --- a/src/index/base.cpp +++ b/src/index/base.cpp @@ -8,6 +8,7 @@ #include <node/ui_interface.h> #include <shutdown.h> #include <tinyformat.h> +#include <util/syscall_sandbox.h> #include <util/thread.h> #include <util/translation.h> #include <validation.h> // For g_chainman @@ -123,6 +124,7 @@ static const CBlockIndex* NextSyncBlock(const CBlockIndex* pindex_prev, CChain& void BaseIndex::ThreadSync() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::TX_INDEX); const CBlockIndex* pindex = m_best_block_index.load(); if (!m_synced) { auto& consensus_params = Params().GetConsensus(); diff --git a/src/init.cpp b/src/init.cpp index ff36ec805c..541c0a9afc 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -60,6 +60,7 @@ #include <util/check.h> #include <util/moneystr.h> #include <util/string.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/thread.h> #include <util/threadnames.h> @@ -562,6 +563,10 @@ void SetupServerArgs(ArgsManager& argsman) hidden_args.emplace_back("-daemonwait"); #endif +#if defined(USE_SYSCALL_SANDBOX) + argsman.AddArg("-sandbox=<mode>", "Use the experimental syscall sandbox in the specified mode (-sandbox=log-and-abort or -sandbox=abort). Allow only expected syscalls to be used by bitcoind. Note that this is an experimental new feature that may cause bitcoind to exit or crash unexpectedly: use with caution. In the \"log-and-abort\" mode the invocation of an unexpected syscall results in a debug handler being invoked which will log the incident and terminate the program (without executing the unexpected syscall). In the \"abort\" mode the invocation of an unexpected syscall results in the entire process being killed immediately by the kernel without executing the unexpected syscall.", ArgsManager::ALLOW_ANY, OptionsCategory::OPTIONS); +#endif // USE_SYSCALL_SANDBOX + // Add the hidden options argsman.AddHiddenArgs(hidden_args); } @@ -1018,6 +1023,37 @@ bool AppInitParameterInteraction(const ArgsManager& args) return InitError(_("No proxy server specified. Use -proxy=<ip> or -proxy=<ip:port>.")); } +#if defined(USE_SYSCALL_SANDBOX) + if (args.IsArgSet("-sandbox") && !args.IsArgNegated("-sandbox")) { + const std::string sandbox_arg{args.GetArg("-sandbox", "")}; + bool log_syscall_violation_before_terminating{false}; + if (sandbox_arg == "log-and-abort") { + log_syscall_violation_before_terminating = true; + } else if (sandbox_arg == "abort") { + // log_syscall_violation_before_terminating is false by default. + } else { + return InitError(Untranslated("Unknown syscall sandbox mode (-sandbox=<mode>). Available modes are \"log-and-abort\" and \"abort\".")); + } + // execve(...) is not allowed by the syscall sandbox. + const std::vector<std::string> features_using_execve{ + "-alertnotify", + "-blocknotify", + "-signer", + "-startupnotify", + "-walletnotify", + }; + for (const std::string& feature_using_execve : features_using_execve) { + if (!args.GetArg(feature_using_execve, "").empty()) { + return InitError(Untranslated(strprintf("The experimental syscall sandbox feature (-sandbox=<mode>) is incompatible with %s (which uses execve).", feature_using_execve))); + } + } + if (!SetupSyscallSandbox(log_syscall_violation_before_terminating)) { + return InitError(Untranslated("Installation of the syscall sandbox failed.")); + } + LogPrintf("Experimental syscall sandbox enabled (-sandbox=%s): bitcoind will terminate if an unexpected (not allowlisted) syscall is invoked.\n", sandbox_arg); + } +#endif // USE_SYSCALL_SANDBOX + return true; } diff --git a/src/logging.cpp b/src/logging.cpp index eb2c750296..a352e106e5 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -160,6 +160,7 @@ const CLogCategoryDesc LogCategories[] = {BCLog::I2P, "i2p"}, {BCLog::IPC, "ipc"}, {BCLog::LOCK, "lock"}, + {BCLog::UTIL, "util"}, {BCLog::ALL, "1"}, {BCLog::ALL, "all"}, }; diff --git a/src/logging.h b/src/logging.h index 53a89d28bd..02e64a7c48 100644 --- a/src/logging.h +++ b/src/logging.h @@ -60,6 +60,7 @@ namespace BCLog { I2P = (1 << 22), IPC = (1 << 23), LOCK = (1 << 24), + UTIL = (1 << 25), ALL = ~(uint32_t)0, }; diff --git a/src/mapport.cpp b/src/mapport.cpp index 135efb561e..a2d06c68b4 100644 --- a/src/mapport.cpp +++ b/src/mapport.cpp @@ -14,6 +14,7 @@ #include <netaddress.h> #include <netbase.h> #include <threadinterrupt.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/thread.h> @@ -222,6 +223,7 @@ static bool ProcessUpnp() static void ThreadMapPort() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION_MAP_PORT); bool ok; do { ok = false; diff --git a/src/net.cpp b/src/net.cpp index cc8f4c4316..cceb5b2199 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -25,6 +25,7 @@ #include <scheduler.h> #include <util/sock.h> #include <util/strencodings.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/thread.h> #include <util/trace.h> @@ -1615,6 +1616,7 @@ void CConnman::SocketHandler() void CConnman::ThreadSocketHandler() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::NET); while (!interruptNet) { DisconnectNodes(); @@ -1634,6 +1636,7 @@ void CConnman::WakeMessageHandler() void CConnman::ThreadDNSAddressSeed() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION_DNS_SEED); FastRandomContext rng; std::vector<std::string> seeds = Params().DNSSeeds(); Shuffle(seeds.begin(), seeds.end(), rng); @@ -1816,6 +1819,7 @@ int CConnman::GetExtraBlockRelayCount() const void CConnman::ThreadOpenConnections(const std::vector<std::string> connect) { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::NET_OPEN_CONNECTION); // Connect to specific addresses if (!connect.empty()) { @@ -2155,6 +2159,7 @@ std::vector<AddedNodeInfo> CConnman::GetAddedNodeInfo() const void CConnman::ThreadOpenAddedConnections() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::NET_ADD_CONNECTION); while (true) { CSemaphoreGrant grant(*semAddnode); @@ -2218,6 +2223,7 @@ void CConnman::OpenNetworkConnection(const CAddress& addrConnect, bool fCountFai void CConnman::ThreadMessageHandler() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::MESSAGE_HANDLER); FastRandomContext rng; while (!flagInterruptMsgProc) { diff --git a/src/node/blockstorage.cpp b/src/node/blockstorage.cpp index 5ddcf95c84..e2776f73bc 100644 --- a/src/node/blockstorage.cpp +++ b/src/node/blockstorage.cpp @@ -16,6 +16,7 @@ #include <signet.h> #include <streams.h> #include <undo.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <validation.h> @@ -489,6 +490,7 @@ struct CImportingNow { void ThreadImport(ChainstateManager& chainman, std::vector<fs::path> vImportFiles, const ArgsManager& args) { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION_LOAD_BLOCKS); ScheduleBatchPriority(); { diff --git a/src/rpc/misc.cpp b/src/rpc/misc.cpp index 14b0e5a984..5fba8c8e07 100644 --- a/src/rpc/misc.cpp +++ b/src/rpc/misc.cpp @@ -22,6 +22,7 @@ #include <util/check.h> #include <util/message.h> // For MessageSign(), MessageVerify() #include <util/strencodings.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <optional> @@ -417,6 +418,27 @@ static RPCHelpMan setmocktime() }; } +#if defined(USE_SYSCALL_SANDBOX) +static RPCHelpMan invokedisallowedsyscall() +{ + return RPCHelpMan{ + "invokedisallowedsyscall", + "\nInvoke a disallowed syscall to trigger a syscall sandbox violation. Used for testing purposes.\n", + {}, + RPCResult{RPCResult::Type::NONE, "", ""}, + RPCExamples{ + HelpExampleCli("invokedisallowedsyscall", "") + HelpExampleRpc("invokedisallowedsyscall", "")}, + [&](const RPCHelpMan& self, const JSONRPCRequest& request) -> UniValue { + if (!Params().IsTestChain()) { + throw std::runtime_error("invokedisallowedsyscall is used for testing only."); + } + TestDisallowedSandboxCall(); + return NullUniValue; + }, + }; +} +#endif // USE_SYSCALL_SANDBOX + static RPCHelpMan mockscheduler() { return RPCHelpMan{"mockscheduler", @@ -777,6 +799,9 @@ static const CRPCCommand commands[] = { "hidden", &echo, }, { "hidden", &echojson, }, { "hidden", &echoipc, }, +#if defined(USE_SYSCALL_SANDBOX) + { "hidden", &invokedisallowedsyscall, }, +#endif // USE_SYSCALL_SANDBOX }; // clang-format on for (const auto& c : commands) { diff --git a/src/scheduler.cpp b/src/scheduler.cpp index 02ada969a4..162cced6c7 100644 --- a/src/scheduler.cpp +++ b/src/scheduler.cpp @@ -5,6 +5,7 @@ #include <scheduler.h> #include <random.h> +#include <util/syscall_sandbox.h> #include <util/time.h> #include <assert.h> @@ -24,6 +25,7 @@ CScheduler::~CScheduler() void CScheduler::serviceQueue() { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::SCHEDULER); WAIT_LOCK(newTaskMutex, lock); ++nThreadsServicingQueue; diff --git a/src/torcontrol.cpp b/src/torcontrol.cpp index bb296456ba..6d215ede6f 100644 --- a/src/torcontrol.cpp +++ b/src/torcontrol.cpp @@ -14,6 +14,7 @@ #include <netbase.h> #include <util/readwritefile.h> #include <util/strencodings.h> +#include <util/syscall_sandbox.h> #include <util/system.h> #include <util/thread.h> #include <util/time.h> @@ -585,6 +586,7 @@ static std::thread torControlThread; static void TorControlThread(CService onion_service_target) { + SetSyscallSandboxPolicy(SyscallSandboxPolicy::TOR_CONTROL); TorController ctrl(gBase, gArgs.GetArg("-torcontrol", DEFAULT_TOR_CONTROL), onion_service_target); event_base_dispatch(gBase); diff --git a/src/util/syscall_sandbox.cpp b/src/util/syscall_sandbox.cpp new file mode 100644 index 0000000000..c4006cbd3c --- /dev/null +++ b/src/util/syscall_sandbox.cpp @@ -0,0 +1,882 @@ +// Copyright (c) 2020 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#if defined(HAVE_CONFIG_H) +#include <config/bitcoin-config.h> +#endif // defined(HAVE_CONFIG_H) + +#include <util/syscall_sandbox.h> + +#if defined(USE_SYSCALL_SANDBOX) +#include <array> +#include <cassert> +#include <cstdint> +#include <exception> +#include <map> +#include <new> +#include <set> +#include <string> +#include <vector> + +#include <logging.h> +#include <tinyformat.h> +#include <util/threadnames.h> + +#include <linux/audit.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <linux/unistd.h> +#include <signal.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <unistd.h> + +namespace { +bool g_syscall_sandbox_enabled{false}; +bool g_syscall_sandbox_log_violation_before_terminating{false}; + +#if !defined(__x86_64__) +#error Syscall sandbox is an experimental feature currently available only under Linux x86-64. +#endif // defined(__x86_64__) + +// This list of syscalls in LINUX_SYSCALLS is only used to map syscall numbers to syscall names in +// order to be able to print user friendly error messages which include the syscall name in addition +// to the syscall number. +// +// Example output in case of a syscall violation where the syscall is present in LINUX_SYSCALLS: +// +// ``` +// 2021-06-09T12:34:56Z ERROR: The syscall "execve" (syscall number 59) is not allowed by the syscall sandbox in thread "msghand". Please report. +// ``` +// +// Example output in case of a syscall violation where the syscall is not present in LINUX_SYSCALLS: +// +// ``` +// 2021-06-09T12:34:56Z ERROR: The syscall "*unknown*" (syscall number 314) is not allowed by the syscall sandbox in thread "msghand". Please report. +// `` +// +// LINUX_SYSCALLS contains two types of syscalls: +// 1.) Syscalls that are present under all architectures or relevant Linux kernel versions for which +// we support the syscall sandbox feature (currently only Linux x86-64). Examples include read, +// write, open, close, etc. +// 2.) Syscalls that are present under a subset of architectures or relevant Linux kernel versions +// for which we support the syscall sandbox feature. This type of syscalls should be added to +// LINUX_SYSCALLS conditional on availability like in the following example: +// ... +// #if defined(__NR_arch_dependent_syscall) +// {__NR_arch_dependent_syscall, "arch_dependent_syscall"}, +// #endif // defined(__NR_arch_dependent_syscall) +// ... +const std::map<uint32_t, std::string> LINUX_SYSCALLS{ + {__NR_accept, "accept"}, + {__NR_accept4, "accept4"}, + {__NR_access, "access"}, + {__NR_acct, "acct"}, + {__NR_add_key, "add_key"}, + {__NR_adjtimex, "adjtimex"}, + {__NR_afs_syscall, "afs_syscall"}, + {__NR_alarm, "alarm"}, + {__NR_arch_prctl, "arch_prctl"}, + {__NR_bind, "bind"}, + {__NR_bpf, "bpf"}, + {__NR_brk, "brk"}, + {__NR_capget, "capget"}, + {__NR_capset, "capset"}, + {__NR_chdir, "chdir"}, + {__NR_chmod, "chmod"}, + {__NR_chown, "chown"}, + {__NR_chroot, "chroot"}, + {__NR_clock_adjtime, "clock_adjtime"}, + {__NR_clock_getres, "clock_getres"}, + {__NR_clock_gettime, "clock_gettime"}, + {__NR_clock_nanosleep, "clock_nanosleep"}, + {__NR_clock_settime, "clock_settime"}, + {__NR_clone, "clone"}, + {__NR_close, "close"}, + {__NR_connect, "connect"}, + {__NR_copy_file_range, "copy_file_range"}, + {__NR_creat, "creat"}, + {__NR_create_module, "create_module"}, + {__NR_delete_module, "delete_module"}, + {__NR_dup, "dup"}, + {__NR_dup2, "dup2"}, + {__NR_dup3, "dup3"}, + {__NR_epoll_create, "epoll_create"}, + {__NR_epoll_create1, "epoll_create1"}, + {__NR_epoll_ctl, "epoll_ctl"}, + {__NR_epoll_ctl_old, "epoll_ctl_old"}, + {__NR_epoll_pwait, "epoll_pwait"}, + {__NR_epoll_wait, "epoll_wait"}, + {__NR_epoll_wait_old, "epoll_wait_old"}, + {__NR_eventfd, "eventfd"}, + {__NR_eventfd2, "eventfd2"}, + {__NR_execve, "execve"}, + {__NR_execveat, "execveat"}, + {__NR_exit, "exit"}, + {__NR_exit_group, "exit_group"}, + {__NR_faccessat, "faccessat"}, + {__NR_fadvise64, "fadvise64"}, + {__NR_fallocate, "fallocate"}, + {__NR_fanotify_init, "fanotify_init"}, + {__NR_fanotify_mark, "fanotify_mark"}, + {__NR_fchdir, "fchdir"}, + {__NR_fchmod, "fchmod"}, + {__NR_fchmodat, "fchmodat"}, + {__NR_fchown, "fchown"}, + {__NR_fchownat, "fchownat"}, + {__NR_fcntl, "fcntl"}, + {__NR_fdatasync, "fdatasync"}, + {__NR_fgetxattr, "fgetxattr"}, + {__NR_finit_module, "finit_module"}, + {__NR_flistxattr, "flistxattr"}, + {__NR_flock, "flock"}, + {__NR_fork, "fork"}, + {__NR_fremovexattr, "fremovexattr"}, + {__NR_fsetxattr, "fsetxattr"}, + {__NR_fstat, "fstat"}, + {__NR_fstatfs, "fstatfs"}, + {__NR_fsync, "fsync"}, + {__NR_ftruncate, "ftruncate"}, + {__NR_futex, "futex"}, + {__NR_futimesat, "futimesat"}, + {__NR_getcpu, "getcpu"}, + {__NR_getcwd, "getcwd"}, + {__NR_getdents, "getdents"}, + {__NR_getdents64, "getdents64"}, + {__NR_getegid, "getegid"}, + {__NR_geteuid, "geteuid"}, + {__NR_getgid, "getgid"}, + {__NR_getgroups, "getgroups"}, + {__NR_getitimer, "getitimer"}, + {__NR_get_kernel_syms, "get_kernel_syms"}, + {__NR_get_mempolicy, "get_mempolicy"}, + {__NR_getpeername, "getpeername"}, + {__NR_getpgid, "getpgid"}, + {__NR_getpgrp, "getpgrp"}, + {__NR_getpid, "getpid"}, + {__NR_getpmsg, "getpmsg"}, + {__NR_getppid, "getppid"}, + {__NR_getpriority, "getpriority"}, +#if defined(__NR_getrandom) + {__NR_getrandom, "getrandom"}, +#endif // defined(__NR_getrandom) + {__NR_getresgid, "getresgid"}, + {__NR_getresuid, "getresuid"}, + {__NR_getrlimit, "getrlimit"}, + {__NR_get_robust_list, "get_robust_list"}, + {__NR_getrusage, "getrusage"}, + {__NR_getsid, "getsid"}, + {__NR_getsockname, "getsockname"}, + {__NR_getsockopt, "getsockopt"}, + {__NR_get_thread_area, "get_thread_area"}, + {__NR_gettid, "gettid"}, + {__NR_gettimeofday, "gettimeofday"}, + {__NR_getuid, "getuid"}, + {__NR_getxattr, "getxattr"}, + {__NR_init_module, "init_module"}, + {__NR_inotify_add_watch, "inotify_add_watch"}, + {__NR_inotify_init, "inotify_init"}, + {__NR_inotify_init1, "inotify_init1"}, + {__NR_inotify_rm_watch, "inotify_rm_watch"}, + {__NR_io_cancel, "io_cancel"}, + {__NR_ioctl, "ioctl"}, + {__NR_io_destroy, "io_destroy"}, + {__NR_io_getevents, "io_getevents"}, + {__NR_ioperm, "ioperm"}, + {__NR_iopl, "iopl"}, + {__NR_ioprio_get, "ioprio_get"}, + {__NR_ioprio_set, "ioprio_set"}, + {__NR_io_setup, "io_setup"}, + {__NR_io_submit, "io_submit"}, + {__NR_kcmp, "kcmp"}, + {__NR_kexec_file_load, "kexec_file_load"}, + {__NR_kexec_load, "kexec_load"}, + {__NR_keyctl, "keyctl"}, + {__NR_kill, "kill"}, + {__NR_lchown, "lchown"}, + {__NR_lgetxattr, "lgetxattr"}, + {__NR_link, "link"}, + {__NR_linkat, "linkat"}, + {__NR_listen, "listen"}, + {__NR_listxattr, "listxattr"}, + {__NR_llistxattr, "llistxattr"}, + {__NR_lookup_dcookie, "lookup_dcookie"}, + {__NR_lremovexattr, "lremovexattr"}, + {__NR_lseek, "lseek"}, + {__NR_lsetxattr, "lsetxattr"}, + {__NR_lstat, "lstat"}, + {__NR_madvise, "madvise"}, + {__NR_mbind, "mbind"}, +#if defined(__NR_membarrier) + {__NR_membarrier, "membarrier"}, +#endif // defined(__NR_membarrier) + {__NR_memfd_create, "memfd_create"}, + {__NR_migrate_pages, "migrate_pages"}, + {__NR_mincore, "mincore"}, + {__NR_mkdir, "mkdir"}, + {__NR_mkdirat, "mkdirat"}, + {__NR_mknod, "mknod"}, + {__NR_mknodat, "mknodat"}, + {__NR_mlock, "mlock"}, + {__NR_mlock2, "mlock2"}, + {__NR_mlockall, "mlockall"}, + {__NR_mmap, "mmap"}, + {__NR_modify_ldt, "modify_ldt"}, + {__NR_mount, "mount"}, + {__NR_move_pages, "move_pages"}, + {__NR_mprotect, "mprotect"}, + {__NR_mq_getsetattr, "mq_getsetattr"}, + {__NR_mq_notify, "mq_notify"}, + {__NR_mq_open, "mq_open"}, + {__NR_mq_timedreceive, "mq_timedreceive"}, + {__NR_mq_timedsend, "mq_timedsend"}, + {__NR_mq_unlink, "mq_unlink"}, + {__NR_mremap, "mremap"}, + {__NR_msgctl, "msgctl"}, + {__NR_msgget, "msgget"}, + {__NR_msgrcv, "msgrcv"}, + {__NR_msgsnd, "msgsnd"}, + {__NR_msync, "msync"}, + {__NR_munlock, "munlock"}, + {__NR_munlockall, "munlockall"}, + {__NR_munmap, "munmap"}, + {__NR_name_to_handle_at, "name_to_handle_at"}, + {__NR_nanosleep, "nanosleep"}, + {__NR_newfstatat, "newfstatat"}, + {__NR_nfsservctl, "nfsservctl"}, + {__NR_open, "open"}, + {__NR_openat, "openat"}, + {__NR_open_by_handle_at, "open_by_handle_at"}, + {__NR_pause, "pause"}, + {__NR_perf_event_open, "perf_event_open"}, + {__NR_personality, "personality"}, + {__NR_pipe, "pipe"}, + {__NR_pipe2, "pipe2"}, + {__NR_pivot_root, "pivot_root"}, + {__NR_pkey_alloc, "pkey_alloc"}, + {__NR_pkey_free, "pkey_free"}, + {__NR_pkey_mprotect, "pkey_mprotect"}, + {__NR_poll, "poll"}, + {__NR_ppoll, "ppoll"}, + {__NR_prctl, "prctl"}, + {__NR_pread64, "pread64"}, + {__NR_preadv, "preadv"}, + {__NR_preadv2, "preadv2"}, + {__NR_prlimit64, "prlimit64"}, + {__NR_process_vm_readv, "process_vm_readv"}, + {__NR_process_vm_writev, "process_vm_writev"}, + {__NR_pselect6, "pselect6"}, + {__NR_ptrace, "ptrace"}, + {__NR_putpmsg, "putpmsg"}, + {__NR_pwrite64, "pwrite64"}, + {__NR_pwritev, "pwritev"}, + {__NR_pwritev2, "pwritev2"}, + {__NR_query_module, "query_module"}, + {__NR_quotactl, "quotactl"}, + {__NR_read, "read"}, + {__NR_readahead, "readahead"}, + {__NR_readlink, "readlink"}, + {__NR_readlinkat, "readlinkat"}, + {__NR_readv, "readv"}, + {__NR_reboot, "reboot"}, + {__NR_recvfrom, "recvfrom"}, + {__NR_recvmmsg, "recvmmsg"}, + {__NR_recvmsg, "recvmsg"}, + {__NR_remap_file_pages, "remap_file_pages"}, + {__NR_removexattr, "removexattr"}, + {__NR_rename, "rename"}, + {__NR_renameat, "renameat"}, + {__NR_renameat2, "renameat2"}, + {__NR_request_key, "request_key"}, + {__NR_restart_syscall, "restart_syscall"}, + {__NR_rmdir, "rmdir"}, + {__NR_rt_sigaction, "rt_sigaction"}, + {__NR_rt_sigpending, "rt_sigpending"}, + {__NR_rt_sigprocmask, "rt_sigprocmask"}, + {__NR_rt_sigqueueinfo, "rt_sigqueueinfo"}, + {__NR_rt_sigreturn, "rt_sigreturn"}, + {__NR_rt_sigsuspend, "rt_sigsuspend"}, + {__NR_rt_sigtimedwait, "rt_sigtimedwait"}, + {__NR_rt_tgsigqueueinfo, "rt_tgsigqueueinfo"}, + {__NR_sched_getaffinity, "sched_getaffinity"}, + {__NR_sched_getattr, "sched_getattr"}, + {__NR_sched_getparam, "sched_getparam"}, + {__NR_sched_get_priority_max, "sched_get_priority_max"}, + {__NR_sched_get_priority_min, "sched_get_priority_min"}, + {__NR_sched_getscheduler, "sched_getscheduler"}, + {__NR_sched_rr_get_interval, "sched_rr_get_interval"}, + {__NR_sched_setaffinity, "sched_setaffinity"}, + {__NR_sched_setattr, "sched_setattr"}, + {__NR_sched_setparam, "sched_setparam"}, + {__NR_sched_setscheduler, "sched_setscheduler"}, + {__NR_sched_yield, "sched_yield"}, + {__NR_seccomp, "seccomp"}, + {__NR_security, "security"}, + {__NR_select, "select"}, + {__NR_semctl, "semctl"}, + {__NR_semget, "semget"}, + {__NR_semop, "semop"}, + {__NR_semtimedop, "semtimedop"}, + {__NR_sendfile, "sendfile"}, + {__NR_sendmmsg, "sendmmsg"}, + {__NR_sendmsg, "sendmsg"}, + {__NR_sendto, "sendto"}, + {__NR_setdomainname, "setdomainname"}, + {__NR_setfsgid, "setfsgid"}, + {__NR_setfsuid, "setfsuid"}, + {__NR_setgid, "setgid"}, + {__NR_setgroups, "setgroups"}, + {__NR_sethostname, "sethostname"}, + {__NR_setitimer, "setitimer"}, + {__NR_set_mempolicy, "set_mempolicy"}, + {__NR_setns, "setns"}, + {__NR_setpgid, "setpgid"}, + {__NR_setpriority, "setpriority"}, + {__NR_setregid, "setregid"}, + {__NR_setresgid, "setresgid"}, + {__NR_setresuid, "setresuid"}, + {__NR_setreuid, "setreuid"}, + {__NR_setrlimit, "setrlimit"}, + {__NR_set_robust_list, "set_robust_list"}, + {__NR_setsid, "setsid"}, + {__NR_setsockopt, "setsockopt"}, + {__NR_set_thread_area, "set_thread_area"}, + {__NR_set_tid_address, "set_tid_address"}, + {__NR_settimeofday, "settimeofday"}, + {__NR_setuid, "setuid"}, + {__NR_setxattr, "setxattr"}, + {__NR_shmat, "shmat"}, + {__NR_shmctl, "shmctl"}, + {__NR_shmdt, "shmdt"}, + {__NR_shmget, "shmget"}, + {__NR_shutdown, "shutdown"}, + {__NR_sigaltstack, "sigaltstack"}, + {__NR_signalfd, "signalfd"}, + {__NR_signalfd4, "signalfd4"}, + {__NR_socket, "socket"}, + {__NR_socketpair, "socketpair"}, + {__NR_splice, "splice"}, + {__NR_stat, "stat"}, + {__NR_statfs, "statfs"}, + {__NR_statx, "statx"}, + {__NR_swapoff, "swapoff"}, + {__NR_swapon, "swapon"}, + {__NR_symlink, "symlink"}, + {__NR_symlinkat, "symlinkat"}, + {__NR_sync, "sync"}, + {__NR_sync_file_range, "sync_file_range"}, + {__NR_syncfs, "syncfs"}, + {__NR__sysctl, "_sysctl"}, + {__NR_sysfs, "sysfs"}, + {__NR_sysinfo, "sysinfo"}, + {__NR_syslog, "syslog"}, + {__NR_tee, "tee"}, + {__NR_tgkill, "tgkill"}, + {__NR_time, "time"}, + {__NR_timer_create, "timer_create"}, + {__NR_timer_delete, "timer_delete"}, + {__NR_timerfd_create, "timerfd_create"}, + {__NR_timerfd_gettime, "timerfd_gettime"}, + {__NR_timerfd_settime, "timerfd_settime"}, + {__NR_timer_getoverrun, "timer_getoverrun"}, + {__NR_timer_gettime, "timer_gettime"}, + {__NR_timer_settime, "timer_settime"}, + {__NR_times, "times"}, + {__NR_tkill, "tkill"}, + {__NR_truncate, "truncate"}, + {__NR_tuxcall, "tuxcall"}, + {__NR_umask, "umask"}, + {__NR_umount2, "umount2"}, + {__NR_uname, "uname"}, + {__NR_unlink, "unlink"}, + {__NR_unlinkat, "unlinkat"}, + {__NR_unshare, "unshare"}, + {__NR_uselib, "uselib"}, + {__NR_userfaultfd, "userfaultfd"}, + {__NR_ustat, "ustat"}, + {__NR_utime, "utime"}, + {__NR_utimensat, "utimensat"}, + {__NR_utimes, "utimes"}, + {__NR_vfork, "vfork"}, + {__NR_vhangup, "vhangup"}, + {__NR_vmsplice, "vmsplice"}, + {__NR_vserver, "vserver"}, + {__NR_wait4, "wait4"}, + {__NR_waitid, "waitid"}, + {__NR_write, "write"}, + {__NR_writev, "writev"}, +}; + +std::string GetLinuxSyscallName(uint32_t syscall_number) +{ + const auto element = LINUX_SYSCALLS.find(syscall_number); + if (element != LINUX_SYSCALLS.end()) { + return element->second; + } + return "*unknown*"; +} + +// See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> for +// an accessible introduction to using seccomp. +// +// This function largely follows <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> and +// <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>. +// +// Seccomp BPF resources: +// * Seccomp BPF documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> +// * seccomp(2) manual page: <https://www.kernel.org/doc/man-pages/online/pages/man2/seccomp.2.html> +// * Seccomp BPF demo code samples: <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/seccomp> +void SyscallSandboxDebugSignalHandler(int, siginfo_t* signal_info, void* void_signal_context) +{ + // The si_code field inside the siginfo_t argument that is passed to a SA_SIGINFO signal handler + // is a value indicating why the signal was sent. + // + // The following value can be placed in si_code for a SIGSYS signal: + // * SYS_SECCOMP (since Linux 3.5): Triggered by a seccomp(2) filter rule. + constexpr int32_t SYS_SECCOMP_SI_CODE{1}; + assert(signal_info->si_code == SYS_SECCOMP_SI_CODE); + + // The ucontext_t structure contains signal context information that was saved on the user-space + // stack by the kernel. + const ucontext_t* signal_context = static_cast<ucontext_t*>(void_signal_context); + assert(signal_context != nullptr); + + std::set_new_handler(std::terminate); + // Portability note: REG_RAX is Linux x86_64 specific. + const uint32_t syscall_number = static_cast<uint32_t>(signal_context->uc_mcontext.gregs[REG_RAX]); + const std::string syscall_name = GetLinuxSyscallName(syscall_number); + const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*"; + const std::string error_message = strprintf("ERROR: The syscall \"%s\" (syscall number %d) is not allowed by the syscall sandbox in thread \"%s\". Please report.", syscall_name, syscall_number, thread_name); + tfm::format(std::cerr, "%s\n", error_message); + LogPrintf("%s\n", error_message); + std::terminate(); +} + +// This function largely follows install_syscall_reporter from Kees Cook's seccomp guide: +// <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> +bool SetupSyscallSandboxDebugHandler() +{ + struct sigaction action = {}; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGSYS); + action.sa_sigaction = &SyscallSandboxDebugSignalHandler; + action.sa_flags = SA_SIGINFO; + if (sigaction(SIGSYS, &action, nullptr) < 0) { + return false; + } + if (sigprocmask(SIG_UNBLOCK, &mask, nullptr)) { + return false; + } + return true; +} + +enum class SyscallSandboxAction { + KILL_PROCESS, + INVOKE_SIGNAL_HANDLER, +}; + +class SeccompPolicyBuilder +{ + std::set<uint32_t> allowed_syscalls; + +public: + SeccompPolicyBuilder() + { + // Allowed by default. + AllowAddressSpaceAccess(); + AllowEpoll(); + AllowEventFd(); + AllowFutex(); + AllowGeneralIo(); + AllowGetRandom(); + AllowGetSimpleId(); + AllowGetTime(); + AllowGlobalProcessEnvironment(); + AllowGlobalSystemStatus(); + AllowKernelInternalApi(); + AllowNetworkSocketInformation(); + AllowOperationOnExistingFileDescriptor(); + AllowPipe(); + AllowPrctl(); + AllowProcessStartOrDeath(); + AllowScheduling(); + AllowSignalHandling(); + AllowSleep(); + AllowUmask(); + } + + void AllowAddressSpaceAccess() + { + allowed_syscalls.insert(__NR_brk); // change data segment size + allowed_syscalls.insert(__NR_madvise); // give advice about use of memory +#if defined(__NR_membarrier) + allowed_syscalls.insert(__NR_membarrier); // issue memory barriers on a set of threads +#endif // defined(__NR_membarrier) + allowed_syscalls.insert(__NR_mlock); // lock memory + allowed_syscalls.insert(__NR_mmap); // map files or devices into memory + allowed_syscalls.insert(__NR_mprotect); // set protection on a region of memory + allowed_syscalls.insert(__NR_munlock); // unlock memory + allowed_syscalls.insert(__NR_munmap); // unmap files or devices into memory + } + + void AllowEpoll() + { + allowed_syscalls.insert(__NR_epoll_create1); // open an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_ctl); // control interface for an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_pwait); // wait for an I/O event on an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_wait); // wait for an I/O event on an epoll file descriptor + } + + void AllowEventFd() + { + allowed_syscalls.insert(__NR_eventfd2); // create a file descriptor for event notification + } + + void AllowFileSystem() + { + allowed_syscalls.insert(__NR_access); // check user's permissions for a file + allowed_syscalls.insert(__NR_chdir); // change working directory + allowed_syscalls.insert(__NR_chmod); // change permissions of a file + allowed_syscalls.insert(__NR_fallocate); // manipulate file space + allowed_syscalls.insert(__NR_fchmod); // change permissions of a file + allowed_syscalls.insert(__NR_fchown); // change ownership of a file + allowed_syscalls.insert(__NR_fdatasync); // synchronize a file's in-core state with storage device + allowed_syscalls.insert(__NR_flock); // apply or remove an advisory lock on an open file + allowed_syscalls.insert(__NR_fstat); // get file status + allowed_syscalls.insert(__NR_fsync); // synchronize a file's in-core state with storage device + allowed_syscalls.insert(__NR_ftruncate); // truncate a file to a specified length + allowed_syscalls.insert(__NR_getcwd); // get current working directory + allowed_syscalls.insert(__NR_getdents); // get directory entries + allowed_syscalls.insert(__NR_getdents64); // get directory entries + allowed_syscalls.insert(__NR_lstat); // get file status + allowed_syscalls.insert(__NR_mkdir); // create a directory + allowed_syscalls.insert(__NR_open); // open and possibly create a file + allowed_syscalls.insert(__NR_openat); // open and possibly create a file + allowed_syscalls.insert(__NR_readlink); // read value of a symbolic link + allowed_syscalls.insert(__NR_rename); // change the name or location of a file + allowed_syscalls.insert(__NR_rmdir); // delete a directory + allowed_syscalls.insert(__NR_stat); // get file status + allowed_syscalls.insert(__NR_statfs); // get filesystem statistics + allowed_syscalls.insert(__NR_statx); // get file status (extended) + allowed_syscalls.insert(__NR_unlink); // delete a name and possibly the file it refers to + } + + void AllowFutex() + { + allowed_syscalls.insert(__NR_futex); // fast user-space locking + allowed_syscalls.insert(__NR_set_robust_list); // set list of robust futexes + } + + void AllowGeneralIo() + { + allowed_syscalls.insert(__NR_ioctl); // control device + allowed_syscalls.insert(__NR_lseek); // reposition read/write file offset + allowed_syscalls.insert(__NR_poll); // wait for some event on a file descriptor + allowed_syscalls.insert(__NR_ppoll); // wait for some event on a file descriptor + allowed_syscalls.insert(__NR_pread64); // read from a file descriptor at a given offset + allowed_syscalls.insert(__NR_pwrite64); // write to a file descriptor at a given offset + allowed_syscalls.insert(__NR_read); // read from a file descriptor + allowed_syscalls.insert(__NR_readv); // read data into multiple buffers + allowed_syscalls.insert(__NR_recvfrom); // receive a message from a socket + allowed_syscalls.insert(__NR_recvmsg); // receive a message from a socket + allowed_syscalls.insert(__NR_select); // synchronous I/O multiplexing + allowed_syscalls.insert(__NR_sendmmsg); // send multiple messages on a socket + allowed_syscalls.insert(__NR_sendmsg); // send a message on a socket + allowed_syscalls.insert(__NR_sendto); // send a message on a socket + allowed_syscalls.insert(__NR_write); // write to a file descriptor + allowed_syscalls.insert(__NR_writev); // write data into multiple buffers + } + + void AllowGetRandom() + { +#if defined(__NR_getrandom) + allowed_syscalls.insert(__NR_getrandom); // obtain a series of random bytes +#endif // defined(__NR_getrandom) + } + + void AllowGetSimpleId() + { + allowed_syscalls.insert(__NR_getegid); // get group identity + allowed_syscalls.insert(__NR_geteuid); // get user identity + allowed_syscalls.insert(__NR_getgid); // get group identity + allowed_syscalls.insert(__NR_getpgid); // get process group + allowed_syscalls.insert(__NR_getpid); // get process identification + allowed_syscalls.insert(__NR_getppid); // get process identification + allowed_syscalls.insert(__NR_getresgid); // get real, effective and saved group IDs + allowed_syscalls.insert(__NR_getresuid); // get real, effective and saved user IDs + allowed_syscalls.insert(__NR_getsid); // get session ID + allowed_syscalls.insert(__NR_gettid); // get thread identification + allowed_syscalls.insert(__NR_getuid); // get user identity + } + + void AllowGetTime() + { + allowed_syscalls.insert(__NR_clock_getres); // find the resolution (precision) of the specified clock + allowed_syscalls.insert(__NR_clock_gettime); // retrieve the time of the specified clock + } + + void AllowGlobalProcessEnvironment() + { + allowed_syscalls.insert(__NR_getrlimit); // get resource limits + allowed_syscalls.insert(__NR_getrusage); // get resource usage + allowed_syscalls.insert(__NR_prlimit64); // get/set resource limits + } + + void AllowGlobalSystemStatus() + { + allowed_syscalls.insert(__NR_sysinfo); // return system information + allowed_syscalls.insert(__NR_uname); // get name and information about current kernel + } + + void AllowKernelInternalApi() + { + allowed_syscalls.insert(__NR_restart_syscall); // restart a system call after interruption by a stop signal + } + + void AllowNetwork() + { + allowed_syscalls.insert(__NR_accept); // accept a connection on a socket + allowed_syscalls.insert(__NR_accept4); // accept a connection on a socket + allowed_syscalls.insert(__NR_bind); // bind a name to a socket + allowed_syscalls.insert(__NR_connect); // initiate a connection on a socket + allowed_syscalls.insert(__NR_listen); // listen for connections on a socket + allowed_syscalls.insert(__NR_setsockopt); // set options on sockets + allowed_syscalls.insert(__NR_socket); // create an endpoint for communication + allowed_syscalls.insert(__NR_socketpair); // create a pair of connected sockets + } + + void AllowNetworkSocketInformation() + { + allowed_syscalls.insert(__NR_getpeername); // get name of connected peer socket + allowed_syscalls.insert(__NR_getsockname); // get socket name + allowed_syscalls.insert(__NR_getsockopt); // get options on sockets + } + + void AllowOperationOnExistingFileDescriptor() + { + allowed_syscalls.insert(__NR_close); // close a file descriptor + allowed_syscalls.insert(__NR_dup); // duplicate a file descriptor + allowed_syscalls.insert(__NR_dup2); // duplicate a file descriptor + allowed_syscalls.insert(__NR_fcntl); // manipulate file descriptor + allowed_syscalls.insert(__NR_shutdown); // shut down part of a full-duplex connection + } + + void AllowPipe() + { + allowed_syscalls.insert(__NR_pipe); // create pipe + allowed_syscalls.insert(__NR_pipe2); // create pipe + } + + void AllowPrctl() + { + allowed_syscalls.insert(__NR_arch_prctl); // set architecture-specific thread state + allowed_syscalls.insert(__NR_prctl); // operations on a process + } + + void AllowProcessStartOrDeath() + { + allowed_syscalls.insert(__NR_clone); // create a child process + allowed_syscalls.insert(__NR_exit); // terminate the calling process + allowed_syscalls.insert(__NR_exit_group); // exit all threads in a process + allowed_syscalls.insert(__NR_fork); // create a child process + allowed_syscalls.insert(__NR_tgkill); // send a signal to a thread + allowed_syscalls.insert(__NR_wait4); // wait for process to change state, BSD style + } + + void AllowScheduling() + { + allowed_syscalls.insert(__NR_sched_getaffinity); // set a thread's CPU affinity mask + allowed_syscalls.insert(__NR_sched_getparam); // get scheduling parameters + allowed_syscalls.insert(__NR_sched_getscheduler); // get scheduling policy/parameters + allowed_syscalls.insert(__NR_sched_setscheduler); // set scheduling policy/parameters + allowed_syscalls.insert(__NR_sched_yield); // yield the processor + } + + void AllowSignalHandling() + { + allowed_syscalls.insert(__NR_rt_sigaction); // examine and change a signal action + allowed_syscalls.insert(__NR_rt_sigprocmask); // examine and change blocked signals + allowed_syscalls.insert(__NR_rt_sigreturn); // return from signal handler and cleanup stack frame + allowed_syscalls.insert(__NR_sigaltstack); // set and/or get signal stack context + } + + void AllowSleep() + { + allowed_syscalls.insert(__NR_clock_nanosleep); // high-resolution sleep with specifiable clock + allowed_syscalls.insert(__NR_nanosleep); // high-resolution sleep + } + + void AllowUmask() + { + allowed_syscalls.insert(__NR_umask); // set file mode creation mask + } + + // See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> + // for an accessible introduction to using seccomp. + // + // This function largely follows <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>. + std::vector<sock_filter> BuildFilter(SyscallSandboxAction default_action) + { + std::vector<sock_filter> bpf_policy; + // See VALIDATE_ARCHITECTURE in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, arch))); + // Portability note: AUDIT_ARCH_X86_64 is Linux x86_64 specific. + bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 1, 0)); + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS)); + // See EXAMINE_SYSCALL in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr))); + for (const uint32_t allowed_syscall : allowed_syscalls) { + // See ALLOW_SYSCALL in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, allowed_syscall, 0, 1)); + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW)); + } + switch (default_action) { + case SyscallSandboxAction::KILL_PROCESS: + // Disallow syscall and kill the process. + // + // See KILL_PROCESS in seccomp-bpf.h referenced above. + // + // Note that we're using SECCOMP_RET_KILL_PROCESS (kill the process) instead + // of SECCOMP_RET_KILL_THREAD (kill the thread). The SECCOMP_RET_KILL_PROCESS + // action was introduced in Linux 4.14. + // + // SECCOMP_RET_KILL_PROCESS: Results in the entire process exiting immediately without + // executing the system call. + // + // SECCOMP_RET_KILL_PROCESS documentation: + // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS)); + break; + case SyscallSandboxAction::INVOKE_SIGNAL_HANDLER: + // Disallow syscall and force a SIGSYS to trigger syscall debug reporter. + // + // SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. + // + // SECCOMP_RET_TRAP documentation: + // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP)); + break; + } + return bpf_policy; + } +}; +} // namespace + +bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating) +{ + assert(!g_syscall_sandbox_enabled && "SetupSyscallSandbox(...) should only be called once."); + g_syscall_sandbox_enabled = true; + g_syscall_sandbox_log_violation_before_terminating = log_syscall_violation_before_terminating; + if (log_syscall_violation_before_terminating) { + if (!SetupSyscallSandboxDebugHandler()) { + return false; + } + } + SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION); + return true; +} + +void TestDisallowedSandboxCall() +{ + // The getgroups syscall is assumed NOT to be allowed by the syscall sandbox policy. + std::array<gid_t, 1> groups; + [[maybe_unused]] int32_t ignored = getgroups(groups.size(), groups.data()); +} +#endif // defined(USE_SYSCALL_SANDBOX) + +void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy) +{ +#if defined(USE_SYSCALL_SANDBOX) + if (!g_syscall_sandbox_enabled) { + return; + } + SeccompPolicyBuilder seccomp_policy_builder; + switch (syscall_policy) { + case SyscallSandboxPolicy::INITIALIZATION: // Thread: main thread (state: init) + // SyscallSandboxPolicy::INITIALIZATION is the first policy loaded. + // + // Subsequently loaded policies can reduce the abilities further, but + // abilities can never be regained. + // + // SyscallSandboxPolicy::INITIALIZATION must thus be a superset of all + // other policies. + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::INITIALIZATION_DNS_SEED: // Thread: dnsseed + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::INITIALIZATION_LOAD_BLOCKS: // Thread: loadblk + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::INITIALIZATION_MAP_PORT: // Thread: mapport + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::MESSAGE_HANDLER: // Thread: msghand + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::NET: // Thread: net + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_ADD_CONNECTION: // Thread: addcon + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_HTTP_SERVER: // Thread: http + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER: // Thread: httpworker.<N> + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_OPEN_CONNECTION: // Thread: opencon + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::SCHEDULER: // Thread: scheduler + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::TOR_CONTROL: // Thread: torcontrol + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::TX_INDEX: // Thread: txindex + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK: // Thread: scriptch.<N> + break; + case SyscallSandboxPolicy::SHUTOFF: // Thread: main thread (state: shutoff) + seccomp_policy_builder.AllowFileSystem(); + break; + } + + const SyscallSandboxAction default_action = g_syscall_sandbox_log_violation_before_terminating ? SyscallSandboxAction::INVOKE_SIGNAL_HANDLER : SyscallSandboxAction::KILL_PROCESS; + std::vector<sock_filter> filter = seccomp_policy_builder.BuildFilter(default_action); + const sock_fprog prog = { + .len = static_cast<uint16_t>(filter.size()), + .filter = filter.data(), + }; + // Do not allow abilities to be regained after being dropped. + // + // PR_SET_NO_NEW_PRIVS documentation: <https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html> + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { + throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_NO_NEW_PRIVS)"); + } + // Install seccomp-bpf syscall filter. + // + // PR_SET_SECCOMP documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) { + throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_SECCOMP)"); + } + + const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*"; + LogPrint(BCLog::UTIL, "Syscall filter installed for thread \"%s\"\n", thread_name); +#endif // defined(USE_SYSCALL_SANDBOX) +} diff --git a/src/util/syscall_sandbox.h b/src/util/syscall_sandbox.h new file mode 100644 index 0000000000..0a0c964f94 --- /dev/null +++ b/src/util/syscall_sandbox.h @@ -0,0 +1,57 @@ +// Copyright (c) 2020 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_UTIL_SYSCALL_SANDBOX_H +#define BITCOIN_UTIL_SYSCALL_SANDBOX_H + +enum class SyscallSandboxPolicy { + // 1. Initialization + INITIALIZATION, + INITIALIZATION_DNS_SEED, + INITIALIZATION_LOAD_BLOCKS, + INITIALIZATION_MAP_PORT, + + // 2. Steady state (non-initialization, non-shutdown) + MESSAGE_HANDLER, + NET, + NET_ADD_CONNECTION, + NET_HTTP_SERVER, + NET_HTTP_SERVER_WORKER, + NET_OPEN_CONNECTION, + SCHEDULER, + TOR_CONTROL, + TX_INDEX, + VALIDATION_SCRIPT_CHECK, + + // 3. Shutdown + SHUTOFF, +}; + +//! Force the current thread (and threads created from the current thread) into a restricted-service +//! operating mode where only a subset of all syscalls are available. +//! +//! Subsequent calls to this function can reduce the abilities further, but abilities can never be +//! regained. +//! +//! This function is a no-op unless SetupSyscallSandbox(...) has been called. +//! +//! SetupSyscallSandbox(...) is called during bitcoind initialization if Bitcoin Core was compiled +//! with seccomp-bpf support (--with-seccomp) *and* the parameter -sandbox=<mode> was passed to +//! bitcoind. +//! +//! This experimental feature is available under Linux x86_64 only. +void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy); + +#if defined(USE_SYSCALL_SANDBOX) +//! Setup and enable the experimental syscall sandbox for the running process. +//! +//! SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION) is called as part of +//! SetupSyscallSandbox(...). +[[nodiscard]] bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating); + +//! Invoke a disallowed syscall. Use for testing purposes. +void TestDisallowedSandboxCall(); +#endif // defined(USE_SYSCALL_SANDBOX) + +#endif // BITCOIN_UTIL_SYSCALL_SANDBOX_H diff --git a/test/config.ini.in b/test/config.ini.in index db80bba6f1..8bcba1b39c 100644 --- a/test/config.ini.in +++ b/test/config.ini.in @@ -24,3 +24,4 @@ RPCAUTH=@abs_top_srcdir@/share/rpcauth/rpcauth.py @ENABLE_FUZZ_TRUE@ENABLE_FUZZ=true @ENABLE_ZMQ_TRUE@ENABLE_ZMQ=true @ENABLE_EXTERNAL_SIGNER_TRUE@ENABLE_EXTERNAL_SIGNER=true +@ENABLE_SYSCALL_SANDBOX_TRUE@ENABLE_SYSCALL_SANDBOX=true diff --git a/test/functional/feature_notifications.py b/test/functional/feature_notifications.py index 5ef3860867..48d41432be 100755 --- a/test/functional/feature_notifications.py +++ b/test/functional/feature_notifications.py @@ -27,6 +27,9 @@ class NotificationsTest(BitcoinTestFramework): def set_test_params(self): self.num_nodes = 2 self.setup_clean_chain = True + # The experimental syscall sandbox feature (-sandbox) is not compatible with -alertnotify, + # -blocknotify or -walletnotify (which all invoke execve). + self.disable_syscall_sandbox = True def setup_network(self): self.wallet = ''.join(chr(i) for i in range(FILE_CHAR_START, FILE_CHAR_END) if chr(i) not in FILE_CHARS_DISALLOWED) diff --git a/test/functional/feature_syscall_sandbox.py b/test/functional/feature_syscall_sandbox.py new file mode 100755 index 0000000000..caf7f1e7fc --- /dev/null +++ b/test/functional/feature_syscall_sandbox.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 The Bitcoin Core developers +# Distributed under the MIT software license, see the accompanying +# file COPYING or http://www.opensource.org/licenses/mit-license.php. +"""Test bitcoind aborts if a disallowed syscall is used when compiled with the syscall sandbox.""" + +from test_framework.test_framework import BitcoinTestFramework, SkipTest + + +class SyscallSandboxTest(BitcoinTestFramework): + def set_test_params(self): + self.num_nodes = 1 + + def skip_test_if_missing_module(self): + if not self.is_syscall_sandbox_compiled(): + raise SkipTest("bitcoind has not been built with syscall sandbox enabled.") + if self.options.nosandbox: + raise SkipTest("--nosandbox passed to test runner.") + + def run_test(self): + disallowed_syscall_terminated_bitcoind = False + expected_log_entry = 'ERROR: The syscall "getgroups" (syscall number 115) is not allowed by the syscall sandbox' + with self.nodes[0].assert_debug_log([expected_log_entry]): + self.log.info("Invoking disallowed syscall") + try: + self.nodes[0].invokedisallowedsyscall() + except ConnectionError: + disallowed_syscall_terminated_bitcoind = True + assert disallowed_syscall_terminated_bitcoind + self.nodes = [] + + +if __name__ == "__main__": + SyscallSandboxTest().main() diff --git a/test/functional/feature_versionbits_warning.py b/test/functional/feature_versionbits_warning.py index 311d871d49..d74ef5e088 100755 --- a/test/functional/feature_versionbits_warning.py +++ b/test/functional/feature_versionbits_warning.py @@ -28,6 +28,9 @@ class VersionBitsWarningTest(BitcoinTestFramework): def set_test_params(self): self.setup_clean_chain = True self.num_nodes = 1 + # The experimental syscall sandbox feature (-sandbox) is not compatible with -alertnotify + # (which invokes execve). + self.disable_syscall_sandbox = True def setup_network(self): self.alert_filename = os.path.join(self.options.tmpdir, "alert.txt") diff --git a/test/functional/rpc_misc.py b/test/functional/rpc_misc.py index 13f33c321f..ac2a7a309b 100755 --- a/test/functional/rpc_misc.py +++ b/test/functional/rpc_misc.py @@ -57,7 +57,7 @@ class RpcMiscTest(BitcoinTestFramework): self.log.info("test logging rpc and help") # Test logging RPC returns the expected number of logging categories. - assert_equal(len(node.logging()), 25) + assert_equal(len(node.logging()), 26) # Test toggling a logging category on/off/on with the logging RPC. assert_equal(node.logging()['qt'], True) diff --git a/test/functional/rpc_signer.py b/test/functional/rpc_signer.py index 9e963eba57..5c3722ef8f 100755 --- a/test/functional/rpc_signer.py +++ b/test/functional/rpc_signer.py @@ -27,6 +27,9 @@ class RPCSignerTest(BitcoinTestFramework): def set_test_params(self): self.num_nodes = 4 + # The experimental syscall sandbox feature (-sandbox) is not compatible with -signer (which + # invokes execve). + self.disable_syscall_sandbox = True self.extra_args = [ [], diff --git a/test/functional/test_framework/test_framework.py b/test/functional/test_framework/test_framework.py index f382e0fdb3..0fad4071e2 100755 --- a/test/functional/test_framework/test_framework.py +++ b/test/functional/test_framework/test_framework.py @@ -101,6 +101,7 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass): self.supports_cli = True self.bind_to_localhost_only = True self.parse_args() + self.disable_syscall_sandbox = self.options.nosandbox self.default_wallet_name = "default_wallet" if self.options.descriptors else "" self.wallet_data_filename = "wallet.dat" # Optional list of wallet names that can be set in set_test_params to @@ -159,6 +160,8 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass): parser = argparse.ArgumentParser(usage="%(prog)s [options]") parser.add_argument("--nocleanup", dest="nocleanup", default=False, action="store_true", help="Leave bitcoinds and test.* datadir on exit or error") + parser.add_argument("--nosandbox", dest="nosandbox", default=False, action="store_true", + help="Don't use the syscall sandbox") parser.add_argument("--noshutdown", dest="noshutdown", default=False, action="store_true", help="Don't stop bitcoinds after the test execution") parser.add_argument("--cachedir", dest="cachedir", default=os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/../../cache"), @@ -468,6 +471,10 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass): extra_args = [[]] * num_nodes if versions is None: versions = [None] * num_nodes + if self.is_syscall_sandbox_compiled() and not self.disable_syscall_sandbox: + for i in range(len(extra_args)): + if versions[i] is None or versions[i] >= 219900: + extra_args[i] = extra_args[i] + ["-sandbox=log-and-abort"] if binary is None: binary = [get_bin_from_version(v, 'bitcoind', self.options.bitcoind) for v in versions] if binary_cli is None: @@ -886,3 +893,7 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass): def is_bdb_compiled(self): """Checks whether the wallet module was compiled with BDB support.""" return self.config["components"].getboolean("USE_BDB") + + def is_syscall_sandbox_compiled(self): + """Checks whether the syscall sandbox was compiled.""" + return self.config["components"].getboolean("ENABLE_SYSCALL_SANDBOX") diff --git a/test/functional/test_runner.py b/test/functional/test_runner.py index 3792d751de..c5af10430c 100755 --- a/test/functional/test_runner.py +++ b/test/functional/test_runner.py @@ -170,6 +170,7 @@ BASE_SCRIPTS = [ 'rpc_users.py', 'rpc_whitelist.py', 'feature_proxy.py', + 'feature_syscall_sandbox.py', 'rpc_signrawtransaction.py --legacy-wallet', 'rpc_signrawtransaction.py --descriptors', 'rpc_rawtransaction.py --legacy-wallet', diff --git a/test/functional/wallet_signer.py b/test/functional/wallet_signer.py index 7b77755d64..c6c1cc8784 100755 --- a/test/functional/wallet_signer.py +++ b/test/functional/wallet_signer.py @@ -27,6 +27,9 @@ class WalletSignerTest(BitcoinTestFramework): def set_test_params(self): self.num_nodes = 2 + # The experimental syscall sandbox feature (-sandbox) is not compatible with -signer (which + # invokes execve). + self.disable_syscall_sandbox = True self.extra_args = [ [], |