diff options
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/syscall_sandbox.cpp | 882 | ||||
-rw-r--r-- | src/util/syscall_sandbox.h | 57 |
2 files changed, 939 insertions, 0 deletions
diff --git a/src/util/syscall_sandbox.cpp b/src/util/syscall_sandbox.cpp new file mode 100644 index 0000000000..c4006cbd3c --- /dev/null +++ b/src/util/syscall_sandbox.cpp @@ -0,0 +1,882 @@ +// Copyright (c) 2020 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#if defined(HAVE_CONFIG_H) +#include <config/bitcoin-config.h> +#endif // defined(HAVE_CONFIG_H) + +#include <util/syscall_sandbox.h> + +#if defined(USE_SYSCALL_SANDBOX) +#include <array> +#include <cassert> +#include <cstdint> +#include <exception> +#include <map> +#include <new> +#include <set> +#include <string> +#include <vector> + +#include <logging.h> +#include <tinyformat.h> +#include <util/threadnames.h> + +#include <linux/audit.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <linux/unistd.h> +#include <signal.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <unistd.h> + +namespace { +bool g_syscall_sandbox_enabled{false}; +bool g_syscall_sandbox_log_violation_before_terminating{false}; + +#if !defined(__x86_64__) +#error Syscall sandbox is an experimental feature currently available only under Linux x86-64. +#endif // defined(__x86_64__) + +// This list of syscalls in LINUX_SYSCALLS is only used to map syscall numbers to syscall names in +// order to be able to print user friendly error messages which include the syscall name in addition +// to the syscall number. +// +// Example output in case of a syscall violation where the syscall is present in LINUX_SYSCALLS: +// +// ``` +// 2021-06-09T12:34:56Z ERROR: The syscall "execve" (syscall number 59) is not allowed by the syscall sandbox in thread "msghand". Please report. +// ``` +// +// Example output in case of a syscall violation where the syscall is not present in LINUX_SYSCALLS: +// +// ``` +// 2021-06-09T12:34:56Z ERROR: The syscall "*unknown*" (syscall number 314) is not allowed by the syscall sandbox in thread "msghand". Please report. +// `` +// +// LINUX_SYSCALLS contains two types of syscalls: +// 1.) Syscalls that are present under all architectures or relevant Linux kernel versions for which +// we support the syscall sandbox feature (currently only Linux x86-64). Examples include read, +// write, open, close, etc. +// 2.) Syscalls that are present under a subset of architectures or relevant Linux kernel versions +// for which we support the syscall sandbox feature. This type of syscalls should be added to +// LINUX_SYSCALLS conditional on availability like in the following example: +// ... +// #if defined(__NR_arch_dependent_syscall) +// {__NR_arch_dependent_syscall, "arch_dependent_syscall"}, +// #endif // defined(__NR_arch_dependent_syscall) +// ... +const std::map<uint32_t, std::string> LINUX_SYSCALLS{ + {__NR_accept, "accept"}, + {__NR_accept4, "accept4"}, + {__NR_access, "access"}, + {__NR_acct, "acct"}, + {__NR_add_key, "add_key"}, + {__NR_adjtimex, "adjtimex"}, + {__NR_afs_syscall, "afs_syscall"}, + {__NR_alarm, "alarm"}, + {__NR_arch_prctl, "arch_prctl"}, + {__NR_bind, "bind"}, + {__NR_bpf, "bpf"}, + {__NR_brk, "brk"}, + {__NR_capget, "capget"}, + {__NR_capset, "capset"}, + {__NR_chdir, "chdir"}, + {__NR_chmod, "chmod"}, + {__NR_chown, "chown"}, + {__NR_chroot, "chroot"}, + {__NR_clock_adjtime, "clock_adjtime"}, + {__NR_clock_getres, "clock_getres"}, + {__NR_clock_gettime, "clock_gettime"}, + {__NR_clock_nanosleep, "clock_nanosleep"}, + {__NR_clock_settime, "clock_settime"}, + {__NR_clone, "clone"}, + {__NR_close, "close"}, + {__NR_connect, "connect"}, + {__NR_copy_file_range, "copy_file_range"}, + {__NR_creat, "creat"}, + {__NR_create_module, "create_module"}, + {__NR_delete_module, "delete_module"}, + {__NR_dup, "dup"}, + {__NR_dup2, "dup2"}, + {__NR_dup3, "dup3"}, + {__NR_epoll_create, "epoll_create"}, + {__NR_epoll_create1, "epoll_create1"}, + {__NR_epoll_ctl, "epoll_ctl"}, + {__NR_epoll_ctl_old, "epoll_ctl_old"}, + {__NR_epoll_pwait, "epoll_pwait"}, + {__NR_epoll_wait, "epoll_wait"}, + {__NR_epoll_wait_old, "epoll_wait_old"}, + {__NR_eventfd, "eventfd"}, + {__NR_eventfd2, "eventfd2"}, + {__NR_execve, "execve"}, + {__NR_execveat, "execveat"}, + {__NR_exit, "exit"}, + {__NR_exit_group, "exit_group"}, + {__NR_faccessat, "faccessat"}, + {__NR_fadvise64, "fadvise64"}, + {__NR_fallocate, "fallocate"}, + {__NR_fanotify_init, "fanotify_init"}, + {__NR_fanotify_mark, "fanotify_mark"}, + {__NR_fchdir, "fchdir"}, + {__NR_fchmod, "fchmod"}, + {__NR_fchmodat, "fchmodat"}, + {__NR_fchown, "fchown"}, + {__NR_fchownat, "fchownat"}, + {__NR_fcntl, "fcntl"}, + {__NR_fdatasync, "fdatasync"}, + {__NR_fgetxattr, "fgetxattr"}, + {__NR_finit_module, "finit_module"}, + {__NR_flistxattr, "flistxattr"}, + {__NR_flock, "flock"}, + {__NR_fork, "fork"}, + {__NR_fremovexattr, "fremovexattr"}, + {__NR_fsetxattr, "fsetxattr"}, + {__NR_fstat, "fstat"}, + {__NR_fstatfs, "fstatfs"}, + {__NR_fsync, "fsync"}, + {__NR_ftruncate, "ftruncate"}, + {__NR_futex, "futex"}, + {__NR_futimesat, "futimesat"}, + {__NR_getcpu, "getcpu"}, + {__NR_getcwd, "getcwd"}, + {__NR_getdents, "getdents"}, + {__NR_getdents64, "getdents64"}, + {__NR_getegid, "getegid"}, + {__NR_geteuid, "geteuid"}, + {__NR_getgid, "getgid"}, + {__NR_getgroups, "getgroups"}, + {__NR_getitimer, "getitimer"}, + {__NR_get_kernel_syms, "get_kernel_syms"}, + {__NR_get_mempolicy, "get_mempolicy"}, + {__NR_getpeername, "getpeername"}, + {__NR_getpgid, "getpgid"}, + {__NR_getpgrp, "getpgrp"}, + {__NR_getpid, "getpid"}, + {__NR_getpmsg, "getpmsg"}, + {__NR_getppid, "getppid"}, + {__NR_getpriority, "getpriority"}, +#if defined(__NR_getrandom) + {__NR_getrandom, "getrandom"}, +#endif // defined(__NR_getrandom) + {__NR_getresgid, "getresgid"}, + {__NR_getresuid, "getresuid"}, + {__NR_getrlimit, "getrlimit"}, + {__NR_get_robust_list, "get_robust_list"}, + {__NR_getrusage, "getrusage"}, + {__NR_getsid, "getsid"}, + {__NR_getsockname, "getsockname"}, + {__NR_getsockopt, "getsockopt"}, + {__NR_get_thread_area, "get_thread_area"}, + {__NR_gettid, "gettid"}, + {__NR_gettimeofday, "gettimeofday"}, + {__NR_getuid, "getuid"}, + {__NR_getxattr, "getxattr"}, + {__NR_init_module, "init_module"}, + {__NR_inotify_add_watch, "inotify_add_watch"}, + {__NR_inotify_init, "inotify_init"}, + {__NR_inotify_init1, "inotify_init1"}, + {__NR_inotify_rm_watch, "inotify_rm_watch"}, + {__NR_io_cancel, "io_cancel"}, + {__NR_ioctl, "ioctl"}, + {__NR_io_destroy, "io_destroy"}, + {__NR_io_getevents, "io_getevents"}, + {__NR_ioperm, "ioperm"}, + {__NR_iopl, "iopl"}, + {__NR_ioprio_get, "ioprio_get"}, + {__NR_ioprio_set, "ioprio_set"}, + {__NR_io_setup, "io_setup"}, + {__NR_io_submit, "io_submit"}, + {__NR_kcmp, "kcmp"}, + {__NR_kexec_file_load, "kexec_file_load"}, + {__NR_kexec_load, "kexec_load"}, + {__NR_keyctl, "keyctl"}, + {__NR_kill, "kill"}, + {__NR_lchown, "lchown"}, + {__NR_lgetxattr, "lgetxattr"}, + {__NR_link, "link"}, + {__NR_linkat, "linkat"}, + {__NR_listen, "listen"}, + {__NR_listxattr, "listxattr"}, + {__NR_llistxattr, "llistxattr"}, + {__NR_lookup_dcookie, "lookup_dcookie"}, + {__NR_lremovexattr, "lremovexattr"}, + {__NR_lseek, "lseek"}, + {__NR_lsetxattr, "lsetxattr"}, + {__NR_lstat, "lstat"}, + {__NR_madvise, "madvise"}, + {__NR_mbind, "mbind"}, +#if defined(__NR_membarrier) + {__NR_membarrier, "membarrier"}, +#endif // defined(__NR_membarrier) + {__NR_memfd_create, "memfd_create"}, + {__NR_migrate_pages, "migrate_pages"}, + {__NR_mincore, "mincore"}, + {__NR_mkdir, "mkdir"}, + {__NR_mkdirat, "mkdirat"}, + {__NR_mknod, "mknod"}, + {__NR_mknodat, "mknodat"}, + {__NR_mlock, "mlock"}, + {__NR_mlock2, "mlock2"}, + {__NR_mlockall, "mlockall"}, + {__NR_mmap, "mmap"}, + {__NR_modify_ldt, "modify_ldt"}, + {__NR_mount, "mount"}, + {__NR_move_pages, "move_pages"}, + {__NR_mprotect, "mprotect"}, + {__NR_mq_getsetattr, "mq_getsetattr"}, + {__NR_mq_notify, "mq_notify"}, + {__NR_mq_open, "mq_open"}, + {__NR_mq_timedreceive, "mq_timedreceive"}, + {__NR_mq_timedsend, "mq_timedsend"}, + {__NR_mq_unlink, "mq_unlink"}, + {__NR_mremap, "mremap"}, + {__NR_msgctl, "msgctl"}, + {__NR_msgget, "msgget"}, + {__NR_msgrcv, "msgrcv"}, + {__NR_msgsnd, "msgsnd"}, + {__NR_msync, "msync"}, + {__NR_munlock, "munlock"}, + {__NR_munlockall, "munlockall"}, + {__NR_munmap, "munmap"}, + {__NR_name_to_handle_at, "name_to_handle_at"}, + {__NR_nanosleep, "nanosleep"}, + {__NR_newfstatat, "newfstatat"}, + {__NR_nfsservctl, "nfsservctl"}, + {__NR_open, "open"}, + {__NR_openat, "openat"}, + {__NR_open_by_handle_at, "open_by_handle_at"}, + {__NR_pause, "pause"}, + {__NR_perf_event_open, "perf_event_open"}, + {__NR_personality, "personality"}, + {__NR_pipe, "pipe"}, + {__NR_pipe2, "pipe2"}, + {__NR_pivot_root, "pivot_root"}, + {__NR_pkey_alloc, "pkey_alloc"}, + {__NR_pkey_free, "pkey_free"}, + {__NR_pkey_mprotect, "pkey_mprotect"}, + {__NR_poll, "poll"}, + {__NR_ppoll, "ppoll"}, + {__NR_prctl, "prctl"}, + {__NR_pread64, "pread64"}, + {__NR_preadv, "preadv"}, + {__NR_preadv2, "preadv2"}, + {__NR_prlimit64, "prlimit64"}, + {__NR_process_vm_readv, "process_vm_readv"}, + {__NR_process_vm_writev, "process_vm_writev"}, + {__NR_pselect6, "pselect6"}, + {__NR_ptrace, "ptrace"}, + {__NR_putpmsg, "putpmsg"}, + {__NR_pwrite64, "pwrite64"}, + {__NR_pwritev, "pwritev"}, + {__NR_pwritev2, "pwritev2"}, + {__NR_query_module, "query_module"}, + {__NR_quotactl, "quotactl"}, + {__NR_read, "read"}, + {__NR_readahead, "readahead"}, + {__NR_readlink, "readlink"}, + {__NR_readlinkat, "readlinkat"}, + {__NR_readv, "readv"}, + {__NR_reboot, "reboot"}, + {__NR_recvfrom, "recvfrom"}, + {__NR_recvmmsg, "recvmmsg"}, + {__NR_recvmsg, "recvmsg"}, + {__NR_remap_file_pages, "remap_file_pages"}, + {__NR_removexattr, "removexattr"}, + {__NR_rename, "rename"}, + {__NR_renameat, "renameat"}, + {__NR_renameat2, "renameat2"}, + {__NR_request_key, "request_key"}, + {__NR_restart_syscall, "restart_syscall"}, + {__NR_rmdir, "rmdir"}, + {__NR_rt_sigaction, "rt_sigaction"}, + {__NR_rt_sigpending, "rt_sigpending"}, + {__NR_rt_sigprocmask, "rt_sigprocmask"}, + {__NR_rt_sigqueueinfo, "rt_sigqueueinfo"}, + {__NR_rt_sigreturn, "rt_sigreturn"}, + {__NR_rt_sigsuspend, "rt_sigsuspend"}, + {__NR_rt_sigtimedwait, "rt_sigtimedwait"}, + {__NR_rt_tgsigqueueinfo, "rt_tgsigqueueinfo"}, + {__NR_sched_getaffinity, "sched_getaffinity"}, + {__NR_sched_getattr, "sched_getattr"}, + {__NR_sched_getparam, "sched_getparam"}, + {__NR_sched_get_priority_max, "sched_get_priority_max"}, + {__NR_sched_get_priority_min, "sched_get_priority_min"}, + {__NR_sched_getscheduler, "sched_getscheduler"}, + {__NR_sched_rr_get_interval, "sched_rr_get_interval"}, + {__NR_sched_setaffinity, "sched_setaffinity"}, + {__NR_sched_setattr, "sched_setattr"}, + {__NR_sched_setparam, "sched_setparam"}, + {__NR_sched_setscheduler, "sched_setscheduler"}, + {__NR_sched_yield, "sched_yield"}, + {__NR_seccomp, "seccomp"}, + {__NR_security, "security"}, + {__NR_select, "select"}, + {__NR_semctl, "semctl"}, + {__NR_semget, "semget"}, + {__NR_semop, "semop"}, + {__NR_semtimedop, "semtimedop"}, + {__NR_sendfile, "sendfile"}, + {__NR_sendmmsg, "sendmmsg"}, + {__NR_sendmsg, "sendmsg"}, + {__NR_sendto, "sendto"}, + {__NR_setdomainname, "setdomainname"}, + {__NR_setfsgid, "setfsgid"}, + {__NR_setfsuid, "setfsuid"}, + {__NR_setgid, "setgid"}, + {__NR_setgroups, "setgroups"}, + {__NR_sethostname, "sethostname"}, + {__NR_setitimer, "setitimer"}, + {__NR_set_mempolicy, "set_mempolicy"}, + {__NR_setns, "setns"}, + {__NR_setpgid, "setpgid"}, + {__NR_setpriority, "setpriority"}, + {__NR_setregid, "setregid"}, + {__NR_setresgid, "setresgid"}, + {__NR_setresuid, "setresuid"}, + {__NR_setreuid, "setreuid"}, + {__NR_setrlimit, "setrlimit"}, + {__NR_set_robust_list, "set_robust_list"}, + {__NR_setsid, "setsid"}, + {__NR_setsockopt, "setsockopt"}, + {__NR_set_thread_area, "set_thread_area"}, + {__NR_set_tid_address, "set_tid_address"}, + {__NR_settimeofday, "settimeofday"}, + {__NR_setuid, "setuid"}, + {__NR_setxattr, "setxattr"}, + {__NR_shmat, "shmat"}, + {__NR_shmctl, "shmctl"}, + {__NR_shmdt, "shmdt"}, + {__NR_shmget, "shmget"}, + {__NR_shutdown, "shutdown"}, + {__NR_sigaltstack, "sigaltstack"}, + {__NR_signalfd, "signalfd"}, + {__NR_signalfd4, "signalfd4"}, + {__NR_socket, "socket"}, + {__NR_socketpair, "socketpair"}, + {__NR_splice, "splice"}, + {__NR_stat, "stat"}, + {__NR_statfs, "statfs"}, + {__NR_statx, "statx"}, + {__NR_swapoff, "swapoff"}, + {__NR_swapon, "swapon"}, + {__NR_symlink, "symlink"}, + {__NR_symlinkat, "symlinkat"}, + {__NR_sync, "sync"}, + {__NR_sync_file_range, "sync_file_range"}, + {__NR_syncfs, "syncfs"}, + {__NR__sysctl, "_sysctl"}, + {__NR_sysfs, "sysfs"}, + {__NR_sysinfo, "sysinfo"}, + {__NR_syslog, "syslog"}, + {__NR_tee, "tee"}, + {__NR_tgkill, "tgkill"}, + {__NR_time, "time"}, + {__NR_timer_create, "timer_create"}, + {__NR_timer_delete, "timer_delete"}, + {__NR_timerfd_create, "timerfd_create"}, + {__NR_timerfd_gettime, "timerfd_gettime"}, + {__NR_timerfd_settime, "timerfd_settime"}, + {__NR_timer_getoverrun, "timer_getoverrun"}, + {__NR_timer_gettime, "timer_gettime"}, + {__NR_timer_settime, "timer_settime"}, + {__NR_times, "times"}, + {__NR_tkill, "tkill"}, + {__NR_truncate, "truncate"}, + {__NR_tuxcall, "tuxcall"}, + {__NR_umask, "umask"}, + {__NR_umount2, "umount2"}, + {__NR_uname, "uname"}, + {__NR_unlink, "unlink"}, + {__NR_unlinkat, "unlinkat"}, + {__NR_unshare, "unshare"}, + {__NR_uselib, "uselib"}, + {__NR_userfaultfd, "userfaultfd"}, + {__NR_ustat, "ustat"}, + {__NR_utime, "utime"}, + {__NR_utimensat, "utimensat"}, + {__NR_utimes, "utimes"}, + {__NR_vfork, "vfork"}, + {__NR_vhangup, "vhangup"}, + {__NR_vmsplice, "vmsplice"}, + {__NR_vserver, "vserver"}, + {__NR_wait4, "wait4"}, + {__NR_waitid, "waitid"}, + {__NR_write, "write"}, + {__NR_writev, "writev"}, +}; + +std::string GetLinuxSyscallName(uint32_t syscall_number) +{ + const auto element = LINUX_SYSCALLS.find(syscall_number); + if (element != LINUX_SYSCALLS.end()) { + return element->second; + } + return "*unknown*"; +} + +// See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> for +// an accessible introduction to using seccomp. +// +// This function largely follows <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> and +// <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>. +// +// Seccomp BPF resources: +// * Seccomp BPF documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> +// * seccomp(2) manual page: <https://www.kernel.org/doc/man-pages/online/pages/man2/seccomp.2.html> +// * Seccomp BPF demo code samples: <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/seccomp> +void SyscallSandboxDebugSignalHandler(int, siginfo_t* signal_info, void* void_signal_context) +{ + // The si_code field inside the siginfo_t argument that is passed to a SA_SIGINFO signal handler + // is a value indicating why the signal was sent. + // + // The following value can be placed in si_code for a SIGSYS signal: + // * SYS_SECCOMP (since Linux 3.5): Triggered by a seccomp(2) filter rule. + constexpr int32_t SYS_SECCOMP_SI_CODE{1}; + assert(signal_info->si_code == SYS_SECCOMP_SI_CODE); + + // The ucontext_t structure contains signal context information that was saved on the user-space + // stack by the kernel. + const ucontext_t* signal_context = static_cast<ucontext_t*>(void_signal_context); + assert(signal_context != nullptr); + + std::set_new_handler(std::terminate); + // Portability note: REG_RAX is Linux x86_64 specific. + const uint32_t syscall_number = static_cast<uint32_t>(signal_context->uc_mcontext.gregs[REG_RAX]); + const std::string syscall_name = GetLinuxSyscallName(syscall_number); + const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*"; + const std::string error_message = strprintf("ERROR: The syscall \"%s\" (syscall number %d) is not allowed by the syscall sandbox in thread \"%s\". Please report.", syscall_name, syscall_number, thread_name); + tfm::format(std::cerr, "%s\n", error_message); + LogPrintf("%s\n", error_message); + std::terminate(); +} + +// This function largely follows install_syscall_reporter from Kees Cook's seccomp guide: +// <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> +bool SetupSyscallSandboxDebugHandler() +{ + struct sigaction action = {}; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGSYS); + action.sa_sigaction = &SyscallSandboxDebugSignalHandler; + action.sa_flags = SA_SIGINFO; + if (sigaction(SIGSYS, &action, nullptr) < 0) { + return false; + } + if (sigprocmask(SIG_UNBLOCK, &mask, nullptr)) { + return false; + } + return true; +} + +enum class SyscallSandboxAction { + KILL_PROCESS, + INVOKE_SIGNAL_HANDLER, +}; + +class SeccompPolicyBuilder +{ + std::set<uint32_t> allowed_syscalls; + +public: + SeccompPolicyBuilder() + { + // Allowed by default. + AllowAddressSpaceAccess(); + AllowEpoll(); + AllowEventFd(); + AllowFutex(); + AllowGeneralIo(); + AllowGetRandom(); + AllowGetSimpleId(); + AllowGetTime(); + AllowGlobalProcessEnvironment(); + AllowGlobalSystemStatus(); + AllowKernelInternalApi(); + AllowNetworkSocketInformation(); + AllowOperationOnExistingFileDescriptor(); + AllowPipe(); + AllowPrctl(); + AllowProcessStartOrDeath(); + AllowScheduling(); + AllowSignalHandling(); + AllowSleep(); + AllowUmask(); + } + + void AllowAddressSpaceAccess() + { + allowed_syscalls.insert(__NR_brk); // change data segment size + allowed_syscalls.insert(__NR_madvise); // give advice about use of memory +#if defined(__NR_membarrier) + allowed_syscalls.insert(__NR_membarrier); // issue memory barriers on a set of threads +#endif // defined(__NR_membarrier) + allowed_syscalls.insert(__NR_mlock); // lock memory + allowed_syscalls.insert(__NR_mmap); // map files or devices into memory + allowed_syscalls.insert(__NR_mprotect); // set protection on a region of memory + allowed_syscalls.insert(__NR_munlock); // unlock memory + allowed_syscalls.insert(__NR_munmap); // unmap files or devices into memory + } + + void AllowEpoll() + { + allowed_syscalls.insert(__NR_epoll_create1); // open an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_ctl); // control interface for an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_pwait); // wait for an I/O event on an epoll file descriptor + allowed_syscalls.insert(__NR_epoll_wait); // wait for an I/O event on an epoll file descriptor + } + + void AllowEventFd() + { + allowed_syscalls.insert(__NR_eventfd2); // create a file descriptor for event notification + } + + void AllowFileSystem() + { + allowed_syscalls.insert(__NR_access); // check user's permissions for a file + allowed_syscalls.insert(__NR_chdir); // change working directory + allowed_syscalls.insert(__NR_chmod); // change permissions of a file + allowed_syscalls.insert(__NR_fallocate); // manipulate file space + allowed_syscalls.insert(__NR_fchmod); // change permissions of a file + allowed_syscalls.insert(__NR_fchown); // change ownership of a file + allowed_syscalls.insert(__NR_fdatasync); // synchronize a file's in-core state with storage device + allowed_syscalls.insert(__NR_flock); // apply or remove an advisory lock on an open file + allowed_syscalls.insert(__NR_fstat); // get file status + allowed_syscalls.insert(__NR_fsync); // synchronize a file's in-core state with storage device + allowed_syscalls.insert(__NR_ftruncate); // truncate a file to a specified length + allowed_syscalls.insert(__NR_getcwd); // get current working directory + allowed_syscalls.insert(__NR_getdents); // get directory entries + allowed_syscalls.insert(__NR_getdents64); // get directory entries + allowed_syscalls.insert(__NR_lstat); // get file status + allowed_syscalls.insert(__NR_mkdir); // create a directory + allowed_syscalls.insert(__NR_open); // open and possibly create a file + allowed_syscalls.insert(__NR_openat); // open and possibly create a file + allowed_syscalls.insert(__NR_readlink); // read value of a symbolic link + allowed_syscalls.insert(__NR_rename); // change the name or location of a file + allowed_syscalls.insert(__NR_rmdir); // delete a directory + allowed_syscalls.insert(__NR_stat); // get file status + allowed_syscalls.insert(__NR_statfs); // get filesystem statistics + allowed_syscalls.insert(__NR_statx); // get file status (extended) + allowed_syscalls.insert(__NR_unlink); // delete a name and possibly the file it refers to + } + + void AllowFutex() + { + allowed_syscalls.insert(__NR_futex); // fast user-space locking + allowed_syscalls.insert(__NR_set_robust_list); // set list of robust futexes + } + + void AllowGeneralIo() + { + allowed_syscalls.insert(__NR_ioctl); // control device + allowed_syscalls.insert(__NR_lseek); // reposition read/write file offset + allowed_syscalls.insert(__NR_poll); // wait for some event on a file descriptor + allowed_syscalls.insert(__NR_ppoll); // wait for some event on a file descriptor + allowed_syscalls.insert(__NR_pread64); // read from a file descriptor at a given offset + allowed_syscalls.insert(__NR_pwrite64); // write to a file descriptor at a given offset + allowed_syscalls.insert(__NR_read); // read from a file descriptor + allowed_syscalls.insert(__NR_readv); // read data into multiple buffers + allowed_syscalls.insert(__NR_recvfrom); // receive a message from a socket + allowed_syscalls.insert(__NR_recvmsg); // receive a message from a socket + allowed_syscalls.insert(__NR_select); // synchronous I/O multiplexing + allowed_syscalls.insert(__NR_sendmmsg); // send multiple messages on a socket + allowed_syscalls.insert(__NR_sendmsg); // send a message on a socket + allowed_syscalls.insert(__NR_sendto); // send a message on a socket + allowed_syscalls.insert(__NR_write); // write to a file descriptor + allowed_syscalls.insert(__NR_writev); // write data into multiple buffers + } + + void AllowGetRandom() + { +#if defined(__NR_getrandom) + allowed_syscalls.insert(__NR_getrandom); // obtain a series of random bytes +#endif // defined(__NR_getrandom) + } + + void AllowGetSimpleId() + { + allowed_syscalls.insert(__NR_getegid); // get group identity + allowed_syscalls.insert(__NR_geteuid); // get user identity + allowed_syscalls.insert(__NR_getgid); // get group identity + allowed_syscalls.insert(__NR_getpgid); // get process group + allowed_syscalls.insert(__NR_getpid); // get process identification + allowed_syscalls.insert(__NR_getppid); // get process identification + allowed_syscalls.insert(__NR_getresgid); // get real, effective and saved group IDs + allowed_syscalls.insert(__NR_getresuid); // get real, effective and saved user IDs + allowed_syscalls.insert(__NR_getsid); // get session ID + allowed_syscalls.insert(__NR_gettid); // get thread identification + allowed_syscalls.insert(__NR_getuid); // get user identity + } + + void AllowGetTime() + { + allowed_syscalls.insert(__NR_clock_getres); // find the resolution (precision) of the specified clock + allowed_syscalls.insert(__NR_clock_gettime); // retrieve the time of the specified clock + } + + void AllowGlobalProcessEnvironment() + { + allowed_syscalls.insert(__NR_getrlimit); // get resource limits + allowed_syscalls.insert(__NR_getrusage); // get resource usage + allowed_syscalls.insert(__NR_prlimit64); // get/set resource limits + } + + void AllowGlobalSystemStatus() + { + allowed_syscalls.insert(__NR_sysinfo); // return system information + allowed_syscalls.insert(__NR_uname); // get name and information about current kernel + } + + void AllowKernelInternalApi() + { + allowed_syscalls.insert(__NR_restart_syscall); // restart a system call after interruption by a stop signal + } + + void AllowNetwork() + { + allowed_syscalls.insert(__NR_accept); // accept a connection on a socket + allowed_syscalls.insert(__NR_accept4); // accept a connection on a socket + allowed_syscalls.insert(__NR_bind); // bind a name to a socket + allowed_syscalls.insert(__NR_connect); // initiate a connection on a socket + allowed_syscalls.insert(__NR_listen); // listen for connections on a socket + allowed_syscalls.insert(__NR_setsockopt); // set options on sockets + allowed_syscalls.insert(__NR_socket); // create an endpoint for communication + allowed_syscalls.insert(__NR_socketpair); // create a pair of connected sockets + } + + void AllowNetworkSocketInformation() + { + allowed_syscalls.insert(__NR_getpeername); // get name of connected peer socket + allowed_syscalls.insert(__NR_getsockname); // get socket name + allowed_syscalls.insert(__NR_getsockopt); // get options on sockets + } + + void AllowOperationOnExistingFileDescriptor() + { + allowed_syscalls.insert(__NR_close); // close a file descriptor + allowed_syscalls.insert(__NR_dup); // duplicate a file descriptor + allowed_syscalls.insert(__NR_dup2); // duplicate a file descriptor + allowed_syscalls.insert(__NR_fcntl); // manipulate file descriptor + allowed_syscalls.insert(__NR_shutdown); // shut down part of a full-duplex connection + } + + void AllowPipe() + { + allowed_syscalls.insert(__NR_pipe); // create pipe + allowed_syscalls.insert(__NR_pipe2); // create pipe + } + + void AllowPrctl() + { + allowed_syscalls.insert(__NR_arch_prctl); // set architecture-specific thread state + allowed_syscalls.insert(__NR_prctl); // operations on a process + } + + void AllowProcessStartOrDeath() + { + allowed_syscalls.insert(__NR_clone); // create a child process + allowed_syscalls.insert(__NR_exit); // terminate the calling process + allowed_syscalls.insert(__NR_exit_group); // exit all threads in a process + allowed_syscalls.insert(__NR_fork); // create a child process + allowed_syscalls.insert(__NR_tgkill); // send a signal to a thread + allowed_syscalls.insert(__NR_wait4); // wait for process to change state, BSD style + } + + void AllowScheduling() + { + allowed_syscalls.insert(__NR_sched_getaffinity); // set a thread's CPU affinity mask + allowed_syscalls.insert(__NR_sched_getparam); // get scheduling parameters + allowed_syscalls.insert(__NR_sched_getscheduler); // get scheduling policy/parameters + allowed_syscalls.insert(__NR_sched_setscheduler); // set scheduling policy/parameters + allowed_syscalls.insert(__NR_sched_yield); // yield the processor + } + + void AllowSignalHandling() + { + allowed_syscalls.insert(__NR_rt_sigaction); // examine and change a signal action + allowed_syscalls.insert(__NR_rt_sigprocmask); // examine and change blocked signals + allowed_syscalls.insert(__NR_rt_sigreturn); // return from signal handler and cleanup stack frame + allowed_syscalls.insert(__NR_sigaltstack); // set and/or get signal stack context + } + + void AllowSleep() + { + allowed_syscalls.insert(__NR_clock_nanosleep); // high-resolution sleep with specifiable clock + allowed_syscalls.insert(__NR_nanosleep); // high-resolution sleep + } + + void AllowUmask() + { + allowed_syscalls.insert(__NR_umask); // set file mode creation mask + } + + // See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> + // for an accessible introduction to using seccomp. + // + // This function largely follows <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>. + std::vector<sock_filter> BuildFilter(SyscallSandboxAction default_action) + { + std::vector<sock_filter> bpf_policy; + // See VALIDATE_ARCHITECTURE in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, arch))); + // Portability note: AUDIT_ARCH_X86_64 is Linux x86_64 specific. + bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 1, 0)); + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS)); + // See EXAMINE_SYSCALL in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr))); + for (const uint32_t allowed_syscall : allowed_syscalls) { + // See ALLOW_SYSCALL in seccomp-bpf.h referenced above. + bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, allowed_syscall, 0, 1)); + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW)); + } + switch (default_action) { + case SyscallSandboxAction::KILL_PROCESS: + // Disallow syscall and kill the process. + // + // See KILL_PROCESS in seccomp-bpf.h referenced above. + // + // Note that we're using SECCOMP_RET_KILL_PROCESS (kill the process) instead + // of SECCOMP_RET_KILL_THREAD (kill the thread). The SECCOMP_RET_KILL_PROCESS + // action was introduced in Linux 4.14. + // + // SECCOMP_RET_KILL_PROCESS: Results in the entire process exiting immediately without + // executing the system call. + // + // SECCOMP_RET_KILL_PROCESS documentation: + // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS)); + break; + case SyscallSandboxAction::INVOKE_SIGNAL_HANDLER: + // Disallow syscall and force a SIGSYS to trigger syscall debug reporter. + // + // SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. + // + // SECCOMP_RET_TRAP documentation: + // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP)); + break; + } + return bpf_policy; + } +}; +} // namespace + +bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating) +{ + assert(!g_syscall_sandbox_enabled && "SetupSyscallSandbox(...) should only be called once."); + g_syscall_sandbox_enabled = true; + g_syscall_sandbox_log_violation_before_terminating = log_syscall_violation_before_terminating; + if (log_syscall_violation_before_terminating) { + if (!SetupSyscallSandboxDebugHandler()) { + return false; + } + } + SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION); + return true; +} + +void TestDisallowedSandboxCall() +{ + // The getgroups syscall is assumed NOT to be allowed by the syscall sandbox policy. + std::array<gid_t, 1> groups; + [[maybe_unused]] int32_t ignored = getgroups(groups.size(), groups.data()); +} +#endif // defined(USE_SYSCALL_SANDBOX) + +void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy) +{ +#if defined(USE_SYSCALL_SANDBOX) + if (!g_syscall_sandbox_enabled) { + return; + } + SeccompPolicyBuilder seccomp_policy_builder; + switch (syscall_policy) { + case SyscallSandboxPolicy::INITIALIZATION: // Thread: main thread (state: init) + // SyscallSandboxPolicy::INITIALIZATION is the first policy loaded. + // + // Subsequently loaded policies can reduce the abilities further, but + // abilities can never be regained. + // + // SyscallSandboxPolicy::INITIALIZATION must thus be a superset of all + // other policies. + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::INITIALIZATION_DNS_SEED: // Thread: dnsseed + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::INITIALIZATION_LOAD_BLOCKS: // Thread: loadblk + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::INITIALIZATION_MAP_PORT: // Thread: mapport + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::MESSAGE_HANDLER: // Thread: msghand + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::NET: // Thread: net + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_ADD_CONNECTION: // Thread: addcon + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_HTTP_SERVER: // Thread: http + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER: // Thread: httpworker.<N> + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::NET_OPEN_CONNECTION: // Thread: opencon + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::SCHEDULER: // Thread: scheduler + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::TOR_CONTROL: // Thread: torcontrol + seccomp_policy_builder.AllowFileSystem(); + seccomp_policy_builder.AllowNetwork(); + break; + case SyscallSandboxPolicy::TX_INDEX: // Thread: txindex + seccomp_policy_builder.AllowFileSystem(); + break; + case SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK: // Thread: scriptch.<N> + break; + case SyscallSandboxPolicy::SHUTOFF: // Thread: main thread (state: shutoff) + seccomp_policy_builder.AllowFileSystem(); + break; + } + + const SyscallSandboxAction default_action = g_syscall_sandbox_log_violation_before_terminating ? SyscallSandboxAction::INVOKE_SIGNAL_HANDLER : SyscallSandboxAction::KILL_PROCESS; + std::vector<sock_filter> filter = seccomp_policy_builder.BuildFilter(default_action); + const sock_fprog prog = { + .len = static_cast<uint16_t>(filter.size()), + .filter = filter.data(), + }; + // Do not allow abilities to be regained after being dropped. + // + // PR_SET_NO_NEW_PRIVS documentation: <https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html> + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { + throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_NO_NEW_PRIVS)"); + } + // Install seccomp-bpf syscall filter. + // + // PR_SET_SECCOMP documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html> + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) { + throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_SECCOMP)"); + } + + const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*"; + LogPrint(BCLog::UTIL, "Syscall filter installed for thread \"%s\"\n", thread_name); +#endif // defined(USE_SYSCALL_SANDBOX) +} diff --git a/src/util/syscall_sandbox.h b/src/util/syscall_sandbox.h new file mode 100644 index 0000000000..0a0c964f94 --- /dev/null +++ b/src/util/syscall_sandbox.h @@ -0,0 +1,57 @@ +// Copyright (c) 2020 The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_UTIL_SYSCALL_SANDBOX_H +#define BITCOIN_UTIL_SYSCALL_SANDBOX_H + +enum class SyscallSandboxPolicy { + // 1. Initialization + INITIALIZATION, + INITIALIZATION_DNS_SEED, + INITIALIZATION_LOAD_BLOCKS, + INITIALIZATION_MAP_PORT, + + // 2. Steady state (non-initialization, non-shutdown) + MESSAGE_HANDLER, + NET, + NET_ADD_CONNECTION, + NET_HTTP_SERVER, + NET_HTTP_SERVER_WORKER, + NET_OPEN_CONNECTION, + SCHEDULER, + TOR_CONTROL, + TX_INDEX, + VALIDATION_SCRIPT_CHECK, + + // 3. Shutdown + SHUTOFF, +}; + +//! Force the current thread (and threads created from the current thread) into a restricted-service +//! operating mode where only a subset of all syscalls are available. +//! +//! Subsequent calls to this function can reduce the abilities further, but abilities can never be +//! regained. +//! +//! This function is a no-op unless SetupSyscallSandbox(...) has been called. +//! +//! SetupSyscallSandbox(...) is called during bitcoind initialization if Bitcoin Core was compiled +//! with seccomp-bpf support (--with-seccomp) *and* the parameter -sandbox=<mode> was passed to +//! bitcoind. +//! +//! This experimental feature is available under Linux x86_64 only. +void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy); + +#if defined(USE_SYSCALL_SANDBOX) +//! Setup and enable the experimental syscall sandbox for the running process. +//! +//! SetSyscallSandboxPolicy(SyscallSandboxPolicy::INITIALIZATION) is called as part of +//! SetupSyscallSandbox(...). +[[nodiscard]] bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating); + +//! Invoke a disallowed syscall. Use for testing purposes. +void TestDisallowedSandboxCall(); +#endif // defined(USE_SYSCALL_SANDBOX) + +#endif // BITCOIN_UTIL_SYSCALL_SANDBOX_H |