diff options
-rw-r--r-- | tools/meson.build | 7 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_common.h | 15 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.c | 13 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.h | 1 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_virtio.c | 49 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 99 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_seccomp.c | 12 |
7 files changed, 158 insertions, 38 deletions
diff --git a/tools/meson.build b/tools/meson.build index fdce66857d..3e5a0abfa2 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -10,8 +10,11 @@ if get_option('virtiofsd').enabled() error('virtiofsd requires Linux') elif not seccomp.found() or not libcap_ng.found() error('virtiofsd requires libcap-ng-devel and seccomp-devel') - elif not have_tools or 'CONFIG_VHOST_USER' not in config_host - error('virtiofsd needs tools and vhost-user support') + elif 'CONFIG_VHOST_USER' not in config_host + error('virtiofsd needs vhost-user support') + else + # Disabled all the tools but virtiofsd. + have_virtiofsd = true endif endif elif get_option('virtiofsd').disabled() or not have_system diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h index a090040bb2..fa9671872e 100644 --- a/tools/virtiofsd/fuse_common.h +++ b/tools/virtiofsd/fuse_common.h @@ -358,6 +358,21 @@ struct fuse_file_info { #define FUSE_CAP_SUBMOUNTS (1 << 27) /** + * Indicates that the filesystem is responsible for clearing + * security.capability xattr and clearing setuid and setgid bits. Following + * are the rules. + * - clear "security.capability" on write, truncate and chown unconditionally + * - clear suid/sgid if following is true. Note, sgid is cleared only if + * group executable bit is set. + * o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. + * o setattr has FATTR_UID or FATTR_GID + * o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID + * o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. + * o write has FUSE_WRITE_KILL_SUIDGID + */ +#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28) + +/** * Ioctl flags * * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index e94b71110b..1aa26c6333 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -18,7 +18,7 @@ #include <sys/file.h> -#define THREAD_POOL_SIZE 64 +#define THREAD_POOL_SIZE 0 #define OFFSET_MAX 0x7fffffffffffffffLL @@ -855,7 +855,7 @@ static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | - FUSE_SET_ATTR_CTIME; + FUSE_SET_ATTR_CTIME | FUSE_SET_ATTR_KILL_SUIDGID; req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); } else { @@ -1069,6 +1069,7 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; req->ctx.umask = arg->umask; @@ -1092,6 +1093,7 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid, memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; if (req->se->op.open) { req->se->op.open(req, nodeid, &fi); @@ -1983,6 +1985,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (arg->flags & FUSE_SUBMOUNTS) { se->conn.capable |= FUSE_CAP_SUBMOUNTS; } + if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2; + } #ifdef HAVE_SPLICE #ifdef HAVE_VMSPLICE se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; @@ -2114,6 +2119,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, outarg.congestion_threshold = se->conn.congestion_threshold; outarg.time_gran = se->conn.time_gran; + if (se->conn.want & FUSE_CAP_HANDLE_KILLPRIV_V2) { + outarg.flags |= FUSE_HANDLE_KILLPRIV_V2; + } + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 0e10a14bc9..3bf786b034 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -143,6 +143,7 @@ struct fuse_forget_data { #define FUSE_SET_ATTR_ATIME_NOW (1 << 7) #define FUSE_SET_ATTR_MTIME_NOW (1 << 8) #define FUSE_SET_ATTR_CTIME (1 << 10) +#define FUSE_SET_ATTR_KILL_SUIDGID (1 << 11) /* * Request methods and replies diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index ddcefee427..523ee64fb7 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -188,6 +188,31 @@ static void copy_iov(struct iovec *src_iov, int src_count, } /* + * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if + * a deadlock condition is detected or the current thread already + * owns the lock. They can also fail, like pthread_rwlock_unlock(), + * if the mutex wasn't properly initialized. None of these are ever + * expected to happen. + */ +static void vu_dispatch_rdlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_wrlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_unlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +/* * Called back by ll whenever it wants to send a reply/message back * The 1st element of the iov starts with the fuse_out_header * 'unique'==0 means it's a notify message. @@ -240,12 +265,12 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, copy_iov(iov, count, in_sg, in_num, tosend_len); - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, tosend_len); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); req->reply_sent = true; @@ -403,12 +428,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ret = 0; - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, tosend_len); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); err: if (ret == 0) { @@ -558,12 +583,12 @@ out: fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, elem->index); - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, 0); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); } pthread_mutex_destroy(&req->ch.lock); @@ -596,7 +621,6 @@ static void *fv_queue_thread(void *opaque) qi->qidx, qi->kick_fd); while (1) { struct pollfd pf[2]; - int ret; pf[0].fd = qi->kick_fd; pf[0].events = POLLIN; @@ -645,8 +669,7 @@ static void *fv_queue_thread(void *opaque) break; } /* Mutual exclusion with virtio_loop() */ - ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); - assert(ret == 0); /* there is no possible error case */ + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); /* out is from guest, in is too guest */ unsigned int in_bytes, out_bytes; @@ -672,7 +695,7 @@ static void *fv_queue_thread(void *opaque) } pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); /* Process all the requests. */ if (!se->thread_pool_size && req_list != NULL) { @@ -799,7 +822,6 @@ int virtio_loop(struct fuse_session *se) while (!fuse_session_exited(se)) { struct pollfd pf[1]; bool ok; - int ret; pf[0].fd = se->vu_socketfd; pf[0].events = POLLIN; pf[0].revents = 0; @@ -825,12 +847,11 @@ int virtio_loop(struct fuse_session *se) assert(pf[0].revents & POLLIN); fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); /* Mutual exclusion with fv_queue_thread() */ - ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock); - assert(ret == 0); /* there is no possible error case */ + vu_dispatch_wrlock(se->virtio_dev); ok = vu_dispatch(&se->virtio_dev->dev); - pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(se->virtio_dev); if (!ok) { fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 147b59338a..58d24c0010 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -168,6 +168,7 @@ struct lo_data { /* An O_PATH file descriptor to /proc/self/fd/ */ int proc_self_fd; + int user_killpriv_v2, killpriv_v2; }; static const struct fuse_opt lo_opts[] = { @@ -198,6 +199,8 @@ static const struct fuse_opt lo_opts[] = { { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, + { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 }, + { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -630,6 +633,34 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) "does not support it\n"); lo->announce_submounts = false; } + + if (lo->user_killpriv_v2 == 1) { + /* + * User explicitly asked for this option. Enable it unconditionally. + * If connection does not have this capability, it should fail + * in fuse_lowlevel.c + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else if (lo->user_killpriv_v2 == -1 && + conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) { + /* + * User did not specify a value for killpriv_v2. By default enable it + * if connection offers this capability + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else { + /* + * Either user specified to disable killpriv_v2, or connection does + * not offer this capability. Disable killpriv_v2 in both the cases + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n"); + conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 0; + } } static void lo_getattr(fuse_req_t req, fuse_ino_t ino, @@ -698,6 +729,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); } if (res == -1) { + saverr = errno; goto out_err; } } @@ -707,27 +739,47 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); if (res == -1) { + saverr = errno; goto out_err; } } if (valid & FUSE_SET_ATTR_SIZE) { int truncfd; + bool kill_suidgid; + bool cap_fsetid_dropped = false; + kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID); if (fi) { truncfd = fd; } else { truncfd = lo_inode_open(lo, inode, O_RDWR); if (truncfd < 0) { - errno = -truncfd; + saverr = -truncfd; + goto out_err; + } + } + + if (kill_suidgid) { + res = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (res != 0) { + saverr = res; + if (!fi) { + close(truncfd); + } goto out_err; } } res = ftruncate(truncfd, attr->st_size); + saverr = res == -1 ? errno : 0; + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } if (!fi) { - saverr = errno; close(truncfd); - errno = saverr; } if (res == -1) { goto out_err; @@ -760,6 +812,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = utimensat(lo->proc_self_fd, procname, tv, 0); } if (res == -1) { + saverr = errno; goto out_err; } } @@ -768,7 +821,6 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, return lo_getattr(req, ino, fi); out_err: - saverr = errno; lo_inode_put(lo, &inode); fuse_reply_err(req, saverr); } @@ -1708,11 +1760,27 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, { ssize_t fh; int fd = existing_fd; + int err; + bool cap_fsetid_dropped = false; + bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv; update_open_flags(lo->writeback, lo->allow_direct_io, fi); if (fd < 0) { + if (kill_suidgid) { + err = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (err) { + return err; + } + } + fd = lo_inode_open(lo, inode, fi->flags); + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } if (fd < 0) { return -fd; } @@ -1746,8 +1814,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, int err; struct lo_cred old = {}; - fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, - name); + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)" + " kill_priv=%d\n", parent, name, fi->kill_priv); if (!is_safe_path_component(name)) { fuse_reply_err(req, EINVAL); @@ -1980,8 +2048,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct lo_inode *inode = lo_inode(req, ino); int err; - fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, - fi->flags); + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)" + "\n", ino, fi->flags, fi->kill_priv); if (!inode) { fuse_reply_err(req, EBADF); @@ -2120,12 +2188,14 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, out_buf.buf[0].pos = off; fuse_log(FUSE_LOG_DEBUG, - "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, - out_buf.buf[0].size, (unsigned long)off); + "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n", + ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv); /* * If kill_priv is set, drop CAP_FSETID which should lead to kernel - * clearing setuid/setgid on file. + * clearing setuid/setgid on file. Note, for WRITE, we need to do + * this even if killpriv_v2 is not enabled. fuse direct write path + * relies on this. */ if (fi->kill_priv) { res = drop_effective_cap("FSETID", &cap_fsetid_dropped); @@ -3204,7 +3274,7 @@ static void setup_mounts(const char *source) } /* - * Only keep whitelisted capabilities that are needed for file system operation + * Only keep capabilities in allowlist that are needed for file system operation * The (possibly NULL) modcaps_in string passed in is free'd before exit. */ static void setup_capabilities(char *modcaps_in) @@ -3214,8 +3284,8 @@ static void setup_capabilities(char *modcaps_in) capng_restore_state(&cap.saved); /* - * Whitelist file system-related capabilities that are needed for a file - * server to act like root. Drop everything else like networking and + * Add to allowlist file system-related capabilities that are needed for a + * file server to act like root. Drop everything else like networking and * sysadmin capabilities. * * Exclusions: @@ -3533,6 +3603,7 @@ int main(int argc, char *argv[]) .posix_lock = 0, .allow_direct_io = 0, .proc_self_fd = -1, + .user_killpriv_v2 = -1, }; struct lo_map_elem *root_elem; struct lo_map_elem *reserve_elem; diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index ea852e2e33..62441cfcdb 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -21,7 +21,7 @@ #endif #endif -static const int syscall_whitelist[] = { +static const int syscall_allowlist[] = { /* TODO ireg sem*() syscalls */ SCMP_SYS(brk), SCMP_SYS(capget), /* For CAP_FSETID */ @@ -117,12 +117,12 @@ static const int syscall_whitelist[] = { }; /* Syscalls used when --syslog is enabled */ -static const int syscall_whitelist_syslog[] = { +static const int syscall_allowlist_syslog[] = { SCMP_SYS(send), SCMP_SYS(sendto), }; -static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len) +static void add_allowlist(scmp_filter_ctx ctx, const int syscalls[], size_t len) { size_t i; @@ -153,10 +153,10 @@ void setup_seccomp(bool enable_syslog) exit(1); } - add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist)); + add_allowlist(ctx, syscall_allowlist, G_N_ELEMENTS(syscall_allowlist)); if (enable_syslog) { - add_whitelist(ctx, syscall_whitelist_syslog, - G_N_ELEMENTS(syscall_whitelist_syslog)); + add_allowlist(ctx, syscall_allowlist_syslog, + G_N_ELEMENTS(syscall_allowlist_syslog)); } /* libvhost-user calls this for post-copy migration, we don't need it */ |