diff options
Diffstat (limited to 'contrib/libvhost-user/libvhost-user.c')
-rw-r--r-- | contrib/libvhost-user/libvhost-user.c | 302 |
1 files changed, 300 insertions, 2 deletions
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 2e358b5bce..beeed0c43f 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -26,9 +26,20 @@ #include <sys/socket.h> #include <sys/eventfd.h> #include <sys/mman.h> +#include "qemu/compiler.h" + +#if defined(__linux__) +#include <sys/syscall.h> +#include <fcntl.h> +#include <sys/ioctl.h> #include <linux/vhost.h> -#include "qemu/compiler.h" +#ifdef __NR_userfaultfd +#include <linux/userfaultfd.h> +#endif + +#endif + #include "qemu/atomic.h" #include "libvhost-user.h" @@ -86,6 +97,9 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_SET_VRING_ENDIAN), REQ(VHOST_USER_GET_CONFIG), REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), REQ(VHOST_USER_MAX), }; #undef REQ @@ -171,6 +185,35 @@ vmsg_close_fds(VhostUserMsg *vmsg) } } +/* A test to see if we have userfault available */ +static bool +have_userfault(void) +{ +#if defined(__linux__) && defined(__NR_userfaultfd) &&\ + defined(UFFD_FEATURE_MISSING_SHMEM) &&\ + defined(UFFD_FEATURE_MISSING_HUGETLBFS) + /* Now test the kernel we're running on really has the features */ + int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + struct uffdio_api api_struct; + if (ufd < 0) { + return false; + } + + api_struct.api = UFFD_API; + api_struct.features = UFFD_FEATURE_MISSING_SHMEM | + UFFD_FEATURE_MISSING_HUGETLBFS; + if (ioctl(ufd, UFFDIO_API, &api_struct)) { + close(ufd); + return false; + } + close(ufd); + return true; + +#else + return false; +#endif +} + static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { @@ -245,6 +288,31 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { int rc; uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + struct cmsghdr *cmsg; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } /* Set the version in the flags when sending the reply */ vmsg->flags &= ~VHOST_USER_VERSION_MASK; @@ -252,7 +320,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) vmsg->flags |= VHOST_USER_REPLY_MASK; do { - rc = write(conn_fd, p, VHOST_USER_HDR_SIZE); + rc = sendmsg(conn_fd, &msg, 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); do { @@ -345,6 +413,7 @@ vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) } vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); @@ -410,6 +479,148 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) } static bool +vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_NONE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + dev_region->mmap_addr); + } + + /* Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = (uintptr_t)(mmap_addr + + dev_region->mmap_offset); + close(vmsg->fds[i]); + } + + /* Send the message back to qemu with the addresses filled in */ + vmsg->fd_num = 0; + if (!vu_message_write(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to set-mem-table for postcopy"); + return false; + } + + /* Wait for QEMU to confirm that it's registered the handler for the + * faults. + */ + if (!vu_message_read(dev, dev->sock, vmsg) || + vmsg->size != sizeof(vmsg->payload.u64) || + vmsg->payload.u64 != 0) { + vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); + return false; + } + + /* OK, now we can go and register the memory and generate faults */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *dev_region = &dev->regions[i]; + int ret; +#ifdef UFFDIO_REGISTER + /* We should already have an open ufd. Mark each memory + * range as ufd. + * Discard any mapping we have here; note I can't use MADV_REMOVE + * or fallocate to make the hole since I don't want to lose + * data that's already arrived in the shared process. + * TODO: How to do hugepage + */ + ret = madvise((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_DONTNEED); + if (ret) { + fprintf(stderr, + "%s: Failed to madvise(DONTNEED) region %d: %s\n", + __func__, i, strerror(errno)); + } + /* Turn off transparent hugepages so we dont get lose wakeups + * in neighbouring pages. + * TODO: Turn this backon later. + */ + ret = madvise((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_NOHUGEPAGE); + if (ret) { + /* Note: This can happen legally on kernels that are configured + * without madvise'able hugepages + */ + fprintf(stderr, + "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", + __func__, i, strerror(errno)); + } + struct uffdio_register reg_struct; + reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; + reg_struct.range.len = dev_region->size + dev_region->mmap_offset; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { + vu_panic(dev, "%s: Failed to userfault region %d " + "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", + __func__, i, + dev_region->mmap_addr, + dev_region->size, dev_region->mmap_offset, + dev->postcopy_ufd, strerror(errno)); + return false; + } + if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { + vu_panic(dev, "%s Region (%d) doesn't support COPY", + __func__, i); + return false; + } + DPRINT("%s: region %d: Registered userfault for %llx + %llx\n", + __func__, i, reg_struct.range.start, reg_struct.range.len); + /* Now it's registered we can let the client at it */ + if (mprotect((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE)) { + vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", + i, strerror(errno)); + return false; + } + /* TODO: Stash 'zero' support flags somewhere */ +#endif + } + + return false; +} + +static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { int i; @@ -425,6 +636,10 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) } dev->nregions = memory->nregions; + if (dev->postcopy_listening) { + return vu_set_mem_table_exec_postcopy(dev, vmsg); + } + DPRINT("Nregions: %d\n", memory->nregions); for (i = 0; i < dev->nregions; i++) { void *mmap_addr; @@ -500,6 +715,7 @@ vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) dev->log_size = log_mmap_size; vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -752,12 +968,17 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ; + if (have_userfault()) { + features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; + } + if (dev->iface->get_protocol_features) { features |= dev->iface->get_protocol_features(dev); } vmsg->payload.u64 = features; vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -857,6 +1078,77 @@ vu_set_config(VuDev *dev, VhostUserMsg *vmsg) } static bool +vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) +{ + dev->postcopy_ufd = -1; +#ifdef UFFDIO_API + struct uffdio_api api_struct; + + dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + vmsg->size = 0; +#endif + + if (dev->postcopy_ufd == -1) { + vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); + goto out; + } + +#ifdef UFFDIO_API + api_struct.api = UFFD_API; + api_struct.features = 0; + if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { + vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + goto out; + } + /* TODO: Stash feature flags somewhere */ +#endif + +out: + /* Return a ufd to the QEMU */ + vmsg->fd_num = 1; + vmsg->fds[0] = dev->postcopy_ufd; + return true; /* = send a reply */ +} + +static bool +vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = -1; + vmsg->size = sizeof(vmsg->payload.u64); + + if (dev->nregions) { + vu_panic(dev, "Regions already registered at postcopy-listen"); + return true; + } + dev->postcopy_listening = true; + + vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; + vmsg->payload.u64 = 0; /* Success */ + return true; +} + +static bool +vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("%s: Entry\n", __func__); + dev->postcopy_listening = false; + if (dev->postcopy_ufd > 0) { + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + DPRINT("%s: Done close\n", __func__); + } + + vmsg->fd_num = 0; + vmsg->payload.u64 = 0; + vmsg->size = sizeof(vmsg->payload.u64); + vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; + DPRINT("%s: exit\n", __func__); + return true; +} + +static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { int do_reply = 0; @@ -927,6 +1219,12 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_set_config(dev, vmsg); case VHOST_USER_NONE: break; + case VHOST_USER_POSTCOPY_ADVISE: + return vu_set_postcopy_advise(dev, vmsg); + case VHOST_USER_POSTCOPY_LISTEN: + return vu_set_postcopy_listen(dev, vmsg); + case VHOST_USER_POSTCOPY_END: + return vu_set_postcopy_end(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); |