aboutsummaryrefslogtreecommitdiff
path: root/contrib/libvhost-user/libvhost-user.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/libvhost-user/libvhost-user.c')
-rw-r--r--contrib/libvhost-user/libvhost-user.c302
1 files changed, 300 insertions, 2 deletions
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 2e358b5bce..beeed0c43f 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -26,9 +26,20 @@
#include <sys/socket.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
+#include "qemu/compiler.h"
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
#include <linux/vhost.h>
-#include "qemu/compiler.h"
+#ifdef __NR_userfaultfd
+#include <linux/userfaultfd.h>
+#endif
+
+#endif
+
#include "qemu/atomic.h"
#include "libvhost-user.h"
@@ -86,6 +97,9 @@ vu_request_to_string(unsigned int req)
REQ(VHOST_USER_SET_VRING_ENDIAN),
REQ(VHOST_USER_GET_CONFIG),
REQ(VHOST_USER_SET_CONFIG),
+ REQ(VHOST_USER_POSTCOPY_ADVISE),
+ REQ(VHOST_USER_POSTCOPY_LISTEN),
+ REQ(VHOST_USER_POSTCOPY_END),
REQ(VHOST_USER_MAX),
};
#undef REQ
@@ -171,6 +185,35 @@ vmsg_close_fds(VhostUserMsg *vmsg)
}
}
+/* A test to see if we have userfault available */
+static bool
+have_userfault(void)
+{
+#if defined(__linux__) && defined(__NR_userfaultfd) &&\
+ defined(UFFD_FEATURE_MISSING_SHMEM) &&\
+ defined(UFFD_FEATURE_MISSING_HUGETLBFS)
+ /* Now test the kernel we're running on really has the features */
+ int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ struct uffdio_api api_struct;
+ if (ufd < 0) {
+ return false;
+ }
+
+ api_struct.api = UFFD_API;
+ api_struct.features = UFFD_FEATURE_MISSING_SHMEM |
+ UFFD_FEATURE_MISSING_HUGETLBFS;
+ if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+ close(ufd);
+ return false;
+ }
+ close(ufd);
+ return true;
+
+#else
+ return false;
+#endif
+}
+
static bool
vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
{
@@ -245,6 +288,31 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
{
int rc;
uint8_t *p = (uint8_t *)vmsg;
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+ struct iovec iov = {
+ .iov_base = (char *)vmsg,
+ .iov_len = VHOST_USER_HDR_SIZE,
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = control,
+ };
+ struct cmsghdr *cmsg;
+
+ memset(control, 0, sizeof(control));
+ assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
+ if (vmsg->fd_num > 0) {
+ size_t fdsize = vmsg->fd_num * sizeof(int);
+ msg.msg_controllen = CMSG_SPACE(fdsize);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
+ } else {
+ msg.msg_controllen = 0;
+ }
/* Set the version in the flags when sending the reply */
vmsg->flags &= ~VHOST_USER_VERSION_MASK;
@@ -252,7 +320,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
vmsg->flags |= VHOST_USER_REPLY_MASK;
do {
- rc = write(conn_fd, p, VHOST_USER_HDR_SIZE);
+ rc = sendmsg(conn_fd, &msg, 0);
} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
do {
@@ -345,6 +413,7 @@ vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
}
vmsg->size = sizeof(vmsg->payload.u64);
+ vmsg->fd_num = 0;
DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
@@ -410,6 +479,148 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
}
static bool
+vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int i;
+ VhostUserMemory *memory = &vmsg->payload.memory;
+ dev->nregions = memory->nregions;
+
+ DPRINT("Nregions: %d\n", memory->nregions);
+ for (i = 0; i < dev->nregions; i++) {
+ void *mmap_addr;
+ VhostUserMemoryRegion *msg_region = &memory->regions[i];
+ VuDevRegion *dev_region = &dev->regions[i];
+
+ DPRINT("Region %d\n", i);
+ DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
+ msg_region->guest_phys_addr);
+ DPRINT(" memory_size: 0x%016"PRIx64"\n",
+ msg_region->memory_size);
+ DPRINT(" userspace_addr 0x%016"PRIx64"\n",
+ msg_region->userspace_addr);
+ DPRINT(" mmap_offset 0x%016"PRIx64"\n",
+ msg_region->mmap_offset);
+
+ dev_region->gpa = msg_region->guest_phys_addr;
+ dev_region->size = msg_region->memory_size;
+ dev_region->qva = msg_region->userspace_addr;
+ dev_region->mmap_offset = msg_region->mmap_offset;
+
+ /* We don't use offset argument of mmap() since the
+ * mapped address has to be page aligned, and we use huge
+ * pages.
+ * In postcopy we're using PROT_NONE here to catch anyone
+ * accessing it before we userfault
+ */
+ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+ PROT_NONE, MAP_SHARED,
+ vmsg->fds[i], 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ vu_panic(dev, "region mmap error: %s", strerror(errno));
+ } else {
+ dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+ DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
+ dev_region->mmap_addr);
+ }
+
+ /* Return the address to QEMU so that it can translate the ufd
+ * fault addresses back.
+ */
+ msg_region->userspace_addr = (uintptr_t)(mmap_addr +
+ dev_region->mmap_offset);
+ close(vmsg->fds[i]);
+ }
+
+ /* Send the message back to qemu with the addresses filled in */
+ vmsg->fd_num = 0;
+ if (!vu_message_write(dev, dev->sock, vmsg)) {
+ vu_panic(dev, "failed to respond to set-mem-table for postcopy");
+ return false;
+ }
+
+ /* Wait for QEMU to confirm that it's registered the handler for the
+ * faults.
+ */
+ if (!vu_message_read(dev, dev->sock, vmsg) ||
+ vmsg->size != sizeof(vmsg->payload.u64) ||
+ vmsg->payload.u64 != 0) {
+ vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
+ return false;
+ }
+
+ /* OK, now we can go and register the memory and generate faults */
+ for (i = 0; i < dev->nregions; i++) {
+ VuDevRegion *dev_region = &dev->regions[i];
+ int ret;
+#ifdef UFFDIO_REGISTER
+ /* We should already have an open ufd. Mark each memory
+ * range as ufd.
+ * Discard any mapping we have here; note I can't use MADV_REMOVE
+ * or fallocate to make the hole since I don't want to lose
+ * data that's already arrived in the shared process.
+ * TODO: How to do hugepage
+ */
+ ret = madvise((void *)dev_region->mmap_addr,
+ dev_region->size + dev_region->mmap_offset,
+ MADV_DONTNEED);
+ if (ret) {
+ fprintf(stderr,
+ "%s: Failed to madvise(DONTNEED) region %d: %s\n",
+ __func__, i, strerror(errno));
+ }
+ /* Turn off transparent hugepages so we dont get lose wakeups
+ * in neighbouring pages.
+ * TODO: Turn this backon later.
+ */
+ ret = madvise((void *)dev_region->mmap_addr,
+ dev_region->size + dev_region->mmap_offset,
+ MADV_NOHUGEPAGE);
+ if (ret) {
+ /* Note: This can happen legally on kernels that are configured
+ * without madvise'able hugepages
+ */
+ fprintf(stderr,
+ "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
+ __func__, i, strerror(errno));
+ }
+ struct uffdio_register reg_struct;
+ reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
+ reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
+ vu_panic(dev, "%s: Failed to userfault region %d "
+ "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
+ __func__, i,
+ dev_region->mmap_addr,
+ dev_region->size, dev_region->mmap_offset,
+ dev->postcopy_ufd, strerror(errno));
+ return false;
+ }
+ if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
+ vu_panic(dev, "%s Region (%d) doesn't support COPY",
+ __func__, i);
+ return false;
+ }
+ DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
+ __func__, i, reg_struct.range.start, reg_struct.range.len);
+ /* Now it's registered we can let the client at it */
+ if (mprotect((void *)dev_region->mmap_addr,
+ dev_region->size + dev_region->mmap_offset,
+ PROT_READ | PROT_WRITE)) {
+ vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
+ i, strerror(errno));
+ return false;
+ }
+ /* TODO: Stash 'zero' support flags somewhere */
+#endif
+ }
+
+ return false;
+}
+
+static bool
vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int i;
@@ -425,6 +636,10 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
}
dev->nregions = memory->nregions;
+ if (dev->postcopy_listening) {
+ return vu_set_mem_table_exec_postcopy(dev, vmsg);
+ }
+
DPRINT("Nregions: %d\n", memory->nregions);
for (i = 0; i < dev->nregions; i++) {
void *mmap_addr;
@@ -500,6 +715,7 @@ vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->log_size = log_mmap_size;
vmsg->size = sizeof(vmsg->payload.u64);
+ vmsg->fd_num = 0;
return true;
}
@@ -752,12 +968,17 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ;
+ if (have_userfault()) {
+ features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
+ }
+
if (dev->iface->get_protocol_features) {
features |= dev->iface->get_protocol_features(dev);
}
vmsg->payload.u64 = features;
vmsg->size = sizeof(vmsg->payload.u64);
+ vmsg->fd_num = 0;
return true;
}
@@ -857,6 +1078,77 @@ vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
}
static bool
+vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
+{
+ dev->postcopy_ufd = -1;
+#ifdef UFFDIO_API
+ struct uffdio_api api_struct;
+
+ dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ vmsg->size = 0;
+#endif
+
+ if (dev->postcopy_ufd == -1) {
+ vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
+ goto out;
+ }
+
+#ifdef UFFDIO_API
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+ if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
+ vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ goto out;
+ }
+ /* TODO: Stash feature flags somewhere */
+#endif
+
+out:
+ /* Return a ufd to the QEMU */
+ vmsg->fd_num = 1;
+ vmsg->fds[0] = dev->postcopy_ufd;
+ return true; /* = send a reply */
+}
+
+static bool
+vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
+{
+ vmsg->payload.u64 = -1;
+ vmsg->size = sizeof(vmsg->payload.u64);
+
+ if (dev->nregions) {
+ vu_panic(dev, "Regions already registered at postcopy-listen");
+ return true;
+ }
+ dev->postcopy_listening = true;
+
+ vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK;
+ vmsg->payload.u64 = 0; /* Success */
+ return true;
+}
+
+static bool
+vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
+{
+ DPRINT("%s: Entry\n", __func__);
+ dev->postcopy_listening = false;
+ if (dev->postcopy_ufd > 0) {
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ DPRINT("%s: Done close\n", __func__);
+ }
+
+ vmsg->fd_num = 0;
+ vmsg->payload.u64 = 0;
+ vmsg->size = sizeof(vmsg->payload.u64);
+ vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK;
+ DPRINT("%s: exit\n", __func__);
+ return true;
+}
+
+static bool
vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
{
int do_reply = 0;
@@ -927,6 +1219,12 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
return vu_set_config(dev, vmsg);
case VHOST_USER_NONE:
break;
+ case VHOST_USER_POSTCOPY_ADVISE:
+ return vu_set_postcopy_advise(dev, vmsg);
+ case VHOST_USER_POSTCOPY_LISTEN:
+ return vu_set_postcopy_listen(dev, vmsg);
+ case VHOST_USER_POSTCOPY_END:
+ return vu_set_postcopy_end(dev, vmsg);
default:
vmsg_close_fds(vmsg);
vu_panic(dev, "Unhandled request: %d", vmsg->request);