aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile1
-rw-r--r--Makefile.objs2
-rw-r--r--contrib/libvhost-user/Makefile.objs1
-rw-r--r--contrib/libvhost-user/libvhost-user.c1499
-rw-r--r--contrib/libvhost-user/libvhost-user.h435
-rw-r--r--hw/i386/amd_iommu.c2
-rw-r--r--hw/i386/amd_iommu.h4
-rw-r--r--hw/pci/pci.c4
-rw-r--r--hw/s390x/virtio-ccw.c4
-rw-r--r--hw/virtio/virtio-mmio.c2
-rw-r--r--tests/Makefile.include2
-rw-r--r--tests/vhost-user-bridge.c1183
12 files changed, 2175 insertions, 964 deletions
diff --git a/Makefile b/Makefile
index 214cbad35d..1a8bfb225c 100644
--- a/Makefile
+++ b/Makefile
@@ -149,6 +149,7 @@ dummy := $(call unnest-vars,, \
qga-obj-y \
ivshmem-client-obj-y \
ivshmem-server-obj-y \
+ libvhost-user-obj-y \
qga-vss-dll-obj-y \
block-obj-y \
block-obj-m \
diff --git a/Makefile.objs b/Makefile.objs
index 51c36a4d54..01cef866e4 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -115,7 +115,7 @@ qga-vss-dll-obj-y = qga/
# contrib
ivshmem-client-obj-y = contrib/ivshmem-client/
ivshmem-server-obj-y = contrib/ivshmem-server/
-
+libvhost-user-obj-y = contrib/libvhost-user/
######################################################################
trace-events-y = trace-events
diff --git a/contrib/libvhost-user/Makefile.objs b/contrib/libvhost-user/Makefile.objs
new file mode 100644
index 0000000000..cef1ad6e31
--- /dev/null
+++ b/contrib/libvhost-user/Makefile.objs
@@ -0,0 +1 @@
+libvhost-user-obj-y = libvhost-user.o
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
new file mode 100644
index 0000000000..af4faad60b
--- /dev/null
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -0,0 +1,1499 @@
+/*
+ * Vhost User library
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Marc-André Lureau <mlureau@redhat.com>
+ * Victor Kaplansky <victork@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include <qemu/osdep.h>
+#include <sys/eventfd.h>
+#include <linux/vhost.h>
+
+#include "qemu/atomic.h"
+
+#include "libvhost-user.h"
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 1
+#define LIBVHOST_USER_DEBUG 0
+
+#define DPRINT(...) \
+ do { \
+ if (LIBVHOST_USER_DEBUG) { \
+ fprintf(stderr, __VA_ARGS__); \
+ } \
+ } while (0)
+
+static const char *
+vu_request_to_string(int req)
+{
+#define REQ(req) [req] = #req
+ static const char *vu_request_str[] = {
+ REQ(VHOST_USER_NONE),
+ REQ(VHOST_USER_GET_FEATURES),
+ REQ(VHOST_USER_SET_FEATURES),
+ REQ(VHOST_USER_NONE),
+ REQ(VHOST_USER_GET_FEATURES),
+ REQ(VHOST_USER_SET_FEATURES),
+ REQ(VHOST_USER_SET_OWNER),
+ REQ(VHOST_USER_RESET_OWNER),
+ REQ(VHOST_USER_SET_MEM_TABLE),
+ REQ(VHOST_USER_SET_LOG_BASE),
+ REQ(VHOST_USER_SET_LOG_FD),
+ REQ(VHOST_USER_SET_VRING_NUM),
+ REQ(VHOST_USER_SET_VRING_ADDR),
+ REQ(VHOST_USER_SET_VRING_BASE),
+ REQ(VHOST_USER_GET_VRING_BASE),
+ REQ(VHOST_USER_SET_VRING_KICK),
+ REQ(VHOST_USER_SET_VRING_CALL),
+ REQ(VHOST_USER_SET_VRING_ERR),
+ REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
+ REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
+ REQ(VHOST_USER_GET_QUEUE_NUM),
+ REQ(VHOST_USER_SET_VRING_ENABLE),
+ REQ(VHOST_USER_SEND_RARP),
+ REQ(VHOST_USER_INPUT_GET_CONFIG),
+ REQ(VHOST_USER_MAX),
+ };
+#undef REQ
+
+ if (req < VHOST_USER_MAX) {
+ return vu_request_str[req];
+ } else {
+ return "unknown";
+ }
+}
+
+static void
+vu_panic(VuDev *dev, const char *msg, ...)
+{
+ char *buf = NULL;
+ va_list ap;
+
+ va_start(ap, msg);
+ (void)vasprintf(&buf, msg, ap);
+ va_end(ap);
+
+ dev->broken = true;
+ dev->panic(dev, buf);
+ free(buf);
+
+ /* FIXME: find a way to call virtio_error? */
+}
+
+/* Translate guest physical address to our virtual address. */
+void *
+vu_gpa_to_va(VuDev *dev, uint64_t guest_addr)
+{
+ int i;
+
+ /* Find matching memory region. */
+ for (i = 0; i < dev->nregions; i++) {
+ VuDevRegion *r = &dev->regions[i];
+
+ if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
+ return (void *)(uintptr_t)
+ guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
+ }
+ }
+
+ return NULL;
+}
+
+/* Translate qemu virtual address to our virtual address. */
+static void *
+qva_to_va(VuDev *dev, uint64_t qemu_addr)
+{
+ int i;
+
+ /* Find matching memory region. */
+ for (i = 0; i < dev->nregions; i++) {
+ VuDevRegion *r = &dev->regions[i];
+
+ if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
+ return (void *)(uintptr_t)
+ qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+vmsg_close_fds(VhostUserMsg *vmsg)
+{
+ int i;
+
+ for (i = 0; i < vmsg->fd_num; i++) {
+ close(vmsg->fds[i]);
+ }
+}
+
+static bool
+vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+{
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+ struct iovec iov = {
+ .iov_base = (char *)vmsg,
+ .iov_len = VHOST_USER_HDR_SIZE,
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = control,
+ .msg_controllen = sizeof(control),
+ };
+ size_t fd_size;
+ struct cmsghdr *cmsg;
+ int rc;
+
+ do {
+ rc = recvmsg(conn_fd, &msg, 0);
+ } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+
+ if (rc <= 0) {
+ vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
+ return false;
+ }
+
+ vmsg->fd_num = 0;
+ for (cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msg, cmsg))
+ {
+ if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+ fd_size = cmsg->cmsg_len - CMSG_LEN(0);
+ vmsg->fd_num = fd_size / sizeof(int);
+ memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
+ break;
+ }
+ }
+
+ if (vmsg->size > sizeof(vmsg->payload)) {
+ vu_panic(dev,
+ "Error: too big message request: %d, size: vmsg->size: %u, "
+ "while sizeof(vmsg->payload) = %zu\n",
+ vmsg->request, vmsg->size, sizeof(vmsg->payload));
+ goto fail;
+ }
+
+ if (vmsg->size) {
+ do {
+ rc = read(conn_fd, &vmsg->payload, vmsg->size);
+ } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+
+ if (rc <= 0) {
+ vu_panic(dev, "Error while reading: %s", strerror(errno));
+ goto fail;
+ }
+
+ assert(rc == vmsg->size);
+ }
+
+ return true;
+
+fail:
+ vmsg_close_fds(vmsg);
+
+ return false;
+}
+
+static bool
+vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
+{
+ int rc;
+ uint8_t *p = (uint8_t *)vmsg;
+
+ /* Set the version in the flags when sending the reply */
+ vmsg->flags &= ~VHOST_USER_VERSION_MASK;
+ vmsg->flags |= VHOST_USER_VERSION;
+ vmsg->flags |= VHOST_USER_REPLY_MASK;
+
+ do {
+ rc = write(conn_fd, p, VHOST_USER_HDR_SIZE);
+ } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+
+ do {
+ if (vmsg->data) {
+ rc = write(conn_fd, vmsg->data, vmsg->size);
+ } else {
+ rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
+ }
+ } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+
+ if (rc <= 0) {
+ vu_panic(dev, "Error while writing: %s", strerror(errno));
+ return false;
+ }
+
+ return true;
+}
+
+/* Kick the log_call_fd if required. */
+static void
+vu_log_kick(VuDev *dev)
+{
+ if (dev->log_call_fd != -1) {
+ DPRINT("Kicking the QEMU's log...\n");
+ if (eventfd_write(dev->log_call_fd, 1) < 0) {
+ vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
+ }
+ }
+}
+
+static void
+vu_log_page(uint8_t *log_table, uint64_t page)
+{
+ DPRINT("Logged dirty guest page: %"PRId64"\n", page);
+ atomic_or(&log_table[page / 8], 1 << (page % 8));
+}
+
+static void
+vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
+{
+ uint64_t page;
+
+ if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
+ !dev->log_table || !length) {
+ return;
+ }
+
+ assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
+
+ page = address / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < address + length) {
+ vu_log_page(dev->log_table, page);
+ page += VHOST_LOG_PAGE;
+ }
+
+ vu_log_kick(dev);
+}
+
+static void
+vu_kick_cb(VuDev *dev, int condition, void *data)
+{
+ int index = (intptr_t)data;
+ VuVirtq *vq = &dev->vq[index];
+ int sock = vq->kick_fd;
+ eventfd_t kick_data;
+ ssize_t rc;
+
+ rc = eventfd_read(sock, &kick_data);
+ if (rc == -1) {
+ vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
+ dev->remove_watch(dev, dev->vq[index].kick_fd);
+ } else {
+ DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
+ kick_data, vq->handler, index);
+ if (vq->handler) {
+ vq->handler(dev, index);
+ }
+ }
+}
+
+static bool
+vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ vmsg->payload.u64 =
+ 1ULL << VHOST_F_LOG_ALL |
+ 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
+
+ if (dev->iface->get_features) {
+ vmsg->payload.u64 |= dev->iface->get_features(dev);
+ }
+
+ vmsg->size = sizeof(vmsg->payload.u64);
+
+ DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
+
+ return true;
+}
+
+static void
+vu_set_enable_all_rings(VuDev *dev, bool enabled)
+{
+ int i;
+
+ for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+ dev->vq[i].enable = enabled;
+ }
+}
+
+static bool
+vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
+
+ dev->features = vmsg->payload.u64;
+
+ if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
+ vu_set_enable_all_rings(dev, true);
+ }
+
+ if (dev->iface->set_features) {
+ dev->iface->set_features(dev, dev->features);
+ }
+
+ return false;
+}
+
+static bool
+vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ return false;
+}
+
+static void
+vu_close_log(VuDev *dev)
+{
+ if (dev->log_table) {
+ if (munmap(dev->log_table, dev->log_size) != 0) {
+ perror("close log munmap() error");
+ }
+
+ dev->log_table = NULL;
+ }
+ if (dev->log_call_fd != -1) {
+ close(dev->log_call_fd);
+ dev->log_call_fd = -1;
+ }
+}
+
+static bool
+vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ vu_set_enable_all_rings(dev, false);
+
+ return false;
+}
+
+static bool
+vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int i;
+ VhostUserMemory *memory = &vmsg->payload.memory;
+ dev->nregions = memory->nregions;
+
+ DPRINT("Nregions: %d\n", memory->nregions);
+ for (i = 0; i < dev->nregions; i++) {
+ void *mmap_addr;
+ VhostUserMemoryRegion *msg_region = &memory->regions[i];
+ VuDevRegion *dev_region = &dev->regions[i];
+
+ DPRINT("Region %d\n", i);
+ DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
+ msg_region->guest_phys_addr);
+ DPRINT(" memory_size: 0x%016"PRIx64"\n",
+ msg_region->memory_size);
+ DPRINT(" userspace_addr 0x%016"PRIx64"\n",
+ msg_region->userspace_addr);
+ DPRINT(" mmap_offset 0x%016"PRIx64"\n",
+ msg_region->mmap_offset);
+
+ dev_region->gpa = msg_region->guest_phys_addr;
+ dev_region->size = msg_region->memory_size;
+ dev_region->qva = msg_region->userspace_addr;
+ dev_region->mmap_offset = msg_region->mmap_offset;
+
+ /* We don't use offset argument of mmap() since the
+ * mapped address has to be page aligned, and we use huge
+ * pages. */
+ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ vmsg->fds[i], 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ vu_panic(dev, "region mmap error: %s", strerror(errno));
+ } else {
+ dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+ DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
+ dev_region->mmap_addr);
+ }
+
+ close(vmsg->fds[i]);
+ }
+
+ return false;
+}
+
+static bool
+vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int fd;
+ uint64_t log_mmap_size, log_mmap_offset;
+ void *rc;
+
+ if (vmsg->fd_num != 1 ||
+ vmsg->size != sizeof(vmsg->payload.log)) {
+ vu_panic(dev, "Invalid log_base message");
+ return true;
+ }
+
+ fd = vmsg->fds[0];
+ log_mmap_offset = vmsg->payload.log.mmap_offset;
+ log_mmap_size = vmsg->payload.log.mmap_size;
+ DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
+ DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
+
+ rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+ log_mmap_offset);
+ if (rc == MAP_FAILED) {
+ perror("log mmap error");
+ }
+ dev->log_table = rc;
+ dev->log_size = log_mmap_size;
+
+ vmsg->size = sizeof(vmsg->payload.u64);
+
+ return true;
+}
+
+static bool
+vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ if (vmsg->fd_num != 1) {
+ vu_panic(dev, "Invalid log_fd message");
+ return false;
+ }
+
+ if (dev->log_call_fd != -1) {
+ close(dev->log_call_fd);
+ }
+ dev->log_call_fd = vmsg->fds[0];
+ DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
+
+ return false;
+}
+
+static bool
+vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
+
+ DPRINT("State.index: %d\n", index);
+ DPRINT("State.num: %d\n", num);
+ dev->vq[index].vring.num = num;
+
+ return false;
+}
+
+static bool
+vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ struct vhost_vring_addr *vra = &vmsg->payload.addr;
+ unsigned int index = vra->index;
+ VuVirtq *vq = &dev->vq[index];
+
+ DPRINT("vhost_vring_addr:\n");
+ DPRINT(" index: %d\n", vra->index);
+ DPRINT(" flags: %d\n", vra->flags);
+ DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
+ DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
+ DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
+ DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
+
+ vq->vring.flags = vra->flags;
+ vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
+ vq->vring.used = qva_to_va(dev, vra->used_user_addr);
+ vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
+ vq->vring.log_guest_addr = vra->log_guest_addr;
+
+ DPRINT("Setting virtq addresses:\n");
+ DPRINT(" vring_desc at %p\n", vq->vring.desc);
+ DPRINT(" vring_used at %p\n", vq->vring.used);
+ DPRINT(" vring_avail at %p\n", vq->vring.avail);
+
+ if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
+ vu_panic(dev, "Invalid vring_addr message");
+ return false;
+ }
+
+ vq->used_idx = vq->vring.used->idx;
+
+ return false;
+}
+
+static bool
+vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+ unsigned int num = vmsg->payload.state.num;
+
+ DPRINT("State.index: %d\n", index);
+ DPRINT("State.num: %d\n", num);
+ dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
+
+ return false;
+}
+
+static bool
+vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+
+ DPRINT("State.index: %d\n", index);
+ vmsg->payload.state.num = dev->vq[index].last_avail_idx;
+ vmsg->size = sizeof(vmsg->payload.state);
+
+ dev->vq[index].started = false;
+ if (dev->iface->queue_set_started) {
+ dev->iface->queue_set_started(dev, index, false);
+ }
+
+ if (dev->vq[index].call_fd != -1) {
+ close(dev->vq[index].call_fd);
+ dev->vq[index].call_fd = -1;
+ }
+ if (dev->vq[index].kick_fd != -1) {
+ dev->remove_watch(dev, dev->vq[index].kick_fd);
+ close(dev->vq[index].kick_fd);
+ dev->vq[index].kick_fd = -1;
+ }
+
+ return true;
+}
+
+static bool
+vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+ if (index >= VHOST_MAX_NR_VIRTQUEUE) {
+ vmsg_close_fds(vmsg);
+ vu_panic(dev, "Invalid queue index: %u", index);
+ return false;
+ }
+
+ if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
+ vmsg->fd_num != 1) {
+ vmsg_close_fds(vmsg);
+ vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+ DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
+
+ if (!vu_check_queue_msg_file(dev, vmsg)) {
+ return false;
+ }
+
+ if (dev->vq[index].kick_fd != -1) {
+ dev->remove_watch(dev, dev->vq[index].kick_fd);
+ close(dev->vq[index].kick_fd);
+ dev->vq[index].kick_fd = -1;
+ }
+
+ if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
+ dev->vq[index].kick_fd = vmsg->fds[0];
+ DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
+ }
+
+ dev->vq[index].started = true;
+ if (dev->iface->queue_set_started) {
+ dev->iface->queue_set_started(dev, index, true);
+ }
+
+ if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
+ dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
+ vu_kick_cb, (void *)(long)index);
+
+ DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
+ dev->vq[index].kick_fd, index);
+ }
+
+ return false;
+}
+
+void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
+ vu_queue_handler_cb handler)
+{
+ int qidx = vq - dev->vq;
+
+ vq->handler = handler;
+ if (vq->kick_fd >= 0) {
+ if (handler) {
+ dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
+ vu_kick_cb, (void *)(long)qidx);
+ } else {
+ dev->remove_watch(dev, vq->kick_fd);
+ }
+ }
+}
+
+static bool
+vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+ DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
+
+ if (!vu_check_queue_msg_file(dev, vmsg)) {
+ return false;
+ }
+
+ if (dev->vq[index].call_fd != -1) {
+ close(dev->vq[index].call_fd);
+ dev->vq[index].call_fd = -1;
+ }
+
+ if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
+ dev->vq[index].call_fd = vmsg->fds[0];
+ }
+
+ DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
+
+ return false;
+}
+
+static bool
+vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+
+ DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
+
+ if (!vu_check_queue_msg_file(dev, vmsg)) {
+ return false;
+ }
+
+ if (dev->vq[index].err_fd != -1) {
+ close(dev->vq[index].err_fd);
+ dev->vq[index].err_fd = -1;
+ }
+
+ if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
+ dev->vq[index].err_fd = vmsg->fds[0];
+ }
+
+ return false;
+}
+
+static bool
+vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
+
+ if (dev->iface->get_protocol_features) {
+ features |= dev->iface->get_protocol_features(dev);
+ }
+
+ vmsg->payload.u64 = features;
+ vmsg->size = sizeof(vmsg->payload.u64);
+
+ return true;
+}
+
+static bool
+vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ uint64_t features = vmsg->payload.u64;
+
+ DPRINT("u64: 0x%016"PRIx64"\n", features);
+
+ dev->protocol_features = vmsg->payload.u64;
+
+ if (dev->iface->set_protocol_features) {
+ dev->iface->set_protocol_features(dev, features);
+ }
+
+ return false;
+}
+
+static bool
+vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ DPRINT("Function %s() not implemented yet.\n", __func__);
+ return false;
+}
+
+static bool
+vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+ unsigned int enable = vmsg->payload.state.num;
+
+ DPRINT("State.index: %d\n", index);
+ DPRINT("State.enable: %d\n", enable);
+
+ if (index >= VHOST_MAX_NR_VIRTQUEUE) {
+ vu_panic(dev, "Invalid vring_enable index: %u", index);
+ return false;
+ }
+
+ dev->vq[index].enable = enable;
+ return false;
+}
+
+static bool
+vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int do_reply = 0;
+
+ /* Print out generic part of the request. */
+ DPRINT("================ Vhost user message ================\n");
+ DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
+ vmsg->request);
+ DPRINT("Flags: 0x%x\n", vmsg->flags);
+ DPRINT("Size: %d\n", vmsg->size);
+
+ if (vmsg->fd_num) {
+ int i;
+ DPRINT("Fds:");
+ for (i = 0; i < vmsg->fd_num; i++) {
+ DPRINT(" %d", vmsg->fds[i]);
+ }
+ DPRINT("\n");
+ }
+
+ if (dev->iface->process_msg &&
+ dev->iface->process_msg(dev, vmsg, &do_reply)) {
+ return do_reply;
+ }
+
+ switch (vmsg->request) {
+ case VHOST_USER_GET_FEATURES:
+ return vu_get_features_exec(dev, vmsg);
+ case VHOST_USER_SET_FEATURES:
+ return vu_set_features_exec(dev, vmsg);
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ return vu_get_protocol_features_exec(dev, vmsg);
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ return vu_set_protocol_features_exec(dev, vmsg);
+ case VHOST_USER_SET_OWNER:
+ return vu_set_owner_exec(dev, vmsg);
+ case VHOST_USER_RESET_OWNER:
+ return vu_reset_device_exec(dev, vmsg);
+ case VHOST_USER_SET_MEM_TABLE:
+ return vu_set_mem_table_exec(dev, vmsg);
+ case VHOST_USER_SET_LOG_BASE:
+ return vu_set_log_base_exec(dev, vmsg);
+ case VHOST_USER_SET_LOG_FD:
+ return vu_set_log_fd_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_NUM:
+ return vu_set_vring_num_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_ADDR:
+ return vu_set_vring_addr_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_BASE:
+ return vu_set_vring_base_exec(dev, vmsg);
+ case VHOST_USER_GET_VRING_BASE:
+ return vu_get_vring_base_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_KICK:
+ return vu_set_vring_kick_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_CALL:
+ return vu_set_vring_call_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_ERR:
+ return vu_set_vring_err_exec(dev, vmsg);
+ case VHOST_USER_GET_QUEUE_NUM:
+ return vu_get_queue_num_exec(dev, vmsg);
+ case VHOST_USER_SET_VRING_ENABLE:
+ return vu_set_vring_enable_exec(dev, vmsg);
+ default:
+ vmsg_close_fds(vmsg);
+ vu_panic(dev, "Unhandled request: %d", vmsg->request);
+ }
+
+ return false;
+}
+
+bool
+vu_dispatch(VuDev *dev)
+{
+ VhostUserMsg vmsg = { 0, };
+ int reply_requested;
+ bool success = false;
+
+ if (!vu_message_read(dev, dev->sock, &vmsg)) {
+ goto end;
+ }
+
+ reply_requested = vu_process_message(dev, &vmsg);
+ if (!reply_requested) {
+ success = true;
+ goto end;
+ }
+
+ if (!vu_message_write(dev, dev->sock, &vmsg)) {
+ goto end;
+ }
+
+ success = true;
+
+end:
+ g_free(vmsg.data);
+ return success;
+}
+
+void
+vu_deinit(VuDev *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->nregions; i++) {
+ VuDevRegion *r = &dev->regions[i];
+ void *m = (void *) (uintptr_t) r->mmap_addr;
+ if (m != MAP_FAILED) {
+ munmap(m, r->size + r->mmap_offset);
+ }
+ }
+ dev->nregions = 0;
+
+ for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+ VuVirtq *vq = &dev->vq[i];
+
+ if (vq->call_fd != -1) {
+ close(vq->call_fd);
+ vq->call_fd = -1;
+ }
+
+ if (vq->kick_fd != -1) {
+ close(vq->kick_fd);
+ vq->kick_fd = -1;
+ }
+
+ if (vq->err_fd != -1) {
+ close(vq->err_fd);
+ vq->err_fd = -1;
+ }
+ }
+
+
+ vu_close_log(dev);
+
+ if (dev->sock != -1) {
+ close(dev->sock);
+ }
+}
+
+void
+vu_init(VuDev *dev,
+ int socket,
+ vu_panic_cb panic,
+ vu_set_watch_cb set_watch,
+ vu_remove_watch_cb remove_watch,
+ const VuDevIface *iface)
+{
+ int i;
+
+ assert(socket >= 0);
+ assert(set_watch);
+ assert(remove_watch);
+ assert(iface);
+ assert(panic);
+
+ memset(dev, 0, sizeof(*dev));
+
+ dev->sock = socket;
+ dev->panic = panic;
+ dev->set_watch = set_watch;
+ dev->remove_watch = remove_watch;
+ dev->iface = iface;
+ dev->log_call_fd = -1;
+ for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+ dev->vq[i] = (VuVirtq) {
+ .call_fd = -1, .kick_fd = -1, .err_fd = -1,
+ .notification = true,
+ };
+ }
+}
+
+VuVirtq *
+vu_get_queue(VuDev *dev, int qidx)
+{
+ assert(qidx < VHOST_MAX_NR_VIRTQUEUE);
+ return &dev->vq[qidx];
+}
+
+bool
+vu_queue_enabled(VuDev *dev, VuVirtq *vq)
+{
+ return vq->enable;
+}
+
+static inline uint16_t
+vring_avail_flags(VuVirtq *vq)
+{
+ return vq->vring.avail->flags;
+}
+
+static inline uint16_t
+vring_avail_idx(VuVirtq *vq)
+{
+ vq->shadow_avail_idx = vq->vring.avail->idx;
+
+ return vq->shadow_avail_idx;
+}
+
+static inline uint16_t
+vring_avail_ring(VuVirtq *vq, int i)
+{
+ return vq->vring.avail->ring[i];
+}
+
+static inline uint16_t
+vring_get_used_event(VuVirtq *vq)
+{
+ return vring_avail_ring(vq, vq->vring.num);
+}
+
+static int
+virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
+{
+ uint16_t num_heads = vring_avail_idx(vq) - idx;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ if (num_heads > vq->vring.num) {
+ vu_panic(dev, "Guest moved used index from %u to %u",
+ idx, vq->shadow_avail_idx);
+ return -1;
+ }
+ if (num_heads) {
+ /* On success, callers read a descriptor at vq->last_avail_idx.
+ * Make sure descriptor read does not bypass avail index read. */
+ smp_rmb();
+ }
+
+ return num_heads;
+}
+
+static bool
+virtqueue_get_head(VuDev *dev, VuVirtq *vq,
+ unsigned int idx, unsigned int *head)
+{
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ *head = vring_avail_ring(vq, idx % vq->vring.num);
+
+ /* If their number is silly, that's a fatal mistake. */
+ if (*head >= vq->vring.num) {
+ vu_panic(dev, "Guest says index %u is available", head);
+ return false;
+ }
+
+ return true;
+}
+
+enum {
+ VIRTQUEUE_READ_DESC_ERROR = -1,
+ VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
+ VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
+};
+
+static int
+virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
+ int i, unsigned int max, unsigned int *next)
+{
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
+ return VIRTQUEUE_READ_DESC_DONE;
+ }
+
+ /* Check they're not leading us off end of descriptors. */
+ *next = desc[i].next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ smp_wmb();
+
+ if (*next >= max) {
+ vu_panic(dev, "Desc next is %u", next);
+ return VIRTQUEUE_READ_DESC_ERROR;
+ }
+
+ return VIRTQUEUE_READ_DESC_MORE;
+}
+
+void
+vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
+ unsigned int *out_bytes,
+ unsigned max_in_bytes, unsigned max_out_bytes)
+{
+ unsigned int idx;
+ unsigned int total_bufs, in_total, out_total;
+ int rc;
+
+ idx = vq->last_avail_idx;
+
+ total_bufs = in_total = out_total = 0;
+ while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
+ unsigned int max, num_bufs, indirect = 0;
+ struct vring_desc *desc;
+ unsigned int i;
+
+ max = vq->vring.num;
+ num_bufs = total_bufs;
+ if (!virtqueue_get_head(dev, vq, idx++, &i)) {
+ goto err;
+ }
+ desc = vq->vring.desc;
+
+ if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+ if (desc[i].len % sizeof(struct vring_desc)) {
+ vu_panic(dev, "Invalid size for indirect buffer table");
+ goto err;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (num_bufs >= max) {
+ vu_panic(dev, "Looped descriptor");
+ goto err;
+ }
+
+ /* loop over the indirect descriptor table */
+ indirect = 1;
+ max = desc[i].len / sizeof(struct vring_desc);
+ desc = vu_gpa_to_va(dev, desc[i].addr);
+ num_bufs = i = 0;
+ }
+
+ do {
+ /* If we've got too many, that implies a descriptor loop. */
+ if (++num_bufs > max) {
+ vu_panic(dev, "Looped descriptor");
+ goto err;
+ }
+
+ if (desc[i].flags & VRING_DESC_F_WRITE) {
+ in_total += desc[i].len;
+ } else {
+ out_total += desc[i].len;
+ }
+ if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
+ goto done;
+ }
+ rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+ goto err;
+ }
+
+ if (!indirect) {
+ total_bufs = num_bufs;
+ } else {
+ total_bufs++;
+ }
+ }
+ if (rc < 0) {
+ goto err;
+ }
+done:
+ if (in_bytes) {
+ *in_bytes = in_total;
+ }
+ if (out_bytes) {
+ *out_bytes = out_total;
+ }
+ return;
+
+err:
+ in_total = out_total = 0;
+ goto done;
+}
+
+bool
+vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
+ unsigned int out_bytes)
+{
+ unsigned int in_total, out_total;
+
+ vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
+ in_bytes, out_bytes);
+
+ return in_bytes <= in_total && out_bytes <= out_total;
+}
+
+/* Fetch avail_idx from VQ memory only when we really need to know if
+ * guest has added some buffers. */
+int
+vu_queue_empty(VuDev *dev, VuVirtq *vq)
+{
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
+ return 0;
+ }
+
+ return vring_avail_idx(vq) == vq->last_avail_idx;
+}
+
+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+ assert(fbit < 64);
+ return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+ unsigned int fbit)
+{
+ return has_feature(dev->features, fbit);
+}
+
+static bool
+vring_notify(VuDev *dev, VuVirtq *vq)
+{
+ uint16_t old, new;
+ bool v;
+
+ /* We need to expose used array entries before checking used event. */
+ smp_mb();
+
+ /* Always notify when queue is empty (when feature acknowledge) */
+ if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+ !vq->inuse && vu_queue_empty(dev, vq)) {
+ return true;
+ }
+
+ if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+
+ v = vq->signalled_used_valid;
+ vq->signalled_used_valid = true;
+ old = vq->signalled_used;
+ new = vq->signalled_used = vq->used_idx;
+ return !v || vring_need_event(vring_get_used_event(vq), new, old);
+}
+
+void
+vu_queue_notify(VuDev *dev, VuVirtq *vq)
+{
+ if (unlikely(dev->broken)) {
+ return;
+ }
+
+ if (!vring_notify(dev, vq)) {
+ DPRINT("skipped notify...\n");
+ return;
+ }
+
+ if (eventfd_write(vq->call_fd, 1) < 0) {
+ vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
+ }
+}
+
+static inline void
+vring_used_flags_set_bit(VuVirtq *vq, int mask)
+{
+ uint16_t *flags;
+
+ flags = (uint16_t *)((char*)vq->vring.used +
+ offsetof(struct vring_used, flags));
+ *flags |= mask;
+}
+
+static inline void
+vring_used_flags_unset_bit(VuVirtq *vq, int mask)
+{
+ uint16_t *flags;
+
+ flags = (uint16_t *)((char*)vq->vring.used +
+ offsetof(struct vring_used, flags));
+ *flags &= ~mask;
+}
+
+static inline void
+vring_set_avail_event(VuVirtq *vq, uint16_t val)
+{
+ if (!vq->notification) {
+ return;
+ }
+
+ *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val;
+}
+
+void
+vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
+{
+ vq->notification = enable;
+ if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ vring_set_avail_event(vq, vring_avail_idx(vq));
+ } else if (enable) {
+ vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
+ } else {
+ vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
+ }
+ if (enable) {
+ /* Expose avail event/used flags before caller checks the avail idx. */
+ smp_mb();
+ }
+}
+
+static void
+virtqueue_map_desc(VuDev *dev,
+ unsigned int *p_num_sg, struct iovec *iov,
+ unsigned int max_num_sg, bool is_write,
+ uint64_t pa, size_t sz)
+{
+ unsigned num_sg = *p_num_sg;
+
+ assert(num_sg <= max_num_sg);
+
+ if (!sz) {
+ vu_panic(dev, "virtio: zero sized buffers are not allowed");
+ return;
+ }
+
+ iov[num_sg].iov_base = vu_gpa_to_va(dev, pa);
+ iov[num_sg].iov_len = sz;
+ num_sg++;
+
+ *p_num_sg = num_sg;
+}
+
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+static void *
+virtqueue_alloc_element(size_t sz,
+ unsigned out_num, unsigned in_num)
+{
+ VuVirtqElement *elem;
+ size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
+ size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
+ size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
+
+ assert(sz >= sizeof(VuVirtqElement));
+ elem = malloc(out_sg_end);
+ elem->out_num = out_num;
+ elem->in_num = in_num;
+ elem->in_sg = (void *)elem + in_sg_ofs;
+ elem->out_sg = (void *)elem + out_sg_ofs;
+ return elem;
+}
+
+void *
+vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+{
+ unsigned int i, head, max;
+ VuVirtqElement *elem;
+ unsigned out_num, in_num;
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ struct vring_desc *desc;
+ int rc;
+
+ if (unlikely(dev->broken)) {
+ return NULL;
+ }
+
+ if (vu_queue_empty(dev, vq)) {
+ return NULL;
+ }
+ /* Needed after virtio_queue_empty(), see comment in
+ * virtqueue_num_heads(). */
+ smp_rmb();
+
+ /* When we start there are none of either input nor output. */
+ out_num = in_num = 0;
+
+ max = vq->vring.num;
+ if (vq->inuse >= vq->vring.num) {
+ vu_panic(dev, "Virtqueue size exceeded");
+ return NULL;
+ }
+
+ if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
+ return NULL;
+ }
+
+ if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ vring_set_avail_event(vq, vq->last_avail_idx);
+ }
+
+ i = head;
+ desc = vq->vring.desc;
+ if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+ if (desc[i].len % sizeof(struct vring_desc)) {
+ vu_panic(dev, "Invalid size for indirect buffer table");
+ }
+
+ /* loop over the indirect descriptor table */
+ max = desc[i].len / sizeof(struct vring_desc);
+ desc = vu_gpa_to_va(dev, desc[i].addr);
+ i = 0;
+ }
+
+ /* Collect all the descriptors */
+ do {
+ if (desc[i].flags & VRING_DESC_F_WRITE) {
+ virtqueue_map_desc(dev, &in_num, iov + out_num,
+ VIRTQUEUE_MAX_SIZE - out_num, true,
+ desc[i].addr, desc[i].len);
+ } else {
+ if (in_num) {
+ vu_panic(dev, "Incorrect order for descriptors");
+ return NULL;
+ }
+ virtqueue_map_desc(dev, &out_num, iov,
+ VIRTQUEUE_MAX_SIZE, false,
+ desc[i].addr, desc[i].len);
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if ((in_num + out_num) > max) {
+ vu_panic(dev, "Looped descriptor");
+ }
+ rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+ return NULL;
+ }
+
+ /* Now copy what we have collected and mapped */
+ elem = virtqueue_alloc_element(sz, out_num, in_num);
+ elem->index = head;
+ for (i = 0; i < out_num; i++) {
+ elem->out_sg[i] = iov[i];
+ }
+ for (i = 0; i < in_num; i++) {
+ elem->in_sg[i] = iov[out_num + i];
+ }
+
+ vq->inuse++;
+
+ return elem;
+}
+
+bool
+vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
+{
+ if (num > vq->inuse) {
+ return false;
+ }
+ vq->last_avail_idx -= num;
+ vq->inuse -= num;
+ return true;
+}
+
+static inline
+void vring_used_write(VuDev *dev, VuVirtq *vq,
+ struct vring_used_elem *uelem, int i)
+{
+ struct vring_used *used = vq->vring.used;
+
+ used->ring[i] = *uelem;
+ vu_log_write(dev, vq->vring.log_guest_addr +
+ offsetof(struct vring_used, ring[i]),
+ sizeof(used->ring[i]));
+}
+
+
+static void
+vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
+ const VuVirtqElement *elem,
+ unsigned int len)
+{
+ struct vring_desc *desc = vq->vring.desc;
+ unsigned int i, max, min;
+ unsigned num_bufs = 0;
+
+ max = vq->vring.num;
+ i = elem->index;
+
+ if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+ if (desc[i].len % sizeof(struct vring_desc)) {
+ vu_panic(dev, "Invalid size for indirect buffer table");
+ }
+
+ /* loop over the indirect descriptor table */
+ max = desc[i].len / sizeof(struct vring_desc);
+ desc = vu_gpa_to_va(dev, desc[i].addr);
+ i = 0;
+ }
+
+ do {
+ if (++num_bufs > max) {
+ vu_panic(dev, "Looped descriptor");
+ return;
+ }
+
+ if (desc[i].flags & VRING_DESC_F_WRITE) {
+ min = MIN(desc[i].len, len);
+ vu_log_write(dev, desc[i].addr, min);
+ len -= min;
+ }
+
+ } while (len > 0 &&
+ (virtqueue_read_next_desc(dev, desc, i, max, &i)
+ == VIRTQUEUE_READ_DESC_MORE));
+}
+
+void
+vu_queue_fill(VuDev *dev, VuVirtq *vq,
+ const VuVirtqElement *elem,
+ unsigned int len, unsigned int idx)
+{
+ struct vring_used_elem uelem;
+
+ if (unlikely(dev->broken)) {
+ return;
+ }
+
+ vu_log_queue_fill(dev, vq, elem, len);
+
+ idx = (idx + vq->used_idx) % vq->vring.num;
+
+ uelem.id = elem->index;
+ uelem.len = len;
+ vring_used_write(dev, vq, &uelem, idx);
+}
+
+static inline
+void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
+{
+ vq->vring.used->idx = val;
+ vu_log_write(dev,
+ vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
+ sizeof(vq->vring.used->idx));
+
+ vq->used_idx = val;
+}
+
+void
+vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
+{
+ uint16_t old, new;
+
+ if (unlikely(dev->broken)) {
+ return;
+ }
+
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+
+ old = vq->used_idx;
+ new = old + count;
+ vring_used_idx_set(dev, vq, new);
+ vq->inuse -= count;
+ if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
+ vq->signalled_used_valid = false;
+ }
+}
+
+void
+vu_queue_push(VuDev *dev, VuVirtq *vq,
+ const VuVirtqElement *elem, unsigned int len)
+{
+ vu_queue_fill(dev, vq, elem, len, 0);
+ vu_queue_flush(dev, vq, 1);
+}
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
new file mode 100644
index 0000000000..156b50e989
--- /dev/null
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -0,0 +1,435 @@
+/*
+ * Vhost User library
+ *
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Victor Kaplansky <victork@redhat.com>
+ * Marc-André Lureau <mlureau@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef LIBVHOST_USER_H
+#define LIBVHOST_USER_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/vhost.h>
+#include "standard-headers/linux/virtio_ring.h"
+
+/* Based on qemu/hw/virtio/vhost-user.c */
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#define VHOST_LOG_PAGE 4096
+
+#define VHOST_MAX_NR_VIRTQUEUE 8
+#define VIRTQUEUE_MAX_SIZE 1024
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+enum VhostUserProtocolFeature {
+ VHOST_USER_PROTOCOL_F_MQ = 0,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
+ VHOST_USER_PROTOCOL_F_RARP = 2,
+
+ VHOST_USER_PROTOCOL_F_MAX
+};
+
+#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_INPUT_GET_CONFIG = 20,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+#if defined(_WIN32)
+# define VU_PACKED __attribute__((gcc_struct, packed))
+#else
+# define VU_PACKED __attribute__((packed))
+#endif
+
+typedef struct VhostUserMsg {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK (0x3)
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+
+ union {
+#define VHOST_USER_VRING_IDX_MASK (0xff)
+#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserLog log;
+ } payload;
+
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ int fd_num;
+ uint8_t *data;
+} VU_PACKED VhostUserMsg;
+
+typedef struct VuDevRegion {
+ /* Guest Physical address. */
+ uint64_t gpa;
+ /* Memory region size. */
+ uint64_t size;
+ /* QEMU virtual address (userspace). */
+ uint64_t qva;
+ /* Starting offset in our mmaped space. */
+ uint64_t mmap_offset;
+ /* Start address of mmaped space. */
+ uint64_t mmap_addr;
+} VuDevRegion;
+
+typedef struct VuDev VuDev;
+
+typedef uint64_t (*vu_get_features_cb) (VuDev *dev);
+typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features);
+typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
+ int *do_reply);
+typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
+
+typedef struct VuDevIface {
+ /* called by VHOST_USER_GET_FEATURES to get the features bitmask */
+ vu_get_features_cb get_features;
+ /* enable vhost implementation features */
+ vu_set_features_cb set_features;
+ /* get the protocol feature bitmask from the underlying vhost
+ * implementation */
+ vu_get_features_cb get_protocol_features;
+ /* enable protocol features in the underlying vhost implementation. */
+ vu_set_features_cb set_protocol_features;
+ /* process_msg is called for each vhost-user message received */
+ /* skip libvhost-user processing if return value != 0 */
+ vu_process_msg_cb process_msg;
+ /* tells when queues can be processed */
+ vu_queue_set_started_cb queue_set_started;
+} VuDevIface;
+
+typedef void (*vu_queue_handler_cb) (VuDev *dev, int qidx);
+
+typedef struct VuRing {
+ unsigned int num;
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint64_t log_guest_addr;
+ uint32_t flags;
+} VuRing;
+
+typedef struct VuVirtq {
+ VuRing vring;
+
+ /* Next head to pop */
+ uint16_t last_avail_idx;
+
+ /* Last avail_idx read from VQ. */
+ uint16_t shadow_avail_idx;
+
+ uint16_t used_idx;
+
+ /* Last used index value we have signalled on */
+ uint16_t signalled_used;
+
+ /* Last used index value we have signalled on */
+ bool signalled_used_valid;
+
+ /* Notification enabled? */
+ bool notification;
+
+ int inuse;
+
+ vu_queue_handler_cb handler;
+
+ int call_fd;
+ int kick_fd;
+ int err_fd;
+ unsigned int enable;
+ bool started;
+} VuVirtq;
+
+enum VuWatchCondtion {
+ VU_WATCH_IN = 1 << 0,
+ VU_WATCH_OUT = 1 << 1,
+ VU_WATCH_PRI = 1 << 2,
+ VU_WATCH_ERR = 1 << 3,
+ VU_WATCH_HUP = 1 << 4,
+};
+
+typedef void (*vu_panic_cb) (VuDev *dev, const char *err);
+typedef void (*vu_watch_cb) (VuDev *dev, int condition, void *data);
+typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
+ vu_watch_cb cb, void *data);
+typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
+
+struct VuDev {
+ int sock;
+ uint32_t nregions;
+ VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+ VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
+ int log_call_fd;
+ uint64_t log_size;
+ uint8_t *log_table;
+ uint64_t features;
+ uint64_t protocol_features;
+ bool broken;
+
+ /* @set_watch: add or update the given fd to the watch set,
+ * call cb when condition is met */
+ vu_set_watch_cb set_watch;
+
+ /* @remove_watch: remove the given fd from the watch set */
+ vu_remove_watch_cb remove_watch;
+
+ /* @panic: encountered an unrecoverable error, you may try to
+ * re-initialize */
+ vu_panic_cb panic;
+ const VuDevIface *iface;
+};
+
+typedef struct VuVirtqElement {
+ unsigned int index;
+ unsigned int out_num;
+ unsigned int in_num;
+ struct iovec *in_sg;
+ struct iovec *out_sg;
+} VuVirtqElement;
+
+/**
+ * vu_init:
+ * @dev: a VuDev context
+ * @socket: the socket connected to vhost-user master
+ * @panic: a panic callback
+ * @set_watch: a set_watch callback
+ * @remove_watch: a remove_watch callback
+ * @iface: a VuDevIface structure with vhost-user device callbacks
+ *
+ * Intializes a VuDev vhost-user context.
+ **/
+void vu_init(VuDev *dev,
+ int socket,
+ vu_panic_cb panic,
+ vu_set_watch_cb set_watch,
+ vu_remove_watch_cb remove_watch,
+ const VuDevIface *iface);
+
+
+/**
+ * vu_deinit:
+ * @dev: a VuDev context
+ *
+ * Cleans up the VuDev context
+ */
+void vu_deinit(VuDev *dev);
+
+/**
+ * vu_dispatch:
+ * @dev: a VuDev context
+ *
+ * Process one vhost-user message.
+ *
+ * Returns: TRUE on success, FALSE on failure.
+ */
+bool vu_dispatch(VuDev *dev);
+
+/**
+ * vu_gpa_to_va:
+ * @dev: a VuDev context
+ * @guest_addr: guest address
+ *
+ * Translate a guest address to a pointer. Returns NULL on failure.
+ */
+void *vu_gpa_to_va(VuDev *dev, uint64_t guest_addr);
+
+/**
+ * vu_get_queue:
+ * @dev: a VuDev context
+ * @qidx: queue index
+ *
+ * Returns the queue number @qidx.
+ */
+VuVirtq *vu_get_queue(VuDev *dev, int qidx);
+
+/**
+ * vu_set_queue_handler:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @handler: the queue handler callback
+ *
+ * Set the queue handler. This function may be called several times
+ * for the same queue. If called with NULL @handler, the handler is
+ * removed.
+ */
+void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
+ vu_queue_handler_cb handler);
+
+
+/**
+ * vu_queue_set_notification:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @enable: state
+ *
+ * Set whether the queue notifies (via event index or interrupt)
+ */
+void vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable);
+
+/**
+ * vu_queue_enabled:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ *
+ * Returns: whether the queue is enabled.
+ */
+bool vu_queue_enabled(VuDev *dev, VuVirtq *vq);
+
+/**
+ * vu_queue_enabled:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ *
+ * Returns: whether the queue is empty.
+ */
+int vu_queue_empty(VuDev *dev, VuVirtq *vq);
+
+/**
+ * vu_queue_notify:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ *
+ * Request to notify the queue via callfd (skipped if unnecessary)
+ */
+void vu_queue_notify(VuDev *dev, VuVirtq *vq);
+
+/**
+ * vu_queue_pop:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @sz: the size of struct to return (must be >= VuVirtqElement)
+ *
+ * Returns: a VuVirtqElement filled from the queue or NULL.
+ */
+void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz);
+
+/**
+ * vu_queue_rewind:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @num: number of elements to push back
+ *
+ * Pretend that elements weren't popped from the virtqueue. The next
+ * virtqueue_pop() will refetch the oldest element.
+ *
+ * Returns: true on success, false if @num is greater than the number of in use
+ * elements.
+ */
+bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num);
+
+/**
+ * vu_queue_fill:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @elem: a VuVirtqElement
+ * @len: length in bytes to write
+ * @idx: optional offset for the used ring index (0 in general)
+ *
+ * Fill the used ring with @elem element.
+ */
+void vu_queue_fill(VuDev *dev, VuVirtq *vq,
+ const VuVirtqElement *elem,
+ unsigned int len, unsigned int idx);
+
+/**
+ * vu_queue_push:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @elem: a VuVirtqElement
+ * @len: length in bytes to write
+ *
+ * Helper that combines vu_queue_fill() with a vu_queue_flush().
+ */
+void vu_queue_push(VuDev *dev, VuVirtq *vq,
+ const VuVirtqElement *elem, unsigned int len);
+
+/**
+ * vu_queue_flush:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @num: number of elements to flush
+ *
+ * Mark the last number of elements as done (used.idx is updated by
+ * num elements).
+*/
+void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int num);
+
+/**
+ * vu_queue_get_avail_bytes:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @in_bytes: in bytes
+ * @out_bytes: out bytes
+ * @max_in_bytes: stop counting after max_in_bytes
+ * @max_out_bytes: stop counting after max_out_bytes
+ *
+ * Count the number of available bytes, up to max_in_bytes/max_out_bytes.
+ */
+void vu_queue_get_avail_bytes(VuDev *vdev, VuVirtq *vq, unsigned int *in_bytes,
+ unsigned int *out_bytes,
+ unsigned max_in_bytes, unsigned max_out_bytes);
+
+/**
+ * vu_queue_avail_bytes:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ * @in_bytes: expected in bytes
+ * @out_bytes: expected out bytes
+ *
+ * Returns: true if in_bytes <= in_total && out_bytes <= out_total
+ */
+bool vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
+ unsigned int out_bytes);
+
+#endif /* LIBVHOST_USER_H */
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9112..e0732ccaf1 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -562,7 +562,7 @@ static void amdvi_mmio_trace(hwaddr addr, unsigned size)
trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
} else {
index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
- trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
+ trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
}
}
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 884926e9e7..0d3dc6a9f2 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -49,8 +49,8 @@
#define AMDVI_CAPAB_INIT_TYPE (3 << 16)
/* No. of used MMIO registers */
-#define AMDVI_MMIO_REGS_HIGH 8
-#define AMDVI_MMIO_REGS_LOW 7
+#define AMDVI_MMIO_REGS_HIGH 7
+#define AMDVI_MMIO_REGS_LOW 8
/* MMIO registers */
#define AMDVI_MMIO_DEVICE_TABLE 0x0000
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 24fae1689d..637d54549e 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -982,8 +982,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
pci_get_function_0(pci_dev)) {
error_setg(errp, "PCI: slot %d function 0 already ocuppied by %s,"
" new func %s cannot be exposed to guest.",
- PCI_SLOT(devfn),
- bus->devices[PCI_DEVFN(PCI_SLOT(devfn), 0)]->name,
+ PCI_SLOT(pci_get_function_0(pci_dev)->devfn),
+ pci_get_function_0(pci_dev)->name,
name);
return NULL;
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index f5c1d98192..07650683f7 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -1098,7 +1098,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n,
* We do not support individual masking for channel devices, so we
* need to manually trigger any guest masking callbacks here.
*/
- if (k->guest_notifier_mask) {
+ if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) {
k->guest_notifier_mask(vdev, n, false);
}
/* get lost events and re-inject */
@@ -1107,7 +1107,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n,
event_notifier_set(notifier);
}
} else {
- if (k->guest_notifier_mask) {
+ if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) {
k->guest_notifier_mask(vdev, n, true);
}
if (with_irqfd) {
diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
index 17412cb7b5..60654dc19d 100644
--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@@ -402,7 +402,7 @@ static int virtio_mmio_set_guest_notifier(DeviceState *d, int n, bool assign,
event_notifier_cleanup(notifier);
}
- if (vdc->guest_notifier_mask) {
+ if (vdc->guest_notifier_mask && vdev->use_guest_notifier_mask) {
vdc->guest_notifier_mask(vdev, n, !assign);
}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 4841d582a1..f776404d86 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -689,7 +689,7 @@ tests/test-filter-mirror$(EXESUF): tests/test-filter-mirror.o $(qtest-obj-y)
tests/test-filter-redirector$(EXESUF): tests/test-filter-redirector.o $(qtest-obj-y)
tests/test-x86-cpuid-compat$(EXESUF): tests/test-x86-cpuid-compat.o $(qtest-obj-y)
tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y)
-tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o
+tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o contrib/libvhost-user/libvhost-user.o $(test-util-obj-y)
tests/test-uuid$(EXESUF): tests/test-uuid.o $(test-util-obj-y)
tests/test-arm-mptimer$(EXESUF): tests/test-arm-mptimer.o
diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
index 775e031069..8618c20d53 100644
--- a/tests/vhost-user-bridge.c
+++ b/tests/vhost-user-bridge.c
@@ -30,17 +30,9 @@
#define _FILE_OFFSET_BITS 64
#include "qemu/osdep.h"
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/unistd.h>
-#include <sys/eventfd.h>
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <linux/vhost.h>
-
-#include "qemu/atomic.h"
+#include "qemu/iov.h"
#include "standard-headers/linux/virtio_net.h"
-#include "standard-headers/linux/virtio_ring.h"
+#include "contrib/libvhost-user/libvhost-user.h"
#define VHOST_USER_BRIDGE_DEBUG 1
@@ -64,6 +56,17 @@ typedef struct Dispatcher {
Event events[FD_SETSIZE];
} Dispatcher;
+typedef struct VubrDev {
+ VuDev vudev;
+ Dispatcher dispatcher;
+ int backend_udp_sock;
+ struct sockaddr_in backend_udp_dest;
+ int hdrlen;
+ int sock;
+ int ready;
+ int quit;
+} VubrDev;
+
static void
vubr_die(const char *s)
{
@@ -101,8 +104,6 @@ dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb)
return 0;
}
-/* dispatcher_remove() is not currently in use but may be useful
- * in the future. */
static int
dispatcher_remove(Dispatcher *dispr, int sock)
{
@@ -157,1039 +158,313 @@ dispatcher_wait(Dispatcher *dispr, uint32_t timeout)
return 0;
}
-typedef struct VubrVirtq {
- int call_fd;
- int kick_fd;
- uint32_t size;
- uint16_t last_avail_index;
- uint16_t last_used_index;
- struct vring_desc *desc;
- struct vring_avail *avail;
- struct vring_used *used;
- uint64_t log_guest_addr;
- int enable;
-} VubrVirtq;
-
-/* Based on qemu/hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-/* v1.0 compliant. */
-#define VIRTIO_F_VERSION_1 32
-
-#define VHOST_LOG_PAGE 4096
-
-enum VhostUserProtocolFeature {
- VHOST_USER_PROTOCOL_F_MQ = 0,
- VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
- VHOST_USER_PROTOCOL_F_RARP = 2,
-
- VHOST_USER_PROTOCOL_F_MAX
-};
-
-#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
-
-typedef enum VhostUserRequest {
- VHOST_USER_NONE = 0,
- VHOST_USER_GET_FEATURES = 1,
- VHOST_USER_SET_FEATURES = 2,
- VHOST_USER_SET_OWNER = 3,
- VHOST_USER_RESET_OWNER = 4,
- VHOST_USER_SET_MEM_TABLE = 5,
- VHOST_USER_SET_LOG_BASE = 6,
- VHOST_USER_SET_LOG_FD = 7,
- VHOST_USER_SET_VRING_NUM = 8,
- VHOST_USER_SET_VRING_ADDR = 9,
- VHOST_USER_SET_VRING_BASE = 10,
- VHOST_USER_GET_VRING_BASE = 11,
- VHOST_USER_SET_VRING_KICK = 12,
- VHOST_USER_SET_VRING_CALL = 13,
- VHOST_USER_SET_VRING_ERR = 14,
- VHOST_USER_GET_PROTOCOL_FEATURES = 15,
- VHOST_USER_SET_PROTOCOL_FEATURES = 16,
- VHOST_USER_GET_QUEUE_NUM = 17,
- VHOST_USER_SET_VRING_ENABLE = 18,
- VHOST_USER_SEND_RARP = 19,
- VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef struct VhostUserMemoryRegion {
- uint64_t guest_phys_addr;
- uint64_t memory_size;
- uint64_t userspace_addr;
- uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
- uint32_t nregions;
- uint32_t padding;
- VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
- uint64_t mmap_size;
- uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
- VhostUserRequest request;
-
-#define VHOST_USER_VERSION_MASK (0x3)
-#define VHOST_USER_REPLY_MASK (0x1<<2)
- uint32_t flags;
- uint32_t size; /* the following payload size */
- union {
-#define VHOST_USER_VRING_IDX_MASK (0xff)
-#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
- uint64_t u64;
- struct vhost_vring_state state;
- struct vhost_vring_addr addr;
- VhostUserMemory memory;
- VhostUserLog log;
- } payload;
- int fds[VHOST_MEMORY_MAX_NREGIONS];
- int fd_num;
-} QEMU_PACKED VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
-/* The version of the protocol we support */
-#define VHOST_USER_VERSION (0x1)
-
-#define MAX_NR_VIRTQUEUE (8)
-
-typedef struct VubrDevRegion {
- /* Guest Physical address. */
- uint64_t gpa;
- /* Memory region size. */
- uint64_t size;
- /* QEMU virtual address (userspace). */
- uint64_t qva;
- /* Starting offset in our mmaped space. */
- uint64_t mmap_offset;
- /* Start address of mmaped space. */
- uint64_t mmap_addr;
-} VubrDevRegion;
-
-typedef struct VubrDev {
- int sock;
- Dispatcher dispatcher;
- uint32_t nregions;
- VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
- VubrVirtq vq[MAX_NR_VIRTQUEUE];
- int log_call_fd;
- uint64_t log_size;
- uint8_t *log_table;
- int backend_udp_sock;
- struct sockaddr_in backend_udp_dest;
- int ready;
- uint64_t features;
- int hdrlen;
-} VubrDev;
-
-static const char *vubr_request_str[] = {
- [VHOST_USER_NONE] = "VHOST_USER_NONE",
- [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
- [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
- [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
- [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
- [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
- [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
- [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
- [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
- [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
- [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
- [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
- [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
- [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
- [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
- [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
- [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
- [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
- [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
- [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
- [VHOST_USER_MAX] = "VHOST_USER_MAX",
-};
-
static void
-print_buffer(uint8_t *buf, size_t len)
+vubr_handle_tx(VuDev *dev, int qidx)
{
- int i;
- printf("Raw buffer:\n");
- for (i = 0; i < len; i++) {
- if (i % 16 == 0) {
- printf("\n");
- }
- if (i % 4 == 0) {
- printf(" ");
- }
- printf("%02x ", buf[i]);
- }
- printf("\n............................................................\n");
-}
+ VuVirtq *vq = vu_get_queue(dev, qidx);
+ VubrDev *vubr = container_of(dev, VubrDev, vudev);
+ int hdrlen = vubr->hdrlen;
+ VuVirtqElement *elem = NULL;
-/* Translate guest physical address to our virtual address. */
-static uint64_t
-gpa_to_va(VubrDev *dev, uint64_t guest_addr)
-{
- int i;
-
- /* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- VubrDevRegion *r = &dev->regions[i];
-
- if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
- return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
- }
- }
-
- assert(!"address not found in regions");
- return 0;
-}
-
-/* Translate qemu virtual address to our virtual address. */
-static uint64_t
-qva_to_va(VubrDev *dev, uint64_t qemu_addr)
-{
- int i;
+ assert(qidx % 2);
- /* Find matching memory region. */
- for (i = 0; i < dev->nregions; i++) {
- VubrDevRegion *r = &dev->regions[i];
+ for (;;) {
+ ssize_t ret;
+ unsigned int out_num;
+ struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg;
- if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
- return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ if (!elem) {
+ break;
}
- }
-
- assert(!"address not found in regions");
- return 0;
-}
-static void
-vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
-{
- char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
- struct iovec iov = {
- .iov_base = (char *)vmsg,
- .iov_len = VHOST_USER_HDR_SIZE,
- };
- struct msghdr msg = {
- .msg_iov = &iov,
- .msg_iovlen = 1,
- .msg_control = control,
- .msg_controllen = sizeof(control),
- };
- size_t fd_size;
- struct cmsghdr *cmsg;
- int rc;
-
- rc = recvmsg(conn_fd, &msg, 0);
-
- if (rc == 0) {
- vubr_die("recvmsg");
- fprintf(stderr, "Peer disconnected.\n");
- exit(1);
- }
- if (rc < 0) {
- vubr_die("recvmsg");
- }
-
- vmsg->fd_num = 0;
- for (cmsg = CMSG_FIRSTHDR(&msg);
- cmsg != NULL;
- cmsg = CMSG_NXTHDR(&msg, cmsg))
- {
- if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
- fd_size = cmsg->cmsg_len - CMSG_LEN(0);
- vmsg->fd_num = fd_size / sizeof(int);
- memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
+ out_num = elem->out_num;
+ out_sg = elem->out_sg;
+ if (out_num < 1) {
+ fprintf(stderr, "virtio-net header not in first element\n");
break;
}
- }
-
- if (vmsg->size > sizeof(vmsg->payload)) {
- fprintf(stderr,
- "Error: too big message request: %d, size: vmsg->size: %u, "
- "while sizeof(vmsg->payload) = %zu\n",
- vmsg->request, vmsg->size, sizeof(vmsg->payload));
- exit(1);
- }
-
- if (vmsg->size) {
- rc = read(conn_fd, &vmsg->payload, vmsg->size);
- if (rc == 0) {
- vubr_die("recvmsg");
- fprintf(stderr, "Peer disconnected.\n");
- exit(1);
+ if (VHOST_USER_BRIDGE_DEBUG) {
+ iov_hexdump(out_sg, out_num, stderr, "TX:", 1024);
}
- if (rc < 0) {
- vubr_die("recvmsg");
+
+ if (hdrlen) {
+ unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
+ out_sg, out_num,
+ hdrlen, -1);
+ out_num = sg_num;
+ out_sg = sg;
}
- assert(rc == vmsg->size);
- }
-}
+ struct msghdr msg = {
+ .msg_name = (struct sockaddr *) &vubr->backend_udp_dest,
+ .msg_namelen = sizeof(struct sockaddr_in),
+ .msg_iov = out_sg,
+ .msg_iovlen = out_num,
+ };
+ do {
+ ret = sendmsg(vubr->backend_udp_sock, &msg, 0);
+ } while (ret == -1 && (errno == EAGAIN || errno == EINTR));
-static void
-vubr_message_write(int conn_fd, VhostUserMsg *vmsg)
-{
- int rc;
+ if (ret == -1) {
+ vubr_die("sendmsg()");
+ }
- do {
- rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size);
- } while (rc < 0 && errno == EINTR);
+ vu_queue_push(dev, vq, elem, 0);
+ vu_queue_notify(dev, vq);
- if (rc < 0) {
- vubr_die("write");
+ free(elem);
+ elem = NULL;
}
-}
-static void
-vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len)
-{
- int slen = sizeof(struct sockaddr_in);
-
- if (sendto(dev->backend_udp_sock, buf, len, 0,
- (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) {
- vubr_die("sendto()");
- }
+ free(elem);
}
-static int
-vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen)
+static void
+iov_restore_front(struct iovec *front, struct iovec *iov, size_t bytes)
{
- int slen = sizeof(struct sockaddr_in);
- int rc;
+ struct iovec *cur;
- rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0,
- (struct sockaddr *) &dev->backend_udp_dest,
- (socklen_t *)&slen);
- if (rc == -1) {
- vubr_die("recvfrom()");
+ for (cur = front; front != iov; cur++) {
+ bytes -= cur->iov_len;
}
- return rc;
+ cur->iov_base -= bytes;
+ cur->iov_len += bytes;
}
static void
-vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len)
+iov_truncate(struct iovec *iov, unsigned iovc, size_t bytes)
{
- int hdrlen = dev->hdrlen;
- DPRINT(" hdrlen = %d\n", dev->hdrlen);
+ unsigned i;
- if (VHOST_USER_BRIDGE_DEBUG) {
- print_buffer(buf, len);
- }
- vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen);
-}
+ for (i = 0; i < iovc; i++, iov++) {
+ if (bytes < iov->iov_len) {
+ iov->iov_len = bytes;
+ return;
+ }
-/* Kick the log_call_fd if required. */
-static void
-vubr_log_kick(VubrDev *dev)
-{
- if (dev->log_call_fd != -1) {
- DPRINT("Kicking the QEMU's log...\n");
- eventfd_write(dev->log_call_fd, 1);
+ bytes -= iov->iov_len;
}
-}
-/* Kick the guest if necessary. */
-static void
-vubr_virtqueue_kick(VubrVirtq *vq)
-{
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
- DPRINT("Kicking the guest...\n");
- eventfd_write(vq->call_fd, 1);
- }
+ assert(!"couldn't truncate iov");
}
static void
-vubr_log_page(uint8_t *log_table, uint64_t page)
+vubr_backend_recv_cb(int sock, void *ctx)
{
- DPRINT("Logged dirty guest page: %"PRId64"\n", page);
- atomic_or(&log_table[page / 8], 1 << (page % 8));
-}
+ VubrDev *vubr = (VubrDev *) ctx;
+ VuDev *dev = &vubr->vudev;
+ VuVirtq *vq = vu_get_queue(dev, 0);
+ VuVirtqElement *elem = NULL;
+ struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
+ struct virtio_net_hdr_mrg_rxbuf mhdr;
+ unsigned mhdr_cnt = 0;
+ int hdrlen = vubr->hdrlen;
+ int i = 0;
+ struct virtio_net_hdr hdr = {
+ .flags = 0,
+ .gso_type = VIRTIO_NET_HDR_GSO_NONE
+ };
-static void
-vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length)
-{
- uint64_t page;
+ DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n");
+ DPRINT(" hdrlen = %d\n", hdrlen);
- if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
- !dev->log_table || !length) {
+ if (!vu_queue_enabled(dev, vq) ||
+ !vu_queue_avail_bytes(dev, vq, hdrlen, 0)) {
+ DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n");
return;
}
- assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
-
- page = address / VHOST_LOG_PAGE;
- while (page * VHOST_LOG_PAGE < address + length) {
- vubr_log_page(dev->log_table, page);
- page += VHOST_LOG_PAGE;
- }
- vubr_log_kick(dev);
-}
-
-static void
-vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
-{
- struct vring_desc *desc = vq->desc;
- struct vring_avail *avail = vq->avail;
- struct vring_used *used = vq->used;
- uint64_t log_guest_addr = vq->log_guest_addr;
- int32_t remaining_len = len;
-
- unsigned int size = vq->size;
-
- uint16_t avail_index = atomic_mb_read(&avail->idx);
-
- /* We check the available descriptors before posting the
- * buffer, so here we assume that enough available
- * descriptors. */
- assert(vq->last_avail_index != avail_index);
- uint16_t a_index = vq->last_avail_index % size;
- uint16_t u_index = vq->last_used_index % size;
- uint16_t d_index = avail->ring[a_index];
-
- int i = d_index;
- uint32_t written_len = 0;
-
do {
- DPRINT("Post packet to guest on vq:\n");
- DPRINT(" size = %d\n", vq->size);
- DPRINT(" last_avail_index = %d\n", vq->last_avail_index);
- DPRINT(" last_used_index = %d\n", vq->last_used_index);
- DPRINT(" a_index = %d\n", a_index);
- DPRINT(" u_index = %d\n", u_index);
- DPRINT(" d_index = %d\n", d_index);
- DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr);
- DPRINT(" desc[%d].len = %d\n", i, desc[i].len);
- DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags);
- DPRINT(" avail->idx = %d\n", avail_index);
- DPRINT(" used->idx = %d\n", used->idx);
-
- if (!(desc[i].flags & VRING_DESC_F_WRITE)) {
- /* FIXME: we should find writable descriptor. */
- fprintf(stderr, "Error: descriptor is not writable. Exiting.\n");
- exit(1);
- }
-
- void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr);
- uint32_t chunk_len = desc[i].len;
- uint32_t chunk_write_len = MIN(remaining_len, chunk_len);
+ struct iovec *sg;
+ ssize_t ret, total = 0;
+ unsigned int num;
- memcpy(chunk_start, buf + written_len, chunk_write_len);
- vubr_log_write(dev, desc[i].addr, chunk_write_len);
- remaining_len -= chunk_write_len;
- written_len += chunk_write_len;
-
- if ((remaining_len == 0) || !(desc[i].flags & VRING_DESC_F_NEXT)) {
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ if (!elem) {
break;
}
- i = desc[i].next;
- } while (1);
-
- if (remaining_len > 0) {
- fprintf(stderr,
- "Too long packet for RX, remaining_len = %d, Dropping...\n",
- remaining_len);
- return;
- }
-
- /* Add descriptor to the used ring. */
- used->ring[u_index].id = d_index;
- used->ring[u_index].len = len;
- vubr_log_write(dev,
- log_guest_addr + offsetof(struct vring_used, ring[u_index]),
- sizeof(used->ring[u_index]));
-
- vq->last_avail_index++;
- vq->last_used_index++;
-
- atomic_mb_set(&used->idx, vq->last_used_index);
- vubr_log_write(dev,
- log_guest_addr + offsetof(struct vring_used, idx),
- sizeof(used->idx));
-
- /* Kick the guest if necessary. */
- vubr_virtqueue_kick(vq);
-}
-
-static int
-vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
-{
- struct vring_desc *desc = vq->desc;
- struct vring_avail *avail = vq->avail;
- struct vring_used *used = vq->used;
- uint64_t log_guest_addr = vq->log_guest_addr;
-
- unsigned int size = vq->size;
-
- uint16_t a_index = vq->last_avail_index % size;
- uint16_t u_index = vq->last_used_index % size;
- uint16_t d_index = avail->ring[a_index];
-
- uint32_t i, len = 0;
- size_t buf_size = 4096;
- uint8_t buf[4096];
-
- DPRINT("Chunks: ");
- i = d_index;
- do {
- void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr);
- uint32_t chunk_len = desc[i].len;
-
- assert(!(desc[i].flags & VRING_DESC_F_WRITE));
-
- if (len + chunk_len < buf_size) {
- memcpy(buf + len, chunk_start, chunk_len);
- DPRINT("%d ", chunk_len);
- } else {
- fprintf(stderr, "Error: too long packet. Dropping...\n");
+ if (elem->in_num < 1) {
+ fprintf(stderr, "virtio-net contains no in buffers\n");
break;
}
- len += chunk_len;
-
- if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
- break;
+ sg = elem->in_sg;
+ num = elem->in_num;
+ if (i == 0) {
+ if (hdrlen == 12) {
+ mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
+ sg, elem->in_num,
+ offsetof(typeof(mhdr), num_buffers),
+ sizeof(mhdr.num_buffers));
+ }
+ iov_from_buf(sg, elem->in_num, 0, &hdr, sizeof hdr);
+ total += hdrlen;
+ assert(iov_discard_front(&sg, &num, hdrlen) == hdrlen);
}
- i = desc[i].next;
- } while (1);
- DPRINT("\n");
-
- if (!len) {
- return -1;
- }
-
- /* Add descriptor to the used ring. */
- used->ring[u_index].id = d_index;
- used->ring[u_index].len = len;
- vubr_log_write(dev,
- log_guest_addr + offsetof(struct vring_used, ring[u_index]),
- sizeof(used->ring[u_index]));
-
- vubr_consume_raw_packet(dev, buf, len);
-
- return 0;
-}
+ struct msghdr msg = {
+ .msg_name = (struct sockaddr *) &vubr->backend_udp_dest,
+ .msg_namelen = sizeof(struct sockaddr_in),
+ .msg_iov = sg,
+ .msg_iovlen = elem->in_num,
+ .msg_flags = MSG_DONTWAIT,
+ };
+ do {
+ ret = recvmsg(vubr->backend_udp_sock, &msg, 0);
+ } while (ret == -1 && (errno == EINTR));
-static void
-vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
-{
- struct vring_avail *avail = vq->avail;
- struct vring_used *used = vq->used;
- uint64_t log_guest_addr = vq->log_guest_addr;
-
- while (vq->last_avail_index != atomic_mb_read(&avail->idx)) {
- vubr_process_desc(dev, vq);
- vq->last_avail_index++;
- vq->last_used_index++;
- }
+ if (i == 0) {
+ iov_restore_front(elem->in_sg, sg, hdrlen);
+ }
- atomic_mb_set(&used->idx, vq->last_used_index);
- vubr_log_write(dev,
- log_guest_addr + offsetof(struct vring_used, idx),
- sizeof(used->idx));
-}
+ if (ret == -1) {
+ if (errno == EWOULDBLOCK) {
+ vu_queue_rewind(dev, vq, 1);
+ break;
+ }
-static void
-vubr_backend_recv_cb(int sock, void *ctx)
-{
- VubrDev *dev = (VubrDev *) ctx;
- VubrVirtq *rx_vq = &dev->vq[0];
- uint8_t buf[4096];
- struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf;
- int hdrlen = dev->hdrlen;
- int buflen = sizeof(buf);
- int len;
-
- if (!dev->ready) {
- return;
- }
+ vubr_die("recvmsg()");
+ }
- DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n");
- DPRINT(" hdrlen = %d\n", hdrlen);
+ total += ret;
+ iov_truncate(elem->in_sg, elem->in_num, total);
+ vu_queue_fill(dev, vq, elem, total, i++);
- uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx);
+ free(elem);
+ elem = NULL;
+ } while (false); /* could loop if DONTWAIT worked? */
- /* If there is no available descriptors, just do nothing.
- * The buffer will be handled by next arrived UDP packet,
- * or next kick on receive virtq. */
- if (rx_vq->last_avail_index == avail_index) {
- DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n");
- return;
+ if (mhdr_cnt) {
+ mhdr.num_buffers = i;
+ iov_from_buf(mhdr_sg, mhdr_cnt,
+ 0,
+ &mhdr.num_buffers, sizeof mhdr.num_buffers);
}
- memset(buf, 0, hdrlen);
- /* TODO: support mergeable buffers. */
- if (hdrlen == 12)
- hdr->num_buffers = 1;
- len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen);
+ vu_queue_flush(dev, vq, i);
+ vu_queue_notify(dev, vq);
- vubr_post_buffer(dev, rx_vq, buf, len + hdrlen);
+ free(elem);
}
static void
-vubr_kick_cb(int sock, void *ctx)
+vubr_receive_cb(int sock, void *ctx)
{
- VubrDev *dev = (VubrDev *) ctx;
- eventfd_t kick_data;
- ssize_t rc;
+ VubrDev *vubr = (VubrDev *)ctx;
- rc = eventfd_read(sock, &kick_data);
- if (rc == -1) {
- vubr_die("eventfd_read()");
- } else {
- DPRINT("Got kick_data: %016"PRIx64"\n", kick_data);
- vubr_process_avail(dev, &dev->vq[1]);
+ if (!vu_dispatch(&vubr->vudev)) {
+ fprintf(stderr, "Error while dispatching\n");
}
}
-static int
-vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- DPRINT("Function %s() not implemented yet.\n", __func__);
- return 0;
-}
+typedef struct WatchData {
+ VuDev *dev;
+ vu_watch_cb cb;
+ void *data;
+} WatchData;
-static int
-vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static void
+watch_cb(int sock, void *ctx)
{
- vmsg->payload.u64 =
- ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VHOST_F_LOG_ALL) |
- (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
- (1ULL << VHOST_USER_F_PROTOCOL_FEATURES));
-
- vmsg->size = sizeof(vmsg->payload.u64);
+ struct WatchData *wd = ctx;
- DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
-
- /* Reply */
- return 1;
+ wd->cb(wd->dev, VU_WATCH_IN, wd->data);
}
-static int
-vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static void
+vubr_set_watch(VuDev *dev, int fd, int condition,
+ vu_watch_cb cb, void *data)
{
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
-
- dev->features = vmsg->payload.u64;
- if ((dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
- (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) {
- dev->hdrlen = 12;
- } else {
- dev->hdrlen = 10;
- }
+ VubrDev *vubr = container_of(dev, VubrDev, vudev);
+ static WatchData watches[FD_SETSIZE];
+ struct WatchData *wd = &watches[fd];
- return 0;
-}
-
-static int
-vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- return 0;
+ wd->cb = cb;
+ wd->data = data;
+ wd->dev = dev;
+ dispatcher_add(&vubr->dispatcher, fd, wd, watch_cb);
}
static void
-vubr_close_log(VubrDev *dev)
+vubr_remove_watch(VuDev *dev, int fd)
{
- if (dev->log_table) {
- if (munmap(dev->log_table, dev->log_size) != 0) {
- vubr_die("munmap()");
- }
+ VubrDev *vubr = container_of(dev, VubrDev, vudev);
- dev->log_table = 0;
- }
- if (dev->log_call_fd != -1) {
- close(dev->log_call_fd);
- dev->log_call_fd = -1;
- }
-}
-
-static int
-vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- vubr_close_log(dev);
- dev->ready = 0;
- dev->features = 0;
- return 0;
+ dispatcher_remove(&vubr->dispatcher, fd);
}
static int
-vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg)
+vubr_send_rarp_exec(VuDev *dev, VhostUserMsg *vmsg)
{
- int i;
- VhostUserMemory *memory = &vmsg->payload.memory;
- dev->nregions = memory->nregions;
-
- DPRINT("Nregions: %d\n", memory->nregions);
- for (i = 0; i < dev->nregions; i++) {
- void *mmap_addr;
- VhostUserMemoryRegion *msg_region = &memory->regions[i];
- VubrDevRegion *dev_region = &dev->regions[i];
-
- DPRINT("Region %d\n", i);
- DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
- msg_region->guest_phys_addr);
- DPRINT(" memory_size: 0x%016"PRIx64"\n",
- msg_region->memory_size);
- DPRINT(" userspace_addr 0x%016"PRIx64"\n",
- msg_region->userspace_addr);
- DPRINT(" mmap_offset 0x%016"PRIx64"\n",
- msg_region->mmap_offset);
-
- dev_region->gpa = msg_region->guest_phys_addr;
- dev_region->size = msg_region->memory_size;
- dev_region->qva = msg_region->userspace_addr;
- dev_region->mmap_offset = msg_region->mmap_offset;
-
- /* We don't use offset argument of mmap() since the
- * mapped address has to be page aligned, and we use huge
- * pages. */
- mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
- PROT_READ | PROT_WRITE, MAP_SHARED,
- vmsg->fds[i], 0);
-
- if (mmap_addr == MAP_FAILED) {
- vubr_die("mmap");
- }
- dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
- DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr);
-
- close(vmsg->fds[i]);
- }
-
+ DPRINT("Function %s() not implemented yet.\n", __func__);
return 0;
}
static int
-vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
+vubr_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
{
- int fd;
- uint64_t log_mmap_size, log_mmap_offset;
- void *rc;
-
- assert(vmsg->fd_num == 1);
- fd = vmsg->fds[0];
-
- assert(vmsg->size == sizeof(vmsg->payload.log));
- log_mmap_offset = vmsg->payload.log.mmap_offset;
- log_mmap_size = vmsg->payload.log.mmap_size;
- DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
- DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
-
- rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
- log_mmap_offset);
- if (rc == MAP_FAILED) {
- vubr_die("mmap");
+ switch (vmsg->request) {
+ case VHOST_USER_SEND_RARP:
+ *do_reply = vubr_send_rarp_exec(dev, vmsg);
+ return 1;
+ default:
+ /* let the library handle the rest */
+ return 0;
}
- dev->log_table = rc;
- dev->log_size = log_mmap_size;
- vmsg->size = sizeof(vmsg->payload.u64);
- /* Reply */
- return 1;
-}
-
-static int
-vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- assert(vmsg->fd_num == 1);
- dev->log_call_fd = vmsg->fds[0];
- DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
return 0;
}
-static int
-vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static void
+vubr_set_features(VuDev *dev, uint64_t features)
{
- unsigned int index = vmsg->payload.state.index;
- unsigned int num = vmsg->payload.state.num;
-
- DPRINT("State.index: %d\n", index);
- DPRINT("State.num: %d\n", num);
- dev->vq[index].size = num;
- return 0;
-}
+ VubrDev *vubr = container_of(dev, VubrDev, vudev);
-static int
-vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- struct vhost_vring_addr *vra = &vmsg->payload.addr;
- unsigned int index = vra->index;
- VubrVirtq *vq = &dev->vq[index];
-
- DPRINT("vhost_vring_addr:\n");
- DPRINT(" index: %d\n", vra->index);
- DPRINT(" flags: %d\n", vra->flags);
- DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
- DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
- DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
- DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
-
- vq->desc = (struct vring_desc *)(uintptr_t)qva_to_va(dev, vra->desc_user_addr);
- vq->used = (struct vring_used *)(uintptr_t)qva_to_va(dev, vra->used_user_addr);
- vq->avail = (struct vring_avail *)(uintptr_t)qva_to_va(dev, vra->avail_user_addr);
- vq->log_guest_addr = vra->log_guest_addr;
-
- DPRINT("Setting virtq addresses:\n");
- DPRINT(" vring_desc at %p\n", vq->desc);
- DPRINT(" vring_used at %p\n", vq->used);
- DPRINT(" vring_avail at %p\n", vq->avail);
-
- vq->last_used_index = vq->used->idx;
-
- if (vq->last_avail_index != vq->used->idx) {
- DPRINT("Last avail index != used index: %d != %d, resuming",
- vq->last_avail_index, vq->used->idx);
- vq->last_avail_index = vq->used->idx;
+ if ((features & (1ULL << VIRTIO_F_VERSION_1)) ||
+ (features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) {
+ vubr->hdrlen = 12;
+ } else {
+ vubr->hdrlen = 10;
}
-
- return 0;
}
-static int
-vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- unsigned int index = vmsg->payload.state.index;
- unsigned int num = vmsg->payload.state.num;
-
- DPRINT("State.index: %d\n", index);
- DPRINT("State.num: %d\n", num);
- dev->vq[index].last_avail_index = num;
-
- return 0;
-}
-
-static int
-vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- unsigned int index = vmsg->payload.state.index;
-
- DPRINT("State.index: %d\n", index);
- vmsg->payload.state.num = dev->vq[index].last_avail_index;
- vmsg->size = sizeof(vmsg->payload.state);
- /* FIXME: this is a work-around for a bug in QEMU enabling
- * too early vrings. When protocol features are enabled,
- * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */
- dev->ready = 0;
-
- if (dev->vq[index].call_fd != -1) {
- close(dev->vq[index].call_fd);
- dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd);
- dev->vq[index].call_fd = -1;
- }
- if (dev->vq[index].kick_fd != -1) {
- close(dev->vq[index].kick_fd);
- dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd);
- dev->vq[index].kick_fd = -1;
- }
-
- /* Reply */
- return 1;
-}
-
-static int
-vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static uint64_t
+vubr_get_features(VuDev *dev)
{
- uint64_t u64_arg = vmsg->payload.u64;
- int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
-
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
-
- assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
- assert(vmsg->fd_num == 1);
-
- if (dev->vq[index].kick_fd != -1) {
- close(dev->vq[index].kick_fd);
- dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd);
- }
- dev->vq[index].kick_fd = vmsg->fds[0];
- DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
-
- if (index % 2 == 1) {
- /* TX queue. */
- dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd,
- dev, vubr_kick_cb);
-
- DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
- dev->vq[index].kick_fd, index);
- }
- /* We temporarily use this hack to determine that both TX and RX
- * queues are set up and ready for processing.
- * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and
- * actual kicks. */
- if (dev->vq[0].kick_fd != -1 &&
- dev->vq[1].kick_fd != -1) {
- dev->ready = 1;
- DPRINT("vhost-user-bridge is ready for processing queues.\n");
- }
- return 0;
-
+ return 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE |
+ 1ULL << VIRTIO_NET_F_MRG_RXBUF;
}
-static int
-vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static void
+vubr_queue_set_started(VuDev *dev, int qidx, bool started)
{
- uint64_t u64_arg = vmsg->payload.u64;
- int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
+ VuVirtq *vq = vu_get_queue(dev, qidx);
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
- assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
- assert(vmsg->fd_num == 1);
-
- if (dev->vq[index].call_fd != -1) {
- close(dev->vq[index].call_fd);
- dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd);
+ if (qidx % 2 == 1) {
+ vu_set_queue_handler(dev, vq, started ? vubr_handle_tx : NULL);
}
- dev->vq[index].call_fd = vmsg->fds[0];
- DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
-
- return 0;
-}
-
-static int
-vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
- return 0;
-}
-
-static int
-vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
- vmsg->size = sizeof(vmsg->payload.u64);
-
- /* Reply */
- return 1;
}
-static int
-vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- /* FIXME: unimplented */
- DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
- return 0;
-}
-
-static int
-vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- DPRINT("Function %s() not implemented yet.\n", __func__);
- return 0;
-}
-
-static int
-vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg)
+static void
+vubr_panic(VuDev *dev, const char *msg)
{
- unsigned int index = vmsg->payload.state.index;
- unsigned int enable = vmsg->payload.state.num;
+ VubrDev *vubr = container_of(dev, VubrDev, vudev);
- DPRINT("State.index: %d\n", index);
- DPRINT("State.enable: %d\n", enable);
- dev->vq[index].enable = enable;
- return 0;
-}
+ fprintf(stderr, "PANIC: %s\n", msg);
-static int
-vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg)
-{
- DPRINT("Function %s() not implemented yet.\n", __func__);
- return 0;
+ dispatcher_remove(&vubr->dispatcher, dev->sock);
+ vubr->quit = 1;
}
-static int
-vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg)
-{
- /* Print out generic part of the request. */
- DPRINT(
- "================== Vhost user message from QEMU ==================\n");
- DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request],
- vmsg->request);
- DPRINT("Flags: 0x%x\n", vmsg->flags);
- DPRINT("Size: %d\n", vmsg->size);
-
- if (vmsg->fd_num) {
- int i;
- DPRINT("Fds:");
- for (i = 0; i < vmsg->fd_num; i++) {
- DPRINT(" %d", vmsg->fds[i]);
- }
- DPRINT("\n");
- }
-
- switch (vmsg->request) {
- case VHOST_USER_NONE:
- return vubr_none_exec(dev, vmsg);
- case VHOST_USER_GET_FEATURES:
- return vubr_get_features_exec(dev, vmsg);
- case VHOST_USER_SET_FEATURES:
- return vubr_set_features_exec(dev, vmsg);
- case VHOST_USER_SET_OWNER:
- return vubr_set_owner_exec(dev, vmsg);
- case VHOST_USER_RESET_OWNER:
- return vubr_reset_device_exec(dev, vmsg);
- case VHOST_USER_SET_MEM_TABLE:
- return vubr_set_mem_table_exec(dev, vmsg);
- case VHOST_USER_SET_LOG_BASE:
- return vubr_set_log_base_exec(dev, vmsg);
- case VHOST_USER_SET_LOG_FD:
- return vubr_set_log_fd_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_NUM:
- return vubr_set_vring_num_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_ADDR:
- return vubr_set_vring_addr_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_BASE:
- return vubr_set_vring_base_exec(dev, vmsg);
- case VHOST_USER_GET_VRING_BASE:
- return vubr_get_vring_base_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_KICK:
- return vubr_set_vring_kick_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_CALL:
- return vubr_set_vring_call_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_ERR:
- return vubr_set_vring_err_exec(dev, vmsg);
- case VHOST_USER_GET_PROTOCOL_FEATURES:
- return vubr_get_protocol_features_exec(dev, vmsg);
- case VHOST_USER_SET_PROTOCOL_FEATURES:
- return vubr_set_protocol_features_exec(dev, vmsg);
- case VHOST_USER_GET_QUEUE_NUM:
- return vubr_get_queue_num_exec(dev, vmsg);
- case VHOST_USER_SET_VRING_ENABLE:
- return vubr_set_vring_enable_exec(dev, vmsg);
- case VHOST_USER_SEND_RARP:
- return vubr_send_rarp_exec(dev, vmsg);
-
- case VHOST_USER_MAX:
- assert(vmsg->request != VHOST_USER_MAX);
- }
- return 0;
-}
-
-static void
-vubr_receive_cb(int sock, void *ctx)
-{
- VubrDev *dev = (VubrDev *) ctx;
- VhostUserMsg vmsg;
- int reply_requested;
-
- vubr_message_read(sock, &vmsg);
- reply_requested = vubr_execute_request(dev, &vmsg);
- if (reply_requested) {
- /* Set the version in the flags when sending the reply */
- vmsg.flags &= ~VHOST_USER_VERSION_MASK;
- vmsg.flags |= VHOST_USER_VERSION;
- vmsg.flags |= VHOST_USER_REPLY_MASK;
- vubr_message_write(sock, &vmsg);
- }
-}
+static const VuDevIface vuiface = {
+ .get_features = vubr_get_features,
+ .set_features = vubr_set_features,
+ .process_msg = vubr_process_msg,
+ .queue_set_started = vubr_queue_set_started,
+};
static void
vubr_accept_cb(int sock, void *ctx)
@@ -1204,36 +479,26 @@ vubr_accept_cb(int sock, void *ctx)
vubr_die("accept()");
}
DPRINT("Got connection from remote peer on sock %d\n", conn_fd);
+
+ vu_init(&dev->vudev,
+ conn_fd,
+ vubr_panic,
+ vubr_set_watch,
+ vubr_remove_watch,
+ &vuiface);
+
dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb);
+ dispatcher_remove(&dev->dispatcher, sock);
}
static VubrDev *
vubr_new(const char *path, bool client)
{
VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev));
- dev->nregions = 0;
- int i;
struct sockaddr_un un;
CallbackFunc cb;
size_t len;
- for (i = 0; i < MAX_NR_VIRTQUEUE; i++) {
- dev->vq[i] = (VubrVirtq) {
- .call_fd = -1, .kick_fd = -1,
- .size = 0,
- .last_avail_index = 0, .last_used_index = 0,
- .desc = 0, .avail = 0, .used = 0,
- .enable = 0,
- };
- }
-
- /* Init log */
- dev->log_call_fd = -1;
- dev->log_size = 0;
- dev->log_table = 0;
- dev->ready = 0;
- dev->features = 0;
-
/* Get a UNIX socket. */
dev->sock = socket(AF_UNIX, SOCK_STREAM, 0);
if (dev->sock == -1) {
@@ -1261,10 +526,17 @@ vubr_new(const char *path, bool client)
if (connect(dev->sock, (struct sockaddr *)&un, len) == -1) {
vubr_die("connect");
}
+ vu_init(&dev->vudev,
+ dev->sock,
+ vubr_panic,
+ vubr_set_watch,
+ vubr_remove_watch,
+ &vuiface);
cb = vubr_receive_cb;
}
dispatcher_init(&dev->dispatcher);
+
dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, cb);
return dev;
@@ -1345,7 +617,7 @@ vubr_backend_udp_setup(VubrDev *dev,
static void
vubr_run(VubrDev *dev)
{
- while (1) {
+ while (!dev->quit) {
/* timeout 200ms */
dispatcher_wait(&dev->dispatcher, 200000);
/* Here one can try polling strategy. */
@@ -1421,6 +693,9 @@ main(int argc, char *argv[])
vubr_backend_udp_setup(dev, lhost, lport, rhost, rport);
vubr_run(dev);
+
+ vu_deinit(&dev->vudev);
+
return 0;
out: