diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2015-10-22 12:41:44 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2015-10-22 12:41:44 +0100 |
commit | ca3e40e233e87f7b29442311736a82da01c0df7b (patch) | |
tree | dba688d01dceded9b998e5d5cfb8cb9264354604 /hw | |
parent | c1bd8997438f1b556acfeab1d52245ff7cc680c0 (diff) | |
parent | 3c23402d4032f69af44a87fdb8019ad3229a4f31 (diff) |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
vhost, pc, virtio features, fixes, cleanups
New features:
VT-d support for devices behind a bridge
vhost-user migration support
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Thu 22 Oct 2015 12:39:19 BST using RSA key ID D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>"
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>"
* remotes/mst/tags/for_upstream: (37 commits)
hw/isa/lpc_ich9: inject the SMI on the VCPU that is writing to APM_CNT
i386: keep cpu_model field in MachineState uptodate
vhost: set the correct queue index in case of migration with multiqueue
piix: fix resource leak reported by Coverity
seccomp: add memfd_create to whitelist
vhost-user-test: check ownership during migration
vhost-user-test: add live-migration test
vhost-user-test: learn to tweak various qemu arguments
vhost-user-test: wrap server in TestServer struct
vhost-user-test: remove useless static check
vhost-user-test: move wait_for_fds() out
vhost: add migration block if memfd failed
vhost-user: use an enum helper for features mask
vhost user: add rarp sending after live migration for legacy guest
vhost user: add support of live migration
net: add trace_vhost_user_event
vhost-user: document migration log
vhost: use a function for each call
vhost-user: add a migration blocker
vhost-user: send log shm fd along with log_base
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/i386/intel_iommu.c | 89 | ||||
-rw-r--r-- | hw/i386/pc.c | 17 | ||||
-rw-r--r-- | hw/i386/pc_piix.c | 2 | ||||
-rw-r--r-- | hw/i386/pc_q35.c | 2 | ||||
-rw-r--r-- | hw/isa/lpc_ich9.c | 2 | ||||
-rw-r--r-- | hw/mem/pc-dimm.c | 7 | ||||
-rw-r--r-- | hw/net/vhost_net.c | 35 | ||||
-rw-r--r-- | hw/pci-host/piix.c | 11 | ||||
-rw-r--r-- | hw/pci-host/q35.c | 25 | ||||
-rw-r--r-- | hw/scsi/vhost-scsi.c | 7 | ||||
-rw-r--r-- | hw/virtio/vhost-backend.c | 138 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 581 | ||||
-rw-r--r-- | hw/virtio/vhost.c | 156 |
13 files changed, 761 insertions, 311 deletions
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 08055a8d8a..3fe27fa512 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "exec/address-spaces.h" #include "intel_iommu_internal.h" +#include "hw/pci/pci.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -166,19 +167,17 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, */ static void vtd_reset_context_cache(IntelIOMMUState *s) { - VTDAddressSpace **pvtd_as; VTDAddressSpace *vtd_as; - uint32_t bus_it; + VTDBus *vtd_bus; + GHashTableIter bus_it; uint32_t devfn_it; + g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); + VTD_DPRINTF(CACHE, "global context_cache_gen=1"); - for (bus_it = 0; bus_it < VTD_PCI_BUS_MAX; ++bus_it) { - pvtd_as = s->address_spaces[bus_it]; - if (!pvtd_as) { - continue; - } + while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { for (devfn_it = 0; devfn_it < VTD_PCI_DEVFN_MAX; ++devfn_it) { - vtd_as = pvtd_as[devfn_it]; + vtd_as = vtd_bus->dev_as[devfn_it]; if (!vtd_as) { continue; } @@ -754,12 +753,13 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) * @is_write: The access is a write operation * @entry: IOMMUTLBEntry that contain the addr to be translated and result */ -static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, uint8_t bus_num, +static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, uint8_t devfn, hwaddr addr, bool is_write, IOMMUTLBEntry *entry) { IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; + uint8_t bus_num = pci_bus_num(bus); VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; uint64_t slpte; uint32_t level; @@ -874,6 +874,29 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) } } + +/* Find the VTD address space currently associated with a given bus number, + */ +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) +{ + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; + if (!vtd_bus) { + /* Iterate over the registered buses to find the one + * which currently hold this bus number, and update the bus_num lookup table: + */ + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { + if (pci_bus_num(vtd_bus->bus) == bus_num) { + s->vtd_as_by_bus_num[bus_num] = vtd_bus; + return vtd_bus; + } + } + } + return vtd_bus; +} + /* Do a context-cache device-selective invalidation. * @func_mask: FM field after shifting */ @@ -882,7 +905,7 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, uint16_t func_mask) { uint16_t mask; - VTDAddressSpace **pvtd_as; + VTDBus *vtd_bus; VTDAddressSpace *vtd_as; uint16_t devfn; uint16_t devfn_it; @@ -903,11 +926,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, } VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16 " mask %"PRIu16, source_id, mask); - pvtd_as = s->address_spaces[VTD_SID_TO_BUS(source_id)]; - if (pvtd_as) { + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + if (vtd_bus) { devfn = VTD_SID_TO_DEVFN(source_id); for (devfn_it = 0; devfn_it < VTD_PCI_DEVFN_MAX; ++devfn_it) { - vtd_as = pvtd_as[devfn_it]; + vtd_as = vtd_bus->dev_as[devfn_it]; if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { VTD_DPRINTF(INV, "invalidate context-cahce of devfn 0x%"PRIx16, devfn_it); @@ -1805,11 +1828,11 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, return ret; } - vtd_do_iommu_translate(vtd_as, vtd_as->bus_num, vtd_as->devfn, addr, + vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, addr, is_write, &ret); VTD_DPRINTF(MMU, "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8 - " gpa 0x%"PRIx64 " hpa 0x%"PRIx64, vtd_as->bus_num, + " gpa 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), VTD_PCI_SLOT(vtd_as->devfn), VTD_PCI_FUNC(vtd_as->devfn), vtd_as->devfn, addr, ret.translated_addr); return ret; @@ -1839,6 +1862,38 @@ static Property vtd_properties[] = { DEFINE_PROP_END_OF_LIST(), }; + +VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) +{ + uintptr_t key = (uintptr_t)bus; + VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); + VTDAddressSpace *vtd_dev_as; + + if (!vtd_bus) { + /* No corresponding free() */ + vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * VTD_PCI_DEVFN_MAX); + vtd_bus->bus = bus; + key = (uintptr_t)bus; + g_hash_table_insert(s->vtd_as_by_busptr, &key, vtd_bus); + } + + vtd_dev_as = vtd_bus->dev_as[devfn]; + + if (!vtd_dev_as) { + vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); + + vtd_dev_as->bus = bus; + vtd_dev_as->devfn = (uint8_t)devfn; + vtd_dev_as->iommu_state = s; + vtd_dev_as->context_cache_entry.context_cache_gen = 0; + memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s), + &s->iommu_ops, "intel_iommu", UINT64_MAX); + address_space_init(&vtd_dev_as->as, + &vtd_dev_as->iommu, "intel_iommu"); + } + return vtd_dev_as; +} + /* Do the initialization. It will also be called when reset, so pay * attention when adding new initialization stuff. */ @@ -1931,13 +1986,15 @@ static void vtd_realize(DeviceState *dev, Error **errp) IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); VTD_DPRINTF(GENERAL, ""); - memset(s->address_spaces, 0, sizeof(s->address_spaces)); + memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); /* No corresponding destroy */ s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, g_free, g_free); + s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, + g_free, g_free); vtd_init(s); } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index b25a87267e..3d958bae5b 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1078,11 +1078,10 @@ out: return cpu; } -static const char *current_cpu_model; - void pc_hot_add_cpu(const int64_t id, Error **errp) { X86CPU *cpu; + MachineState *machine = MACHINE(qdev_get_machine()); int64_t apic_id = x86_cpu_apic_id_from_index(id); Error *local_err = NULL; @@ -1110,7 +1109,7 @@ void pc_hot_add_cpu(const int64_t id, Error **errp) return; } - cpu = pc_new_cpu(current_cpu_model, apic_id, &local_err); + cpu = pc_new_cpu(machine->cpu_model, apic_id, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -1118,22 +1117,22 @@ void pc_hot_add_cpu(const int64_t id, Error **errp) object_unref(OBJECT(cpu)); } -void pc_cpus_init(const char *cpu_model) +void pc_cpus_init(PCMachineState *pcms) { int i; X86CPU *cpu = NULL; + MachineState *machine = MACHINE(pcms); Error *error = NULL; unsigned long apic_id_limit; /* init CPUs */ - if (cpu_model == NULL) { + if (machine->cpu_model == NULL) { #ifdef TARGET_X86_64 - cpu_model = "qemu64"; + machine->cpu_model = "qemu64"; #else - cpu_model = "qemu32"; + machine->cpu_model = "qemu32"; #endif } - current_cpu_model = cpu_model; apic_id_limit = pc_apic_id_limit(max_cpus); if (apic_id_limit > ACPI_CPU_HOTPLUG_ID_LIMIT) { @@ -1143,7 +1142,7 @@ void pc_cpus_init(const char *cpu_model) } for (i = 0; i < smp_cpus; i++) { - cpu = pc_new_cpu(cpu_model, x86_cpu_apic_id_from_index(i), + cpu = pc_new_cpu(machine->cpu_model, x86_cpu_apic_id_from_index(i), &error); if (error) { error_report_err(error); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index a91cc3dada..9d4425a5b9 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -139,7 +139,7 @@ static void pc_init1(MachineState *machine, exit(1); } - pc_cpus_init(machine->cpu_model); + pc_cpus_init(pcms); if (kvm_enabled() && kvmclock_enabled) { kvmclock_create(); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 19e66702e0..3744abd397 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -128,7 +128,7 @@ static void pc_q35_init(MachineState *machine) exit(1); } - pc_cpus_init(machine->cpu_model); + pc_cpus_init(pcms); pc_acpi_init("q35-acpi-dsdt.aml"); kvmclock_create(); diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 360699f6fd..1ffc80362b 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -394,7 +394,7 @@ static void ich9_apm_ctrl_changed(uint32_t val, void *arg) /* SMI_EN = PMBASE + 30. SMI control and enable register */ if (lpc->pm.smi_en & ICH9_PMIO_SMI_EN_APMC_EN) { - cpu_interrupt(first_cpu, CPU_INTERRUPT_SMI); + cpu_interrupt(current_cpu, CPU_INTERRUPT_SMI); } } diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 506fe0d2a8..2bae994667 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -25,6 +25,7 @@ #include "sysemu/numa.h" #include "sysemu/kvm.h" #include "trace.h" +#include "hw/virtio/vhost.h" typedef struct pc_dimms_capacity { uint64_t size; @@ -96,6 +97,12 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, goto out; } + if (!vhost_has_free_slot()) { + error_setg(&local_err, "a used vhost backend has no free" + " memory slots left"); + goto out; + } + memory_region_add_subregion(&hpms->mr, addr - hpms->base, mr); vmstate_register_ram(mr, dev); numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node); diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 2bce89129d..d91b7b155e 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -85,6 +85,8 @@ static const int user_feature_bits[] = { VIRTIO_NET_F_CTRL_MAC_ADDR, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, + VIRTIO_NET_F_GUEST_ANNOUNCE, + VIRTIO_NET_F_MQ, VHOST_INVALID_FEATURE_BIT @@ -252,8 +254,7 @@ static int vhost_net_start_one(struct vhost_net *net, file.fd = net->backend; for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { const VhostOps *vhost_ops = net->dev.vhost_ops; - r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, - &file); + r = vhost_ops->vhost_net_set_backend(&net->dev, &file); if (r < 0) { r = -errno; goto fail; @@ -266,8 +267,7 @@ fail: if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { while (file.index-- > 0) { const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, - &file); + int r = vhost_ops->vhost_net_set_backend(&net->dev, &file); assert(r >= 0); } } @@ -289,15 +289,13 @@ static void vhost_net_stop_one(struct vhost_net *net, if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, - &file); + int r = vhost_ops->vhost_net_set_backend(&net->dev, &file); assert(r >= 0); } } else if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER) { for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_call(&net->dev, VHOST_RESET_DEVICE, - NULL); + int r = vhost_ops->vhost_reset_device(&net->dev); assert(r >= 0); } } @@ -390,6 +388,18 @@ void vhost_net_cleanup(struct vhost_net *net) g_free(net); } +int vhost_net_notify_migration_done(struct vhost_net *net, char* mac_addr) +{ + const VhostOps *vhost_ops = net->dev.vhost_ops; + int r = -1; + + if (vhost_ops->vhost_migration_done) { + r = vhost_ops->vhost_migration_done(&net->dev, mac_addr); + } + + return r; +} + bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) { return vhost_virtqueue_pending(&net->dev, idx); @@ -428,8 +438,8 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; - if (vhost_ops->vhost_backend_set_vring_enable) { - return vhost_ops->vhost_backend_set_vring_enable(&net->dev, enable); + if (vhost_ops->vhost_set_vring_enable) { + return vhost_ops->vhost_set_vring_enable(&net->dev, enable); } return 0; @@ -481,6 +491,11 @@ void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, { } +int vhost_net_notify_migration_done(struct vhost_net *net, char* mac_addr) +{ + return -1; +} + VHostNetState *get_vhost_net(NetClientState *nc) { return 0; diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c index 1fb71c8081..7b2fbf9598 100644 --- a/hw/pci-host/piix.c +++ b/hw/pci-host/piix.c @@ -764,6 +764,7 @@ static int host_pci_config_read(int pos, int len, uint32_t val) /* Access real host bridge. */ int rc = snprintf(path, size, "/sys/bus/pci/devices/%04x:%02x:%02x.%d/%s", 0, 0, 0, 0, "config"); + int ret = 0; if (rc >= size || rc < 0) { return -ENODEV; @@ -775,16 +776,18 @@ static int host_pci_config_read(int pos, int len, uint32_t val) } if (lseek(config_fd, pos, SEEK_SET) != pos) { - return -errno; + ret = -errno; + goto out; } do { rc = read(config_fd, (uint8_t *)&val, len); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); if (rc != len) { - return -errno; + ret = -errno; } - - return 0; +out: + close(config_fd); + return ret; } static int igd_pt_i440fx_initfn(struct PCIDevice *pci_dev) diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index bd7409456f..c81507d710 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -426,31 +426,12 @@ static void mch_reset(DeviceState *qdev) static AddressSpace *q35_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) { IntelIOMMUState *s = opaque; - VTDAddressSpace **pvtd_as; - int bus_num = pci_bus_num(bus); + VTDAddressSpace *vtd_as; - assert(0 <= bus_num && bus_num <= VTD_PCI_BUS_MAX); assert(0 <= devfn && devfn <= VTD_PCI_DEVFN_MAX); - pvtd_as = s->address_spaces[bus_num]; - if (!pvtd_as) { - /* No corresponding free() */ - pvtd_as = g_malloc0(sizeof(VTDAddressSpace *) * VTD_PCI_DEVFN_MAX); - s->address_spaces[bus_num] = pvtd_as; - } - if (!pvtd_as[devfn]) { - pvtd_as[devfn] = g_malloc0(sizeof(VTDAddressSpace)); - - pvtd_as[devfn]->bus_num = (uint8_t)bus_num; - pvtd_as[devfn]->devfn = (uint8_t)devfn; - pvtd_as[devfn]->iommu_state = s; - pvtd_as[devfn]->context_cache_entry.context_cache_gen = 0; - memory_region_init_iommu(&pvtd_as[devfn]->iommu, OBJECT(s), - &s->iommu_ops, "intel_iommu", UINT64_MAX); - address_space_init(&pvtd_as[devfn]->as, - &pvtd_as[devfn]->iommu, "intel_iommu"); - } - return &pvtd_as[devfn]->as; + vtd_as = vtd_find_add_as(s, bus, devfn); + return &vtd_as->as; } static void mch_init_dmar(MCHPCIState *mch) diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index fb7983d9dc..00cdac62f9 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -46,7 +46,7 @@ static int vhost_scsi_set_endpoint(VHostSCSI *s) memset(&backend, 0, sizeof(backend)); pstrcpy(backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->conf.wwpn); - ret = vhost_ops->vhost_call(&s->dev, VHOST_SCSI_SET_ENDPOINT, &backend); + ret = vhost_ops->vhost_scsi_set_endpoint(&s->dev, &backend); if (ret < 0) { return -errno; } @@ -61,7 +61,7 @@ static void vhost_scsi_clear_endpoint(VHostSCSI *s) memset(&backend, 0, sizeof(backend)); pstrcpy(backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->conf.wwpn); - vhost_ops->vhost_call(&s->dev, VHOST_SCSI_CLEAR_ENDPOINT, &backend); + vhost_ops->vhost_scsi_clear_endpoint(&s->dev, &backend); } static int vhost_scsi_start(VHostSCSI *s) @@ -77,8 +77,7 @@ static int vhost_scsi_start(VHostSCSI *s) return -ENOSYS; } - ret = vhost_ops->vhost_call(&s->dev, - VHOST_SCSI_GET_ABI_VERSION, &abi_version); + ret = vhost_ops->vhost_scsi_get_abi_version(&s->dev, &abi_version); if (ret < 0) { return -errno; } diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 72d1392b35..1d5f684eb0 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -11,6 +11,7 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-backend.h" #include "qemu/error-report.h" +#include "linux/vhost.h" #include <sys/ioctl.h> @@ -42,6 +43,122 @@ static int vhost_kernel_cleanup(struct vhost_dev *dev) return close(fd); } +static int vhost_kernel_memslots_limit(struct vhost_dev *dev) +{ + int limit = 64; + char *s; + + if (g_file_get_contents("/sys/module/vhost/parameters/max_mem_regions", + &s, NULL, NULL)) { + uint64_t val = g_ascii_strtoull(s, NULL, 10); + if (!((val == G_MAXUINT64 || !val) && errno)) { + return val; + } + error_report("ignoring invalid max_mem_regions value in vhost module:" + " %s", s); + } + return limit; +} + +static int vhost_kernel_net_set_backend(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_NET_SET_BACKEND, file); +} + +static int vhost_kernel_scsi_set_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_SET_ENDPOINT, target); +} + +static int vhost_kernel_scsi_clear_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_CLEAR_ENDPOINT, target); +} + +static int vhost_kernel_scsi_get_abi_version(struct vhost_dev *dev, int *version) +{ + return vhost_kernel_call(dev, VHOST_SCSI_GET_ABI_VERSION, version); +} + +static int vhost_kernel_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) +{ + return vhost_kernel_call(dev, VHOST_SET_LOG_BASE, &base); +} + +static int vhost_kernel_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem); +} + +static int vhost_kernel_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ADDR, addr); +} + +static int vhost_kernel_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ENDIAN, ring); +} + +static int vhost_kernel_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_NUM, ring); +} + +static int vhost_kernel_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_BASE, ring); +} + +static int vhost_kernel_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_GET_VRING_BASE, ring); +} + +static int vhost_kernel_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file); +} + +static int vhost_kernel_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file); +} + +static int vhost_kernel_set_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, + uint64_t *features) +{ + return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); +} + +static int vhost_kernel_set_owner(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL); +} + +static int vhost_kernel_reset_device(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_RESET_DEVICE, NULL); +} + static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx) { assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); @@ -51,10 +168,27 @@ static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx) static const VhostOps kernel_ops = { .backend_type = VHOST_BACKEND_TYPE_KERNEL, - .vhost_call = vhost_kernel_call, .vhost_backend_init = vhost_kernel_init, .vhost_backend_cleanup = vhost_kernel_cleanup, - .vhost_backend_get_vq_index = vhost_kernel_get_vq_index, + .vhost_backend_memslots_limit = vhost_kernel_memslots_limit, + .vhost_net_set_backend = vhost_kernel_net_set_backend, + .vhost_scsi_set_endpoint = vhost_kernel_scsi_set_endpoint, + .vhost_scsi_clear_endpoint = vhost_kernel_scsi_clear_endpoint, + .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version, + .vhost_set_log_base = vhost_kernel_set_log_base, + .vhost_set_mem_table = vhost_kernel_set_mem_table, + .vhost_set_vring_addr = vhost_kernel_set_vring_addr, + .vhost_set_vring_endian = vhost_kernel_set_vring_endian, + .vhost_set_vring_num = vhost_kernel_set_vring_num, + .vhost_set_vring_base = vhost_kernel_set_vring_base, + .vhost_get_vring_base = vhost_kernel_get_vring_base, + .vhost_set_vring_kick = vhost_kernel_set_vring_kick, + .vhost_set_vring_call = vhost_kernel_set_vring_call, + .vhost_set_features = vhost_kernel_set_features, + .vhost_get_features = vhost_kernel_get_features, + .vhost_set_owner = vhost_kernel_set_owner, + .vhost_reset_device = vhost_kernel_reset_device, + .vhost_get_vq_index = vhost_kernel_get_vq_index, }; int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index b11c0d21a0..78442ba980 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -10,11 +10,13 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-backend.h" +#include "hw/virtio/virtio-net.h" #include "sysemu/char.h" #include "sysemu/kvm.h" #include "qemu/error-report.h" #include "qemu/sockets.h" #include "exec/ram_addr.h" +#include "migration/migration.h" #include <fcntl.h> #include <unistd.h> @@ -25,9 +27,16 @@ #define VHOST_MEMORY_MAX_NREGIONS 8 #define VHOST_USER_F_PROTOCOL_FEATURES 30 -#define VHOST_USER_PROTOCOL_FEATURE_MASK 0x1ULL -#define VHOST_USER_PROTOCOL_F_MQ 0 +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -49,6 +58,7 @@ typedef enum VhostUserRequest { VHOST_USER_SET_PROTOCOL_FEATURES = 16, VHOST_USER_GET_QUEUE_NUM = 17, VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, VHOST_USER_MAX } VhostUserRequest; @@ -97,37 +107,6 @@ static bool ioeventfd_enabled(void) return kvm_enabled() && kvm_eventfds_enabled(); } -static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = { - -1, /* VHOST_USER_NONE */ - VHOST_GET_FEATURES, /* VHOST_USER_GET_FEATURES */ - VHOST_SET_FEATURES, /* VHOST_USER_SET_FEATURES */ - VHOST_SET_OWNER, /* VHOST_USER_SET_OWNER */ - VHOST_RESET_DEVICE, /* VHOST_USER_RESET_DEVICE */ - VHOST_SET_MEM_TABLE, /* VHOST_USER_SET_MEM_TABLE */ - VHOST_SET_LOG_BASE, /* VHOST_USER_SET_LOG_BASE */ - VHOST_SET_LOG_FD, /* VHOST_USER_SET_LOG_FD */ - VHOST_SET_VRING_NUM, /* VHOST_USER_SET_VRING_NUM */ - VHOST_SET_VRING_ADDR, /* VHOST_USER_SET_VRING_ADDR */ - VHOST_SET_VRING_BASE, /* VHOST_USER_SET_VRING_BASE */ - VHOST_GET_VRING_BASE, /* VHOST_USER_GET_VRING_BASE */ - VHOST_SET_VRING_KICK, /* VHOST_USER_SET_VRING_KICK */ - VHOST_SET_VRING_CALL, /* VHOST_USER_SET_VRING_CALL */ - VHOST_SET_VRING_ERR /* VHOST_USER_SET_VRING_ERR */ -}; - -static VhostUserRequest vhost_user_request_translate(unsigned long int request) -{ - VhostUserRequest idx; - - for (idx = 0; idx < VHOST_USER_MAX; idx++) { - if (ioctl_to_vhost_user_request[idx] == request) { - break; - } - } - - return (idx == VHOST_USER_MAX) ? VHOST_USER_NONE : idx; -} - static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) { CharDriverState *chr = dev->opaque; @@ -174,12 +153,35 @@ fail: return -1; } +static bool vhost_user_one_time_request(VhostUserRequest request) +{ + switch (request) { + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_DEVICE: + case VHOST_USER_SET_MEM_TABLE: + case VHOST_USER_GET_QUEUE_NUM: + return true; + default: + return false; + } +} + +/* most non-init callers ignore the error */ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, int *fds, int fd_num) { CharDriverState *chr = dev->opaque; int size = VHOST_USER_HDR_SIZE + msg->size; + /* + * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE, + * we just need send it once in the first time. For later such + * request, we just ignore it. + */ + if (vhost_user_one_time_request(msg->request) && dev->vq_index != 0) { + return 0; + } + if (fd_num) { qemu_chr_fe_set_msgfds(chr, fds, fd_num); } @@ -188,195 +190,321 @@ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, 0 : -1; } -static bool vhost_user_one_time_request(VhostUserRequest request) +static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) { - switch (request) { - case VHOST_USER_SET_OWNER: - case VHOST_USER_RESET_DEVICE: - case VHOST_USER_SET_MEM_TABLE: - case VHOST_USER_GET_QUEUE_NUM: - return true; - default: - return false; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + size_t fd_num = 0; + bool shmfd = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); + VhostUserMsg msg = { + .request = VHOST_USER_SET_LOG_BASE, + .flags = VHOST_USER_VERSION, + .u64 = base, + .size = sizeof(m.u64), + }; + + if (shmfd && log->fd != -1) { + fds[fd_num++] = log->fd; } + + vhost_user_write(dev, &msg, fds, fd_num); + + if (shmfd) { + msg.size = 0; + if (vhost_user_read(dev, &msg) < 0) { + return 0; + } + + if (msg.request != VHOST_USER_SET_LOG_BASE) { + error_report("Received unexpected msg type. " + "Expected %d received %d", + VHOST_USER_SET_LOG_BASE, msg.request); + return -1; + } + } + + return 0; } -static int vhost_user_call(struct vhost_dev *dev, unsigned long int request, - void *arg) +static int vhost_user_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) { - VhostUserMsg msg; - VhostUserRequest msg_request; - struct vhost_vring_file *file = 0; - int need_reply = 0; int fds[VHOST_MEMORY_MAX_NREGIONS]; int i, fd; size_t fd_num = 0; + VhostUserMsg msg = { + .request = VHOST_USER_SET_MEM_TABLE, + .flags = VHOST_USER_VERSION, + }; - assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); - - /* only translate vhost ioctl requests */ - if (request > VHOST_USER_MAX) { - msg_request = vhost_user_request_translate(request); - } else { - msg_request = request; + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + ram_addr_t ram_addr; + + assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); + qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, + &ram_addr); + fd = qemu_get_ram_fd(ram_addr); + if (fd > 0) { + msg.memory.regions[fd_num].userspace_addr = reg->userspace_addr; + msg.memory.regions[fd_num].memory_size = reg->memory_size; + msg.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; + msg.memory.regions[fd_num].mmap_offset = reg->userspace_addr - + (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr); + assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); + fds[fd_num++] = fd; + } } - /* - * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE, - * we just need send it once in the first time. For later such - * request, we just ignore it. - */ - if (vhost_user_one_time_request(msg_request) && dev->vq_index != 0) { - return 0; + msg.memory.nregions = fd_num; + + if (!fd_num) { + error_report("Failed initializing vhost-user memory map, " + "consider using -object memory-backend-file share=on"); + return -1; } - msg.request = msg_request; - msg.flags = VHOST_USER_VERSION; - msg.size = 0; + msg.size = sizeof(m.memory.nregions); + msg.size += sizeof(m.memory.padding); + msg.size += fd_num * sizeof(VhostUserMemoryRegion); - switch (msg_request) { - case VHOST_USER_GET_FEATURES: - case VHOST_USER_GET_PROTOCOL_FEATURES: - case VHOST_USER_GET_QUEUE_NUM: - need_reply = 1; - break; + vhost_user_write(dev, &msg, fds, fd_num); - case VHOST_USER_SET_FEATURES: - case VHOST_USER_SET_LOG_BASE: - case VHOST_USER_SET_PROTOCOL_FEATURES: - msg.u64 = *((__u64 *) arg); - msg.size = sizeof(m.u64); - break; + return 0; +} - case VHOST_USER_SET_OWNER: - case VHOST_USER_RESET_DEVICE: - break; +static int vhost_user_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + VhostUserMsg msg = { + .request = VHOST_USER_SET_VRING_ADDR, + .flags = VHOST_USER_VERSION, + .addr = *addr, + .size = sizeof(*addr), + }; - case VHOST_USER_SET_MEM_TABLE: - for (i = 0; i < dev->mem->nregions; ++i) { - struct vhost_memory_region *reg = dev->mem->regions + i; - ram_addr_t ram_addr; - - assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); - qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, &ram_addr); - fd = qemu_get_ram_fd(ram_addr); - if (fd > 0) { - msg.memory.regions[fd_num].userspace_addr = reg->userspace_addr; - msg.memory.regions[fd_num].memory_size = reg->memory_size; - msg.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; - msg.memory.regions[fd_num].mmap_offset = reg->userspace_addr - - (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr); - assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); - fds[fd_num++] = fd; - } - } + vhost_user_write(dev, &msg, NULL, 0); - msg.memory.nregions = fd_num; + return 0; +} - if (!fd_num) { - error_report("Failed initializing vhost-user memory map, " - "consider using -object memory-backend-file share=on"); - return -1; - } +static int vhost_user_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + error_report("vhost-user trying to send unhandled ioctl"); + return -1; +} - msg.size = sizeof(m.memory.nregions); - msg.size += sizeof(m.memory.padding); - msg.size += fd_num * sizeof(VhostUserMemoryRegion); - - break; - - case VHOST_USER_SET_LOG_FD: - fds[fd_num++] = *((int *) arg); - break; - - case VHOST_USER_SET_VRING_NUM: - case VHOST_USER_SET_VRING_BASE: - case VHOST_USER_SET_VRING_ENABLE: - memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); - msg.size = sizeof(m.state); - break; - - case VHOST_USER_GET_VRING_BASE: - memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); - msg.size = sizeof(m.state); - need_reply = 1; - break; - - case VHOST_USER_SET_VRING_ADDR: - memcpy(&msg.addr, arg, sizeof(struct vhost_vring_addr)); - msg.size = sizeof(m.addr); - break; - - case VHOST_USER_SET_VRING_KICK: - case VHOST_USER_SET_VRING_CALL: - case VHOST_USER_SET_VRING_ERR: - file = arg; - msg.u64 = file->index & VHOST_USER_VRING_IDX_MASK; - msg.size = sizeof(m.u64); - if (ioeventfd_enabled() && file->fd > 0) { - fds[fd_num++] = file->fd; - } else { - msg.u64 |= VHOST_USER_VRING_NOFD_MASK; - } - break; - default: - error_report("vhost-user trying to send unhandled ioctl"); +static int vhost_set_vring(struct vhost_dev *dev, + unsigned long int request, + struct vhost_vring_state *ring) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .state = *ring, + .size = sizeof(*ring), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring); +} + +static int vhost_user_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring); +} + +static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) +{ + struct vhost_vring_state state = { + .index = dev->vq_index, + .num = enable, + }; + + if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ))) { return -1; - break; } - if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + return vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state); +} + + +static int vhost_user_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + VhostUserMsg msg = { + .request = VHOST_USER_GET_VRING_BASE, + .flags = VHOST_USER_VERSION, + .state = *ring, + .size = sizeof(*ring), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + if (vhost_user_read(dev, &msg) < 0) { return 0; } - if (need_reply) { - if (vhost_user_read(dev, &msg) < 0) { - return 0; - } + if (msg.request != VHOST_USER_GET_VRING_BASE) { + error_report("Received unexpected msg type. Expected %d received %d", + VHOST_USER_GET_VRING_BASE, msg.request); + return -1; + } - if (msg_request != msg.request) { - error_report("Received unexpected msg type." - " Expected %d received %d", msg_request, msg.request); - return -1; - } + if (msg.size != sizeof(m.state)) { + error_report("Received bad msg size."); + return -1; + } - switch (msg_request) { - case VHOST_USER_GET_FEATURES: - case VHOST_USER_GET_PROTOCOL_FEATURES: - case VHOST_USER_GET_QUEUE_NUM: - if (msg.size != sizeof(m.u64)) { - error_report("Received bad msg size."); - return -1; - } - *((__u64 *) arg) = msg.u64; - break; - case VHOST_USER_GET_VRING_BASE: - if (msg.size != sizeof(m.state)) { - error_report("Received bad msg size."); - return -1; - } - memcpy(arg, &msg.state, sizeof(struct vhost_vring_state)); - break; - default: - error_report("Received unexpected msg type."); - return -1; - break; - } + *ring = msg.state; + + return 0; +} + +static int vhost_set_vring_file(struct vhost_dev *dev, + VhostUserRequest request, + struct vhost_vring_file *file) +{ + int fds[VHOST_MEMORY_MAX_NREGIONS]; + size_t fd_num = 0; + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .u64 = file->index & VHOST_USER_VRING_IDX_MASK, + .size = sizeof(m.u64), + }; + + if (ioeventfd_enabled() && file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + + vhost_user_write(dev, &msg, fds, fd_num); + + return 0; +} + +static int vhost_user_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file); +} + +static int vhost_user_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file); +} + +static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .u64 = u64, + .size = sizeof(m.u64), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_set_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES, features); +} + +static int vhost_user_set_protocol_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features); +} + +static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + }; + + if (vhost_user_one_time_request(request) && dev->vq_index != 0) { + return 0; + } + + vhost_user_write(dev, &msg, NULL, 0); + + if (vhost_user_read(dev, &msg) < 0) { + return 0; + } + + if (msg.request != request) { + error_report("Received unexpected msg type. Expected %d received %d", + request, msg.request); + return -1; } + if (msg.size != sizeof(m.u64)) { + error_report("Received bad msg size."); + return -1; + } + + *u64 = msg.u64; + + return 0; +} + +static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features) +{ + return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features); +} + +static int vhost_user_set_owner(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .request = VHOST_USER_SET_OWNER, + .flags = VHOST_USER_VERSION, + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_reset_device(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .request = VHOST_USER_RESET_DEVICE, + .flags = VHOST_USER_VERSION, + }; + + vhost_user_write(dev, &msg, NULL, 0); + return 0; } static int vhost_user_init(struct vhost_dev *dev, void *opaque) { - unsigned long long features; + uint64_t features; int err; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); dev->opaque = opaque; - err = vhost_user_call(dev, VHOST_USER_GET_FEATURES, &features); + err = vhost_user_get_features(dev, &features); if (err < 0) { return err; } @@ -384,44 +512,37 @@ static int vhost_user_init(struct vhost_dev *dev, void *opaque) if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; - err = vhost_user_call(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &features); + err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, + &features); if (err < 0) { return err; } dev->protocol_features = features & VHOST_USER_PROTOCOL_FEATURE_MASK; - err = vhost_user_call(dev, VHOST_USER_SET_PROTOCOL_FEATURES, - &dev->protocol_features); + err = vhost_user_set_protocol_features(dev, dev->protocol_features); if (err < 0) { return err; } /* query the max queues we support if backend supports Multiple Queue */ if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) { - err = vhost_user_call(dev, VHOST_USER_GET_QUEUE_NUM, &dev->max_queues); + err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM, + &dev->max_queues); if (err < 0) { return err; } } } - return 0; -} - -static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) -{ - struct vhost_vring_state state = { - .index = dev->vq_index, - .num = enable, - }; - - assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); - - if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ))) { - return -1; + if (dev->migration_blocker == NULL && + !virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD)) { + error_setg(&dev->migration_blocker, + "Migration disabled: vhost-user backend lacks " + "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature."); } - return vhost_user_call(dev, VHOST_USER_SET_VRING_ENABLE, &state); + return 0; } static int vhost_user_cleanup(struct vhost_dev *dev) @@ -440,11 +561,65 @@ static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx) return idx; } +static int vhost_user_memslots_limit(struct vhost_dev *dev) +{ + return VHOST_MEMORY_MAX_NREGIONS; +} + +static bool vhost_user_requires_shm_log(struct vhost_dev *dev) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + return virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); +} + +static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) +{ + VhostUserMsg msg = { 0 }; + int err; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + /* If guest supports GUEST_ANNOUNCE do nothing */ + if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) { + return 0; + } + + /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */ + if (virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RARP)) { + msg.request = VHOST_USER_SEND_RARP; + msg.flags = VHOST_USER_VERSION; + memcpy((char *)&msg.u64, mac_addr, 6); + msg.size = sizeof(m.u64); + + err = vhost_user_write(dev, &msg, NULL, 0); + return err; + } + return -1; +} + const VhostOps user_ops = { .backend_type = VHOST_BACKEND_TYPE_USER, - .vhost_call = vhost_user_call, .vhost_backend_init = vhost_user_init, .vhost_backend_cleanup = vhost_user_cleanup, - .vhost_backend_get_vq_index = vhost_user_get_vq_index, - .vhost_backend_set_vring_enable = vhost_user_set_vring_enable, + .vhost_backend_memslots_limit = vhost_user_memslots_limit, + .vhost_set_log_base = vhost_user_set_log_base, + .vhost_set_mem_table = vhost_user_set_mem_table, + .vhost_set_vring_addr = vhost_user_set_vring_addr, + .vhost_set_vring_endian = vhost_user_set_vring_endian, + .vhost_set_vring_num = vhost_user_set_vring_num, + .vhost_set_vring_base = vhost_user_set_vring_base, + .vhost_get_vring_base = vhost_user_get_vring_base, + .vhost_set_vring_kick = vhost_user_set_vring_kick, + .vhost_set_vring_call = vhost_user_set_vring_call, + .vhost_set_features = vhost_user_set_features, + .vhost_get_features = vhost_user_get_features, + .vhost_set_owner = vhost_user_set_owner, + .vhost_reset_device = vhost_user_reset_device, + .vhost_get_vq_index = vhost_user_get_vq_index, + .vhost_set_vring_enable = vhost_user_set_vring_enable, + .vhost_requires_shm_log = vhost_user_requires_shm_log, + .vhost_migration_done = vhost_user_migration_done, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index c0ed5b263f..de29968a79 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -18,6 +18,7 @@ #include "qemu/atomic.h" #include "qemu/range.h" #include "qemu/error-report.h" +#include "qemu/memfd.h" #include <linux/vhost.h> #include "exec/address-spaces.h" #include "hw/virtio/virtio-bus.h" @@ -25,6 +26,23 @@ #include "migration/migration.h" static struct vhost_log *vhost_log; +static struct vhost_log *vhost_log_shm; + +static unsigned int used_memslots; +static QLIST_HEAD(, vhost_dev) vhost_devices = + QLIST_HEAD_INITIALIZER(vhost_devices); + +bool vhost_has_free_slot(void) +{ + unsigned int slots_limit = ~0U; + struct vhost_dev *hdev; + + QLIST_FOREACH(hdev, &vhost_devices, entry) { + unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); + slots_limit = MIN(slots_limit, r); + } + return slots_limit > used_memslots; +} static void vhost_dev_sync_region(struct vhost_dev *dev, MemoryRegionSection *section, @@ -286,25 +304,46 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev) } return log_size; } -static struct vhost_log *vhost_log_alloc(uint64_t size) + +static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) { - struct vhost_log *log = g_malloc0(sizeof *log + size * sizeof(*(log->log))); + struct vhost_log *log; + uint64_t logsize = size * sizeof(*(log->log)); + int fd = -1; + + log = g_new0(struct vhost_log, 1); + if (share) { + log->log = qemu_memfd_alloc("vhost-log", logsize, + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, + &fd); + memset(log->log, 0, logsize); + } else { + log->log = g_malloc0(logsize); + } log->size = size; log->refcnt = 1; + log->fd = fd; return log; } -static struct vhost_log *vhost_log_get(uint64_t size) +static struct vhost_log *vhost_log_get(uint64_t size, bool share) { - if (!vhost_log || vhost_log->size != size) { - vhost_log = vhost_log_alloc(size); + struct vhost_log *log = share ? vhost_log_shm : vhost_log; + + if (!log || log->size != size) { + log = vhost_log_alloc(size, share); + if (share) { + vhost_log_shm = log; + } else { + vhost_log = log; + } } else { - ++vhost_log->refcnt; + ++log->refcnt; } - return vhost_log; + return log; } static void vhost_log_put(struct vhost_dev *dev, bool sync) @@ -321,20 +360,35 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync) if (dev->log_size && sync) { vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); } + if (vhost_log == log) { + g_free(log->log); vhost_log = NULL; + } else if (vhost_log_shm == log) { + qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), + log->fd); + vhost_log_shm = NULL; } + g_free(log); } } -static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) +static bool vhost_dev_log_is_shared(struct vhost_dev *dev) { - struct vhost_log *log = vhost_log_get(size); + return dev->vhost_ops->vhost_requires_shm_log && + dev->vhost_ops->vhost_requires_shm_log(dev); +} + +static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) +{ + struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); uint64_t log_base = (uintptr_t)log->log; int r; - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base); + /* inform backend of log switching, this must be done before + releasing the current log, to ensure no logging is lost */ + r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); assert(r >= 0); vhost_log_put(dev, true); dev->log = log; @@ -457,6 +511,7 @@ static void vhost_set_memory(MemoryListener *listener, dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr); dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1); dev->memory_changed = true; + used_memslots = dev->mem->nregions; } static bool vhost_section(MemoryRegionSection *section) @@ -500,7 +555,7 @@ static void vhost_commit(MemoryListener *listener) } if (!dev->log_enabled) { - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); assert(r >= 0); dev->memory_changed = false; return; @@ -513,7 +568,7 @@ static void vhost_commit(MemoryListener *listener) if (dev->log_size < log_size) { vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); } - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); assert(r >= 0); /* To log less, can only decrease log size after table update. */ if (dev->log_size > log_size + VHOST_LOG_BUFFER) { @@ -581,7 +636,7 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, .log_guest_addr = vq->used_phys, .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, }; - int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr); + int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); if (r < 0) { return -errno; } @@ -595,19 +650,20 @@ static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) if (enable_log) { features |= 0x1ULL << VHOST_F_LOG_ALL; } - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features); + r = dev->vhost_ops->vhost_set_features(dev, features); return r < 0 ? -errno : 0; } static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) { - int r, t, i; + int r, t, i, idx; r = vhost_dev_set_features(dev, enable_log); if (r < 0) { goto err_features; } for (i = 0; i < dev->nvqs; ++i) { - r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, enable_log); if (r < 0) { goto err_vq; @@ -616,7 +672,8 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) return 0; err_vq: for (; i >= 0; --i) { - t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, dev->log_enabled); assert(t >= 0); } @@ -700,7 +757,7 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, .num = is_big_endian }; - if (!dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ENDIAN, &s)) { + if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) { return 0; } @@ -719,7 +776,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, { hwaddr s, l, a; int r; - int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, idx); + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); struct vhost_vring_file file = { .index = vhost_vq_index }; @@ -730,13 +787,13 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, vq->num = state.num = virtio_queue_get_num(vdev, idx); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state); + r = dev->vhost_ops->vhost_set_vring_num(dev, &state); if (r) { return -errno; } state.num = virtio_queue_get_last_avail_idx(vdev, idx); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state); + r = dev->vhost_ops->vhost_set_vring_base(dev, &state); if (r) { return -errno; } @@ -788,7 +845,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file); + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); if (r) { r = -errno; goto fail_kick; @@ -821,13 +878,13 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, struct vhost_virtqueue *vq, unsigned idx) { - int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, idx); + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); struct vhost_vring_state state = { .index = vhost_vq_index, }; int r; - r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state); + r = dev->vhost_ops->vhost_get_vring_base(dev, &state); if (r < 0) { fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); fflush(stderr); @@ -874,7 +931,7 @@ static void vhost_eventfd_del(MemoryListener *listener, static int vhost_virtqueue_init(struct vhost_dev *dev, struct vhost_virtqueue *vq, int n) { - int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, n); + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); struct vhost_vring_file file = { .index = vhost_vq_index, }; @@ -884,7 +941,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(&vq->masked_notifier); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file); + r = dev->vhost_ops->vhost_set_vring_call(dev, &file); if (r) { r = -errno; goto fail_call; @@ -906,6 +963,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, uint64_t features; int i, r; + hdev->migration_blocker = NULL; + if (vhost_set_backend_type(hdev, backend_type) < 0) { close((uintptr_t)opaque); return -1; @@ -916,12 +975,20 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, return -errno; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL); + if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { + fprintf(stderr, "vhost backend memory slots limit is less" + " than current number of present memory slots\n"); + close((uintptr_t)opaque); + return -1; + } + QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); + + r = hdev->vhost_ops->vhost_set_owner(hdev); if (r < 0) { goto fail; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features); + r = hdev->vhost_ops->vhost_get_features(hdev, &features); if (r < 0) { goto fail; } @@ -949,12 +1016,21 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, .eventfd_del = vhost_eventfd_del, .priority = 10 }; - hdev->migration_blocker = NULL; - if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { - error_setg(&hdev->migration_blocker, - "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); + + if (hdev->migration_blocker == NULL) { + if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + error_setg(&hdev->migration_blocker, + "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); + } else if (!qemu_memfd_check()) { + error_setg(&hdev->migration_blocker, + "Migration disabled: failed to allocate shared memory"); + } + } + + if (hdev->migration_blocker != NULL) { migrate_add_blocker(hdev->migration_blocker); } + hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); hdev->n_mem_sections = 0; hdev->mem_sections = NULL; @@ -972,6 +1048,7 @@ fail_vq: fail: r = -errno; hdev->vhost_ops->vhost_backend_cleanup(hdev); + QLIST_REMOVE(hdev, entry); return r; } @@ -989,6 +1066,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) g_free(hdev->mem); g_free(hdev->mem_sections); hdev->vhost_ops->vhost_backend_cleanup(hdev); + QLIST_REMOVE(hdev, entry); } /* Stop processing guest IO notifications in qemu. @@ -1074,8 +1152,8 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); } - file.index = hdev->vhost_ops->vhost_backend_get_vq_index(hdev, n); - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file); + file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); + r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); assert(r >= 0); } @@ -1117,7 +1195,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) if (r < 0) { goto fail_features; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem); + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); if (r < 0) { r = -errno; goto fail_mem; @@ -1136,10 +1214,12 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) uint64_t log_base; hdev->log_size = vhost_get_log_size(hdev); - hdev->log = vhost_log_get(hdev->log_size); + hdev->log = vhost_log_get(hdev->log_size, + vhost_dev_log_is_shared(hdev)); log_base = (uintptr_t)hdev->log->log; - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE, - hdev->log_size ? &log_base : NULL); + r = hdev->vhost_ops->vhost_set_log_base(hdev, + hdev->log_size ? log_base : 0, + hdev->log); if (r < 0) { r = -errno; goto fail_log; |