diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-11-05 15:16:43 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-11-05 15:16:43 +0000 |
commit | 85c3ed44171d757e399bcbb3db3608c1848c0984 (patch) | |
tree | 777c2c5df9154caa0f0787b15b76deda2f2c2399 /hw | |
parent | 747c6b3811ef5f06278ab364261e3723bcbb4031 (diff) | |
parent | 9f6df01d0e128c2df179789b37140d6aeddfcb92 (diff) |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
pc,pci,vhost,virtio: fixes
Lots of fixes all over the place.
virtio-mem and virtio-iommu patches are kind of fixes but
it seems better to just make them behave sanely than
try to educate users about the limitations ...
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Wed 04 Nov 2020 18:40:03 GMT
# gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg: issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* remotes/mst/tags/for_upstream: (31 commits)
contrib/vhost-user-blk: fix get_config() information leak
block/export: fix vhost-user-blk get_config() information leak
block/export: make vhost-user-blk config space little-endian
configure: introduce --enable-vhost-user-blk-server
libvhost-user: follow QEMU comment style
vhost-blk: set features before setting inflight feature
Revert "vhost-blk: set features before setting inflight feature"
net: Add vhost-vdpa in show_netdevs()
vhost-vdpa: Add qemu_close in vhost_vdpa_cleanup
vfio: Don't issue full 2^64 unmap
virtio-iommu: Set supported page size mask
vfio: Set IOMMU page size as per host supported page size
memory: Add interface to set iommu page size mask
virtio-iommu: Add notify_flag_changed() memory region callback
virtio-iommu: Add replay() memory region callback
virtio-iommu: Call memory notifiers in attach/detach
virtio-iommu: Add memory notifiers for map/unmap
virtio-iommu: Store memory region in endpoint struct
virtio-iommu: Fix virtio_iommu_mr()
hw/smbios: Fix leaked fd in save_opt_one() error path
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/acpi/core.c | 2 | ||||
-rw-r--r-- | hw/acpi/nvdimm.c | 20 | ||||
-rw-r--r-- | hw/acpi/pcihp.c | 2 | ||||
-rw-r--r-- | hw/block/vhost-user-blk.c | 2 | ||||
-rw-r--r-- | hw/i386/pc.c | 9 | ||||
-rw-r--r-- | hw/mem/memory-device.c | 20 | ||||
-rw-r--r-- | hw/smbios/smbios.c | 4 | ||||
-rw-r--r-- | hw/vfio/common.c | 19 | ||||
-rw-r--r-- | hw/virtio/trace-events | 6 | ||||
-rw-r--r-- | hw/virtio/vhost-backend.c | 4 | ||||
-rw-r--r-- | hw/virtio/vhost.c | 8 | ||||
-rw-r--r-- | hw/virtio/virtio-iommu.c | 205 | ||||
-rw-r--r-- | hw/virtio/virtio-mem-pci.c | 7 | ||||
-rw-r--r-- | hw/virtio/virtio-mem.c | 113 |
14 files changed, 386 insertions, 35 deletions
diff --git a/hw/acpi/core.c b/hw/acpi/core.c index ade9158cbf..2c0c83221f 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -558,7 +558,7 @@ static void acpi_pm1_cnt_write(ACPIREGS *ar, uint16_t val) if (val & ACPI_BITMASK_SLEEP_ENABLE) { /* change suspend type */ uint16_t sus_typ = (val >> 10) & 7; - switch(sus_typ) { + switch (sus_typ) { case 0: /* soft power off */ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); break; diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 8f7cc16add..8ad5516142 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -556,7 +556,7 @@ static void nvdimm_dsm_func_read_fit(NVDIMMState *state, NvdimmDsmIn *in, fit = fit_buf->fit; - nvdimm_debug("Read FIT: offset %#x FIT size %#x Dirty %s.\n", + nvdimm_debug("Read FIT: offset 0x%x FIT size 0x%x Dirty %s.\n", read_fit->offset, fit->len, fit_buf->dirty ? "Yes" : "No"); if (read_fit->offset > fit->len) { @@ -664,7 +664,7 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr) label_size = nvdimm->label_size; mxfer = nvdimm_get_max_xfer_label_size(); - nvdimm_debug("label_size %#x, max_xfer %#x.\n", label_size, mxfer); + nvdimm_debug("label_size 0x%x, max_xfer 0x%x.\n", label_size, mxfer); label_size_out.func_ret_status = cpu_to_le32(NVDIMM_DSM_RET_STATUS_SUCCESS); label_size_out.label_size = cpu_to_le32(label_size); @@ -680,19 +680,19 @@ static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm, uint32_t ret = NVDIMM_DSM_RET_STATUS_INVALID; if (offset + length < offset) { - nvdimm_debug("offset %#x + length %#x is overflow.\n", offset, + nvdimm_debug("offset 0x%x + length 0x%x is overflow.\n", offset, length); return ret; } if (nvdimm->label_size < offset + length) { - nvdimm_debug("position %#x is beyond label data (len = %" PRIx64 ").\n", + nvdimm_debug("position 0x%x is beyond label data (len = %" PRIx64 ").\n", offset + length, nvdimm->label_size); return ret; } if (length > nvdimm_get_max_xfer_label_size()) { - nvdimm_debug("length (%#x) is larger than max_xfer (%#x).\n", + nvdimm_debug("length (0x%x) is larger than max_xfer (0x%x).\n", length, nvdimm_get_max_xfer_label_size()); return ret; } @@ -716,7 +716,7 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, get_label_data->offset = le32_to_cpu(get_label_data->offset); get_label_data->length = le32_to_cpu(get_label_data->length); - nvdimm_debug("Read Label Data: offset %#x length %#x.\n", + nvdimm_debug("Read Label Data: offset 0x%x length 0x%x.\n", get_label_data->offset, get_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset, @@ -755,7 +755,7 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, set_label_data->offset = le32_to_cpu(set_label_data->offset); set_label_data->length = le32_to_cpu(set_label_data->length); - nvdimm_debug("Write Label Data: offset %#x length %#x.\n", + nvdimm_debug("Write Label Data: offset 0x%x length 0x%x.\n", set_label_data->offset, set_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset, @@ -838,7 +838,7 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) NvdimmDsmIn *in; hwaddr dsm_mem_addr = val; - nvdimm_debug("dsm memory address %#" HWADDR_PRIx ".\n", dsm_mem_addr); + nvdimm_debug("dsm memory address 0x%" HWADDR_PRIx ".\n", dsm_mem_addr); /* * The DSM memory is mapped to guest address space so an evil guest @@ -852,11 +852,11 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) in->function = le32_to_cpu(in->function); in->handle = le32_to_cpu(in->handle); - nvdimm_debug("Revision %#x Handler %#x Function %#x.\n", in->revision, + nvdimm_debug("Revision 0x%x Handler 0x%x Function 0x%x.\n", in->revision, in->handle, in->function); if (in->revision != 0x1 /* Currently we only support DSM Spec Rev1. */) { - nvdimm_debug("Revision %#x is not supported, expect %#x.\n", + nvdimm_debug("Revision 0x%x is not supported, expect 0x%x.\n", in->revision, 0x1); nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); goto exit; diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 32ae8b2c0a..17c32e0ffd 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -400,7 +400,7 @@ void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus, s->io_len = ACPI_PCIHP_SIZE; s->io_base = ACPI_PCIHP_ADDR; - s->root= root_bus; + s->root = root_bus; s->legacy_piix = !bridges_enabled; memory_region_init_io(&s->io, owner, &acpi_pcihp_io_ops, s, diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index f67b29bbf3..2dd3d93ca0 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -131,7 +131,7 @@ static int vhost_user_blk_start(VirtIODevice *vdev) s->dev.acked_features = vdev->guest_features; - ret = vhost_dev_prepare_inflight(&s->dev); + ret = vhost_dev_prepare_inflight(&s->dev, vdev); if (ret < 0) { error_report("Error set inflight format: %d", -ret); goto err_guest_notifiers; diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 5e6c0023e0..17b514d1da 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1149,10 +1149,11 @@ void pc_basic_device_init(struct PCMachineState *pcms, error_report("couldn't create HPET device"); exit(1); } - /* For pc-piix-*, hpet's intcap is always IRQ2. For pc-q35-1.7 - * and earlier, use IRQ2 for compat. Otherwise, use IRQ16~23, - * IRQ8 and IRQ2. - */ + /* + * For pc-piix-*, hpet's intcap is always IRQ2. For pc-q35-1.7 and + * earlier, use IRQ2 for compat. Otherwise, use IRQ16~23, IRQ8 and + * IRQ2. + */ uint8_t compat = object_property_get_uint(OBJECT(hpet), HPET_INTCAP, NULL); if (!compat) { diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index 4bc9cf0917..cf0627fd01 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -119,9 +119,10 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, /* start of address space indicates the maximum alignment we expect */ if (!QEMU_IS_ALIGNED(range_lob(&as), align)) { - error_setg(errp, "the alignment (0x%" PRIx64 ") is not supported", - align); - return 0; + warn_report("the alignment (0x%" PRIx64 ") exceeds the expected" + " maximum alignment, memory will get fragmented and not" + " all 'maxmem' might be usable for memory devices.", + align); } memory_device_check_addable(ms, size, &err); @@ -151,7 +152,7 @@ static uint64_t memory_device_get_free_addr(MachineState *ms, return 0; } } else { - if (range_init(&new, range_lob(&as), size)) { + if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) { error_setg(errp, "can't add memory device, device too big"); return 0; } @@ -258,7 +259,7 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); Error *local_err = NULL; - uint64_t addr, align; + uint64_t addr, align = 0; MemoryRegion *mr; mr = mdc->get_memory_region(md, &local_err); @@ -266,7 +267,14 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, goto out; } - align = legacy_align ? *legacy_align : memory_region_get_alignment(mr); + if (legacy_align) { + align = *legacy_align; + } else { + if (mdc->get_min_alignment) { + align = mdc->get_min_alignment(md); + } + align = MAX(align, memory_region_get_alignment(mr)); + } addr = mdc->get_addr(md); addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align, memory_region_size(mr), &local_err); diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 8b30906e50..6a3d39793b 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -988,16 +988,18 @@ static int save_opt_one(void *opaque, if (ret < 0) { error_setg(errp, "Unable to read from %s: %s", value, strerror(errno)); + qemu_close(fd); return -1; } if (memchr(buf, '\0', ret)) { error_setg(errp, "NUL in OEM strings value in %s", value); + qemu_close(fd); return -1; } g_byte_array_append(data, (guint8 *)buf, ret); } - close(fd); + qemu_close(fd); *opt->dest = g_renew(char *, *opt->dest, (*opt->ndest) + 1); (*opt->dest)[*opt->ndest] = (char *)g_byte_array_free(data, FALSE); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e18ea2cf91..c1fdbf17f2 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -789,6 +789,14 @@ static void vfio_listener_region_add(MemoryListener *listener, int128_get64(llend), iommu_idx); + ret = memory_region_iommu_set_page_size_mask(giommu->iommu, + container->pgsizes, + &err); + if (ret) { + g_free(giommu); + goto fail; + } + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, &err); if (ret) { @@ -942,6 +950,17 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, int128_get64(llsize), ret); + } + iova += int128_get64(llsize); + } ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index cf1e59de30..2060a144a2 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -106,6 +106,12 @@ virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d" virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d" virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64 virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t start, uint64_t end) "dev= %d, type=%d start=0x%"PRIx64" end=0x%"PRIx64 +virtio_iommu_notify_map(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64" flags=%d" +virtio_iommu_notify_unmap(const char *name, uint64_t virt_start, uint64_t virt_end) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64 +virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 +virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" +virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" # virtio-mem.c virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 88c8ecc9e0..222bbcc62d 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -257,7 +257,7 @@ static int vhost_kernel_send_device_iotlb_msg(struct vhost_dev *dev, struct vhost_iotlb_msg *imsg) { if (dev->backend_cap & (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)) { - struct vhost_msg_v2 msg; + struct vhost_msg_v2 msg = {}; msg.type = VHOST_IOTLB_MSG_V2; msg.iotlb = *imsg; @@ -267,7 +267,7 @@ static int vhost_kernel_send_device_iotlb_msg(struct vhost_dev *dev, return -EFAULT; } } else { - struct vhost_msg msg; + struct vhost_msg msg = {}; msg.type = VHOST_IOTLB_MSG; msg.iotlb = *imsg; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index f2482378c6..614ccc2bcb 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1645,15 +1645,17 @@ int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) return 0; } -int vhost_dev_prepare_inflight(struct vhost_dev *hdev) +int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) { int r; - + if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || hdev->vhost_ops->vhost_set_inflight_fd == NULL) { return 0; } - + + hdev->vdev = vdev; + r = vhost_dev_set_features(hdev, hdev->log_enabled); if (r < 0) { VHOST_OPS_DEBUG("vhost_dev_prepare_inflight failed"); diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 21ec63b108..fc5c75d693 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -49,6 +49,7 @@ typedef struct VirtIOIOMMUDomain { typedef struct VirtIOIOMMUEndpoint { uint32_t id; VirtIOIOMMUDomain *domain; + IOMMUMemoryRegion *iommu_mr; QLIST_ENTRY(VirtIOIOMMUEndpoint) next; } VirtIOIOMMUEndpoint; @@ -101,7 +102,7 @@ static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid) bus_n = PCI_BUS_NUM(sid); iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n); if (iommu_pci_bus) { - devfn = sid & PCI_DEVFN_MAX; + devfn = sid & (PCI_DEVFN_MAX - 1); dev = iommu_pci_bus->pbdev[devfn]; if (dev) { return &dev->iommu_mr; @@ -124,11 +125,84 @@ static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data) } } +static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end, hwaddr paddr, + uint32_t flags) +{ + IOMMUTLBEntry entry; + IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ, + flags & VIRTIO_IOMMU_MAP_F_WRITE); + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) || + (flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) { + return; + } + + trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end, + paddr, perm); + + entry.target_as = &address_space_memory; + entry.addr_mask = virt_end - virt_start; + entry.iova = virt_start; + entry.perm = perm; + entry.translated_addr = paddr; + + memory_region_notify_iommu(mr, 0, entry); +} + +static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end) +{ + IOMMUTLBEntry entry; + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { + return; + } + + trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end); + + entry.target_as = &address_space_memory; + entry.addr_mask = virt_end - virt_start; + entry.iova = virt_start; + entry.perm = IOMMU_NONE; + entry.translated_addr = 0; + + memory_region_notify_iommu(mr, 0, entry); +} + +static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + virtio_iommu_notify_unmap(mr, interval->low, interval->high); + + return false; +} + +static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value; + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + virtio_iommu_notify_map(mr, interval->low, interval->high, + mapping->phys_addr, mapping->flags); + + return false; +} + static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) { + VirtIOIOMMUDomain *domain = ep->domain; + if (!ep->domain) { return; } + g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb, + ep->iommu_mr); QLIST_REMOVE(ep, next); ep->domain = NULL; } @@ -137,16 +211,19 @@ static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id) { VirtIOIOMMUEndpoint *ep; + IOMMUMemoryRegion *mr; ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id)); if (ep) { return ep; } - if (!virtio_iommu_mr(s, ep_id)) { + mr = virtio_iommu_mr(s, ep_id); + if (!mr) { return NULL; } ep = g_malloc0(sizeof(*ep)); ep->id = ep_id; + ep->iommu_mr = mr; trace_virtio_iommu_get_endpoint(ep_id); g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep); return ep; @@ -268,6 +345,10 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, ep->domain = domain; + /* Replay domain mappings on the associated memory region */ + g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, + ep->iommu_mr); + return VIRTIO_IOMMU_S_OK; } @@ -311,6 +392,7 @@ static int virtio_iommu_map(VirtIOIOMMU *s, VirtIOIOMMUDomain *domain; VirtIOIOMMUInterval *interval; VirtIOIOMMUMapping *mapping; + VirtIOIOMMUEndpoint *ep; if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) { return VIRTIO_IOMMU_S_INVAL; @@ -340,6 +422,11 @@ static int virtio_iommu_map(VirtIOIOMMU *s, g_tree_insert(domain->mappings, interval, mapping); + QLIST_FOREACH(ep, &domain->endpoint_list, next) { + virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start, + flags); + } + return VIRTIO_IOMMU_S_OK; } @@ -352,6 +439,7 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, VirtIOIOMMUMapping *iter_val; VirtIOIOMMUInterval interval, *iter_key; VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; int ret = VIRTIO_IOMMU_S_OK; trace_virtio_iommu_unmap(domain_id, virt_start, virt_end); @@ -369,6 +457,10 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, uint64_t current_high = iter_key->high; if (interval.low <= current_low && interval.high >= current_high) { + QLIST_FOREACH(ep, &domain->endpoint_list, next) { + virtio_iommu_notify_unmap(ep->iommu_mr, current_low, + current_high); + } g_tree_remove(domain->mappings, iter_key); trace_virtio_iommu_unmap_done(domain_id, current_low, current_high); } else { @@ -755,6 +847,107 @@ static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) return (ua > ub) - (ua < ub); } +static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data) +{ + VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value; + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high, + mapping->phys_addr); + virtio_iommu_notify_map(mr, interval->low, interval->high, + mapping->phys_addr, mapping->flags); + return false; +} + +static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMU *s = sdev->viommu; + uint32_t sid; + VirtIOIOMMUEndpoint *ep; + + sid = virtio_iommu_get_bdf(sdev); + + qemu_mutex_lock(&s->mutex); + + if (!s->endpoints) { + goto unlock; + } + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + if (!ep || !ep->domain) { + goto unlock; + } + + g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr); + +unlock: + qemu_mutex_unlock(&s->mutex); +} + +static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new, + Error **errp) +{ + if (old == IOMMU_NOTIFIER_NONE) { + trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name); + } else if (new == IOMMU_NOTIFIER_NONE) { + trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name); + } + return 0; +} + +/* + * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, + * for example 0xfffffffffffff000. When an assigned device has page size + * restrictions due to the hardware IOMMU configuration, apply this restriction + * to the mask. + */ +static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, + uint64_t new_mask, + Error **errp) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMU *s = sdev->viommu; + uint64_t cur_mask = s->config.page_size_mask; + + trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask, + new_mask); + + if ((cur_mask & new_mask) == 0) { + error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 + " is incompatible with mask 0x%"PRIx64, cur_mask, new_mask); + return -1; + } + + /* + * After the machine is finalized, we can't change the mask anymore. If by + * chance the hotplugged device supports the same granule, we can still + * accept it. Having a different masks is possible but the guest will use + * sub-optimal block sizes, so warn about it. + */ + if (qdev_hotplug) { + int new_granule = ctz64(new_mask); + int cur_granule = ctz64(cur_mask); + + if (new_granule != cur_granule) { + error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 + " is incompatible with mask 0x%"PRIx64, cur_mask, + new_mask); + return -1; + } else if (new_mask != cur_mask) { + warn_report("virtio-iommu page mask 0x%"PRIx64 + " does not match 0x%"PRIx64, cur_mask, new_mask); + } + return 0; + } + + s->config.page_size_mask &= new_mask; + return 0; +} + static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -910,9 +1103,14 @@ static gboolean reconstruct_endpoints(gpointer key, gpointer value, VirtIOIOMMU *s = (VirtIOIOMMU *)data; VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value; VirtIOIOMMUEndpoint *iter; + IOMMUMemoryRegion *mr; QLIST_FOREACH(iter, &d->endpoint_list, next) { + mr = virtio_iommu_mr(s, iter->id); + assert(mr); + iter->domain = d; + iter->iommu_mr = mr; g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter); } return false; /* continue the domain traversal */ @@ -979,6 +1177,9 @@ static void virtio_iommu_memory_region_class_init(ObjectClass *klass, IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); imrc->translate = virtio_iommu_translate; + imrc->replay = virtio_iommu_replay; + imrc->notify_flag_changed = virtio_iommu_notify_flag_changed; + imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; } static const TypeInfo virtio_iommu_info = { diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c index 913f4a3326..fa5395cd88 100644 --- a/hw/virtio/virtio-mem-pci.c +++ b/hw/virtio/virtio-mem-pci.c @@ -76,6 +76,12 @@ static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md, info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM; } +static uint64_t virtio_mem_pci_get_min_alignment(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP, + &error_abort); +} + static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data) { VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI, @@ -110,6 +116,7 @@ static void virtio_mem_pci_class_init(ObjectClass *klass, void *data) mdc->get_plugged_size = virtio_mem_pci_get_plugged_size; mdc->get_memory_region = virtio_mem_pci_get_memory_region; mdc->fill_device_info = virtio_mem_pci_fill_device_info; + mdc->get_min_alignment = virtio_mem_pci_get_min_alignment; } static void virtio_mem_pci_instance_init(Object *obj) diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 7c8ca9f28b..655824ff81 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -33,10 +33,83 @@ #include "trace.h" /* - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging - * memory (e.g., 2MB on x86_64). + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking + * bitmap small. */ -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN) +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) + +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \ + defined(__powerpc64__) +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB)) +#else + /* fallback to 1 MiB (e.g., the THP size on s390x) */ +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE +#endif + +/* + * We want to have a reasonable default block size such that + * 1. We avoid splitting THPs when unplugging memory, which degrades + * performance. + * 2. We avoid placing THPs for plugged blocks that also cover unplugged + * blocks. + * + * The actual THP size might differ between Linux kernels, so we try to probe + * it. In the future (if we ever run into issues regarding 2.), we might want + * to disable THP in case we fail to properly probe the THP size, or if the + * block size is configured smaller than the THP size. + */ +static uint32_t thp_size; + +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +static uint32_t virtio_mem_thp_size(void) +{ + gchar *content = NULL; + const char *endptr; + uint64_t tmp; + + if (thp_size) { + return thp_size; + } + + /* + * Try to probe the actual THP size, fallback to (sane but eventually + * incorrect) default sizes. + */ + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && + !qemu_strtou64(content, &endptr, 0, &tmp) && + (!endptr || *endptr == '\n')) { + /* + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base + * pages) or weird, fallback to something smaller. + */ + if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) { + warn_report("Read unsupported THP size: %" PRIx64, tmp); + } else { + thp_size = tmp; + } + } + + if (!thp_size) { + thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE; + warn_report("Could not detect THP size, falling back to %" PRIx64 + " MiB.", thp_size / MiB); + } + + g_free(content); + return thp_size; +} + +static uint64_t virtio_mem_default_block_size(RAMBlock *rb) +{ + const uint64_t page_size = qemu_ram_pagesize(rb); + + /* We can have hugetlbfs with a page size smaller than the THP size. */ + if (page_size == qemu_real_host_page_size) { + return MAX(page_size, virtio_mem_thp_size()); + } + return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); +} + /* * Size the usable region bigger than the requested size if possible. Esp. * Linux guests will only add (aligned) memory blocks in case they fully @@ -227,6 +300,9 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr), requested_size + VIRTIO_MEM_USABLE_EXTENT); + /* The usable region size always has to be multiples of the block size. */ + newsize = QEMU_ALIGN_UP(newsize, vmem->block_size); + if (!requested_size) { newsize = 0; } @@ -440,15 +516,33 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) rb = vmem->memdev->mr.ram_block; page_size = qemu_ram_pagesize(rb); + /* + * If the block size wasn't configured by the user, use a sane default. This + * allows using hugetlbfs backends of any page size without manual + * intervention. + */ + if (!vmem->block_size) { + vmem->block_size = virtio_mem_default_block_size(rb); + } + if (vmem->block_size < page_size) { error_setg(errp, "'%s' property has to be at least the page size (0x%" PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); return; + } else if (vmem->block_size < virtio_mem_default_block_size(rb)) { + warn_report("'%s' property is smaller than the default block size (%" + PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP, + virtio_mem_default_block_size(rb) / MiB); } else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 ")", VIRTIO_MEM_REQUESTED_SIZE_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); return; + } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) { + error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 + ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP, + vmem->block_size); + return; } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr), vmem->block_size)) { error_setg(errp, "'%s' property memdev size has to be multiples of" @@ -734,6 +828,18 @@ static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name, const VirtIOMEM *vmem = VIRTIO_MEM(obj); uint64_t value = vmem->block_size; + /* + * If not configured by the user (and we're not realized yet), use the + * default block size we would use with the current memory backend. + */ + if (!value) { + if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) { + value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block); + } else { + value = virtio_mem_thp_size(); + } + } + visit_type_size(v, name, &value, errp); } @@ -813,7 +919,6 @@ static void virtio_mem_instance_init(Object *obj) { VirtIOMEM *vmem = VIRTIO_MEM(obj); - vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE; notifier_list_init(&vmem->size_change_notifiers); vmem->precopy_notifier.notify = virtio_mem_precopy_notify; |