diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2024-03-13 15:11:53 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2024-03-13 15:11:53 +0000 |
commit | 6fc69312313a2207a8fbc083658e0548746b707f (patch) | |
tree | bec474d21181ee12f06f0b185552d0726dcb8d40 /hw | |
parent | 51e31f21407190df9bd048a539267534cea7dd66 (diff) | |
parent | 73279cecca03f7c2b4489c5fea994e7349eaafaa (diff) |
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pc,pci: features, cleanups, fixes
more memslots support in libvhost-user
support PCIe Gen5/Gen6 link speeds in pcie
more traces in vdpa
network simulation devices support in vdpa
SMBIOS type 9 descriptor implementation
Bump max_cpus to 4096 vcpus in q35
aw-bits and granule options in VIRTIO-IOMMU
Support report NUMA nodes for device memory using GI in acpi
Beginning of shutdown event support in pvpanic
fixes, cleanups all over the place.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# -----BEGIN PGP SIGNATURE-----
#
# iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmXw0TMPHG1zdEByZWRo
# YXQuY29tAAoJECgfDbjSjVRp8x4H+gLMoGwaGAX7gDGPgn2Ix4j/3kO77ZJ9X9k/
# 1KqZu/9eMS1j2Ei+vZqf05w7qRjxxhwDq3ilEXF/+UFqgAehLqpRRB8j5inqvzYt
# +jv0DbL11PBp/oFjWcytm5CbiVsvq8KlqCF29VNzc162XdtcduUOWagL96y8lJfZ
# uPrOoyeR7SMH9lp3LLLHWgu+9W4nOS03RroZ6Umj40y5B7yR0Rrppz8lMw5AoQtr
# 0gMRnFhYXeiW6CXdz+Tzcr7XfvkkYDi/j7ibiNSURLBfOpZa6Y8+kJGKxz5H1K1G
# 6ZY4PBcOpQzl+NMrktPHogczgJgOK10t+1i/R3bGZYw2Qn/93Eg=
# =C0UU
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 12 Mar 2024 22:03:31 GMT
# gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg: issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (68 commits)
docs/specs/pvpanic: document shutdown event
hw/cxl: Fix missing reserved data in CXL Device DVSEC
hmat acpi: Fix out of bounds access due to missing use of indirection
hmat acpi: Do not add Memory Proximity Domain Attributes Structure targetting non existent memory.
qemu-options.hx: Document the virtio-iommu-pci aw-bits option
hw/arm/virt: Set virtio-iommu aw-bits default value to 48
hw/i386/q35: Set virtio-iommu aw-bits default value to 39
virtio-iommu: Add an option to define the input range width
virtio-iommu: Trace domain range limits as unsigned int
qemu-options.hx: Document the virtio-iommu-pci granule option
virtio-iommu: Change the default granule to the host page size
virtio-iommu: Add a granule property
hw/i386/acpi-build: Add support for SRAT Generic Initiator structures
hw/acpi: Implement the SRAT GI affinity structure
qom: new object to associate device to NUMA node
hw/i386/pc: Inline pc_cmos_init() into pc_cmos_init_late() and remove it
hw/i386/pc: Set "normal" boot device order in pc_basic_device_init()
hw/i386/pc: Avoid one use of the current_machine global
hw/i386/pc: Remove "rtc_state" link again
Revert "hw/i386/pc: Confine system flash handling to pc_sysfw"
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
# Conflicts:
# hw/core/machine.c
Diffstat (limited to 'hw')
-rw-r--r-- | hw/acpi/acpi_generic_initiator.c | 148 | ||||
-rw-r--r-- | hw/acpi/hmat.c | 15 | ||||
-rw-r--r-- | hw/acpi/meson.build | 1 | ||||
-rw-r--r-- | hw/arm/virt-acpi-build.c | 3 | ||||
-rw-r--r-- | hw/arm/virt.c | 17 | ||||
-rw-r--r-- | hw/audio/virtio-snd.c | 7 | ||||
-rw-r--r-- | hw/core/machine.c | 3 | ||||
-rw-r--r-- | hw/core/numa.c | 3 | ||||
-rw-r--r-- | hw/core/qdev-properties-system.c | 16 | ||||
-rw-r--r-- | hw/cxl/cxl-component-utils.c | 21 | ||||
-rw-r--r-- | hw/i386/acpi-build.c | 3 | ||||
-rw-r--r-- | hw/i386/pc.c | 30 | ||||
-rw-r--r-- | hw/i386/pc_piix.c | 3 | ||||
-rw-r--r-- | hw/i386/pc_q35.c | 14 | ||||
-rw-r--r-- | hw/i386/pc_sysfw.c | 17 | ||||
-rw-r--r-- | hw/net/igb.c | 2 | ||||
-rw-r--r-- | hw/net/virtio-net.c | 16 | ||||
-rw-r--r-- | hw/nvme/ctrl.c | 30 | ||||
-rw-r--r-- | hw/pci-bridge/pci_expander_bridge.c | 2 | ||||
-rw-r--r-- | hw/pci/pci.c | 1 | ||||
-rw-r--r-- | hw/pci/pcie.c | 8 | ||||
-rw-r--r-- | hw/pci/pcie_sriov.c | 32 | ||||
-rw-r--r-- | hw/smbios/smbios.c | 142 | ||||
-rw-r--r-- | hw/virtio/trace-events | 7 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 21 | ||||
-rw-r--r-- | hw/virtio/vhost-vdpa.c | 44 | ||||
-rw-r--r-- | hw/virtio/virtio-iommu.c | 36 | ||||
-rw-r--r-- | hw/virtio/virtio-pci.c | 189 | ||||
-rw-r--r-- | hw/virtio/virtio.c | 39 |
29 files changed, 770 insertions, 100 deletions
diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c new file mode 100644 index 0000000000..17b9a052f5 --- /dev/null +++ b/hw/acpi/acpi_generic_initiator.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "qemu/osdep.h" +#include "hw/acpi/acpi_generic_initiator.h" +#include "hw/acpi/aml-build.h" +#include "hw/boards.h" +#include "hw/pci/pci_device.h" +#include "qemu/error-report.h" + +typedef struct AcpiGenericInitiatorClass { + ObjectClass parent_class; +} AcpiGenericInitiatorClass; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, acpi_generic_initiator, + ACPI_GENERIC_INITIATOR, OBJECT, + { TYPE_USER_CREATABLE }, + { NULL }) + +OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR) + +static void acpi_generic_initiator_init(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->node = MAX_NODES; + gi->pci_dev = NULL; +} + +static void acpi_generic_initiator_finalize(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + g_free(gi->pci_dev); +} + +static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->pci_dev = g_strdup(val); +} + +static void acpi_generic_initiator_set_node(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + MachineState *ms = MACHINE(qdev_get_machine()); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value >= MAX_NODES) { + error_printf("%s: Invalid NUMA node specified\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + gi->node = value; + ms->numa_state->nodes[gi->node].has_gi = true; +} + +static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_str(oc, "pci-dev", NULL, + acpi_generic_initiator_set_pci_device); + object_class_property_add(oc, "node", "int", NULL, + acpi_generic_initiator_set_node, NULL, NULL); +} + +/* + * ACPI 6.3: + * Table 5-78 Generic Initiator Affinity Structure + */ +static void +build_srat_generic_pci_initiator_affinity(GArray *table_data, int node, + PCIDeviceHandle *handle) +{ + uint8_t index; + + build_append_int_noprefix(table_data, 5, 1); /* Type */ + build_append_int_noprefix(table_data, 32, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 1); /* Reserved */ + build_append_int_noprefix(table_data, 1, 1); /* Device Handle Type: PCI */ + build_append_int_noprefix(table_data, node, 4); /* Proximity Domain */ + + /* Device Handle - PCI */ + build_append_int_noprefix(table_data, handle->segment, 2); + build_append_int_noprefix(table_data, handle->bdf, 2); + for (index = 0; index < 12; index++) { + build_append_int_noprefix(table_data, 0, 1); + } + + build_append_int_noprefix(table_data, GEN_AFFINITY_ENABLED, 4); /* Flags */ + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ +} + +static int build_all_acpi_generic_initiators(Object *obj, void *opaque) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + AcpiGenericInitiator *gi; + GArray *table_data = opaque; + PCIDeviceHandle dev_handle; + PCIDevice *pci_dev; + Object *o; + + if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) { + return 0; + } + + gi = ACPI_GENERIC_INITIATOR(obj); + if (gi->node >= ms->numa_state->num_nodes) { + error_printf("%s: Specified node %d is invalid.\n", + TYPE_ACPI_GENERIC_INITIATOR, gi->node); + exit(1); + } + + o = object_resolve_path_type(gi->pci_dev, TYPE_PCI_DEVICE, NULL); + if (!o) { + error_printf("%s: Specified device must be a PCI device.\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + pci_dev = PCI_DEVICE(o); + + dev_handle.segment = 0; + dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), + pci_dev->devfn); + + build_srat_generic_pci_initiator_affinity(table_data, + gi->node, &dev_handle); + + return 0; +} + +void build_srat_generic_pci_initiator(GArray *table_data) +{ + object_child_foreach_recursive(object_get_root(), + build_all_acpi_generic_initiators, + table_data); +} diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 3042d223c8..9b1662b6b8 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -78,6 +78,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, uint32_t *initiator_list) { int i, index; + uint32_t initiator_to_index[MAX_NODES] = {}; HMAT_LB_Data *lb_data; uint16_t *entry_list; uint32_t base; @@ -121,6 +122,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, /* Initiator Proximity Domain List */ for (i = 0; i < num_initiator; i++) { build_append_int_noprefix(table_data, initiator_list[i], 4); + /* Reverse mapping for array possitions */ + initiator_to_index[initiator_list[i]] = i; } /* Target Proximity Domain List */ @@ -132,7 +135,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, entry_list = g_new0(uint16_t, num_initiator * num_target); for (i = 0; i < hmat_lb->list->len; i++) { lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); - index = lb_data->initiator * num_target + lb_data->target; + index = initiator_to_index[lb_data->initiator] * num_target + + lb_data->target; entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); } @@ -204,6 +208,13 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ for (i = 0; i < numa_state->num_nodes; i++) { + /* + * Linux rejects whole HMAT table if a node with no memory + * has one of these structures listing it as a target. + */ + if (!numa_state->nodes[i].node_mem) { + continue; + } flags = 0; if (numa_state->nodes[i].initiator < MAX_NODES) { @@ -214,7 +225,7 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } for (i = 0; i < numa_state->num_nodes; i++) { - if (numa_state->nodes[i].has_cpu) { + if (numa_state->nodes[i].has_cpu || numa_state->nodes[i].has_gi) { initiator_list[num_initiator++] = i; } } diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 5441c9b1e4..fa5c07db90 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -1,5 +1,6 @@ acpi_ss = ss.source_set() acpi_ss.add(files( + 'acpi_generic_initiator.c', 'acpi_interface.c', 'aml-build.c', 'bios-linker-loader.c', diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 6a1bde61ce..c3ccfef026 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -57,6 +57,7 @@ #include "migration/vmstate.h" #include "hw/acpi/ghes.h" #include "hw/acpi/viot.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "hw/virtio/virtio-acpi.h" #include "target/arm/multiprocessing.h" @@ -504,6 +505,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } } + build_srat_generic_pci_initiator(table_data); + if (ms->nvdimms_state->is_enabled) { nvdimm_build_srat(table_data); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 0af1943697..e5cd935232 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -85,11 +85,28 @@ #include "hw/char/pl011.h" #include "qemu/guest-random.h" +static GlobalProperty arm_virt_compat[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" }, +}; +static const size_t arm_virt_compat_len = G_N_ELEMENTS(arm_virt_compat); + +/* + * This cannot be called from the virt_machine_class_init() because + * TYPE_VIRT_MACHINE is abstract and mc->compat_props g_ptr_array_new() + * only is called on virt non abstract class init. + */ +static void arm_virt_compat_set(MachineClass *mc) +{ + compat_props_add(mc->compat_props, arm_virt_compat, + arm_virt_compat_len); +} + #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ void *data) \ { \ MachineClass *mc = MACHINE_CLASS(oc); \ + arm_virt_compat_set(mc); \ virt_machine_##major##_##minor##_options(mc); \ mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \ if (latest) { \ diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index ea2aeaef14..e604d8f30c 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -243,12 +243,13 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s, memset(&pcm_info[i].padding, 0, 5); } + cmd->payload_size = sizeof(virtio_snd_pcm_info) * count; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); iov_from_buf(cmd->elem->in_sg, cmd->elem->in_num, sizeof(virtio_snd_hdr), pcm_info, - sizeof(virtio_snd_pcm_info) * count); + cmd->payload_size); } /* @@ -749,7 +750,8 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd) 0, &cmd->resp, sizeof(virtio_snd_hdr)); - virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr)); + virtqueue_push(cmd->vq, cmd->elem, + sizeof(virtio_snd_hdr) + cmd->payload_size); virtio_notify(VIRTIO_DEVICE(s), cmd->vq); } @@ -808,6 +810,7 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) cmd->elem = elem; cmd->vq = vq; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); + /* implicit cmd->payload_size = 0; */ QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next); elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); } diff --git a/hw/core/machine.c b/hw/core/machine.c index e483b34459..37ede0e7d4 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -30,10 +30,13 @@ #include "exec/confidential-guest-support.h" #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" +#include "hw/virtio/virtio-iommu.h" #include "audio/audio.h" GlobalProperty hw_compat_8_2[] = { { "migration", "zero-page-detection", "legacy"}, + { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" }, + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" }, }; const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); diff --git a/hw/core/numa.c b/hw/core/numa.c index 81d2124349..f8ce332cfe 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -227,7 +227,8 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, node->target, numa_state->num_nodes); return; } - if (!numa_info[node->initiator].has_cpu) { + if (!numa_info[node->initiator].has_cpu && + !numa_info[node->initiator].has_gi) { error_setg(errp, "Invalid initiator=%d, it isn't an " "initiator proximity domain", node->initiator); return; diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index f52073b7c8..d79d6f4b53 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -966,7 +966,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = { .set_default_value = qdev_propinfo_set_default_value_enum, }; -/* --- PCIELinkSpeed 2_5/5/8/16 -- */ +/* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -988,6 +988,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case QEMU_PCI_EXP_LNK_16GT: speed = PCIE_LINK_SPEED_16; break; + case QEMU_PCI_EXP_LNK_32GT: + speed = PCIE_LINK_SPEED_32; + break; + case QEMU_PCI_EXP_LNK_64GT: + speed = PCIE_LINK_SPEED_64; + break; default: /* Unreachable */ abort(); @@ -1021,6 +1027,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case PCIE_LINK_SPEED_16: *p = QEMU_PCI_EXP_LNK_16GT; break; + case PCIE_LINK_SPEED_32: + *p = QEMU_PCI_EXP_LNK_32GT; + break; + case PCIE_LINK_SPEED_64: + *p = QEMU_PCI_EXP_LNK_64GT; + break; default: /* Unreachable */ abort(); @@ -1029,7 +1041,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, const PropertyInfo qdev_prop_pcie_link_speed = { .name = "PCIELinkSpeed", - .description = "2_5/5/8/16", + .description = "2_5/5/8/16/32/64", .enum_table = &PCIELinkSpeed_lookup, .get = get_prop_pcielinkspeed, .set = set_prop_pcielinkspeed, diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index 84ab503325..cd116c0401 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -297,6 +297,7 @@ void cxl_component_register_init_common(uint32_t *reg_state, caps = 3; break; case CXL2_ROOT_PORT: + case CXL2_RC: /* + Extended Security, + Snoop */ caps = 5; break; @@ -326,8 +327,19 @@ void cxl_component_register_init_common(uint32_t *reg_state, CXL_##reg##_REGISTERS_OFFSET); \ } while (0) + switch (type) { + case CXL2_DEVICE: + case CXL2_TYPE3_DEVICE: + case CXL2_LOGICAL_DEVICE: + case CXL2_ROOT_PORT: + case CXL2_UPSTREAM_PORT: + case CXL2_DOWNSTREAM_PORT: init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION); - ras_init_common(reg_state, write_msk); + ras_init_common(reg_state, write_msk); + break; + default: + break; + } init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION); @@ -335,9 +347,10 @@ void cxl_component_register_init_common(uint32_t *reg_state, return; } - init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); - hdm_init_common(reg_state, write_msk, type); - + if (type != CXL2_ROOT_PORT) { + init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); + hdm_init_common(reg_state, write_msk, type); + } if (caps < 5) { return; } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 15242b9096..53f804ac16 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -68,6 +68,7 @@ #include "hw/acpi/utils.h" #include "hw/acpi/pci.h" #include "hw/acpi/cxl.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -2046,6 +2047,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } + build_srat_generic_pci_initiator(table_data); + /* * Entry is required for Windows to enable memory hotplug in OS * and for Linux to enable SWIOTLB when booted with less than diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4f322e0856..feb7a93083 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -425,9 +425,10 @@ static void set_boot_dev(PCMachineState *pcms, MC146818RtcState *s, static void pc_boot_set(void *opaque, const char *boot_device, Error **errp) { - PCMachineState *pcms = PC_MACHINE(current_machine); + PCMachineState *pcms = opaque; + X86MachineState *x86ms = X86_MACHINE(pcms); - set_boot_dev(pcms, opaque, boot_device, errp); + set_boot_dev(pcms, MC146818_RTC(x86ms->rtc), boot_device, errp); } static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy) @@ -569,14 +570,6 @@ static void pc_cmos_init_late(PCMachineState *pcms) mc146818rtc_set_cmos_data(s, 0x39, val); pc_cmos_init_floppy(s, pc_find_fdc0()); -} - -void pc_cmos_init(PCMachineState *pcms, - ISADevice *rtc) -{ - int val; - X86MachineState *x86ms = X86_MACHINE(pcms); - MC146818RtcState *s = MC146818_RTC(rtc); /* various important CMOS locations needed by PC/Bochs bios */ @@ -613,22 +606,10 @@ void pc_cmos_init(PCMachineState *pcms, mc146818rtc_set_cmos_data(s, 0x5c, val >> 8); mc146818rtc_set_cmos_data(s, 0x5d, val >> 16); - object_property_add_link(OBJECT(pcms), "rtc_state", - TYPE_ISA_DEVICE, - (Object **)&x86ms->rtc, - object_property_allow_set_link, - OBJ_PROP_LINK_STRONG); - object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s), - &error_abort); - - set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal); - val = 0; val |= 0x02; /* FPU is there */ val |= 0x04; /* PS/2 mouse installed */ mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val); - - /* hard drives and FDC are handled by pc_cmos_init_late() */ } static void handle_a20_line_change(void *opaque, int irq, int level) @@ -1261,7 +1242,9 @@ void pc_basic_device_init(struct PCMachineState *pcms, } #endif - qemu_register_boot_set(pc_boot_set, rtc_state); + qemu_register_boot_set(pc_boot_set, pcms); + set_boot_dev(pcms, MC146818_RTC(rtc_state), + MACHINE(pcms)->boot_config.order, &error_fatal); if (!xen_enabled() && (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { @@ -1751,6 +1734,7 @@ static void pc_machine_initfn(Object *obj) pcms->fd_bootchk = true; pcms->default_bus_bypass_iommu = false; + pc_system_flash_create(pcms); pcms->pcspk = isa_new(TYPE_PC_SPEAKER); object_property_add_alias(OBJECT(pcms), "pcspk-audiodev", OBJECT(pcms->pcspk), "audiodev"); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 319bc4b180..c9a6c0aa68 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -228,6 +228,7 @@ static void pc_init1(MachineState *machine, const char *pci_type) assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); + pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { /* For xen HVM direct kernel boot, load linux here */ xen_load_linux(pcms); @@ -343,8 +344,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) } #endif - pc_cmos_init(pcms, x86ms->rtc); - if (piix4_pm) { smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 45a4102e75..8a427c4647 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -45,6 +45,7 @@ #include "hw/i386/pc.h" #include "hw/i386/amd_iommu.h" #include "hw/i386/intel_iommu.h" +#include "hw/virtio/virtio-iommu.h" #include "hw/display/ramfb.h" #include "hw/ide/pci.h" #include "hw/ide/ahci-pci.h" @@ -63,6 +64,12 @@ /* ICH9 AHCI has 6 ports */ #define MAX_SATA_PORTS 6 +static GlobalProperty pc_q35_compat_defaults[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" }, +}; +static const size_t pc_q35_compat_defaults_len = + G_N_ELEMENTS(pc_q35_compat_defaults); + struct ehci_companions { const char *name; int func; @@ -311,8 +318,6 @@ static void pc_q35_init(MachineState *machine) smbus_eeprom_init(pcms->smbus, 8, NULL, 0); } - pc_cmos_init(pcms, x86ms->rtc); - /* the rest devices to which pci devfn is automatically assigned */ pc_vga_init(isa_bus, pcms->pcibus); pc_nic_init(pcmc, isa_bus, pcms->pcibus); @@ -350,12 +355,14 @@ static void pc_q35_machine_options(MachineClass *m) m->default_nic = "e1000e"; m->default_kernel_irqchip_split = false; m->no_floppy = 1; - m->max_cpus = 1024; + m->max_cpus = 4096; m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + compat_props_add(m->compat_props, + pc_q35_compat_defaults, pc_q35_compat_defaults_len); } static void pc_q35_9_0_machine_options(MachineClass *m) @@ -371,6 +378,7 @@ static void pc_q35_8_2_machine_options(MachineClass *m) { pc_q35_9_0_machine_options(m); m->alias = NULL; + m->max_cpus = 1024; compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len); compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len); } diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index b02e285579..3efabbbab2 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -91,7 +91,19 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms, return PFLASH_CFI01(dev); } -static void pc_system_flash_cleanup_unused(PCMachineState *pcms) +void pc_system_flash_create(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + + if (pcmc->pci_enabled) { + pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", + "pflash0"); + pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", + "pflash1"); + } +} + +void pc_system_flash_cleanup_unused(PCMachineState *pcms) { char *prop_name; int i; @@ -198,9 +210,6 @@ void pc_system_firmware_init(PCMachineState *pcms, return; } - pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0"); - pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1"); - /* Map legacy -drive if=pflash to machine properties */ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { pflash_cfi01_legacy_drive(pcms->flash[i], diff --git a/hw/net/igb.c b/hw/net/igb.c index 0b5c31a58b..9b37523d6d 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -488,12 +488,10 @@ static void igb_pci_uninit(PCIDevice *pci_dev) static void igb_qdev_reset_hold(Object *obj) { - PCIDevice *d = PCI_DEVICE(obj); IGBState *s = IGB(obj); trace_e1000e_cb_qdev_reset_hold(); - pcie_sriov_pf_disable_vfs(d); igb_core_reset(&s->core); } diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 403a693baf..9959f1932b 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -2039,6 +2039,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, goto err; } + /* Mark dirty page's bitmap of guest memory */ + if (vdev->lm_logging_ctrl == LM_ENABLE) { + uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK; + /* Get chunk index */ + BitmapMemoryRegionCaches *caches = qatomic_rcu_read(&vdev->caches); + uint64_t index = chunk / 8; + uint64_t shift = chunk % 8; + uint8_t val = 0; + address_space_read_cached(&caches->bitmap, index, &val, + sizeof(val)); + val |= 1 << shift; + address_space_write_cached(&caches->bitmap, index, &val, + sizeof(val)); + address_space_cache_invalidate(&caches->bitmap, index, sizeof(val)); + } + elems[i] = elem; lens[i] = total; i++; diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 036b15403a..c2b17de987 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7126,10 +7126,6 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) sctrl = &n->sec_ctrl_list.sec[i]; nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } - - if (rst != NVME_RESET_CONTROLLER) { - pcie_sriov_pf_disable_vfs(pci_dev); - } } if (rst != NVME_RESET_CONTROLLER) { @@ -8509,36 +8505,26 @@ static void nvme_pci_reset(DeviceState *qdev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); } -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, - uint32_t val, int len) +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) { NvmeCtrl *n = NVME(dev); NvmeSecCtrlEntry *sctrl; - uint16_t sriov_cap = dev->exp.sriov_cap; - uint32_t off = address - sriov_cap; - int i, num_vfs; + int i; - if (!sriov_cap) { - return; - } - - if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - for (i = 0; i < num_vfs; i++) { - sctrl = &n->sec_ctrl_list.sec[i]; - nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); - } - } + for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { + sctrl = &n->sec_ctrl_list.sec[i]; + nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } } static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, uint32_t val, int len) { - nvme_sriov_pre_write_ctrl(dev, address, val, len); + uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); + nvme_sriov_post_write_config(dev, old_num_vfs); } static const VMStateDescription nvme_vmstate = { diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index 535889f7c2..0411ad31ea 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -290,7 +290,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev) uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; int dsp_count = 0; - cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); + cxl_component_register_init_common(reg_state, write_msk, CXL2_RC); /* * The CXL specification allows for host bridges with no HDM decoders * if they only have a single root port. diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 6496d027ca..e7a39cb203 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -409,6 +409,7 @@ static void pci_do_device_reset(PCIDevice *dev) msi_reset(dev); msix_reset(dev); + pcie_sriov_pf_reset(dev); } /* diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index f56079acf5..4b2f0805c6 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -171,6 +171,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, PCI_EXP_LNKCAP2_SLS_16_0GB); } + if (s->speed > QEMU_PCI_EXP_LNK_16GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_32_0GB); + } + if (s->speed > QEMU_PCI_EXP_LNK_32GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_64_0GB); + } } } diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index a1fe65f5d8..e9b23221d7 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev) assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + return; + } dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); @@ -212,7 +215,6 @@ static void unregister_vfs(PCIDevice *dev) g_free(dev->exp.sriov_pf.vf); dev->exp.sriov_pf.vf = NULL; dev->exp.sriov_pf.num_vfs = 0; - pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0); } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -246,16 +248,28 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, } -/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */ -void pcie_sriov_pf_disable_vfs(PCIDevice *dev) +/* Reset SR/IOV */ +void pcie_sriov_pf_reset(PCIDevice *dev) { uint16_t sriov_cap = dev->exp.sriov_cap; - if (sriov_cap) { - uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL); - if (val & PCI_SRIOV_CTRL_VFE) { - val &= ~PCI_SRIOV_CTRL_VFE; - pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1); - } + if (!sriov_cap) { + return; + } + + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0); + unregister_vfs(dev); + + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0); + + /* + * Default is to use 4K pages, software can modify it + * to any of the supported bits + */ + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_SYS_PGSIZE, 0x1); + + for (uint16_t i = 0; i < PCI_NUM_REGIONS; i++) { + pci_set_quad(dev->config + sriov_cap + PCI_SRIOV_BAR + i * 4, + dev->exp.sriov_pf.vf_bar_type[i]); } } diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index a3c4e52ce9..e3d5d8f2e2 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -121,6 +121,16 @@ struct type8_instance { }; static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8); +/* type 9 instance for parsing */ +struct type9_instance { + const char *slot_designation, *pcidev; + uint8_t slot_type, slot_data_bus_width, current_usage, slot_length, + slot_characteristics1, slot_characteristics2; + uint16_t slot_id; + QTAILQ_ENTRY(type9_instance) next; +}; +static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9); + static struct { size_t nvalues; char **values; @@ -380,6 +390,59 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = { { /* end of list */ } }; +static const QemuOptDesc qemu_smbios_type9_opts[] = { + { + .name = "type", + .type = QEMU_OPT_NUMBER, + .help = "SMBIOS element type", + }, + { + .name = "slot_designation", + .type = QEMU_OPT_STRING, + .help = "string number for reference designation", + }, + { + .name = "slot_type", + .type = QEMU_OPT_NUMBER, + .help = "connector type", + }, + { + .name = "slot_data_bus_width", + .type = QEMU_OPT_NUMBER, + .help = "port type", + }, + { + .name = "current_usage", + .type = QEMU_OPT_NUMBER, + .help = "current usage", + }, + { + .name = "slot_length", + .type = QEMU_OPT_NUMBER, + .help = "system slot length", + }, + { + .name = "slot_id", + .type = QEMU_OPT_NUMBER, + .help = "system slot id", + }, + { + .name = "slot_characteristics1", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics1, see the spec", + }, + { + .name = "slot_characteristics2", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics2, see the spec", + }, + { + .name = "pci_device", + .type = QEMU_OPT_STRING, + .help = "PCI device, if provided." + } +}; + static const QemuOptDesc qemu_smbios_type11_opts[] = { { .name = "type", @@ -609,6 +672,7 @@ bool smbios_skip_table(uint8_t type, bool required_table) #define T2_BASE 0x200 #define T3_BASE 0x300 #define T4_BASE 0x400 +#define T9_BASE 0x900 #define T11_BASE 0xe00 #define T16_BASE 0x1000 @@ -807,6 +871,65 @@ static void smbios_build_type_8_table(void) } } +static void smbios_build_type_9_table(Error **errp) +{ + unsigned instance = 0; + struct type9_instance *t9; + + QTAILQ_FOREACH(t9, &type9, next) { + SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true); + + SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation); + t->slot_type = t9->slot_type; + t->slot_data_bus_width = t9->slot_data_bus_width; + t->current_usage = t9->current_usage; + t->slot_length = t9->slot_length; + t->slot_id = t9->slot_id; + t->slot_characteristics1 = t9->slot_characteristics1; + t->slot_characteristics2 = t9->slot_characteristics2; + + if (t9->pcidev) { + PCIDevice *pdev = NULL; + int rc = pci_qdev_find_device(t9->pcidev, &pdev); + if (rc != 0) { + error_setg(errp, + "No PCI device %s for SMBIOS type 9 entry %s", + t9->pcidev, t9->slot_designation); + return; + } + /* + * We only handle the case were the device is attached to + * the PCI root bus. The general case is more complex as + * bridges are enumerated later and the table would need + * to be updated at this moment. + */ + if (!pci_bus_is_root(pci_get_bus(pdev))) { + error_setg(errp, + "Cannot create type 9 entry for PCI device %s: " + "not attached to the root bus", + t9->pcidev); + return; + } + t->segment_group_number = cpu_to_le16(0); + t->bus_number = pci_dev_bus_num(pdev); + t->device_number = pdev->devfn; + } else { + /* + * Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X, + * or PCI-Express type that do not have bus/device/function + * information, 0FFh should be populated in the fields of Segment + * Group Number, Bus Number, Device/Function Number. + */ + t->segment_group_number = 0xff; + t->bus_number = 0xff; + t->device_number = 0xff; + } + + SMBIOS_BUILD_TABLE_POST; + instance++; + } +} + static void smbios_build_type_11_table(void) { char count_str[128]; @@ -1126,6 +1249,7 @@ void smbios_get_tables(MachineState *ms, } smbios_build_type_8_table(); + smbios_build_type_9_table(errp); smbios_build_type_11_table(); #define MAX_DIMM_SZ (16 * GiB) @@ -1460,6 +1584,24 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0); QTAILQ_INSERT_TAIL(&type8, t8_i, next); return; + case 9: { + if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) { + return; + } + struct type9_instance *t; + t = g_new0(struct type9_instance, 1); + save_opt(&t->slot_designation, opts, "slot_designation"); + t->slot_type = qemu_opt_get_number(opts, "slot_type", 0); + t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0); + t->current_usage = qemu_opt_get_number(opts, "current_usage", 0); + t->slot_length = qemu_opt_get_number(opts, "slot_length", 0); + t->slot_id = qemu_opt_get_number(opts, "slot_id", 0); + t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0); + t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0); + save_opt(&t->pcidev, opts, "pcidev"); + QTAILQ_INSERT_TAIL(&type9, t, next); + return; + } case 11: if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) { return; diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 77905d1994..13b6991179 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -30,6 +30,7 @@ vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" # vhost-vdpa.c +vhost_vdpa_skipped_memory_section(int is_ram, int is_iommu, int is_protected, int is_ram_device, uint64_t first, uint64_t last, int page_mask) "is_ram=%d, is_iommu=%d, is_protected=%d, is_ram_device=%d iova_min=0x%"PRIx64" iova_last=0x%"PRIx64" page_mask=0x%x" vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 @@ -57,8 +58,8 @@ vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_set_dev_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" +vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 @@ -111,7 +112,7 @@ virtio_iommu_device_reset(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" -virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%u domain range end=%u probe_size=0x%x bypass=0x%x" virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 1af8621481..cdf9af4a4b 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1610,11 +1610,27 @@ vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev, } static int -vhost_user_backend_handle_shared_object_remove(VhostUserShared *object) +vhost_user_backend_handle_shared_object_remove(struct vhost_dev *dev, + VhostUserShared *object) { QemuUUID uuid; memcpy(uuid.data, object->uuid, sizeof(object->uuid)); + switch (virtio_object_type(&uuid)) { + case TYPE_VHOST_DEV: + { + struct vhost_dev *owner = virtio_lookup_vhost_device(&uuid); + if (dev != owner) { + /* Not allowed to remove non-owned entries */ + return 0; + } + break; + } + default: + /* Not allowed to remove non-owned entries */ + return 0; + } + return virtio_remove_resource(&uuid); } @@ -1793,7 +1809,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE: - ret = vhost_user_backend_handle_shared_object_remove(&payload.object); + ret = vhost_user_backend_handle_shared_object_remove(dev, + &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP: ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc, diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ddae494ca8..3bcd05cc22 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -47,12 +47,17 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, int page_mask) { Int128 llend; + bool is_ram = memory_region_is_ram(section->mr); + bool is_iommu = memory_region_is_iommu(section->mr); + bool is_protected = memory_region_is_protected(section->mr); - if ((!memory_region_is_ram(section->mr) && - !memory_region_is_iommu(section->mr)) || - memory_region_is_protected(section->mr) || - /* vhost-vDPA doesn't allow MMIO to be mapped */ - memory_region_is_ram_device(section->mr)) { + /* vhost-vDPA doesn't allow MMIO to be mapped */ + bool is_ram_device = memory_region_is_ram_device(section->mr); + + if ((!is_ram && !is_iommu) || is_protected || is_ram_device) { + trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected, + is_ram_device, iova_min, + iova_max, page_mask); return true; } @@ -69,7 +74,7 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, * size that maps to the kernel */ - if (!memory_region_is_iommu(section->mr)) { + if (!is_iommu) { llend = vhost_vdpa_section_end(section, page_mask); if (int128_gt(llend, int128_make64(iova_max))) { error_report("RAM section out of device range (max=0x%" PRIx64 @@ -555,6 +560,11 @@ static bool vhost_vdpa_first_dev(struct vhost_dev *dev) return v->index == 0; } +static bool vhost_vdpa_last_dev(struct vhost_dev *dev) +{ + return dev->vq_index + dev->nvqs == dev->vq_index_end; +} + static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, uint64_t *features) { @@ -965,7 +975,10 @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { - trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); + struct vhost_vdpa *v = dev->opaque; + + trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num, + v->shadow_vqs_enabled); return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); } @@ -1315,7 +1328,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return 0; } @@ -1337,7 +1350,7 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return; } @@ -1407,6 +1420,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, if (v->shadow_vqs_enabled) { ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true); return 0; } @@ -1419,7 +1433,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, } ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); - trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false); return ret; } @@ -1447,7 +1461,15 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, /* Remember last call fd because we can switch to SVQ anytime. */ vhost_svq_set_svq_call_fd(svq, file->fd); - if (v->shadow_vqs_enabled) { + /* + * When SVQ is transitioning to off, shadow_vqs_enabled has + * not been set back to false yet, but the underlying call fd + * will have to switch back to the guest notifier to signal the + * passthrough virtqueues. In other situations, SVQ's own call + * fd shall be used to signal the device model. + */ + if (v->shadow_vqs_enabled && + v->shared->svq_switching != SVQ_TSTATE_DISABLING) { return 0; } diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 86623d55a5..1326c6ec41 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -29,6 +29,7 @@ #include "sysemu/reset.h" #include "sysemu/sysemu.h" #include "qemu/reserved-region.h" +#include "qemu/units.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -1115,8 +1116,8 @@ static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, } /* - * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, - * for example 0xfffffffffffff000. When an assigned device has page size + * The default mask depends on the "granule" property. For example, with + * 4k granule, it is -(4 * KiB). When an assigned device has page size * restrictions due to the hardware IOMMU configuration, apply this restriction * to the mask. */ @@ -1313,8 +1314,32 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) * in vfio realize */ s->config.bypass = s->boot_bypass; - s->config.page_size_mask = qemu_target_page_mask(); - s->config.input_range.end = UINT64_MAX; + if (s->aw_bits < 32 || s->aw_bits > 64) { + error_setg(errp, "aw-bits must be within [32,64]"); + return; + } + s->config.input_range.end = + s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1; + + switch (s->granule_mode) { + case GRANULE_MODE_4K: + s->config.page_size_mask = -(4 * KiB); + break; + case GRANULE_MODE_8K: + s->config.page_size_mask = -(8 * KiB); + break; + case GRANULE_MODE_16K: + s->config.page_size_mask = -(16 * KiB); + break; + case GRANULE_MODE_64K: + s->config.page_size_mask = -(64 * KiB); + break; + case GRANULE_MODE_HOST: + s->config.page_size_mask = qemu_real_host_page_mask(); + break; + default: + error_setg(errp, "Unsupported granule mode"); + } s->config.domain_range.end = UINT32_MAX; s->config.probe_size = VIOMMU_PROBE_SIZE; @@ -1522,6 +1547,9 @@ static Property virtio_iommu_properties[] = { DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, TYPE_PCI_BUS, PCIBus *), DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), + DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode, + GRANULE_MODE_HOST), + DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index cb6940fc0e..eaaf86402c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1442,6 +1442,155 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +/* Called within call_rcu(). */ +static void bitmap_free_region_cache(BitmapMemoryRegionCaches *caches) +{ + assert(caches != NULL); + address_space_cache_destroy(&caches->bitmap); + g_free(caches); +} + +static void lm_disable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *caches; + caches = qatomic_read(&vdev->caches); + qatomic_rcu_set(&vdev->caches, NULL); + if (caches) { + call_rcu(caches, bitmap_free_region_cache, rcu); + } +} + +static void lm_enable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *old = vdev->caches; + BitmapMemoryRegionCaches *new = NULL; + hwaddr addr, end, size; + int64_t len; + + addr = vdev->lm_base_addr_low | ((hwaddr)(vdev->lm_base_addr_high) << 32); + end = vdev->lm_end_addr_low | ((hwaddr)(vdev->lm_end_addr_high) << 32); + size = end - addr; + if (size <= 0) { + error_report("Invalid lm size."); + return; + } + + new = g_new0(BitmapMemoryRegionCaches, 1); + len = address_space_cache_init(&new->bitmap, vdev->dma_as, addr, size, + true); + if (len < size) { + virtio_error(vdev, "Cannot map bitmap"); + goto err_bitmap; + } + qatomic_rcu_set(&vdev->caches, new); + + if (old) { + call_rcu(old, bitmap_free_region_cache, rcu); + } + + return; + +err_bitmap: + address_space_cache_destroy(&new->bitmap); + g_free(new); +} + +static uint64_t virtio_pci_lm_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + uint32_t val; + int qid; + + if (vdev == NULL) { + return UINT64_MAX; + } + switch (addr) { + case LM_LOGGING_CTRL: + val = vdev->lm_logging_ctrl; + break; + case LM_BASE_ADDR_LOW: + val = vdev->lm_base_addr_low; + break; + case LM_BASE_ADDR_HIGH: + val = vdev->lm_base_addr_high; + break; + case LM_END_ADDR_LOW: + val = vdev->lm_end_addr_low; + break; + case LM_END_ADDR_HIGH: + val = vdev->lm_end_addr_high; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + val = virtio_queue_get_vring_states(vdev, qid); + } else + val = 0; + + break; + } + + return val; +} + +static void virtio_pci_lm_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + int qid; + + if (vdev == NULL) { + return; + } + + switch (addr) { + case LM_LOGGING_CTRL: + vdev->lm_logging_ctrl = val; + switch (val) { + case LM_DISABLE: + lm_disable(vdev); + break; + case LM_ENABLE: + lm_enable(vdev); + break; + default: + virtio_error(vdev, "Unsupport LM_LOGGING_CTRL value: %"PRIx64, + val); + break; + }; + + break; + case LM_BASE_ADDR_LOW: + vdev->lm_base_addr_low = val; + break; + case LM_BASE_ADDR_HIGH: + vdev->lm_base_addr_high = val; + break; + case LM_END_ADDR_LOW: + vdev->lm_end_addr_low = val; + break; + case LM_END_ADDR_HIGH: + vdev->lm_end_addr_high = val; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + virtio_queue_set_vring_states(vdev, qid, val); + } else + virtio_error(vdev, "Unsupport addr: %"PRIx64, addr); + break; + } +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1823,6 +1972,15 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, }, .endianness = DEVICE_LITTLE_ENDIAN, }; + static const MemoryRegionOps lm_ops = { + .read = virtio_pci_lm_read, + .write = virtio_pci_lm_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; g_autoptr(GString) name = g_string_new(NULL); g_string_printf(name, "virtio-pci-common-%s", vdev_name); @@ -1859,6 +2017,14 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, proxy, name->str, proxy->notify_pio.size); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + g_string_printf(name, "virtio-pci-lm-%s", vdev_name); + memory_region_init_io(&proxy->lm.mr, OBJECT(proxy), + &lm_ops, + proxy, + name->str, + proxy->lm.size); + } } static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, @@ -2021,6 +2187,10 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->notify, ¬ify.cap); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_add_subregion(&proxy->modern_bar, + proxy->lm.offset, &proxy->lm.mr); + } if (modern_pio) { memory_region_init(&proxy->io_bar, OBJECT(proxy), @@ -2090,6 +2260,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_del_subregion(&proxy->modern_bar, &proxy->lm.mr); + } if (modern_pio) { virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); } @@ -2144,9 +2317,17 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; /* subclasses can enforce modern, so do this unconditionally */ - memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", - /* PCI BAR regions must be powers of 2 */ - pow2ceil(proxy->notify.offset + proxy->notify.size)); + if (!(proxy->flags & VIRTIO_PCI_FLAG_VDPA)) { + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->notify.offset + proxy->notify.size)); + } else { + proxy->lm.offset = proxy->notify.offset + proxy->notify.size; + proxy->lm.size = 0x20 + VIRTIO_QUEUE_MAX * 4; + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->lm.offset + proxy->lm.size)); + } if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; @@ -2301,6 +2482,8 @@ static Property virtio_pci_properties[] = { VIRTIO_PCI_FLAG_INIT_FLR_BIT, true), DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_AER_BIT, false), + DEFINE_PROP_BIT("vdpa", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_VDPA_BIT, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d229755eae..fb6b4ccd83 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3368,6 +3368,18 @@ static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev, return vdev->vq[n].last_avail_idx; } +static uint32_t virtio_queue_split_get_vring_states(VirtIODevice *vdev, + int n) +{ + struct VirtQueue *vq = &vdev->vq[n]; + uint16_t avail, used; + + avail = vq->last_avail_idx; + used = vq->used_idx; + + return avail | (uint32_t)used << 16; +} + unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) { if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { @@ -3377,6 +3389,33 @@ unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) } } +unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return -1; + } else { + return virtio_queue_split_get_vring_states(vdev, n); + } +} + +static void virtio_queue_split_set_vring_states(VirtIODevice *vdev, + int n, uint32_t idx) +{ + struct VirtQueue *vq = &vdev->vq[n]; + vq->last_avail_idx = (uint16_t)(idx & 0xffff); + vq->shadow_avail_idx = (uint16_t)(idx & 0xffff); + vq->used_idx = (uint16_t)(idx >> 16); +} + +void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, uint32_t idx) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return; + } else { + virtio_queue_split_set_vring_states(vdev, n, idx); + } +} + static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, int n, unsigned int idx) { |