diff options
56 files changed, 1425 insertions, 381 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index a3130f64fd..409d7db4d4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2370,6 +2370,11 @@ F: hw/virtio/vhost-user-scmi* F: include/hw/virtio/vhost-user-scmi.h F: tests/qtest/libqos/virtio-scmi.* +vdpa-net +M: Hao Chen <chenh@yusur.tech> +S: Maintained +F: docs/system/devices/vdpa-net.rst + virtio-crypto M: Gonglei <arei.gonglei@huawei.com> S: Supported diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index d1ed39dfa0..d8419fd2f1 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -1839,7 +1839,9 @@ is sent by the front-end. When the ``VHOST_USER_PROTOCOL_F_SHARED_OBJECT`` protocol feature has been successfully negotiated, this message can be submitted by the backend to remove themselves from to the virtio-dmabuf shared - table API. The shared table will remove the back-end device associated with + table API. Only the back-end owning the entry (i.e., the one that first added + it) will have permission to remove it. Otherwise, the message is ignored. + The shared table will remove the back-end device associated with the UUID. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the back-end sets the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond with zero when operation is successfully completed, or non-zero otherwise. diff --git a/docs/specs/pvpanic.rst b/docs/specs/pvpanic.rst index f894bc1955..61a80480ed 100644 --- a/docs/specs/pvpanic.rst +++ b/docs/specs/pvpanic.rst @@ -29,6 +29,8 @@ bit 1 a guest panic has happened and will be handled by the guest; the host should record it or report it, but should not affect the execution of the guest. +bit 2 + a regular guest shutdown has happened and should be processed by the host PCI Interface ------------- diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index f19777411c..e4a27f53c8 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -99,3 +99,4 @@ Emulated Devices devices/canokey.rst devices/usb-u2f.rst devices/igb.rst + devices/vdpa-net.rst diff --git a/docs/system/devices/vdpa-net.rst b/docs/system/devices/vdpa-net.rst new file mode 100644 index 0000000000..323d8c926a --- /dev/null +++ b/docs/system/devices/vdpa-net.rst @@ -0,0 +1,121 @@ +vdpa net +============ + +This document explains the setup and usage of the vdpa network device. +The vdpa network device is a paravirtualized vdpa emulate device. + +Description +----------- + +VDPA net devices support dirty page bitmap mark and vring state saving and recovery. + +Users can use this VDPA device for live migration simulation testing in a nested virtualization environment. + +Registers layout +---------------- + +The vdpa device add live migrate registers layout as follow:: + + Offset Register Name Bitwidth Associated vq + 0x0 LM_LOGGING_CTRL 4bits + 0x10 LM_BASE_ADDR_LOW 32bits + 0x14 LM_BASE_ADDR_HIGH 32bits + 0x18 LM_END_ADDR_LOW 32bits + 0x1c LM_END_ADDR_HIGH 32bits + 0x20 LM_RING_STATE_OFFSET 32bits vq0 + 0x24 LM_RING_STATE_OFFSET 32bits vq1 + 0x28 LM_RING_STATE_OFFSET 32bits vq2 + ...... + 0x20+1023*4 LM_RING_STATE_OFFSET 32bits vq1023 + +These registers are extended at the end of the notify bar space. + +Architecture diagram +-------------------- +:: + + |------------------------------------------------------------------------| + | guest-L1-user-space | + | | + | |----------------------------------------| + | | [virtio-net driver] | + | | ^ guest-L2-src(iommu=on) | + | |--------------|-------------------------| + | | | qemu-L2-src(viommu) | + | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] | + -------------------------------------------------------------------------- + -------------------------------------------------------------------------- + | ^ guest-L1-kernel-space | + | | | + | [VFIO] | + | ^ | + | | guest-L1-src(iommu=on) | + --------|----------------------------------------------------------------- + --------|----------------------------------------------------------------- + | [vdpa net device(iommu=on)] [manager nic device] | + | | | | + | | | | + | [tap device] qemu-L1-src(viommu) | | + ------------------------------------------------+------------------------- + | + | + --------------------- | + | kernel net bridge |<----- + | virbr0 |<---------------------------------- + --------------------- | + | + | + -------------------------------------------------------------------------- | + | guest-L1-user-space | | + | | | + | |----------------------------------------| | + | | [virtio-net driver] | | + | | ^ guest-L2-dst(iommu=on) | | + | |--------------|-------------------------| | + | | | qemu-L2-dst(viommu) | | + | [dpdk-vdpa]<->[vhost socket]<-+->[vhost-user backend(iommu=on)] | | + -------------------------------------------------------------------------- | + -------------------------------------------------------------------------- | + | ^ guest-L1-kernel-space | | + | | | | + | [VFIO] | | + | ^ | | + | | guest-L1-dst(iommu=on) | | + --------|----------------------------------------------------------------- | + --------|----------------------------------------------------------------- | + | [vdpa net device(iommu=on)] [manager nic device]----------------+---- + | | | + | | | + | [tap device] qemu-L1-dst(viommu) | + -------------------------------------------------------------------------- + + +Device properties +----------------- + +The Virtio vdpa device can be configured with the following properties: + + * ``vdpa=on`` open vdpa device emulated. + +Usages +-------- +This patch add virtio sriov support and vdpa live migrate support. +You can open vdpa by set xml file as follow:: + + <qemu:commandline xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'> + <qemu:arg value='-device'/> + <qemu:arg value='intel-iommu,intremap=on,device-iotlb=on,aw-bits=48'/> + <qemu:arg value='-netdev'/> + <qemu:arg value='tap,id=hostnet1,script=no,downscript=no,vhost=off'/> + <qemu:arg value='-device'/> + <qemu:arg value='virtio-net-pci,netdev=hostnet1,id=net1,mac=56:4a:b7:4f:4d:a9,bus=pci.6,addr=0x0,iommu_platform=on,ats=on,vdpa=on'/> + </qemu:commandline> + +Limitations +----------- +1. Dependent on tap device with param ``vhost=off``. +2. Nested virtualization environment only supports ``q35`` machines. +3. Current only support split vring live migrate. + + + diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c new file mode 100644 index 0000000000..17b9a052f5 --- /dev/null +++ b/hw/acpi/acpi_generic_initiator.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "qemu/osdep.h" +#include "hw/acpi/acpi_generic_initiator.h" +#include "hw/acpi/aml-build.h" +#include "hw/boards.h" +#include "hw/pci/pci_device.h" +#include "qemu/error-report.h" + +typedef struct AcpiGenericInitiatorClass { + ObjectClass parent_class; +} AcpiGenericInitiatorClass; + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, acpi_generic_initiator, + ACPI_GENERIC_INITIATOR, OBJECT, + { TYPE_USER_CREATABLE }, + { NULL }) + +OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR) + +static void acpi_generic_initiator_init(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->node = MAX_NODES; + gi->pci_dev = NULL; +} + +static void acpi_generic_initiator_finalize(Object *obj) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + g_free(gi->pci_dev); +} + +static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + + gi->pci_dev = g_strdup(val); +} + +static void acpi_generic_initiator_set_node(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj); + MachineState *ms = MACHINE(qdev_get_machine()); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value >= MAX_NODES) { + error_printf("%s: Invalid NUMA node specified\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + gi->node = value; + ms->numa_state->nodes[gi->node].has_gi = true; +} + +static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_str(oc, "pci-dev", NULL, + acpi_generic_initiator_set_pci_device); + object_class_property_add(oc, "node", "int", NULL, + acpi_generic_initiator_set_node, NULL, NULL); +} + +/* + * ACPI 6.3: + * Table 5-78 Generic Initiator Affinity Structure + */ +static void +build_srat_generic_pci_initiator_affinity(GArray *table_data, int node, + PCIDeviceHandle *handle) +{ + uint8_t index; + + build_append_int_noprefix(table_data, 5, 1); /* Type */ + build_append_int_noprefix(table_data, 32, 1); /* Length */ + build_append_int_noprefix(table_data, 0, 1); /* Reserved */ + build_append_int_noprefix(table_data, 1, 1); /* Device Handle Type: PCI */ + build_append_int_noprefix(table_data, node, 4); /* Proximity Domain */ + + /* Device Handle - PCI */ + build_append_int_noprefix(table_data, handle->segment, 2); + build_append_int_noprefix(table_data, handle->bdf, 2); + for (index = 0; index < 12; index++) { + build_append_int_noprefix(table_data, 0, 1); + } + + build_append_int_noprefix(table_data, GEN_AFFINITY_ENABLED, 4); /* Flags */ + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ +} + +static int build_all_acpi_generic_initiators(Object *obj, void *opaque) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + AcpiGenericInitiator *gi; + GArray *table_data = opaque; + PCIDeviceHandle dev_handle; + PCIDevice *pci_dev; + Object *o; + + if (!object_dynamic_cast(obj, TYPE_ACPI_GENERIC_INITIATOR)) { + return 0; + } + + gi = ACPI_GENERIC_INITIATOR(obj); + if (gi->node >= ms->numa_state->num_nodes) { + error_printf("%s: Specified node %d is invalid.\n", + TYPE_ACPI_GENERIC_INITIATOR, gi->node); + exit(1); + } + + o = object_resolve_path_type(gi->pci_dev, TYPE_PCI_DEVICE, NULL); + if (!o) { + error_printf("%s: Specified device must be a PCI device.\n", + TYPE_ACPI_GENERIC_INITIATOR); + exit(1); + } + + pci_dev = PCI_DEVICE(o); + + dev_handle.segment = 0; + dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), + pci_dev->devfn); + + build_srat_generic_pci_initiator_affinity(table_data, + gi->node, &dev_handle); + + return 0; +} + +void build_srat_generic_pci_initiator(GArray *table_data) +{ + object_child_foreach_recursive(object_get_root(), + build_all_acpi_generic_initiators, + table_data); +} diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 3042d223c8..9b1662b6b8 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -78,6 +78,7 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, uint32_t *initiator_list) { int i, index; + uint32_t initiator_to_index[MAX_NODES] = {}; HMAT_LB_Data *lb_data; uint16_t *entry_list; uint32_t base; @@ -121,6 +122,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, /* Initiator Proximity Domain List */ for (i = 0; i < num_initiator; i++) { build_append_int_noprefix(table_data, initiator_list[i], 4); + /* Reverse mapping for array possitions */ + initiator_to_index[initiator_list[i]] = i; } /* Target Proximity Domain List */ @@ -132,7 +135,8 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, entry_list = g_new0(uint16_t, num_initiator * num_target); for (i = 0; i < hmat_lb->list->len; i++) { lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); - index = lb_data->initiator * num_target + lb_data->target; + index = initiator_to_index[lb_data->initiator] * num_target + + lb_data->target; entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); } @@ -204,6 +208,13 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ for (i = 0; i < numa_state->num_nodes; i++) { + /* + * Linux rejects whole HMAT table if a node with no memory + * has one of these structures listing it as a target. + */ + if (!numa_state->nodes[i].node_mem) { + continue; + } flags = 0; if (numa_state->nodes[i].initiator < MAX_NODES) { @@ -214,7 +225,7 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } for (i = 0; i < numa_state->num_nodes; i++) { - if (numa_state->nodes[i].has_cpu) { + if (numa_state->nodes[i].has_cpu || numa_state->nodes[i].has_gi) { initiator_list[num_initiator++] = i; } } diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 5441c9b1e4..fa5c07db90 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -1,5 +1,6 @@ acpi_ss = ss.source_set() acpi_ss.add(files( + 'acpi_generic_initiator.c', 'acpi_interface.c', 'aml-build.c', 'bios-linker-loader.c', diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 6a1bde61ce..c3ccfef026 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -57,6 +57,7 @@ #include "migration/vmstate.h" #include "hw/acpi/ghes.h" #include "hw/acpi/viot.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "hw/virtio/virtio-acpi.h" #include "target/arm/multiprocessing.h" @@ -504,6 +505,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } } + build_srat_generic_pci_initiator(table_data); + if (ms->nvdimms_state->is_enabled) { nvdimm_build_srat(table_data); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 0af1943697..e5cd935232 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -85,11 +85,28 @@ #include "hw/char/pl011.h" #include "qemu/guest-random.h" +static GlobalProperty arm_virt_compat[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" }, +}; +static const size_t arm_virt_compat_len = G_N_ELEMENTS(arm_virt_compat); + +/* + * This cannot be called from the virt_machine_class_init() because + * TYPE_VIRT_MACHINE is abstract and mc->compat_props g_ptr_array_new() + * only is called on virt non abstract class init. + */ +static void arm_virt_compat_set(MachineClass *mc) +{ + compat_props_add(mc->compat_props, arm_virt_compat, + arm_virt_compat_len); +} + #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ void *data) \ { \ MachineClass *mc = MACHINE_CLASS(oc); \ + arm_virt_compat_set(mc); \ virt_machine_##major##_##minor##_options(mc); \ mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \ if (latest) { \ diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index ea2aeaef14..e604d8f30c 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -243,12 +243,13 @@ static void virtio_snd_handle_pcm_info(VirtIOSound *s, memset(&pcm_info[i].padding, 0, 5); } + cmd->payload_size = sizeof(virtio_snd_pcm_info) * count; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); iov_from_buf(cmd->elem->in_sg, cmd->elem->in_num, sizeof(virtio_snd_hdr), pcm_info, - sizeof(virtio_snd_pcm_info) * count); + cmd->payload_size); } /* @@ -749,7 +750,8 @@ process_cmd(VirtIOSound *s, virtio_snd_ctrl_command *cmd) 0, &cmd->resp, sizeof(virtio_snd_hdr)); - virtqueue_push(cmd->vq, cmd->elem, sizeof(virtio_snd_hdr)); + virtqueue_push(cmd->vq, cmd->elem, + sizeof(virtio_snd_hdr) + cmd->payload_size); virtio_notify(VIRTIO_DEVICE(s), cmd->vq); } @@ -808,6 +810,7 @@ static void virtio_snd_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) cmd->elem = elem; cmd->vq = vq; cmd->resp.code = cpu_to_le32(VIRTIO_SND_S_OK); + /* implicit cmd->payload_size = 0; */ QTAILQ_INSERT_TAIL(&s->cmdq, cmd, next); elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); } diff --git a/hw/core/machine.c b/hw/core/machine.c index e483b34459..37ede0e7d4 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -30,10 +30,13 @@ #include "exec/confidential-guest-support.h" #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" +#include "hw/virtio/virtio-iommu.h" #include "audio/audio.h" GlobalProperty hw_compat_8_2[] = { { "migration", "zero-page-detection", "legacy"}, + { TYPE_VIRTIO_IOMMU_PCI, "granule", "4k" }, + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" }, }; const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); diff --git a/hw/core/numa.c b/hw/core/numa.c index 81d2124349..f8ce332cfe 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -227,7 +227,8 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, node->target, numa_state->num_nodes); return; } - if (!numa_info[node->initiator].has_cpu) { + if (!numa_info[node->initiator].has_cpu && + !numa_info[node->initiator].has_gi) { error_setg(errp, "Invalid initiator=%d, it isn't an " "initiator proximity domain", node->initiator); return; diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index f52073b7c8..d79d6f4b53 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -966,7 +966,7 @@ const PropertyInfo qdev_prop_off_auto_pcibar = { .set_default_value = qdev_propinfo_set_default_value_enum, }; -/* --- PCIELinkSpeed 2_5/5/8/16 -- */ +/* --- PCIELinkSpeed 2_5/5/8/16/32/64 -- */ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -988,6 +988,12 @@ static void get_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case QEMU_PCI_EXP_LNK_16GT: speed = PCIE_LINK_SPEED_16; break; + case QEMU_PCI_EXP_LNK_32GT: + speed = PCIE_LINK_SPEED_32; + break; + case QEMU_PCI_EXP_LNK_64GT: + speed = PCIE_LINK_SPEED_64; + break; default: /* Unreachable */ abort(); @@ -1021,6 +1027,12 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, case PCIE_LINK_SPEED_16: *p = QEMU_PCI_EXP_LNK_16GT; break; + case PCIE_LINK_SPEED_32: + *p = QEMU_PCI_EXP_LNK_32GT; + break; + case PCIE_LINK_SPEED_64: + *p = QEMU_PCI_EXP_LNK_64GT; + break; default: /* Unreachable */ abort(); @@ -1029,7 +1041,7 @@ static void set_prop_pcielinkspeed(Object *obj, Visitor *v, const char *name, const PropertyInfo qdev_prop_pcie_link_speed = { .name = "PCIELinkSpeed", - .description = "2_5/5/8/16", + .description = "2_5/5/8/16/32/64", .enum_table = &PCIELinkSpeed_lookup, .get = get_prop_pcielinkspeed, .set = set_prop_pcielinkspeed, diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index 84ab503325..cd116c0401 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -297,6 +297,7 @@ void cxl_component_register_init_common(uint32_t *reg_state, caps = 3; break; case CXL2_ROOT_PORT: + case CXL2_RC: /* + Extended Security, + Snoop */ caps = 5; break; @@ -326,8 +327,19 @@ void cxl_component_register_init_common(uint32_t *reg_state, CXL_##reg##_REGISTERS_OFFSET); \ } while (0) + switch (type) { + case CXL2_DEVICE: + case CXL2_TYPE3_DEVICE: + case CXL2_LOGICAL_DEVICE: + case CXL2_ROOT_PORT: + case CXL2_UPSTREAM_PORT: + case CXL2_DOWNSTREAM_PORT: init_cap_reg(RAS, 2, CXL_RAS_CAPABILITY_VERSION); - ras_init_common(reg_state, write_msk); + ras_init_common(reg_state, write_msk); + break; + default: + break; + } init_cap_reg(LINK, 4, CXL_LINK_CAPABILITY_VERSION); @@ -335,9 +347,10 @@ void cxl_component_register_init_common(uint32_t *reg_state, return; } - init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); - hdm_init_common(reg_state, write_msk, type); - + if (type != CXL2_ROOT_PORT) { + init_cap_reg(HDM, 5, CXL_HDM_CAPABILITY_VERSION); + hdm_init_common(reg_state, write_msk, type); + } if (caps < 5) { return; } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 15242b9096..53f804ac16 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -68,6 +68,7 @@ #include "hw/acpi/utils.h" #include "hw/acpi/pci.h" #include "hw/acpi/cxl.h" +#include "hw/acpi/acpi_generic_initiator.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -2046,6 +2047,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } + build_srat_generic_pci_initiator(table_data); + /* * Entry is required for Windows to enable memory hotplug in OS * and for Linux to enable SWIOTLB when booted with less than diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4f322e0856..feb7a93083 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -425,9 +425,10 @@ static void set_boot_dev(PCMachineState *pcms, MC146818RtcState *s, static void pc_boot_set(void *opaque, const char *boot_device, Error **errp) { - PCMachineState *pcms = PC_MACHINE(current_machine); + PCMachineState *pcms = opaque; + X86MachineState *x86ms = X86_MACHINE(pcms); - set_boot_dev(pcms, opaque, boot_device, errp); + set_boot_dev(pcms, MC146818_RTC(x86ms->rtc), boot_device, errp); } static void pc_cmos_init_floppy(MC146818RtcState *rtc_state, ISADevice *floppy) @@ -569,14 +570,6 @@ static void pc_cmos_init_late(PCMachineState *pcms) mc146818rtc_set_cmos_data(s, 0x39, val); pc_cmos_init_floppy(s, pc_find_fdc0()); -} - -void pc_cmos_init(PCMachineState *pcms, - ISADevice *rtc) -{ - int val; - X86MachineState *x86ms = X86_MACHINE(pcms); - MC146818RtcState *s = MC146818_RTC(rtc); /* various important CMOS locations needed by PC/Bochs bios */ @@ -613,22 +606,10 @@ void pc_cmos_init(PCMachineState *pcms, mc146818rtc_set_cmos_data(s, 0x5c, val >> 8); mc146818rtc_set_cmos_data(s, 0x5d, val >> 16); - object_property_add_link(OBJECT(pcms), "rtc_state", - TYPE_ISA_DEVICE, - (Object **)&x86ms->rtc, - object_property_allow_set_link, - OBJ_PROP_LINK_STRONG); - object_property_set_link(OBJECT(pcms), "rtc_state", OBJECT(s), - &error_abort); - - set_boot_dev(pcms, s, MACHINE(pcms)->boot_config.order, &error_fatal); - val = 0; val |= 0x02; /* FPU is there */ val |= 0x04; /* PS/2 mouse installed */ mc146818rtc_set_cmos_data(s, REG_EQUIPMENT_BYTE, val); - - /* hard drives and FDC are handled by pc_cmos_init_late() */ } static void handle_a20_line_change(void *opaque, int irq, int level) @@ -1261,7 +1242,9 @@ void pc_basic_device_init(struct PCMachineState *pcms, } #endif - qemu_register_boot_set(pc_boot_set, rtc_state); + qemu_register_boot_set(pc_boot_set, pcms); + set_boot_dev(pcms, MC146818_RTC(rtc_state), + MACHINE(pcms)->boot_config.order, &error_fatal); if (!xen_enabled() && (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { @@ -1751,6 +1734,7 @@ static void pc_machine_initfn(Object *obj) pcms->fd_bootchk = true; pcms->default_bus_bypass_iommu = false; + pc_system_flash_create(pcms); pcms->pcspk = isa_new(TYPE_PC_SPEAKER); object_property_add_alias(OBJECT(pcms), "pcspk-audiodev", OBJECT(pcms->pcspk), "audiodev"); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 319bc4b180..c9a6c0aa68 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -228,6 +228,7 @@ static void pc_init1(MachineState *machine, const char *pci_type) assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); + pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { /* For xen HVM direct kernel boot, load linux here */ xen_load_linux(pcms); @@ -343,8 +344,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) } #endif - pc_cmos_init(pcms, x86ms->rtc); - if (piix4_pm) { smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 45a4102e75..8a427c4647 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -45,6 +45,7 @@ #include "hw/i386/pc.h" #include "hw/i386/amd_iommu.h" #include "hw/i386/intel_iommu.h" +#include "hw/virtio/virtio-iommu.h" #include "hw/display/ramfb.h" #include "hw/ide/pci.h" #include "hw/ide/ahci-pci.h" @@ -63,6 +64,12 @@ /* ICH9 AHCI has 6 ports */ #define MAX_SATA_PORTS 6 +static GlobalProperty pc_q35_compat_defaults[] = { + { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" }, +}; +static const size_t pc_q35_compat_defaults_len = + G_N_ELEMENTS(pc_q35_compat_defaults); + struct ehci_companions { const char *name; int func; @@ -311,8 +318,6 @@ static void pc_q35_init(MachineState *machine) smbus_eeprom_init(pcms->smbus, 8, NULL, 0); } - pc_cmos_init(pcms, x86ms->rtc); - /* the rest devices to which pci devfn is automatically assigned */ pc_vga_init(isa_bus, pcms->pcibus); pc_nic_init(pcmc, isa_bus, pcms->pcibus); @@ -350,12 +355,14 @@ static void pc_q35_machine_options(MachineClass *m) m->default_nic = "e1000e"; m->default_kernel_irqchip_split = false; m->no_floppy = 1; - m->max_cpus = 1024; + m->max_cpus = 4096; m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + compat_props_add(m->compat_props, + pc_q35_compat_defaults, pc_q35_compat_defaults_len); } static void pc_q35_9_0_machine_options(MachineClass *m) @@ -371,6 +378,7 @@ static void pc_q35_8_2_machine_options(MachineClass *m) { pc_q35_9_0_machine_options(m); m->alias = NULL; + m->max_cpus = 1024; compat_props_add(m->compat_props, hw_compat_8_2, hw_compat_8_2_len); compat_props_add(m->compat_props, pc_compat_8_2, pc_compat_8_2_len); } diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index b02e285579..3efabbbab2 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -91,7 +91,19 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms, return PFLASH_CFI01(dev); } -static void pc_system_flash_cleanup_unused(PCMachineState *pcms) +void pc_system_flash_create(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + + if (pcmc->pci_enabled) { + pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", + "pflash0"); + pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", + "pflash1"); + } +} + +void pc_system_flash_cleanup_unused(PCMachineState *pcms) { char *prop_name; int i; @@ -198,9 +210,6 @@ void pc_system_firmware_init(PCMachineState *pcms, return; } - pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0"); - pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1"); - /* Map legacy -drive if=pflash to machine properties */ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { pflash_cfi01_legacy_drive(pcms->flash[i], diff --git a/hw/net/igb.c b/hw/net/igb.c index 0b5c31a58b..9b37523d6d 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -488,12 +488,10 @@ static void igb_pci_uninit(PCIDevice *pci_dev) static void igb_qdev_reset_hold(Object *obj) { - PCIDevice *d = PCI_DEVICE(obj); IGBState *s = IGB(obj); trace_e1000e_cb_qdev_reset_hold(); - pcie_sriov_pf_disable_vfs(d); igb_core_reset(&s->core); } diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 403a693baf..9959f1932b 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -2039,6 +2039,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, goto err; } + /* Mark dirty page's bitmap of guest memory */ + if (vdev->lm_logging_ctrl == LM_ENABLE) { + uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK; + /* Get chunk index */ + BitmapMemoryRegionCaches *caches = qatomic_rcu_read(&vdev->caches); + uint64_t index = chunk / 8; + uint64_t shift = chunk % 8; + uint8_t val = 0; + address_space_read_cached(&caches->bitmap, index, &val, + sizeof(val)); + val |= 1 << shift; + address_space_write_cached(&caches->bitmap, index, &val, + sizeof(val)); + address_space_cache_invalidate(&caches->bitmap, index, sizeof(val)); + } + elems[i] = elem; lens[i] = total; i++; diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 036b15403a..c2b17de987 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7126,10 +7126,6 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) sctrl = &n->sec_ctrl_list.sec[i]; nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } - - if (rst != NVME_RESET_CONTROLLER) { - pcie_sriov_pf_disable_vfs(pci_dev); - } } if (rst != NVME_RESET_CONTROLLER) { @@ -8509,36 +8505,26 @@ static void nvme_pci_reset(DeviceState *qdev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); } -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, - uint32_t val, int len) +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) { NvmeCtrl *n = NVME(dev); NvmeSecCtrlEntry *sctrl; - uint16_t sriov_cap = dev->exp.sriov_cap; - uint32_t off = address - sriov_cap; - int i, num_vfs; + int i; - if (!sriov_cap) { - return; - } - - if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - for (i = 0; i < num_vfs; i++) { - sctrl = &n->sec_ctrl_list.sec[i]; - nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); - } - } + for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { + sctrl = &n->sec_ctrl_list.sec[i]; + nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } } static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, uint32_t val, int len) { - nvme_sriov_pre_write_ctrl(dev, address, val, len); + uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); + nvme_sriov_post_write_config(dev, old_num_vfs); } static const VMStateDescription nvme_vmstate = { diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index 535889f7c2..0411ad31ea 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -290,7 +290,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev) uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; int dsp_count = 0; - cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); + cxl_component_register_init_common(reg_state, write_msk, CXL2_RC); /* * The CXL specification allows for host bridges with no HDM decoders * if they only have a single root port. diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 6496d027ca..e7a39cb203 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -409,6 +409,7 @@ static void pci_do_device_reset(PCIDevice *dev) msi_reset(dev); msix_reset(dev); + pcie_sriov_pf_reset(dev); } /* diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index f56079acf5..4b2f0805c6 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -171,6 +171,14 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, PCI_EXP_LNKCAP2_SLS_16_0GB); } + if (s->speed > QEMU_PCI_EXP_LNK_16GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_32_0GB); + } + if (s->speed > QEMU_PCI_EXP_LNK_32GT) { + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2, + PCI_EXP_LNKCAP2_SLS_64_0GB); + } } } diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index a1fe65f5d8..e9b23221d7 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev) assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + return; + } dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); @@ -212,7 +215,6 @@ static void unregister_vfs(PCIDevice *dev) g_free(dev->exp.sriov_pf.vf); dev->exp.sriov_pf.vf = NULL; dev->exp.sriov_pf.num_vfs = 0; - pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0); } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -246,16 +248,28 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, } -/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */ -void pcie_sriov_pf_disable_vfs(PCIDevice *dev) +/* Reset SR/IOV */ +void pcie_sriov_pf_reset(PCIDevice *dev) { uint16_t sriov_cap = dev->exp.sriov_cap; - if (sriov_cap) { - uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL); - if (val & PCI_SRIOV_CTRL_VFE) { - val &= ~PCI_SRIOV_CTRL_VFE; - pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1); - } + if (!sriov_cap) { + return; + } + + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_CTRL, 0); + unregister_vfs(dev); + + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0); + + /* + * Default is to use 4K pages, software can modify it + * to any of the supported bits + */ + pci_set_word(dev->config + sriov_cap + PCI_SRIOV_SYS_PGSIZE, 0x1); + + for (uint16_t i = 0; i < PCI_NUM_REGIONS; i++) { + pci_set_quad(dev->config + sriov_cap + PCI_SRIOV_BAR + i * 4, + dev->exp.sriov_pf.vf_bar_type[i]); } } diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index a3c4e52ce9..e3d5d8f2e2 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -121,6 +121,16 @@ struct type8_instance { }; static QTAILQ_HEAD(, type8_instance) type8 = QTAILQ_HEAD_INITIALIZER(type8); +/* type 9 instance for parsing */ +struct type9_instance { + const char *slot_designation, *pcidev; + uint8_t slot_type, slot_data_bus_width, current_usage, slot_length, + slot_characteristics1, slot_characteristics2; + uint16_t slot_id; + QTAILQ_ENTRY(type9_instance) next; +}; +static QTAILQ_HEAD(, type9_instance) type9 = QTAILQ_HEAD_INITIALIZER(type9); + static struct { size_t nvalues; char **values; @@ -380,6 +390,59 @@ static const QemuOptDesc qemu_smbios_type8_opts[] = { { /* end of list */ } }; +static const QemuOptDesc qemu_smbios_type9_opts[] = { + { + .name = "type", + .type = QEMU_OPT_NUMBER, + .help = "SMBIOS element type", + }, + { + .name = "slot_designation", + .type = QEMU_OPT_STRING, + .help = "string number for reference designation", + }, + { + .name = "slot_type", + .type = QEMU_OPT_NUMBER, + .help = "connector type", + }, + { + .name = "slot_data_bus_width", + .type = QEMU_OPT_NUMBER, + .help = "port type", + }, + { + .name = "current_usage", + .type = QEMU_OPT_NUMBER, + .help = "current usage", + }, + { + .name = "slot_length", + .type = QEMU_OPT_NUMBER, + .help = "system slot length", + }, + { + .name = "slot_id", + .type = QEMU_OPT_NUMBER, + .help = "system slot id", + }, + { + .name = "slot_characteristics1", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics1, see the spec", + }, + { + .name = "slot_characteristics2", + .type = QEMU_OPT_NUMBER, + .help = "slot characteristics2, see the spec", + }, + { + .name = "pci_device", + .type = QEMU_OPT_STRING, + .help = "PCI device, if provided." + } +}; + static const QemuOptDesc qemu_smbios_type11_opts[] = { { .name = "type", @@ -609,6 +672,7 @@ bool smbios_skip_table(uint8_t type, bool required_table) #define T2_BASE 0x200 #define T3_BASE 0x300 #define T4_BASE 0x400 +#define T9_BASE 0x900 #define T11_BASE 0xe00 #define T16_BASE 0x1000 @@ -807,6 +871,65 @@ static void smbios_build_type_8_table(void) } } +static void smbios_build_type_9_table(Error **errp) +{ + unsigned instance = 0; + struct type9_instance *t9; + + QTAILQ_FOREACH(t9, &type9, next) { + SMBIOS_BUILD_TABLE_PRE(9, T9_BASE + instance, true); + + SMBIOS_TABLE_SET_STR(9, slot_designation, t9->slot_designation); + t->slot_type = t9->slot_type; + t->slot_data_bus_width = t9->slot_data_bus_width; + t->current_usage = t9->current_usage; + t->slot_length = t9->slot_length; + t->slot_id = t9->slot_id; + t->slot_characteristics1 = t9->slot_characteristics1; + t->slot_characteristics2 = t9->slot_characteristics2; + + if (t9->pcidev) { + PCIDevice *pdev = NULL; + int rc = pci_qdev_find_device(t9->pcidev, &pdev); + if (rc != 0) { + error_setg(errp, + "No PCI device %s for SMBIOS type 9 entry %s", + t9->pcidev, t9->slot_designation); + return; + } + /* + * We only handle the case were the device is attached to + * the PCI root bus. The general case is more complex as + * bridges are enumerated later and the table would need + * to be updated at this moment. + */ + if (!pci_bus_is_root(pci_get_bus(pdev))) { + error_setg(errp, + "Cannot create type 9 entry for PCI device %s: " + "not attached to the root bus", + t9->pcidev); + return; + } + t->segment_group_number = cpu_to_le16(0); + t->bus_number = pci_dev_bus_num(pdev); + t->device_number = pdev->devfn; + } else { + /* + * Per SMBIOS spec, For slots that are not of the PCI, AGP, PCI-X, + * or PCI-Express type that do not have bus/device/function + * information, 0FFh should be populated in the fields of Segment + * Group Number, Bus Number, Device/Function Number. + */ + t->segment_group_number = 0xff; + t->bus_number = 0xff; + t->device_number = 0xff; + } + + SMBIOS_BUILD_TABLE_POST; + instance++; + } +} + static void smbios_build_type_11_table(void) { char count_str[128]; @@ -1126,6 +1249,7 @@ void smbios_get_tables(MachineState *ms, } smbios_build_type_8_table(); + smbios_build_type_9_table(errp); smbios_build_type_11_table(); #define MAX_DIMM_SZ (16 * GiB) @@ -1460,6 +1584,24 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) t8_i->port_type = qemu_opt_get_number(opts, "port_type", 0); QTAILQ_INSERT_TAIL(&type8, t8_i, next); return; + case 9: { + if (!qemu_opts_validate(opts, qemu_smbios_type9_opts, errp)) { + return; + } + struct type9_instance *t; + t = g_new0(struct type9_instance, 1); + save_opt(&t->slot_designation, opts, "slot_designation"); + t->slot_type = qemu_opt_get_number(opts, "slot_type", 0); + t->slot_data_bus_width = qemu_opt_get_number(opts, "slot_data_bus_width", 0); + t->current_usage = qemu_opt_get_number(opts, "current_usage", 0); + t->slot_length = qemu_opt_get_number(opts, "slot_length", 0); + t->slot_id = qemu_opt_get_number(opts, "slot_id", 0); + t->slot_characteristics1 = qemu_opt_get_number(opts, "slot_characteristics1", 0); + t->slot_characteristics2 = qemu_opt_get_number(opts, "slot_characteristics2", 0); + save_opt(&t->pcidev, opts, "pcidev"); + QTAILQ_INSERT_TAIL(&type9, t, next); + return; + } case 11: if (!qemu_opts_validate(opts, qemu_smbios_type11_opts, errp)) { return; diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 77905d1994..13b6991179 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -30,6 +30,7 @@ vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" # vhost-vdpa.c +vhost_vdpa_skipped_memory_section(int is_ram, int is_iommu, int is_protected, int is_ram_device, uint64_t first, uint64_t last, int page_mask) "is_ram=%d, is_iommu=%d, is_protected=%d, is_ram_device=%d iova_min=0x%"PRIx64" iova_last=0x%"PRIx64" page_mask=0x%x" vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint32_t asid, uint64_t iova, uint64_t size, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" asid: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8 vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa_shared:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 @@ -57,8 +58,8 @@ vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" -vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_set_dev_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" +vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num, bool svq) "dev: %p index: %u num: %u svq: %d" vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 @@ -111,7 +112,7 @@ virtio_iommu_device_reset(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" -virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%u domain range end=%u probe_size=0x%x bypass=0x%x" virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 1af8621481..cdf9af4a4b 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1610,11 +1610,27 @@ vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev, } static int -vhost_user_backend_handle_shared_object_remove(VhostUserShared *object) +vhost_user_backend_handle_shared_object_remove(struct vhost_dev *dev, + VhostUserShared *object) { QemuUUID uuid; memcpy(uuid.data, object->uuid, sizeof(object->uuid)); + switch (virtio_object_type(&uuid)) { + case TYPE_VHOST_DEV: + { + struct vhost_dev *owner = virtio_lookup_vhost_device(&uuid); + if (dev != owner) { + /* Not allowed to remove non-owned entries */ + return 0; + } + break; + } + default: + /* Not allowed to remove non-owned entries */ + return 0; + } + return virtio_remove_resource(&uuid); } @@ -1793,7 +1809,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE: - ret = vhost_user_backend_handle_shared_object_remove(&payload.object); + ret = vhost_user_backend_handle_shared_object_remove(dev, + &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP: ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc, diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ddae494ca8..3bcd05cc22 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -47,12 +47,17 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, int page_mask) { Int128 llend; + bool is_ram = memory_region_is_ram(section->mr); + bool is_iommu = memory_region_is_iommu(section->mr); + bool is_protected = memory_region_is_protected(section->mr); - if ((!memory_region_is_ram(section->mr) && - !memory_region_is_iommu(section->mr)) || - memory_region_is_protected(section->mr) || - /* vhost-vDPA doesn't allow MMIO to be mapped */ - memory_region_is_ram_device(section->mr)) { + /* vhost-vDPA doesn't allow MMIO to be mapped */ + bool is_ram_device = memory_region_is_ram_device(section->mr); + + if ((!is_ram && !is_iommu) || is_protected || is_ram_device) { + trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected, + is_ram_device, iova_min, + iova_max, page_mask); return true; } @@ -69,7 +74,7 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, * size that maps to the kernel */ - if (!memory_region_is_iommu(section->mr)) { + if (!is_iommu) { llend = vhost_vdpa_section_end(section, page_mask); if (int128_gt(llend, int128_make64(iova_max))) { error_report("RAM section out of device range (max=0x%" PRIx64 @@ -555,6 +560,11 @@ static bool vhost_vdpa_first_dev(struct vhost_dev *dev) return v->index == 0; } +static bool vhost_vdpa_last_dev(struct vhost_dev *dev) +{ + return dev->vq_index + dev->nvqs == dev->vq_index_end; +} + static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, uint64_t *features) { @@ -965,7 +975,10 @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { - trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); + struct vhost_vdpa *v = dev->opaque; + + trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num, + v->shadow_vqs_enabled); return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); } @@ -1315,7 +1328,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return 0; } @@ -1337,7 +1350,7 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + if (!vhost_vdpa_last_dev(dev)) { return; } @@ -1407,6 +1420,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, if (v->shadow_vqs_enabled) { ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true); return 0; } @@ -1419,7 +1433,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, } ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); - trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false); return ret; } @@ -1447,7 +1461,15 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, /* Remember last call fd because we can switch to SVQ anytime. */ vhost_svq_set_svq_call_fd(svq, file->fd); - if (v->shadow_vqs_enabled) { + /* + * When SVQ is transitioning to off, shadow_vqs_enabled has + * not been set back to false yet, but the underlying call fd + * will have to switch back to the guest notifier to signal the + * passthrough virtqueues. In other situations, SVQ's own call + * fd shall be used to signal the device model. + */ + if (v->shadow_vqs_enabled && + v->shared->svq_switching != SVQ_TSTATE_DISABLING) { return 0; } diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 86623d55a5..1326c6ec41 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -29,6 +29,7 @@ #include "sysemu/reset.h" #include "sysemu/sysemu.h" #include "qemu/reserved-region.h" +#include "qemu/units.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -1115,8 +1116,8 @@ static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, } /* - * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, - * for example 0xfffffffffffff000. When an assigned device has page size + * The default mask depends on the "granule" property. For example, with + * 4k granule, it is -(4 * KiB). When an assigned device has page size * restrictions due to the hardware IOMMU configuration, apply this restriction * to the mask. */ @@ -1313,8 +1314,32 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) * in vfio realize */ s->config.bypass = s->boot_bypass; - s->config.page_size_mask = qemu_target_page_mask(); - s->config.input_range.end = UINT64_MAX; + if (s->aw_bits < 32 || s->aw_bits > 64) { + error_setg(errp, "aw-bits must be within [32,64]"); + return; + } + s->config.input_range.end = + s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1; + + switch (s->granule_mode) { + case GRANULE_MODE_4K: + s->config.page_size_mask = -(4 * KiB); + break; + case GRANULE_MODE_8K: + s->config.page_size_mask = -(8 * KiB); + break; + case GRANULE_MODE_16K: + s->config.page_size_mask = -(16 * KiB); + break; + case GRANULE_MODE_64K: + s->config.page_size_mask = -(64 * KiB); + break; + case GRANULE_MODE_HOST: + s->config.page_size_mask = qemu_real_host_page_mask(); + break; + default: + error_setg(errp, "Unsupported granule mode"); + } s->config.domain_range.end = UINT32_MAX; s->config.probe_size = VIOMMU_PROBE_SIZE; @@ -1522,6 +1547,9 @@ static Property virtio_iommu_properties[] = { DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, TYPE_PCI_BUS, PCIBus *), DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), + DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode, + GRANULE_MODE_HOST), + DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index cb6940fc0e..eaaf86402c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1442,6 +1442,155 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +/* Called within call_rcu(). */ +static void bitmap_free_region_cache(BitmapMemoryRegionCaches *caches) +{ + assert(caches != NULL); + address_space_cache_destroy(&caches->bitmap); + g_free(caches); +} + +static void lm_disable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *caches; + caches = qatomic_read(&vdev->caches); + qatomic_rcu_set(&vdev->caches, NULL); + if (caches) { + call_rcu(caches, bitmap_free_region_cache, rcu); + } +} + +static void lm_enable(VirtIODevice *vdev) +{ + BitmapMemoryRegionCaches *old = vdev->caches; + BitmapMemoryRegionCaches *new = NULL; + hwaddr addr, end, size; + int64_t len; + + addr = vdev->lm_base_addr_low | ((hwaddr)(vdev->lm_base_addr_high) << 32); + end = vdev->lm_end_addr_low | ((hwaddr)(vdev->lm_end_addr_high) << 32); + size = end - addr; + if (size <= 0) { + error_report("Invalid lm size."); + return; + } + + new = g_new0(BitmapMemoryRegionCaches, 1); + len = address_space_cache_init(&new->bitmap, vdev->dma_as, addr, size, + true); + if (len < size) { + virtio_error(vdev, "Cannot map bitmap"); + goto err_bitmap; + } + qatomic_rcu_set(&vdev->caches, new); + + if (old) { + call_rcu(old, bitmap_free_region_cache, rcu); + } + + return; + +err_bitmap: + address_space_cache_destroy(&new->bitmap); + g_free(new); +} + +static uint64_t virtio_pci_lm_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + uint32_t val; + int qid; + + if (vdev == NULL) { + return UINT64_MAX; + } + switch (addr) { + case LM_LOGGING_CTRL: + val = vdev->lm_logging_ctrl; + break; + case LM_BASE_ADDR_LOW: + val = vdev->lm_base_addr_low; + break; + case LM_BASE_ADDR_HIGH: + val = vdev->lm_base_addr_high; + break; + case LM_END_ADDR_LOW: + val = vdev->lm_end_addr_low; + break; + case LM_END_ADDR_HIGH: + val = vdev->lm_end_addr_high; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + val = virtio_queue_get_vring_states(vdev, qid); + } else + val = 0; + + break; + } + + return val; +} + +static void virtio_pci_lm_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + hwaddr offset_end = LM_VRING_STATE_OFFSET + + virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + int qid; + + if (vdev == NULL) { + return; + } + + switch (addr) { + case LM_LOGGING_CTRL: + vdev->lm_logging_ctrl = val; + switch (val) { + case LM_DISABLE: + lm_disable(vdev); + break; + case LM_ENABLE: + lm_enable(vdev); + break; + default: + virtio_error(vdev, "Unsupport LM_LOGGING_CTRL value: %"PRIx64, + val); + break; + }; + + break; + case LM_BASE_ADDR_LOW: + vdev->lm_base_addr_low = val; + break; + case LM_BASE_ADDR_HIGH: + vdev->lm_base_addr_high = val; + break; + case LM_END_ADDR_LOW: + vdev->lm_end_addr_low = val; + break; + case LM_END_ADDR_HIGH: + vdev->lm_end_addr_high = val; + break; + default: + if (addr >= LM_VRING_STATE_OFFSET && addr <= offset_end) { + qid = (addr - LM_VRING_STATE_OFFSET) / + virtio_pci_queue_mem_mult(proxy); + virtio_queue_set_vring_states(vdev, qid, val); + } else + virtio_error(vdev, "Unsupport addr: %"PRIx64, addr); + break; + } +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1823,6 +1972,15 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, }, .endianness = DEVICE_LITTLE_ENDIAN, }; + static const MemoryRegionOps lm_ops = { + .read = virtio_pci_lm_read, + .write = virtio_pci_lm_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; g_autoptr(GString) name = g_string_new(NULL); g_string_printf(name, "virtio-pci-common-%s", vdev_name); @@ -1859,6 +2017,14 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, proxy, name->str, proxy->notify_pio.size); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + g_string_printf(name, "virtio-pci-lm-%s", vdev_name); + memory_region_init_io(&proxy->lm.mr, OBJECT(proxy), + &lm_ops, + proxy, + name->str, + proxy->lm.size); + } } static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, @@ -2021,6 +2187,10 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); virtio_pci_modern_mem_region_map(proxy, &proxy->notify, ¬ify.cap); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_add_subregion(&proxy->modern_bar, + proxy->lm.offset, &proxy->lm.mr); + } if (modern_pio) { memory_region_init(&proxy->io_bar, OBJECT(proxy), @@ -2090,6 +2260,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); + if (proxy->flags & VIRTIO_PCI_FLAG_VDPA) { + memory_region_del_subregion(&proxy->modern_bar, &proxy->lm.mr); + } if (modern_pio) { virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); } @@ -2144,9 +2317,17 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; /* subclasses can enforce modern, so do this unconditionally */ - memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", - /* PCI BAR regions must be powers of 2 */ - pow2ceil(proxy->notify.offset + proxy->notify.size)); + if (!(proxy->flags & VIRTIO_PCI_FLAG_VDPA)) { + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->notify.offset + proxy->notify.size)); + } else { + proxy->lm.offset = proxy->notify.offset + proxy->notify.size; + proxy->lm.size = 0x20 + VIRTIO_QUEUE_MAX * 4; + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->lm.offset + proxy->lm.size)); + } if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; @@ -2301,6 +2482,8 @@ static Property virtio_pci_properties[] = { VIRTIO_PCI_FLAG_INIT_FLR_BIT, true), DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_AER_BIT, false), + DEFINE_PROP_BIT("vdpa", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_VDPA_BIT, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d229755eae..fb6b4ccd83 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3368,6 +3368,18 @@ static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev, return vdev->vq[n].last_avail_idx; } +static uint32_t virtio_queue_split_get_vring_states(VirtIODevice *vdev, + int n) +{ + struct VirtQueue *vq = &vdev->vq[n]; + uint16_t avail, used; + + avail = vq->last_avail_idx; + used = vq->used_idx; + + return avail | (uint32_t)used << 16; +} + unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) { if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { @@ -3377,6 +3389,33 @@ unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) } } +unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return -1; + } else { + return virtio_queue_split_get_vring_states(vdev, n); + } +} + +static void virtio_queue_split_set_vring_states(VirtIODevice *vdev, + int n, uint32_t idx) +{ + struct VirtQueue *vq = &vdev->vq[n]; + vq->last_avail_idx = (uint16_t)(idx & 0xffff); + vq->shadow_avail_idx = (uint16_t)(idx & 0xffff); + vq->used_idx = (uint16_t)(idx >> 16); +} + +void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, uint32_t idx) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return; + } else { + virtio_queue_split_set_vring_states(vdev, n, idx); + } +} + static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, int n, unsigned int idx) { diff --git a/include/hw/acpi/acpi_generic_initiator.h b/include/hw/acpi/acpi_generic_initiator.h new file mode 100644 index 0000000000..a304bad73e --- /dev/null +++ b/include/hw/acpi/acpi_generic_initiator.h @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef ACPI_GENERIC_INITIATOR_H +#define ACPI_GENERIC_INITIATOR_H + +#include "qom/object_interfaces.h" + +#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator" + +typedef struct AcpiGenericInitiator { + /* private */ + Object parent; + + /* public */ + char *pci_dev; + uint16_t node; +} AcpiGenericInitiator; + +/* + * ACPI 6.3: + * Table 5-81 Flags – Generic Initiator Affinity Structure + */ +typedef enum { + /* + * If clear, the OSPM ignores the contents of the Generic + * Initiator/Port Affinity Structure. This allows system firmware + * to populate the SRAT with a static number of structures, but only + * enable them as necessary. + */ + GEN_AFFINITY_ENABLED = (1 << 0), +} GenericAffinityFlags; + +/* + * ACPI 6.3: + * Table 5-80 Device Handle - PCI + */ +typedef struct PCIDeviceHandle { + uint16_t segment; + uint16_t bdf; +} PCIDeviceHandle; + +void build_srat_generic_pci_initiator(GArray *table_data); + +#endif diff --git a/include/hw/audio/virtio-snd.h b/include/hw/audio/virtio-snd.h index c3767f442b..3d79181364 100644 --- a/include/hw/audio/virtio-snd.h +++ b/include/hw/audio/virtio-snd.h @@ -230,6 +230,7 @@ struct virtio_snd_ctrl_command { VirtQueue *vq; virtio_snd_hdr ctrl; virtio_snd_hdr resp; + size_t payload_size; QTAILQ_ENTRY(virtio_snd_ctrl_command) next; }; #endif diff --git a/include/hw/cxl/cxl_component.h b/include/hw/cxl/cxl_component.h index 0e5d35c263..5012fab6f7 100644 --- a/include/hw/cxl/cxl_component.h +++ b/include/hw/cxl/cxl_component.h @@ -25,6 +25,7 @@ enum reg_type { CXL2_TYPE3_DEVICE, CXL2_LOGICAL_DEVICE, CXL2_ROOT_PORT, + CXL2_RC, CXL2_UPSTREAM_PORT, CXL2_DOWNSTREAM_PORT, CXL3_SWITCH_MAILBOX_CCI, diff --git a/include/hw/cxl/cxl_pci.h b/include/hw/cxl/cxl_pci.h index 265db6c407..d0855ed78b 100644 --- a/include/hw/cxl/cxl_pci.h +++ b/include/hw/cxl/cxl_pci.h @@ -92,8 +92,9 @@ typedef struct CXLDVSECDevice { uint32_t range2_base_hi; uint32_t range2_base_lo; uint16_t cap3; + uint16_t resv; } QEMU_PACKED CXLDVSECDevice; -QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != 0x3A); +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != PCIE_CXL_DEVICE_DVSEC_LENGTH); /* * CXL r3.1 Section 8.1.5: CXL Extensions DVSEC for Ports diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index 6e514982d4..c21b8d3285 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -211,6 +211,23 @@ struct smbios_type_8 { uint8_t port_type; } QEMU_PACKED; +/* SMBIOS type 9 - System Slots (v2.1+) */ +struct smbios_type_9 { + struct smbios_structure_header header; + uint8_t slot_designation; + uint8_t slot_type; + uint8_t slot_data_bus_width; + uint8_t current_usage; + uint8_t slot_length; + uint16_t slot_id; + uint8_t slot_characteristics1; + uint8_t slot_characteristics2; + /* SMBIOS spec v2.6+ */ + uint16_t segment_group_number; + uint8_t bus_number; + uint8_t device_number; +} QEMU_PACKED; + /* SMBIOS type 11 - OEM strings */ struct smbios_type_11 { struct smbios_structure_header header; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index b958023187..27a68071d7 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -178,8 +178,6 @@ void pc_basic_device_init(struct PCMachineState *pcms, ISADevice *rtc_state, bool create_fdctrl, uint32_t hpet_irqs); -void pc_cmos_init(PCMachineState *pcms, - ISADevice *s); void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus); void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs); @@ -190,6 +188,8 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs); #define TYPE_PORT92 "port92" /* pc_sysfw.c */ +void pc_system_flash_create(PCMachineState *pcms); +void pc_system_flash_cleanup_unused(PCMachineState *pcms); void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory); bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, int *data_len); diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h index 4972106c42..9d3b6868dc 100644 --- a/include/hw/pci/pcie_regs.h +++ b/include/hw/pci/pcie_regs.h @@ -39,6 +39,8 @@ typedef enum PCIExpLinkSpeed { QEMU_PCI_EXP_LNK_5GT, QEMU_PCI_EXP_LNK_8GT, QEMU_PCI_EXP_LNK_16GT, + QEMU_PCI_EXP_LNK_32GT, + QEMU_PCI_EXP_LNK_64GT, } PCIExpLinkSpeed; #define QEMU_PCI_EXP_LNKCAP_MLS(speed) (speed) diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index 095fb0c9ed..b77eb7bf58 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -58,8 +58,8 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize); void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len); -/* Reset SR/IOV VF Enable bit to unregister all VFs */ -void pcie_sriov_pf_disable_vfs(PCIDevice *dev); +/* Reset SR/IOV */ +void pcie_sriov_pf_reset(PCIDevice *dev); /* Get logical VF number of a VF - only valid for VFs */ uint16_t pcie_sriov_vf_number(PCIDevice *dev); diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 8f54e5edd4..0a9575b469 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -30,6 +30,12 @@ typedef struct VhostVDPAHostNotifier { void *addr; } VhostVDPAHostNotifier; +typedef enum SVQTransitionState { + SVQ_TSTATE_DISABLING = -1, + SVQ_TSTATE_DONE, + SVQ_TSTATE_ENABLING +} SVQTransitionState; + /* Info shared by all vhost_vdpa device models */ typedef struct vhost_vdpa_shared { int device_fd; @@ -47,6 +53,9 @@ typedef struct vhost_vdpa_shared { /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ bool shadow_data; + + /* SVQ switching is in progress, or already completed? */ + SVQTransitionState svq_switching; } VhostVDPAShared; typedef struct vhost_vdpa { diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 781ebaea8f..83a52cc446 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -24,6 +24,7 @@ #include "hw/virtio/virtio.h" #include "hw/pci/pci.h" #include "qom/object.h" +#include "qapi/qapi-types-virtio.h" #define TYPE_VIRTIO_IOMMU "virtio-iommu-device" #define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci" @@ -66,6 +67,8 @@ struct VirtIOIOMMU { bool boot_bypass; Notifier machine_done; bool granule_frozen; + GranuleMode granule_mode; + uint8_t aw_bits; }; #endif diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 59d88018c1..4d57a9c751 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -43,6 +43,7 @@ enum { VIRTIO_PCI_FLAG_INIT_FLR_BIT, VIRTIO_PCI_FLAG_AER_BIT, VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT, + VIRTIO_PCI_FLAG_VDPA_BIT, }; /* Need to activate work-arounds for buggy guests at vmstate load. */ @@ -89,6 +90,9 @@ enum { #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \ (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT) +/* VDPA supported flags */ +#define VIRTIO_PCI_FLAG_VDPA (1 << VIRTIO_PCI_FLAG_VDPA_BIT) + typedef struct { MSIMessage msg; int virq; @@ -140,6 +144,7 @@ struct VirtIOPCIProxy { }; VirtIOPCIRegion regs[5]; }; + VirtIOPCIRegion lm; MemoryRegion modern_bar; MemoryRegion io_bar; uint32_t legacy_io_bar_idx; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c8f72850bc..b3c74a1bca 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -35,6 +35,9 @@ (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ (0x1ULL << VIRTIO_F_ANY_LAYOUT)) +#define LM_DISABLE 0x00 +#define LM_ENABLE 0x01 + struct VirtQueue; static inline hwaddr vring_align(hwaddr addr, @@ -95,6 +98,11 @@ enum virtio_device_endian { VIRTIO_DEVICE_ENDIAN_BIG, }; +typedef struct BitmapMemoryRegionCaches { + struct rcu_head rcu; + MemoryRegionCache bitmap; +} BitmapMemoryRegionCaches; + /** * struct VirtIODevice - common VirtIO structure * @name: name of the device @@ -128,6 +136,14 @@ struct VirtIODevice uint32_t generation; int nvectors; VirtQueue *vq; + uint8_t lm_logging_ctrl; + uint32_t lm_base_addr_low; + uint32_t lm_base_addr_high; + uint32_t lm_end_addr_low; + uint32_t lm_end_addr_high; + + BitmapMemoryRegionCaches *caches; + MemoryListener listener; uint16_t device_id; /* @vm_running: current VM running state via virtio_vmstate_change() */ @@ -379,8 +395,11 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n); unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); +unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n); void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, unsigned int idx); +void virtio_queue_set_vring_states(VirtIODevice *vdev, int n, + unsigned int idx); void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n); void virtio_queue_update_used_idx(VirtIODevice *vdev, int n); diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index 3e2bc2c97e..86733278ba 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -221,6 +221,13 @@ struct virtio_pci_cfg_cap { #define VIRTIO_PCI_COMMON_ADM_Q_IDX 60 #define VIRTIO_PCI_COMMON_ADM_Q_NUM 62 +#define LM_LOGGING_CTRL 0 +#define LM_BASE_ADDR_LOW 4 +#define LM_BASE_ADDR_HIGH 8 +#define LM_END_ADDR_LOW 12 +#define LM_END_ADDR_HIGH 16 +#define LM_VRING_STATE_OFFSET 0x20 + #endif /* VIRTIO_PCI_NO_MODERN */ /* Admin command status. */ diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 4173ef2afa..825cfe86bc 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -41,6 +41,7 @@ struct NodeInfo { struct HostMemoryBackend *node_memdev; bool present; bool has_cpu; + bool has_gi; uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; diff --git a/net/trace-events b/net/trace-events index 823a071bdc..cda960f42b 100644 --- a/net/trace-events +++ b/net/trace-events @@ -23,3 +23,9 @@ colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, in # filter-rewriter.c colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x" colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" + +# vhost-vdpa.c +vhost_vdpa_set_address_space_id(void *v, unsigned vq_group, unsigned asid_num) "vhost_vdpa: %p vq_group: %u asid: %u" +vhost_vdpa_net_load_cmd(void *s, uint8_t class, uint8_t cmd, int data_num, int data_size) "vdpa state: %p class: %u cmd: %u sg_num: %d size: %d" +vhost_vdpa_net_load_cmd_retval(void *s, uint8_t class, uint8_t cmd, int r) "vdpa state: %p class: %u cmd: %u retval: %d" +vhost_vdpa_net_load_mq(void *s, int ncurqps) "vdpa state: %p current_qpairs: %d" diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 2a9ddb4552..85e73dd6a7 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -28,6 +28,7 @@ #include "monitor/monitor.h" #include "migration/misc.h" #include "hw/virtio/vhost.h" +#include "trace.h" /* Todo:need to add the multiqueue support here */ typedef struct VhostVDPAState { @@ -286,6 +287,21 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, return size; } + +/** From any vdpa net client, get the netclient of the i-th queue pair */ +static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i) +{ + NICState *nic = qemu_get_nic(s->nc.peer); + NetClientState *nc_i = qemu_get_peer(nic->ncs, i); + + return DO_UPCAST(VhostVDPAState, nc, nc_i); +} + +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) +{ + return vhost_vdpa_net_get_nc_vdpa(s, 0); +} + static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) { struct vhost_vdpa *v = &s->vhost_vdpa; @@ -307,6 +323,8 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? n->max_ncs - n->max_queue_pairs : 0; + v->shared->svq_switching = enable ? + SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING; /* * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter * in the future and resume the device if read-only operations between @@ -319,6 +337,7 @@ static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) if (unlikely(r < 0)) { error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); } + v->shared->svq_switching = SVQ_TSTATE_DONE; } static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier, @@ -444,6 +463,8 @@ static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, }; int r; + trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num); + r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); if (unlikely(r < 0)) { error_report("Can't set vq group %u asid %u, errno=%d (%s)", @@ -510,7 +531,7 @@ dma_map_err: static int vhost_vdpa_net_cvq_start(NetClientState *nc) { - VhostVDPAState *s; + VhostVDPAState *s, *s0; struct vhost_vdpa *v; int64_t cvq_group; int r; @@ -521,7 +542,8 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc) s = DO_UPCAST(VhostVDPAState, nc, nc); v = &s->vhost_vdpa; - v->shadow_vqs_enabled = v->shared->shadow_data; + s0 = vhost_vdpa_net_first_nc_vdpa(s); + v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled; s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; if (v->shared->shadow_data) { @@ -695,6 +717,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); cmd_size = sizeof(ctrl) + data_size; + trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size); if (vhost_svq_available_slots(svq) < 2 || iov_size(out_cursor, 1) < cmd_size) { /* @@ -726,6 +749,7 @@ static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1); if (unlikely(r < 0)) { + trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r); return r; } @@ -917,6 +941,8 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s, return 0; } + trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs); + mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); const struct iovec data = { .iov_base = &mq, diff --git a/qapi/common.json b/qapi/common.json index f1bb841951..867a9ad9b0 100644 --- a/qapi/common.json +++ b/qapi/common.json @@ -107,10 +107,14 @@ # # @16: 16.0GT/s # +# @32: 32.0GT/s +# +# @64: 64.0GT/s +# # Since: 4.0 ## { 'enum': 'PCIELinkSpeed', - 'data': [ '2_5', '5', '8', '16' ] } + 'data': [ '2_5', '5', '8', '16', '32', '64' ] } ## # @PCIELinkWidth: diff --git a/qapi/qom.json b/qapi/qom.json index 032c6fa037..baae3a183f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -812,6 +812,21 @@ 'data': { '*fd': 'str' } } ## +# @AcpiGenericInitiatorProperties: +# +# Properties for acpi-generic-initiator objects. +# +# @pci-dev: PCI device ID to be associated with the node +# +# @node: NUMA node associated with the PCI device +# +# Since: 9.0 +## +{ 'struct': 'AcpiGenericInitiatorProperties', + 'data': { 'pci-dev': 'str', + 'node': 'uint32' } } + +## # @RngProperties: # # Properties for objects of classes derived from rng. @@ -928,6 +943,7 @@ ## { 'enum': 'ObjectType', 'data': [ + 'acpi-generic-initiator', 'authz-list', 'authz-listfile', 'authz-pam', @@ -999,6 +1015,7 @@ 'id': 'str' }, 'discriminator': 'qom-type', 'data': { + 'acpi-generic-initiator': 'AcpiGenericInitiatorProperties', 'authz-list': 'AuthZListProperties', 'authz-listfile': 'AuthZListFileProperties', 'authz-pam': 'AuthZPAMProperties', diff --git a/qemu-options.hx b/qemu-options.hx index ac4a30fa83..7fd1713fa8 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1172,6 +1172,17 @@ SRST Please also refer to the wiki page for general scenarios of VT-d emulation in QEMU: https://wiki.qemu.org/Features/VT-d. +``-device virtio-iommu-pci[,option=...]`` + This is only supported by ``-machine q35`` (x86_64) and ``-machine virt`` (ARM). + It supports below options: + + ``granule=val`` (possible values are 4k, 8k, 16k, 64k and host; default: host) + This decides the default granule to be be exposed by the + virtio-iommu. If host, the granule matches the host page size. + + ``aw-bits=val`` (val between 32 and 64, default depends on machine) + This decides the address width of the IOVA address space. + ERST DEF("name", HAS_ARG, QEMU_OPTION_name, @@ -2718,6 +2729,9 @@ SRST ``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` Specify SMBIOS type 4 fields +``-smbios type=9[,slot_designation=str][,slot_type=%d][,slot_data_bus_width=%d][,current_usage=%d][,slot_length=%d][,slot_id=%d][,slot_characteristics1=%d][,slot_characteristics12=%d][,pci_device=str]`` + Specify SMBIOS type 9 fields + ``-smbios type=11[,value=str][,path=filename]`` Specify SMBIOS type 11 fields diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a3b158c671..a879149fef 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -43,6 +43,8 @@ #include <fcntl.h> #include <sys/ioctl.h> #include <linux/vhost.h> +#include <sys/vfs.h> +#include <linux/magic.h> #ifdef __NR_userfaultfd #include <linux/userfaultfd.h> @@ -195,30 +197,58 @@ vu_panic(VuDev *dev, const char *msg, ...) */ } +/* Search for a memory region that covers this guest physical address. */ +static VuDevRegion * +vu_gpa_to_mem_region(VuDev *dev, uint64_t guest_addr) +{ + int low = 0; + int high = dev->nregions - 1; + + /* + * Memory regions cannot overlap in guest physical address space. Each + * GPA belongs to exactly one memory region, so there can only be one + * match. + * + * We store our memory regions ordered by GPA and can simply perform a + * binary search. + */ + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; + + if (guest_addr >= cur->gpa && guest_addr < cur->gpa + cur->size) { + return cur; + } + if (guest_addr >= cur->gpa + cur->size) { + low = mid + 1; + } + if (guest_addr < cur->gpa) { + high = mid - 1; + } + } + return NULL; +} + /* Translate guest physical address to our virtual address. */ void * vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) { - unsigned int i; + VuDevRegion *r; if (*plen == 0) { return NULL; } - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - - if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { - if ((guest_addr + *plen) > (r->gpa + r->size)) { - *plen = r->gpa + r->size - guest_addr; - } - return (void *)(uintptr_t) - guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; - } + r = vu_gpa_to_mem_region(dev, guest_addr); + if (!r) { + return NULL; } - return NULL; + if ((guest_addr + *plen) > (r->gpa + r->size)) { + *plen = r->gpa + r->size - guest_addr; + } + return (void *)(uintptr_t)guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset; } /* Translate qemu virtual address to our virtual address. */ @@ -241,6 +271,221 @@ qva_to_va(VuDev *dev, uint64_t qemu_addr) } static void +vu_remove_all_mem_regs(VuDev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); + } + dev->nregions = 0; +} + +static bool +map_ring(VuDev *dev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +static bool +vu_is_vq_usable(VuDev *dev, VuVirtq *vq) +{ + if (unlikely(dev->broken)) { + return false; + } + + if (likely(vq->vring.avail)) { + return true; + } + + /* + * In corner cases, we might temporarily remove a memory region that + * mapped a ring. When removing a memory region we make sure to + * unmap any rings that would be impacted. Let's try to remap if we + * already succeeded mapping this ring once. + */ + if (!vq->vra.desc_user_addr || !vq->vra.used_user_addr || + !vq->vra.avail_user_addr) { + return false; + } + if (map_ring(dev, vq)) { + vu_panic(dev, "remapping queue on access"); + return false; + } + return true; +} + +static void +unmap_rings(VuDev *dev, VuDevRegion *r) +{ + int i; + + for (i = 0; i < dev->max_queues; i++) { + VuVirtq *vq = &dev->vq[i]; + const uintptr_t desc = (uintptr_t)vq->vring.desc; + const uintptr_t used = (uintptr_t)vq->vring.used; + const uintptr_t avail = (uintptr_t)vq->vring.avail; + + if (desc < r->mmap_addr || desc >= r->mmap_addr + r->size) { + continue; + } + if (used < r->mmap_addr || used >= r->mmap_addr + r->size) { + continue; + } + if (avail < r->mmap_addr || avail >= r->mmap_addr + r->size) { + continue; + } + + DPRINT("Unmapping rings of queue %d\n", i); + vq->vring.desc = NULL; + vq->vring.used = NULL; + vq->vring.avail = NULL; + } +} + +static size_t +get_fd_hugepagesize(int fd) +{ +#if defined(__linux__) + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (!ret && (unsigned int)fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } +#endif + return 0; +} + +static void +_vu_add_mem_reg(VuDev *dev, VhostUserMemoryRegion *msg_region, int fd) +{ + const uint64_t start_gpa = msg_region->guest_phys_addr; + const uint64_t end_gpa = start_gpa + msg_region->memory_size; + int prot = PROT_READ | PROT_WRITE; + uint64_t mmap_offset, fd_offset; + size_t hugepagesize; + VuDevRegion *r; + void *mmap_addr; + int low = 0; + int high = dev->nregions - 1; + unsigned int idx; + + DPRINT("Adding region %d\n", dev->nregions); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr: 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" old mmap_offset: 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + if (dev->postcopy_listening) { + /* + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault + */ + prot = PROT_NONE; + } + + /* + * We will add memory regions into the array sorted by GPA. Perform a + * binary search to locate the insertion point: it will be at the low + * index. + */ + while (low <= high) { + unsigned int mid = low + (high - low) / 2; + VuDevRegion *cur = &dev->regions[mid]; + + /* Overlap of GPA addresses. */ + if (start_gpa < cur->gpa + cur->size && cur->gpa < end_gpa) { + vu_panic(dev, "regions with overlapping guest physical addresses"); + return; + } + if (start_gpa >= cur->gpa + cur->size) { + low = mid + 1; + } + if (start_gpa < cur->gpa) { + high = mid - 1; + } + } + idx = low; + + /* + * Convert most of msg_region->mmap_offset to fd_offset. In almost all + * cases, this will leave us with mmap_offset == 0, mmap()'ing only + * what we really need. Only if a memory region would partially cover + * hugetlb pages, we'd get mmap_offset != 0, which usually doesn't happen + * anymore (i.e., modern QEMU). + * + * Note that mmap() with hugetlb would fail if the offset into the file + * is not aligned to the huge page size. + */ + hugepagesize = get_fd_hugepagesize(fd); + if (hugepagesize) { + fd_offset = ALIGN_DOWN(msg_region->mmap_offset, hugepagesize); + mmap_offset = msg_region->mmap_offset - fd_offset; + } else { + fd_offset = msg_region->mmap_offset; + mmap_offset = 0; + } + + DPRINT(" fd_offset: 0x%016"PRIx64"\n", + fd_offset); + DPRINT(" new mmap_offset: 0x%016"PRIx64"\n", + mmap_offset); + + mmap_addr = mmap(0, msg_region->memory_size + mmap_offset, + prot, MAP_SHARED | MAP_NORESERVE, fd, fd_offset); + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + return; + } + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + (uint64_t)(uintptr_t)mmap_addr); + +#if defined(__linux__) + /* Don't include all guest memory in a coredump. */ + madvise(mmap_addr, msg_region->memory_size + mmap_offset, + MADV_DONTDUMP); +#endif + + /* Shift all affected entries by 1 to open a hole at idx. */ + r = &dev->regions[idx]; + memmove(r + 1, r, sizeof(VuDevRegion) * (dev->nregions - idx)); + r->gpa = msg_region->guest_phys_addr; + r->size = msg_region->memory_size; + r->qva = msg_region->userspace_addr; + r->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + r->mmap_offset = mmap_offset; + dev->nregions++; + + if (dev->postcopy_listening) { + /* + * Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = r->mmap_addr + r->mmap_offset; + } +} + +static void vmsg_close_fds(VhostUserMsg *vmsg) { int i; @@ -613,21 +858,6 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) } static bool -map_ring(VuDev *dev, VuVirtq *vq) -{ - vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); - vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); - vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); - - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->vring.desc); - DPRINT(" vring_used at %p\n", vq->vring.used); - DPRINT(" vring_avail at %p\n", vq->vring.avail); - - return !(vq->vring.desc && vq->vring.used && vq->vring.avail); -} - -static bool generate_faults(VuDev *dev) { unsigned int i; for (i = 0; i < dev->nregions; i++) { @@ -710,11 +940,7 @@ generate_faults(VuDev *dev) { static bool vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { - int i; - bool track_ramblocks = dev->postcopy_listening; VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - VuDevRegion *dev_region = &dev->regions[dev->nregions]; - void *mmap_addr; if (vmsg->fd_num != 1) { vmsg_close_fds(vmsg); @@ -744,84 +970,24 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { * we know all the postcopy client bases have been received, and we * should start generating faults. */ - if (track_ramblocks && + if (dev->postcopy_listening && vmsg->size == sizeof(vmsg->payload.u64) && vmsg->payload.u64 == 0) { (void)generate_faults(dev); return false; } - DPRINT("Adding region: %u\n", dev->nregions); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* - * We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. - */ - if (track_ramblocks) { - /* - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault. - */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } else { - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[0], 0); - } - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - + _vu_add_mem_reg(dev, msg_region, vmsg->fds[0]); close(vmsg->fds[0]); - if (track_ramblocks) { - /* - * Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - + if (dev->postcopy_listening) { /* Send the message back to qemu with the addresses filled in. */ vmsg->fd_num = 0; DPRINT("Successfully added new region in postcopy\n"); - dev->nregions++; return true; - } else { - for (i = 0; i < dev->max_queues; i++) { - if (dev->vq[i].vring.desc) { - if (map_ring(dev, &dev->vq[i])) { - vu_panic(dev, "remapping queue %d for new memory region", - i); - } - } - } - - DPRINT("Successfully added new region\n"); - dev->nregions++; - return false; } + DPRINT("Successfully added new region\n"); + return false; } static inline bool reg_equal(VuDevRegion *vudev_reg, @@ -839,8 +1005,8 @@ static inline bool reg_equal(VuDevRegion *vudev_reg, static bool vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; - unsigned int i; - bool found = false; + unsigned int idx; + VuDevRegion *r; if (vmsg->fd_num > 1) { vmsg_close_fds(vmsg); @@ -867,35 +1033,31 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { DPRINT(" mmap_offset 0x%016"PRIx64"\n", msg_region->mmap_offset); - for (i = 0; i < dev->nregions; i++) { - if (reg_equal(&dev->regions[i], msg_region)) { - VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; - - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } - - /* - * Shift all affected entries by 1 to close the hole at index i and - * zero out the last entry. - */ - memmove(dev->regions + i, dev->regions + i + 1, - sizeof(VuDevRegion) * (dev->nregions - i - 1)); - memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); - DPRINT("Successfully removed a region\n"); - dev->nregions--; - i--; + r = vu_gpa_to_mem_region(dev, msg_region->guest_phys_addr); + if (!r || !reg_equal(r, msg_region)) { + vmsg_close_fds(vmsg); + vu_panic(dev, "Specified region not found\n"); + return false; + } - found = true; + /* + * There might be valid cases where we temporarily remove memory regions + * to readd them again, or remove memory regions and don't use the rings + * anymore before we set the ring addresses and restart the device. + * + * Unmap all affected rings, remapping them on demand later. This should + * be a corner case. + */ + unmap_rings(dev, r); - /* Continue the search for eventual duplicates. */ - } - } + munmap((void *)(uintptr_t)r->mmap_addr, r->size + r->mmap_offset); - if (!found) { - vu_panic(dev, "Specified region not found\n"); - } + idx = r - dev->regions; + assert(idx < dev->nregions); + /* Shift all affected entries by 1 to close the hole. */ + memmove(r, r + 1, sizeof(VuDevRegion) * (dev->nregions - idx - 1)); + DPRINT("Successfully removed a region\n"); + dev->nregions--; vmsg_close_fds(vmsg); @@ -921,139 +1083,41 @@ vu_get_shared_object(VuDev *dev, VhostUserMsg *vmsg) } static bool -vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) -{ - unsigned int i; - VhostUserMemory m = vmsg->payload.memory, *memory = &m; - dev->nregions = memory->nregions; - - DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. - * In postcopy we're using PROT_NONE here to catch anyone - * accessing it before we userfault - */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_NONE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); - } - - /* Return the address to QEMU so that it can translate the ufd - * fault addresses back. - */ - msg_region->userspace_addr = (uintptr_t)(mmap_addr + - dev_region->mmap_offset); - close(vmsg->fds[i]); - } - - /* Send the message back to qemu with the addresses filled in */ - vmsg->fd_num = 0; - if (!vu_send_reply(dev, dev->sock, vmsg)) { - vu_panic(dev, "failed to respond to set-mem-table for postcopy"); - return false; - } - - /* Wait for QEMU to confirm that it's registered the handler for the - * faults. - */ - if (!dev->read_msg(dev, dev->sock, vmsg) || - vmsg->size != sizeof(vmsg->payload.u64) || - vmsg->payload.u64 != 0) { - vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); - return false; - } - - /* OK, now we can go and register the memory and generate faults */ - (void)generate_faults(dev); - - return false; -} - -static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { - unsigned int i; VhostUserMemory m = vmsg->payload.memory, *memory = &m; + unsigned int i; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *ma = (void *) (uintptr_t) r->mmap_addr; + vu_remove_all_mem_regs(dev); - if (ma) { - munmap(ma, r->size + r->mmap_offset); - } + DPRINT("Nregions: %u\n", memory->nregions); + for (i = 0; i < memory->nregions; i++) { + _vu_add_mem_reg(dev, &memory->regions[i], vmsg->fds[i]); + close(vmsg->fds[i]); } - dev->nregions = memory->nregions; if (dev->postcopy_listening) { - return vu_set_mem_table_exec_postcopy(dev, vmsg); - } - - DPRINT("Nregions: %u\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VuDevRegion *dev_region = &dev->regions[i]; + /* Send the message back to qemu with the addresses filled in */ + vmsg->fd_num = 0; + if (!vu_send_reply(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to set-mem-table for postcopy"); + return false; + } - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vu_panic(dev, "region mmap error: %s", strerror(errno)); - } else { - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", - dev_region->mmap_addr); + /* + * Wait for QEMU to confirm that it's registered the handler for the + * faults. + */ + if (!dev->read_msg(dev, dev->sock, vmsg) || + vmsg->size != sizeof(vmsg->payload.u64) || + vmsg->payload.u64 != 0) { + vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); + return false; } - close(vmsg->fds[i]); + /* OK, now we can go and register the memory and generate faults */ + (void)generate_faults(dev); + return false; } for (i = 0; i < dev->max_queues; i++) { @@ -2112,14 +2176,7 @@ vu_deinit(VuDev *dev) { unsigned int i; - for (i = 0; i < dev->nregions; i++) { - VuDevRegion *r = &dev->regions[i]; - void *m = (void *) (uintptr_t) r->mmap_addr; - if (m != MAP_FAILED) { - munmap(m, r->size + r->mmap_offset); - } - } - dev->nregions = 0; + vu_remove_all_mem_regs(dev); for (i = 0; i < dev->max_queues; i++) { VuVirtq *vq = &dev->vq[i]; @@ -2171,6 +2228,8 @@ vu_deinit(VuDev *dev) free(dev->vq); dev->vq = NULL; + free(dev->regions); + dev->regions = NULL; } bool @@ -2205,9 +2264,17 @@ vu_init(VuDev *dev, dev->backend_fd = -1; dev->max_queues = max_queues; + dev->regions = malloc(VHOST_USER_MAX_RAM_SLOTS * sizeof(dev->regions[0])); + if (!dev->regions) { + DPRINT("%s: failed to malloc mem regions\n", __func__); + return false; + } + dev->vq = malloc(max_queues * sizeof(dev->vq[0])); if (!dev->vq) { DPRINT("%s: failed to malloc virtqueues\n", __func__); + free(dev->regions); + dev->regions = NULL; return false; } @@ -2374,8 +2441,7 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { goto done; } @@ -2490,8 +2556,7 @@ vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, bool vu_queue_empty(VuDev *dev, VuVirtq *vq) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return true; } @@ -2530,8 +2595,7 @@ vring_notify(VuDev *dev, VuVirtq *vq) static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) { - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -2856,8 +2920,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) unsigned int head; VuVirtqElement *elem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return NULL; } @@ -3014,8 +3077,7 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq, { struct vring_used_elem uelem; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } @@ -3044,8 +3106,7 @@ vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) { uint16_t old, new; - if (unlikely(dev->broken) || - unlikely(!vq->vring.avail)) { + if (!vu_is_vq_usable(dev, vq)) { return; } diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index c2352904f0..deb40e77b3 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -31,10 +31,12 @@ #define VHOST_MEMORY_BASELINE_NREGIONS 8 /* - * Set a reasonable maximum number of ram slots, which will be supported by - * any architecture. + * vhost in the kernel usually supports 509 mem slots. 509 used to be the + * KVM limit, it supported 512, but 3 were used for internal purposes. This + * limit is sufficient to support many DIMMs and virtio-mem in + * "dynamic-memslots" mode. */ -#define VHOST_USER_MAX_RAM_SLOTS 32 +#define VHOST_USER_MAX_RAM_SLOTS 509 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) @@ -398,7 +400,7 @@ typedef struct VuDevInflightInfo { struct VuDev { int sock; uint32_t nregions; - VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; + VuDevRegion *regions; VuVirtq *vq; VuDevInflightInfo inflight_info; int log_call_fd; diff --git a/tests/qtest/virtio-iommu-test.c b/tests/qtest/virtio-iommu-test.c index 068e7a9e6c..afb225971d 100644 --- a/tests/qtest/virtio-iommu-test.c +++ b/tests/qtest/virtio-iommu-test.c @@ -34,7 +34,7 @@ static void pci_config(void *obj, void *data, QGuestAllocator *t_alloc) uint8_t bypass = qvirtio_config_readb(dev, 36); g_assert_cmpint(input_range_start, ==, 0); - g_assert_cmphex(input_range_end, ==, UINT64_MAX); + g_assert_cmphex(input_range_end, >=, UINT32_MAX); g_assert_cmpint(domain_range_start, ==, 0); g_assert_cmpint(domain_range_end, ==, UINT32_MAX); g_assert_cmpint(bypass, ==, 1); |