From 722f8c51d8af223751dfb1d02de40043e8ba067e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 9 Dec 2019 11:46:13 -0500 Subject: virtio: add ability to delete vq through a pointer Devices tend to maintain vq pointers, allow deleting them trough a vq pointer. Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand Reviewed-by: David Hildenbrand --- include/hw/virtio/virtio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c32a815303..e18756d50d 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -183,6 +183,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void virtio_del_queue(VirtIODevice *vdev, int n); +void virtio_delete_queue(VirtQueue *vq); + void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len); void virtqueue_flush(VirtQueue *vq, unsigned int count); -- cgit v1.2.3 From 9d7bd0826f2d19f88631ad7078662668148f7b5f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 19 Nov 2019 18:50:03 -0600 Subject: virtio-pci: disable vring processing when bus-mastering is disabled Currently the SLOF firmware for pseries guests will disable/re-enable a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND register after the initial probe/feature negotiation, as it tends to work with a single device at a time at various stages like probing and running block/network bootloaders without doing a full reset in-between. In QEMU, when PCI_COMMAND_MASTER is disabled we disable the corresponding IOMMU memory region, so DMA accesses (including to vring fields like idx/flags) will no longer undergo the necessary translation. Normally we wouldn't expect this to happen since it would be misbehavior on the driver side to continue driving DMA requests. However, in the case of pseries, with iommu_platform=on, we trigger the following sequence when tearing down the virtio-blk dataplane ioeventfd in response to the guest unsetting PCI_COMMAND_MASTER: #2 0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757 #3 0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950 #4 0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0) at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255 #5 0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776 #6 0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550 #7 0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546 #8 0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8) at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527 #9 0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8) at /home/mdroth/w/qemu.git/util/aio-posix.c:520 #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0) at /home/mdroth/w/qemu.git/util/aio-posix.c:607 #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true) at /home/mdroth/w/qemu.git/util/aio-posix.c:639 #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 , opaque=opaque@entry=0x555556de86f0) at /home/mdroth/w/qemu.git/util/aio-wait.c:71 #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=) at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288 #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38) at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245 #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38) at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237 #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40) at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292 #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=, val=1048832, len=) at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613 I.e. the calling code is only scheduling a one-shot BH for virtio_blk_data_plane_stop_bh, but somehow we end up trying to process an additional virtqueue entry before we get there. This is likely due to the following check in virtio_queue_host_notifier_aio_poll: static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); bool progress; if (!vq->vring.desc || virtio_queue_empty(vq)) { return false; } progress = virtio_queue_notify_aio_vq(vq); namely the call to virtio_queue_empty(). In this case, since no new requests have actually been issued, shadow_avail_idx == last_avail_idx, so we actually try to access the vring via vring_avail_idx() to get the latest non-shadowed idx: int virtio_queue_empty(VirtQueue *vq) { bool empty; ... if (vq->shadow_avail_idx != vq->last_avail_idx) { return 0; } rcu_read_lock(); empty = vring_avail_idx(vq) == vq->last_avail_idx; rcu_read_unlock(); return empty; but since the IOMMU region has been disabled we get a bogus value (0 usually), which causes virtio_queue_empty() to falsely report that there are entries to be processed, which causes errors such as: "virtio: zero sized buffers are not allowed" or "virtio-blk missing headers" and puts the device in an error state. This patch works around the issue by introducing virtio_set_disabled(), which sets a 'disabled' flag to bypass checks like virtio_queue_empty() when bus-mastering is disabled. Since we'd check this flag at all the same sites as vdev->broken, we replace those checks with an inline function which checks for either vdev->broken or vdev->disabled. The 'disabled' flag is only migrated when set, which should be fairly rare, but to maintain migration compatibility we disable it's use for older machine types. Users requiring the use of the flag in conjunction with older machine types can set it explicitly as a virtio-device option. NOTES: - This leaves some other oddities in play, like the fact that DRIVER_OK also gets unset in response to bus-mastering being disabled, but not restored (however the device seems to continue working) - Similarly, we disable the host notifier via virtio_bus_stop_ioeventfd(), which seems to move the handling out of virtio-blk dataplane and back into the main IO thread, and it ends up staying there till a reset (but otherwise continues working normally) Cc: David Gibson , Cc: Alexey Kardashevskiy Cc: "Michael S. Tsirkin" Signed-off-by: Michael Roth Message-Id: <20191120005003.27035-1-mdroth@linux.vnet.ibm.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index e18756d50d..777772475c 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -100,6 +100,8 @@ struct VirtIODevice uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ + bool use_disabled_flag; /* allow use of 'disable' flag when needed */ + bool disabled; /* device in temporarily disabled state */ bool use_started; bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ @@ -380,4 +382,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started) vdev->started = started; } } + +static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable) +{ + if (vdev->use_disabled_flag) { + vdev->disabled = disable; + } +} + +static inline bool virtio_device_disabled(VirtIODevice *vdev) +{ + return unlikely(vdev->disabled || vdev->broken); +} + #endif -- cgit v1.2.3 From d0435bc513e23a4961b6af20164d1c6c219eb4ea Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 9 Dec 2019 21:09:57 +0000 Subject: virtio: don't enable notifications during polling Virtqueue notifications are not necessary during polling, so we disable them. This allows the guest driver to avoid MMIO vmexits. Unfortunately the virtio-blk and virtio-scsi handler functions re-enable notifications, defeating this optimization. Fix virtio-blk and virtio-scsi emulation so they leave notifications disabled. The key thing to remember for correctness is that polling always checks one last time after ending its loop, therefore it's safe to lose the race when re-enabling notifications at the end of polling. There is a measurable performance improvement of 5-10% with the null-co block driver. Real-life storage configurations will see a smaller improvement because the MMIO vmexit overhead contributes less to latency. Signed-off-by: Stefan Hajnoczi Message-Id: <20191209210957.65087-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 777772475c..b69d517496 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -228,6 +228,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id); void virtio_notify_config(VirtIODevice *vdev); +bool virtio_queue_get_notification(VirtQueue *vq); void virtio_queue_set_notification(VirtQueue *vq, int enable); int virtio_queue_ready(VirtQueue *vq); -- cgit v1.2.3 From 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Fri, 13 Dec 2019 09:19:22 +0800 Subject: numa: Extend CLI to provide initiator information for numa nodes In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), The initiator represents processor which access to memory. And in 5.2.27.3 Memory Proximity Domain Attributes Structure, the attached initiator is defined as where the memory controller responsible for a memory proximity domain. With attached initiator information, the topology of heterogeneous memory can be described. Add new machine property 'hmat' to enable all HMAT specific options. Extend CLI of "-numa node" option to indicate the initiator numa node-id. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Before using initiator option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Jingqi Liu Suggested-by: Dan Williams Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-2-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/numa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index ae9c41d02b..788cbec7a2 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -18,6 +18,8 @@ struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; + bool has_cpu; + uint16_t initiator; uint8_t distance[MAX_NODES]; }; @@ -33,6 +35,9 @@ struct NumaState { /* Allow setting NUMA distance for different NUMA nodes */ bool have_numa_distance; + /* Detect if HMAT support is enabled. */ + bool hmat_enabled; + /* NUMA nodes information */ NodeInfo nodes[MAX_NODES]; }; -- cgit v1.2.3 From 9b12dfa03a94d7f7a4b54eb67229a31e58193384 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:23 +0800 Subject: numa: Extend CLI to provide memory latency and bandwidth information Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-3-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- include/sysemu/numa.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'include') diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 788cbec7a2..70f93c83d7 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -14,11 +14,34 @@ struct CPUArchId; #define NUMA_DISTANCE_MAX 254 #define NUMA_DISTANCE_UNREACHABLE 255 +/* the value of AcpiHmatLBInfo flags */ +enum { + HMAT_LB_MEM_MEMORY = 0, + HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, + HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, + HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, + HMAT_LB_LEVELS /* must be the last entry */ +}; + +/* the value of AcpiHmatLBInfo data type */ +enum { + HMAT_LB_DATA_ACCESS_LATENCY = 0, + HMAT_LB_DATA_READ_LATENCY = 1, + HMAT_LB_DATA_WRITE_LATENCY = 2, + HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, + HMAT_LB_DATA_READ_BANDWIDTH = 4, + HMAT_LB_DATA_WRITE_BANDWIDTH = 5, + HMAT_LB_TYPES /* must be the last entry */ +}; + +#define UINT16_BITS 16 + struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; bool has_cpu; + uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; }; @@ -28,6 +51,31 @@ struct NumaNodeMem { uint64_t node_plugged_mem; }; +struct HMAT_LB_Data { + uint8_t initiator; + uint8_t target; + uint64_t data; +}; +typedef struct HMAT_LB_Data HMAT_LB_Data; + +struct HMAT_LB_Info { + /* Indicates it's memory or the specified level memory side cache. */ + uint8_t hierarchy; + + /* Present the type of data, access/read/write latency or bandwidth. */ + uint8_t data_type; + + /* The range bitmap of bandwidth for calculating common base */ + uint64_t range_bitmap; + + /* The common base unit for latencies or bandwidths */ + uint64_t base; + + /* Array to store the latencies or bandwidths */ + GArray *list; +}; +typedef struct HMAT_LB_Info HMAT_LB_Info; + struct NumaState { /* Number of NUMA nodes */ int num_nodes; @@ -40,11 +88,16 @@ struct NumaState { /* NUMA nodes information */ NodeInfo nodes[MAX_NODES]; + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; }; typedef struct NumaState NumaState; void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); void parse_numa_opts(MachineState *ms); +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); void numa_complete_configuration(MachineState *ms); void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); extern QemuOptsList qemu_numa_opts; -- cgit v1.2.3 From c412a48d4d91e8f8b89aae02de0f44f1f0b729e5 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Fri, 13 Dec 2019 09:19:24 +0800 Subject: numa: Extend CLI to provide memory side cache information Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu Message-Id: <20191213011929.2520-4-tao3.xu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- include/sysemu/numa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 70f93c83d7..ba693cc80b 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -91,6 +91,9 @@ struct NumaState { /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + + /* Memory Side Cache Information Structure */ + NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; }; typedef struct NumaState NumaState; @@ -98,6 +101,8 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); void parse_numa_opts(MachineState *ms); void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, Error **errp); +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp); void numa_complete_configuration(MachineState *ms); void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); extern QemuOptsList qemu_numa_opts; -- cgit v1.2.3 From f2a7e8f170252081ff48b99f34dbd1f0211d7938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 16 Dec 2019 01:21:34 +0100 Subject: hw/pci/pci_host: Let pci_data_[read/write] use unsigned 'size' argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both functions are called by MemoryRegionOps.[read/write] handlers with unsigned 'size' argument. Both functions call pci_host_config_[read/write]_common() which expect a uint32_t 'len' parameter (also unsigned). Since it is pointless (and confuse) to use a signed value, use a unsigned type. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20191216002134.18279-3-philmd@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/pci/pci_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h index ba31595fc7..9ce088bd13 100644 --- a/include/hw/pci/pci_host.h +++ b/include/hw/pci/pci_host.h @@ -62,8 +62,8 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, uint32_t limit, uint32_t len); -void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len); -uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len); +void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len); +uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len); extern const MemoryRegionOps pci_host_conf_le_ops; extern const MemoryRegionOps pci_host_conf_be_ops; -- cgit v1.2.3 From 1bf8a989a566b2ba41c197004ec2a02562a766a4 Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Fri, 20 Dec 2019 17:09:04 +0300 Subject: virtio: make seg_max virtqueue size dependent Before the patch, seg_max parameter was immutable and hardcoded to 126 (128 - 2) without respect to queue size. This has two negative effects: 1. when queue size is < 128, we have Virtio 1.1 specfication violation: (2.6.5.3.1 Driver Requirements) seq_max must be <= queue_size. This violation affects the old Linux guests (ver < 4.14). These guests crash on these queue_size setups. 2. when queue_size > 128, as was pointed out by Denis Lunev , seg_max restrics guest's block request length which affects guests' performance making them issues more block request than needed. https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html To mitigate this two effects, the patch adds the property adjusting seg_max to queue size automaticaly. Since seg_max is a guest visible parameter, the property is machine type managable and allows to choose between old (seg_max = 126 always) and new (seg_max = queue_size - 2) behaviors. Not to change the behavior of the older VMs, prevent setting the default seg_max_adjust value for older machine types. Reviewed-by: Stefan Hajnoczi Signed-off-by: Denis Plotnikov Message-Id: <20191220140905.1718-2-dplotnikov@virtuozzo.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio-blk.h | 1 + include/hw/virtio/virtio-scsi.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 9c19f5b634..1e62f869b2 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -38,6 +38,7 @@ struct VirtIOBlkConf uint32_t request_merging; uint16_t num_queues; uint16_t queue_size; + bool seg_max_adjust; uint32_t max_discard_sectors; uint32_t max_write_zeroes_sectors; bool x_enable_wce_if_config_wce; diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h index 122f7c4b6f..24e768909d 100644 --- a/include/hw/virtio/virtio-scsi.h +++ b/include/hw/virtio/virtio-scsi.h @@ -48,6 +48,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig; struct VirtIOSCSIConf { uint32_t num_queues; uint32_t virtqueue_size; + bool seg_max_adjust; uint32_t max_sectors; uint32_t cmd_per_lun; #ifdef CONFIG_VHOST_SCSI -- cgit v1.2.3