diff options
45 files changed, 1797 insertions, 134 deletions
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index ae61034656..6fd91c7e99 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -576,70 +576,90 @@ vub_new(char *blk_file) return vdev_blk; } +static int opt_fdnum = -1; +static char *opt_socket_path; +static char *opt_blk_file; +static gboolean opt_print_caps; +static gboolean opt_read_only; + +static GOptionEntry entries[] = { + { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, + "Print capabilities", NULL }, + { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, + "Use inherited fd socket", "FDNUM" }, + { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, + "Use UNIX socket path", "PATH" }, + {"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, &opt_blk_file, + "block device or file path", "PATH"}, + { "read-only", 'r', 0, G_OPTION_ARG_NONE, &opt_read_only, + "Enable read-only", NULL } +}; + int main(int argc, char **argv) { - int opt; - char *unix_socket = NULL; - char *blk_file = NULL; - bool enable_ro = false; int lsock = -1, csock = -1; VubDev *vdev_blk = NULL; - - while ((opt = getopt(argc, argv, "b:rs:h")) != -1) { - switch (opt) { - case 'b': - blk_file = g_strdup(optarg); - break; - case 's': - unix_socket = g_strdup(optarg); - break; - case 'r': - enable_ro = true; - break; - case 'h': - default: - printf("Usage: %s [ -b block device or file, -s UNIX domain socket" - " | -r Enable read-only ] | [ -h ]\n", argv[0]); - return 0; + GError *error = NULL; + GOptionContext *context; + + context = g_option_context_new(NULL); + g_option_context_add_main_entries(context, entries, NULL); + if (!g_option_context_parse(context, &argc, &argv, &error)) { + g_printerr("Option parsing failed: %s\n", error->message); + exit(EXIT_FAILURE); + } + if (opt_print_caps) { + g_print("{\n"); + g_print(" \"type\": \"block\",\n"); + g_print(" \"features\": [\n"); + g_print(" \"read-only\",\n"); + g_print(" \"blk-file\"\n"); + g_print(" ]\n"); + g_print("}\n"); + exit(EXIT_SUCCESS); + } + + if (!opt_blk_file) { + g_print("%s\n", g_option_context_get_help(context, true, NULL)); + exit(EXIT_FAILURE); + } + + if (opt_socket_path) { + lsock = unix_sock_new(opt_socket_path); + if (lsock < 0) { + exit(EXIT_FAILURE); } + } else if (opt_fdnum < 0) { + g_print("%s\n", g_option_context_get_help(context, true, NULL)); + exit(EXIT_FAILURE); + } else { + lsock = opt_fdnum; } - if (!unix_socket || !blk_file) { - printf("Usage: %s [ -b block device or file, -s UNIX domain socket" - " | -r Enable read-only ] | [ -h ]\n", argv[0]); - return -1; - } - - lsock = unix_sock_new(unix_socket); - if (lsock < 0) { - goto err; - } - - csock = accept(lsock, (void *)0, (void *)0); + csock = accept(lsock, NULL, NULL); if (csock < 0) { - fprintf(stderr, "Accept error %s\n", strerror(errno)); - goto err; + g_printerr("Accept error %s\n", strerror(errno)); + exit(EXIT_FAILURE); } - vdev_blk = vub_new(blk_file); + vdev_blk = vub_new(opt_blk_file); if (!vdev_blk) { - goto err; + exit(EXIT_FAILURE); } - if (enable_ro) { + if (opt_read_only) { vdev_blk->enable_ro = true; } if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock, vub_panic_cb, &vub_iface)) { - fprintf(stderr, "Failed to initialized libvhost-user-glib\n"); - goto err; + g_printerr("Failed to initialize libvhost-user-glib\n"); + exit(EXIT_FAILURE); } g_main_loop_run(vdev_blk->loop); - + g_main_loop_unref(vdev_blk->loop); + g_option_context_free(context); vug_deinit(&vdev_blk->parent); - -err: vub_free(vdev_blk); if (csock >= 0) { close(csock); @@ -647,8 +667,8 @@ err: if (lsock >= 0) { close(lsock); } - g_free(unix_socket); - g_free(blk_file); + g_free(opt_socket_path); + g_free(opt_blk_file); return 0; } diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json index da6aaf51c8..ce0ef74db5 100644 --- a/docs/interop/vhost-user.json +++ b/docs/interop/vhost-user.json @@ -55,6 +55,37 @@ } ## +# @VHostUserBackendBlockFeature: +# +# List of vhost user "block" features. +# +# @read-only: The --read-only command line option is supported. +# @blk-file: The --blk-file command line option is supported. +# +# Since: 5.0 +## +{ + 'enum': 'VHostUserBackendBlockFeature', + 'data': [ 'read-only', 'blk-file' ] +} + +## +# @VHostUserBackendCapabilitiesBlock: +# +# Capabilities reported by vhost user "block" backends +# +# @features: list of supported features. +# +# Since: 5.0 +## +{ + 'struct': 'VHostUserBackendCapabilitiesBlock', + 'data': { + 'features': [ 'VHostUserBackendBlockFeature' ] + } +} + +## # @VHostUserBackendInputFeature: # # List of vhost user "input" features. diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 7827b710aa..5f8b3a456b 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -785,6 +785,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 + #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 Master message types -------------------- @@ -1190,6 +1191,20 @@ Master message types ancillary data. The GPU protocol is used to inform the master of rendering state and updates. See vhost-user-gpu.rst for details. +``VHOST_USER_RESET_DEVICE`` + :id: 34 + :equivalent ioctl: N/A + :master payload: N/A + :slave payload: N/A + + Ask the vhost user backend to disable all rings and reset all + internal device state to the initial state, ready to be + reinitialized. The backend retains ownership of the device + throughout the reset operation. + + Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol + feature is set by the backend. + Slave message types ------------------- @@ -1376,3 +1391,20 @@ Command line options: Enable virgl rendering support. (optional) + +vhost-user-blk +-------------- + +Command line options: + +--blk-file=PATH + + Specify block device or file path. + + (optional) + +--read-only + + Enable read-only. + + (optional) diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 12e3f1e86e..54209c6f2f 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -7,6 +7,7 @@ config ACPI_X86 select ACPI_NVDIMM select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG + select ACPI_HMAT config ACPI_X86_ICH bool @@ -23,6 +24,10 @@ config ACPI_NVDIMM bool depends on ACPI +config ACPI_HMAT + bool + depends on ACPI + config ACPI_PCI bool depends on ACPI && PCI @@ -33,5 +38,3 @@ config ACPI_VMGENID depends on PC config ACPI_HW_REDUCED - bool - depends on ACPI diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 99253057e1..777da07f4d 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o common-obj-$(call lnot,$(CONFIG_PC)) += acpi-x86-stub.o diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c new file mode 100644 index 0000000000..7c24bb5371 --- /dev/null +++ b/hw/acpi/hmat.c @@ -0,0 +1,268 @@ +/* + * HMAT ACPI Implementation + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi <jingqi.liu@linux.intel.com> + * Tao Xu <tao3.xu@intel.com> + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "sysemu/numa.h" +#include "hw/acpi/hmat.h" + +/* + * ACPI 6.3: + * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 + */ +static void build_hmat_mpda(GArray *table_data, uint16_t flags, + uint32_t initiator, uint32_t mem_node) +{ + + /* Memory Proximity Domain Attributes Structure */ + /* Type */ + build_append_int_noprefix(table_data, 0, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, 40, 4); + /* Flags */ + build_append_int_noprefix(table_data, flags, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Proximity Domain for the Attached Initiator */ + build_append_int_noprefix(table_data, initiator, 4); + /* Proximity Domain for the Memory */ + build_append_int_noprefix(table_data, mem_node, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + /* + * Reserved: + * Previously defined as the Start Address of the System Physical + * Address Range. Deprecated since ACPI Spec 6.3. + */ + build_append_int_noprefix(table_data, 0, 8); + /* + * Reserved: + * Previously defined as the Range Length of the region in bytes. + * Deprecated since ACPI Spec 6.3. + */ + build_append_int_noprefix(table_data, 0, 8); +} + +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ + int i, index; + HMAT_LB_Data *lb_data; + uint16_t *entry_list; + uint32_t base; + /* Length in bytes for entire structure */ + uint32_t lb_length + = 32 /* Table length upto and including Entry Base Unit */ + + 4 * num_initiator /* Initiator Proximity Domain List */ + + 4 * num_target /* Target Proximity Domain List */ + + 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + + /* Type */ + build_append_int_noprefix(table_data, 1, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, lb_length, 4); + /* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ + assert(!(hmat_lb->hierarchy >> 4)); + build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); + /* Data Type */ + build_append_int_noprefix(table_data, hmat_lb->data_type, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Number of Initiator Proximity Domains (s) */ + build_append_int_noprefix(table_data, num_initiator, 4); + /* Number of Target Proximity Domains (t) */ + build_append_int_noprefix(table_data, num_target, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + /* Entry Base Unit */ + if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { + /* Convert latency base from nanoseconds to picosecond */ + base = hmat_lb->base * 1000; + } else { + /* Convert bandwidth base from Byte to Megabyte */ + base = hmat_lb->base / MiB; + } + build_append_int_noprefix(table_data, base, 8); + + /* Initiator Proximity Domain List */ + for (i = 0; i < num_initiator; i++) { + build_append_int_noprefix(table_data, initiator_list[i], 4); + } + + /* Target Proximity Domain List */ + for (i = 0; i < num_target; i++) { + build_append_int_noprefix(table_data, i, 4); + } + + /* Latency or Bandwidth Entries */ + entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); + for (i = 0; i < hmat_lb->list->len; i++) { + lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + index = lb_data->initiator * num_target + lb_data->target; + + entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); + } + + for (i = 0; i < num_initiator * num_target; i++) { + build_append_int_noprefix(table_data, entry_list[i], 2); + } + + g_free(entry_list); +} + +/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ +static void build_hmat_cache(GArray *table_data, uint8_t total_levels, + NumaHmatCacheOptions *hmat_cache) +{ + /* + * Cache Attributes: Bits [3:0] – Total Cache Levels + * for this Memory Proximity Domain + */ + uint32_t cache_attr = total_levels; + + /* Bits [7:4] : Cache Level described in this structure */ + cache_attr |= (uint32_t) hmat_cache->level << 4; + + /* Bits [11:8] - Cache Associativity */ + cache_attr |= (uint32_t) hmat_cache->associativity << 8; + + /* Bits [15:12] - Write Policy */ + cache_attr |= (uint32_t) hmat_cache->policy << 12; + + /* Bits [31:16] - Cache Line size in bytes */ + cache_attr |= (uint32_t) hmat_cache->line << 16; + + /* Type */ + build_append_int_noprefix(table_data, 2, 2); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* Length */ + build_append_int_noprefix(table_data, 32, 4); + /* Proximity Domain for the Memory */ + build_append_int_noprefix(table_data, hmat_cache->node_id, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + /* Memory Side Cache Size */ + build_append_int_noprefix(table_data, hmat_cache->size, 8); + /* Cache Attributes */ + build_append_int_noprefix(table_data, cache_attr, 4); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + /* + * Number of SMBIOS handles (n) + * Linux kernel uses Memory Side Cache Information Structure + * without SMBIOS entries for now, so set Number of SMBIOS handles + * as 0. + */ + build_append_int_noprefix(table_data, 0, 2); +} + +/* Build HMAT sub table structures */ +static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) +{ + uint16_t flags; + uint32_t num_initiator = 0; + uint32_t initiator_list[MAX_NODES]; + int i, hierarchy, type, cache_level, total_levels; + HMAT_LB_Info *hmat_lb; + NumaHmatCacheOptions *hmat_cache; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; + + if (numa_state->nodes[i].initiator < MAX_NODES) { + flags |= HMAT_PROXIMITY_INITIATOR_VALID; + } + + build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); + } + + for (i = 0; i < numa_state->num_nodes; i++) { + if (numa_state->nodes[i].has_cpu) { + initiator_list[num_initiator++] = i; + } + } + + /* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ + for (hierarchy = HMAT_LB_MEM_MEMORY; + hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) { + for (type = HMAT_LB_DATA_ACCESS_LATENCY; + type <= HMAT_LB_DATA_WRITE_BANDWIDTH; type++) { + hmat_lb = numa_state->hmat_lb[hierarchy][type]; + + if (hmat_lb && hmat_lb->list->len) { + build_hmat_lb(table_data, hmat_lb, num_initiator, + numa_state->num_nodes, initiator_list); + } + } + } + + /* + * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: + * Table 5-147 + */ + for (i = 0; i < numa_state->num_nodes; i++) { + total_levels = 0; + for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { + if (numa_state->hmat_cache[i][cache_level]) { + total_levels++; + } + } + for (cache_level = 0; cache_level <= total_levels; cache_level++) { + hmat_cache = numa_state->hmat_cache[i][cache_level]; + if (hmat_cache) { + build_hmat_cache(table_data, total_levels, hmat_cache); + } + } + } +} + +void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +{ + int hmat_start = table_data->len; + + /* reserve space for HMAT header */ + acpi_data_push(table_data, 40); + + hmat_build_table_structs(table_data, numa_state); + + build_header(linker, table_data, + (void *)(table_data->data + hmat_start), + "HMAT", table_data->len - hmat_start, 2, NULL, NULL); +} diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h new file mode 100644 index 0000000000..437dbc6872 --- /dev/null +++ b/hw/acpi/hmat.h @@ -0,0 +1,42 @@ +/* + * HMAT ACPI Implementation Header + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi <jingqi.liu@linux.intel.com> + * Tao Xu <tao3.xu@intel.com> + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#ifndef HMAT_H +#define HMAT_H + +#include "hw/acpi/aml-build.h" + +/* + * ACPI 6.3: 5.2.27.3 Memory Proximity Domain Attributes Structure, + * Table 5-145, Field "flag", Bit [0]: set to 1 to indicate that data in + * the Proximity Domain for the Attached Initiator field is valid. + * Other bits reserved. + */ +#define HMAT_PROXIMITY_INITIATOR_VALID 0x1 + +void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state); + +#endif diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index d62e6377c2..9bee514c4e 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -764,13 +764,16 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) { VirtIOBlockReq *req; MultiReqBuffer mrb = {}; + bool suppress_notifications = virtio_queue_get_notification(vq); bool progress = false; aio_context_acquire(blk_get_aio_context(s->blk)); blk_io_plug(s->blk); do { - virtio_queue_set_notification(vq, 0); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 0); + } while ((req = virtio_blk_get_request(s, vq))) { progress = true; @@ -781,7 +784,9 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) } } - virtio_queue_set_notification(vq, 1); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 1); + } } while (!virtio_queue_empty(vq)); if (mrb.num_reqs) { @@ -908,7 +913,8 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) blk_get_geometry(s->blk, &capacity); memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); - virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2); + virtio_stl_p(vdev, &blkcfg.seg_max, + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1133,6 +1139,11 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) error_setg(errp, "num-queues property must be larger than 0"); return; } + if (conf->queue_size <= 2) { + error_setg(errp, "invalid queue-size property (%" PRIu16 "), " + "must be > 2", conf->queue_size); + return; + } if (!is_power_of_2(conf->queue_size) || conf->queue_size > VIRTQUEUE_MAX_SIZE) { error_setg(errp, "invalid queue-size property (%" PRIu16 "), " @@ -1262,6 +1273,7 @@ static Property virtio_blk_properties[] = { true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), + DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features, diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 33259042a9..e1cbce3ba3 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -1126,9 +1126,17 @@ static void virtio_serial_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOSerial *vser = VIRTIO_SERIAL(dev); + int i; QLIST_REMOVE(vser, next); + virtio_delete_queue(vser->c_ivq); + virtio_delete_queue(vser->c_ovq); + for (i = 0; i < vser->bus.max_nr_ports; i++) { + virtio_delete_queue(vser->ivqs[i]); + virtio_delete_queue(vser->ovqs[i]); + } + g_free(vser->ivqs); g_free(vser->ovqs); g_free(vser->ports_map); diff --git a/hw/core/machine.c b/hw/core/machine.c index 73bf1f8572..4f30fb5646 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -29,11 +29,15 @@ GlobalProperty hw_compat_4_2[] = { { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, + { "virtio-blk-device", "seg-max-adjust", "off"}, + { "virtio-scsi-device", "seg_max_adjust", "off"}, + { "vhost-blk-device", "seg_max_adjust", "off"}, }; const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2); GlobalProperty hw_compat_4_1[] = { { "virtio-pci", "x-pcie-flr-init", "off" }, + { "virtio-device", "use-disabled-flag", "false" }, }; const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1); @@ -429,6 +433,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) ms->nvdimms_state->is_enabled = value; } +static bool machine_get_hmat(Object *obj, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + return ms->numa_state->hmat_enabled; +} + +static void machine_set_hmat(Object *obj, bool value, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + ms->numa_state->hmat_enabled = value; +} + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); @@ -556,6 +574,7 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp) { MachineClass *mc = MACHINE_GET_CLASS(machine); + NodeInfo *numa_info = machine->numa_state->nodes; bool match = false; int i; @@ -625,6 +644,17 @@ void machine_set_cpu_numa_node(MachineState *machine, match = true; slot->props.node_id = props->node_id; slot->props.has_node_id = props->has_node_id; + + if (machine->numa_state->hmat_enabled) { + if ((numa_info[props->node_id].initiator < MAX_NODES) && + (props->node_id != numa_info[props->node_id].initiator)) { + error_setg(errp, "The initiator of CPU NUMA node %" PRId64 + " should be itself", props->node_id); + return; + } + numa_info[props->node_id].has_cpu = true; + numa_info[props->node_id].initiator = props->node_id; + } } if (!match) { @@ -845,6 +875,13 @@ static void machine_initfn(Object *obj) if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { ms->numa_state = g_new0(NumaState, 1); + object_property_add_bool(obj, "hmat", + machine_get_hmat, machine_set_hmat, + &error_abort); + object_property_set_description(obj, "hmat", + "Set on/off to enable/disable " + "ACPI Heterogeneous Memory Attribute " + "Table (HMAT)", NULL); } /* Register notifier when init is done for sysbus sanity checks */ @@ -912,6 +949,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) return g_string_free(s, false); } +static void numa_validate_initiator(NumaState *numa_state) +{ + int i; + NodeInfo *numa_info = numa_state->nodes; + + for (i = 0; i < numa_state->num_nodes; i++) { + if (numa_info[i].initiator == MAX_NODES) { + error_report("The initiator of NUMA node %d is missing, use " + "'-numa node,initiator' option to declare it", i); + exit(1); + } + + if (!numa_info[numa_info[i].initiator].present) { + error_report("NUMA node %" PRIu16 " is missing, use " + "'-numa node' option to declare it first", + numa_info[i].initiator); + exit(1); + } + + if (!numa_info[numa_info[i].initiator].has_cpu) { + error_report("The initiator of NUMA node %d is invalid", i); + exit(1); + } + } +} + static void machine_numa_finish_cpu_init(MachineState *machine) { int i; @@ -952,6 +1015,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine) machine_set_cpu_numa_node(machine, &props, &error_fatal); } } + + if (machine->numa_state->hmat_enabled) { + numa_validate_initiator(machine->numa_state); + } + if (s->len && !qtest_enabled()) { warn_report("CPU(s) not present in any NUMA nodes: %s", s->str); diff --git a/hw/core/numa.c b/hw/core/numa.c index 19f082de12..0d1b4be76a 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "sysemu/sysemu.h" @@ -129,6 +130,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); } + + /* + * If not set the initiator, set it to MAX_NODES. And if + * HMAT is enabled and this node has no cpus, QEMU will raise error. + */ + numa_info[nodenr].initiator = MAX_NODES; + if (node->has_initiator) { + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + if (node->initiator >= MAX_NODES) { + error_report("The initiator id %" PRIu16 " expects an integer " + "between 0 and %d", node->initiator, + MAX_NODES - 1); + return; + } + + numa_info[nodenr].initiator = node->initiator; + } numa_info[nodenr].present = true; max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); ms->numa_state->num_nodes++; @@ -171,6 +195,253 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) ms->numa_state->have_numa_distance = true; } +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp) +{ + int i, first_bit, last_bit; + uint64_t max_entry, temp_base, bitmap_copy; + NodeInfo *numa_info = numa_state->nodes; + HMAT_LB_Info *hmat_lb = + numa_state->hmat_lb[node->hierarchy][node->data_type]; + HMAT_LB_Data lb_data = {}; + HMAT_LB_Data *lb_temp; + + /* Error checking */ + if (node->initiator > numa_state->num_nodes) { + error_setg(errp, "Invalid initiator=%d, it should be less than %d", + node->initiator, numa_state->num_nodes); + return; + } + if (node->target > numa_state->num_nodes) { + error_setg(errp, "Invalid target=%d, it should be less than %d", + node->target, numa_state->num_nodes); + return; + } + if (!numa_info[node->initiator].has_cpu) { + error_setg(errp, "Invalid initiator=%d, it isn't an " + "initiator proximity domain", node->initiator); + return; + } + if (!numa_info[node->target].present) { + error_setg(errp, "The target=%d should point to an existing node", + node->target); + return; + } + + if (!hmat_lb) { + hmat_lb = g_malloc0(sizeof(*hmat_lb)); + numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; + hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); + } + hmat_lb->hierarchy = node->hierarchy; + hmat_lb->data_type = node->data_type; + lb_data.initiator = node->initiator; + lb_data.target = node->target; + + if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { + /* Input latency data */ + + if (!node->has_latency) { + error_setg(errp, "Missing 'latency' option"); + return; + } + if (node->has_bandwidth) { + error_setg(errp, "Invalid option 'bandwidth' since " + "the data type is latency"); + return; + } + + /* Detect duplicate configuration */ + for (i = 0; i < hmat_lb->list->len; i++) { + lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + + if (node->initiator == lb_temp->initiator && + node->target == lb_temp->target) { + error_setg(errp, "Duplicate configuration of the latency for " + "initiator=%d and target=%d", node->initiator, + node->target); + return; + } + } + + hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; + + if (node->latency) { + /* Calculate the temporary base and compressed latency */ + max_entry = node->latency; + temp_base = 1; + while (QEMU_IS_ALIGNED(max_entry, 10)) { + max_entry /= 10; + temp_base *= 10; + } + + /* Calculate the max compressed latency */ + temp_base = MIN(hmat_lb->base, temp_base); + max_entry = node->latency / hmat_lb->base; + max_entry = MAX(hmat_lb->range_bitmap, max_entry); + + /* + * For latency hmat_lb->range_bitmap record the max compressed + * latency which should be less than 0xFFFF (UINT16_MAX) + */ + if (max_entry >= UINT16_MAX) { + error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " + "target=%d should not differ from previously entered " + "min or max values on more than %d", node->latency, + node->initiator, node->target, UINT16_MAX - 1); + return; + } else { + hmat_lb->base = temp_base; + hmat_lb->range_bitmap = max_entry; + } + + /* + * Set lb_info_provided bit 0 as 1, + * latency information is provided + */ + numa_info[node->target].lb_info_provided |= BIT(0); + } + lb_data.data = node->latency; + } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { + /* Input bandwidth data */ + if (!node->has_bandwidth) { + error_setg(errp, "Missing 'bandwidth' option"); + return; + } + if (node->has_latency) { + error_setg(errp, "Invalid option 'latency' since " + "the data type is bandwidth"); + return; + } + if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { + error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " + "target=%d should be 1MB aligned", node->bandwidth, + node->initiator, node->target); + return; + } + + /* Detect duplicate configuration */ + for (i = 0; i < hmat_lb->list->len; i++) { + lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + + if (node->initiator == lb_temp->initiator && + node->target == lb_temp->target) { + error_setg(errp, "Duplicate configuration of the bandwidth for " + "initiator=%d and target=%d", node->initiator, + node->target); + return; + } + } + + hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; + + if (node->bandwidth) { + /* Keep bitmap unchanged when bandwidth out of range */ + bitmap_copy = hmat_lb->range_bitmap; + bitmap_copy |= node->bandwidth; + first_bit = ctz64(bitmap_copy); + temp_base = UINT64_C(1) << first_bit; + max_entry = node->bandwidth / temp_base; + last_bit = 64 - clz64(bitmap_copy); + + /* + * For bandwidth, first_bit record the base unit of bandwidth bits, + * last_bit record the last bit of the max bandwidth. The max + * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) + */ + if ((last_bit - first_bit) > UINT16_BITS || + max_entry >= UINT16_MAX) { + error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " + "and target=%d should not differ from previously " + "entered values on more than %d", node->bandwidth, + node->initiator, node->target, UINT16_MAX - 1); + return; + } else { + hmat_lb->base = temp_base; + hmat_lb->range_bitmap = bitmap_copy; + } + + /* + * Set lb_info_provided bit 1 as 1, + * bandwidth information is provided + */ + numa_info[node->target].lb_info_provided |= BIT(1); + } + lb_data.data = node->bandwidth; + } else { + assert(0); + } + + g_array_append_val(hmat_lb->list, lb_data); +} + +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ + int nb_numa_nodes = ms->numa_state->num_nodes; + NodeInfo *numa_info = ms->numa_state->nodes; + NumaHmatCacheOptions *hmat_cache = NULL; + + if (node->node_id >= nb_numa_nodes) { + error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); + return; + } + + if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { + error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); + return; + } + + if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { + error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " + "and less than or equal to %d", node->level, + HMAT_LB_LEVELS - 1); + return; + } + + assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); + assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); + if (ms->numa_state->hmat_cache[node->node_id][node->level]) { + error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); + return; + } + + if ((node->level > 1) && + ms->numa_state->hmat_cache[node->node_id][node->level - 1] && + (node->size >= + ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { + error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%u", node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); + return; + } + + if ((node->level < HMAT_LB_LEVELS - 1) && + ms->numa_state->hmat_cache[node->node_id][node->level + 1] && + (node->size <= + ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { + error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be larger than the size(%" PRIu64 ") of " + "level=%u", node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level + 1]->size, + node->level + 1); + return; + } + + hmat_cache = g_malloc0(sizeof(*hmat_cache)); + memcpy(hmat_cache, node, sizeof(*hmat_cache)); + ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; +} + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) { Error *err = NULL; @@ -208,6 +479,32 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), &err); break; + case NUMA_OPTIONS_TYPE_HMAT_LB: + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); + if (err) { + goto end; + } + break; + case NUMA_OPTIONS_TYPE_HMAT_CACHE: + if (!ms->numa_state->hmat_enabled) { + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " + "(HMAT) is disabled, enable it with -machine hmat=on " + "before using any of hmat specific options"); + return; + } + + parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); + if (err) { + goto end; + } + break; default: abort(); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 7b8da62d41..e25df838f0 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -68,6 +68,7 @@ #include "hw/i386/intel_iommu.h" #include "hw/acpi/ipmi.h" +#include "hw/acpi/hmat.h" /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows @@ -2835,6 +2836,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); build_slit(tables_blob, tables->linker, machine); } + if (machine->numa_state->hmat_enabled) { + acpi_add_table(table_offsets, tables_blob); + build_hmat(tables_blob, tables->linker, machine->numa_state); + } } if (acpi_get_mcfg(&mcfg)) { acpi_add_table(table_offsets, tables_blob); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 43c94b993b..a523ef0e65 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -686,9 +686,18 @@ static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, return true; } -static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, - uint32_t pasid, - VTDPASIDDirEntry *pdire) +static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) +{ + return pdire->val & 1; +} + +/** + * Caller of this function should check present bit if wants + * to use pdir entry for futher usage except for fpd bit check. + */ +static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDDirEntry *pdire) { uint32_t index; dma_addr_t addr, entry_size; @@ -703,18 +712,22 @@ static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, return 0; } -static int vtd_get_pasid_entry(IntelIOMMUState *s, - uint32_t pasid, - VTDPASIDDirEntry *pdire, - VTDPASIDEntry *pe) +static inline bool vtd_pe_present(VTDPASIDEntry *pe) +{ + return pe->val[0] & VTD_PASID_ENTRY_P; +} + +static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, + uint32_t pasid, + dma_addr_t addr, + VTDPASIDEntry *pe) { uint32_t index; - dma_addr_t addr, entry_size; + dma_addr_t entry_size; X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); index = VTD_PASID_TABLE_INDEX(pasid); entry_size = VTD_PASID_ENTRY_SIZE; - addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; addr = addr + index * entry_size; if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { return -VTD_FR_PASID_TABLE_INV; @@ -732,25 +745,54 @@ static int vtd_get_pasid_entry(IntelIOMMUState *s, return 0; } -static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s, - dma_addr_t pasid_dir_base, - uint32_t pasid, - VTDPASIDEntry *pe) +/** + * Caller of this function should check present bit if wants + * to use pasid entry for futher usage except for fpd bit check. + */ +static int vtd_get_pe_from_pdire(IntelIOMMUState *s, + uint32_t pasid, + VTDPASIDDirEntry *pdire, + VTDPASIDEntry *pe) +{ + dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; + + return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe); +} + +/** + * This function gets a pasid entry from a specified pasid + * table (includes dir and leaf table) with a specified pasid. + * Sanity check should be done to ensure return a present + * pasid entry to caller. + */ +static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, + dma_addr_t pasid_dir_base, + uint32_t pasid, + VTDPASIDEntry *pe) { int ret; VTDPASIDDirEntry pdire; - ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, + pasid, &pdire); if (ret) { return ret; } - ret = vtd_get_pasid_entry(s, pasid, &pdire, pe); + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); if (ret) { return ret; } - return ret; + if (!vtd_pe_present(pe)) { + return -VTD_FR_PASID_TABLE_INV; + } + + return 0; } static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, @@ -763,7 +805,7 @@ static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, pasid = VTD_CE_GET_RID2PASID(ce); pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); - ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe); + ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); return ret; } @@ -781,7 +823,11 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, pasid = VTD_CE_GET_RID2PASID(ce); pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); - ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire); if (ret) { return ret; } @@ -791,7 +837,15 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, return 0; } - ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe); + if (!vtd_pdire_present(&pdire)) { + return -VTD_FR_PASID_TABLE_INV; + } + + /* + * No present bit check since fpd is meaningful even + * if the present bit is clear. + */ + ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe); if (ret) { return ret; } @@ -948,6 +1002,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) return vtd_bus; } } + vtd_bus = NULL; } return vtd_bus; } @@ -2610,16 +2665,15 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) switch (addr) { /* Root Table Address Register, 64-bit */ case DMAR_RTADDR_REG: + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); if (size == 4) { - val = s->root & ((1ULL << 32) - 1); - } else { - val = s->root; + val = val & ((1ULL << 32) - 1); } break; case DMAR_RTADDR_REG_HI: assert(size == 4); - val = s->root >> 32; + val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; break; /* Invalidation Queue Address Register, 64-bit */ diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index edcf9fc9bb..862033ebe6 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -479,6 +479,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */ /* PASID Granular Translation Type Mask */ +#define VTD_PASID_ENTRY_P 1ULL #define VTD_SM_PASID_ENTRY_PGTT (7ULL << 6) #define VTD_SM_PASID_ENTRY_FLT (1ULL << 6) #define VTD_SM_PASID_ENTRY_SLT (2ULL << 6) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 721c7aa64e..fa12203079 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -425,7 +425,6 @@ static void pc_i440fx_5_0_machine_options(MachineClass *m) m->alias = "pc"; m->is_default = 1; pcmc->default_cpu_version = 1; - compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); } DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 52f45735e4..84cf925cf4 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -354,7 +354,6 @@ static void pc_q35_5_0_machine_options(MachineClass *m) pc_q35_machine_options(m); m->alias = "q35"; pcmc->default_cpu_version = 1; - compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len); } DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL, diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c index ec54e46ad6..9c013afddb 100644 --- a/hw/input/virtio-input.c +++ b/hw/input/virtio-input.c @@ -280,6 +280,7 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp) { VirtIOInputClass *vic = VIRTIO_INPUT_GET_CLASS(dev); VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOInput *vinput = VIRTIO_INPUT(dev); Error *local_err = NULL; if (vic->unrealize) { @@ -289,8 +290,8 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp) return; } } - virtio_del_queue(vdev, 0); - virtio_del_queue(vdev, 1); + virtio_delete_queue(vinput->evt); + virtio_delete_queue(vinput->sts); virtio_cleanup(vdev); } diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 777d62d3c8..d7d3ad6dc7 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3102,7 +3102,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) for (i = 0; i < max_queues; i++) { virtio_net_del_queue(n, i); } - + /* delete also control vq */ + virtio_del_queue(vdev, max_queues * 2); qemu_announce_timer_del(&n->announce_timer, false); g_free(n->vqs); qemu_del_nic(n->nic); diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index c5f9244934..ce7bcdb1d5 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -106,7 +106,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, return ret; } -void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len) +void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len) { PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr); uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1); @@ -115,28 +115,21 @@ void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len) return; } - PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n", - __func__, pci_dev->name, config_addr, val, len); pci_host_config_write_common(pci_dev, config_addr, PCI_CONFIG_SPACE_SIZE, val, len); } -uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len) +uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len) { PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr); uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1); - uint32_t val; if (!pci_dev) { return ~0x0; } - val = pci_host_config_read_common(pci_dev, config_addr, - PCI_CONFIG_SPACE_SIZE, len); - PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n", - __func__, pci_dev->name, config_addr, val, len); - - return val; + return pci_host_config_read_common(pci_dev, config_addr, + PCI_CONFIG_SPACE_SIZE, len); } static void pci_host_config_write(void *opaque, hwaddr addr, @@ -167,8 +160,7 @@ static void pci_host_data_write(void *opaque, hwaddr addr, uint64_t val, unsigned len) { PCIHostState *s = opaque; - PCI_DPRINTF("write addr " TARGET_FMT_plx " len %d val %x\n", - addr, len, (unsigned)val); + if (s->config_reg & (1u << 31)) pci_data_write(s->bus, s->config_reg | (addr & 3), val, len); } @@ -177,14 +169,11 @@ static uint64_t pci_host_data_read(void *opaque, hwaddr addr, unsigned len) { PCIHostState *s = opaque; - uint32_t val; + if (!(s->config_reg & (1U << 31))) { return 0xffffffff; } - val = pci_data_read(s->bus, s->config_reg | (addr & 3), len); - PCI_DPRINTF("read addr " TARGET_FMT_plx " len %d val %x\n", - addr, len, val); - return val; + return pci_data_read(s->bus, s->config_reg | (addr & 3), len); } const MemoryRegionOps pci_host_conf_le_ops = { diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index c693fc748a..26f710d3ec 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -275,6 +275,8 @@ static Property vhost_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size, 128), + DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust, + true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors, 0xFFFF), DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128), diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index 6a6c15dd32..23f972df59 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -39,6 +39,10 @@ static const int user_feature_bits[] = { VHOST_INVALID_FEATURE_BIT }; +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, +}; + static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserSCSI *s = (VHostUserSCSI *)vdev; @@ -62,6 +66,25 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) } } +static void vhost_user_scsi_reset(VirtIODevice *vdev) +{ + VHostSCSICommon *vsc = VHOST_SCSI_COMMON(vdev); + struct vhost_dev *dev = &vsc->dev; + + /* + * Historically, reset was not implemented so only reset devices + * that are expecting it. + */ + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RESET_DEVICE)) { + return; + } + + if (dev->vhost_ops->vhost_reset_device) { + dev->vhost_ops->vhost_reset_device(dev); + } +} + static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) { } @@ -182,6 +205,7 @@ static void vhost_user_scsi_class_init(ObjectClass *klass, void *data) vdc->get_features = vhost_scsi_common_get_features; vdc->set_config = vhost_scsi_common_set_config; vdc->set_status = vhost_user_scsi_set_status; + vdc->reset = vhost_user_scsi_reset; fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path; } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index e8b2b64d09..4bc73a370e 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -597,12 +597,15 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req, *next; int ret = 0; + bool suppress_notifications = virtio_queue_get_notification(vq); bool progress = false; QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); do { - virtio_queue_set_notification(vq, 0); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 0); + } while ((req = virtio_scsi_pop_req(s, vq))) { progress = true; @@ -622,7 +625,9 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) } } - virtio_queue_set_notification(vq, 1); + if (suppress_notifications) { + virtio_queue_set_notification(vq, 1); + } } while (ret != -EINVAL && !virtio_queue_empty(vq)); QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { @@ -654,7 +659,8 @@ static void virtio_scsi_get_config(VirtIODevice *vdev, VirtIOSCSICommon *s = VIRTIO_SCSI_COMMON(vdev); virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues); - virtio_stl_p(vdev, &scsiconf->seg_max, 128 - 2); + virtio_stl_p(vdev, &scsiconf->seg_max, + s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 2); virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors); virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun); virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); @@ -893,6 +899,11 @@ void virtio_scsi_common_realize(DeviceState *dev, virtio_cleanup(vdev); return; } + if (s->conf.virtqueue_size <= 2) { + error_setg(errp, "invalid virtqueue_size property (= %" PRIu32 "), " + "must be > 2", s->conf.virtqueue_size); + return; + } s->cmd_vqs = g_new0(VirtQueue *, s->conf.num_queues); s->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; s->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; @@ -949,6 +960,8 @@ static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, parent_obj.conf.virtqueue_size, 128), + DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, + parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, 0xFFFF), DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSI, parent_obj.conf.cmd_per_lun, diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 02a9b25199..d27a10fcc6 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -58,6 +58,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, VHOST_USER_PROTOCOL_F_MAX }; @@ -98,6 +99,7 @@ typedef enum VhostUserRequest { VHOST_USER_GET_INFLIGHT_FD = 31, VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_RESET_DEVICE = 34, VHOST_USER_MAX } VhostUserRequest; @@ -890,10 +892,14 @@ static int vhost_user_set_owner(struct vhost_dev *dev) static int vhost_user_reset_device(struct vhost_dev *dev) { VhostUserMsg msg = { - .hdr.request = VHOST_USER_RESET_OWNER, .hdr.flags = VHOST_USER_VERSION, }; + msg.hdr.request = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RESET_DEVICE) + ? VHOST_USER_RESET_DEVICE + : VHOST_USER_RESET_OWNER; + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { return -1; } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 40b04f5180..57f3b9f22d 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -831,6 +831,13 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp) } balloon_stats_destroy_timer(s); qemu_remove_balloon_handler(s); + + virtio_delete_queue(s->ivq); + virtio_delete_queue(s->dvq); + virtio_delete_queue(s->svq); + if (s->free_page_vq) { + virtio_delete_queue(s->free_page_vq); + } virtio_cleanup(vdev); } diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 94d934c44b..872f2cd237 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -65,6 +65,19 @@ static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy) virtio_bus_stop_ioeventfd(&proxy->bus); } +static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy) +{ + int i; + + if (proxy->legacy) { + return; + } + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].enabled = 0; + } +} + static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size) { VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque; @@ -295,8 +308,9 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, break; case VIRTIO_MMIO_QUEUE_NUM: trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE); + virtio_queue_set_num(vdev, vdev->queue_sel, value); + if (proxy->legacy) { - virtio_queue_set_num(vdev, vdev->queue_sel, value); virtio_queue_update_rings(vdev, vdev->queue_sel); } else { proxy->vqs[vdev->queue_sel].num = value; @@ -378,6 +392,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, if (vdev->status == 0) { virtio_reset(vdev); + virtio_mmio_soft_reset(proxy); } break; case VIRTIO_MMIO_QUEUE_DESC_LOW: diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index c6b47a9c73..f723b9f631 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, pcie_cap_flr_write_config(pci_dev, address, val, len); } - if (range_covers_byte(address, len, PCI_COMMAND) && - !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { - virtio_pci_stop_ioeventfd(proxy); - virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); + if (range_covers_byte(address, len, PCI_COMMAND)) { + if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { + virtio_set_disabled(vdev, true); + virtio_pci_stop_ioeventfd(proxy); + virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); + } else { + virtio_set_disabled(vdev, false); + } } if (proxy->config_cap && @@ -1256,6 +1260,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, break; case VIRTIO_PCI_COMMON_Q_SIZE: proxy->vqs[vdev->queue_sel].num = val; + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); break; case VIRTIO_PCI_COMMON_Q_MSIX: msix_vector_unuse(&proxy->pci_dev, diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 04716b5f6c..7b861e0ca0 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -432,6 +432,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) } } +bool virtio_queue_get_notification(VirtQueue *vq) +{ + return vq->notification; +} + void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; @@ -546,7 +551,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter) * Called within rcu_read_lock(). */ static int virtio_queue_empty_rcu(VirtQueue *vq) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 1; } @@ -565,7 +570,7 @@ static int virtio_queue_split_empty(VirtQueue *vq) { bool empty; - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 1; } @@ -783,7 +788,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, virtqueue_unmap_sg(vq, elem, len); - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return; } @@ -839,7 +844,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) void virtqueue_flush(VirtQueue *vq, unsigned int count) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { vq->inuse -= count; return; } @@ -1602,7 +1607,7 @@ err_undo_map: void *virtqueue_pop(VirtQueue *vq, size_t sz) { - if (unlikely(vq->vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return NULL; } @@ -1698,7 +1703,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq) { struct VirtIODevice *vdev = vq->vdev; - if (unlikely(vdev->broken)) { + if (virtio_device_disabled(vq->vdev)) { return 0; } @@ -1816,7 +1821,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - if (unlikely(vdev->broken)) { + if (virtio_device_disabled(vdev)) { return; } @@ -1920,6 +1925,7 @@ void virtio_reset(void *opaque) vdev->guest_features = 0; vdev->queue_sel = 0; vdev->status = 0; + vdev->disabled = false; atomic_set(&vdev->isr, 0); vdev->config_vector = VIRTIO_NO_VECTOR; virtio_notify_vector(vdev, vdev->config_vector); @@ -2330,17 +2336,24 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, return &vdev->vq[i]; } +void virtio_delete_queue(VirtQueue *vq) +{ + vq->vring.num = 0; + vq->vring.num_default = 0; + vq->handle_output = NULL; + vq->handle_aio_output = NULL; + g_free(vq->used_elems); + vq->used_elems = NULL; + virtio_virtqueue_reset_region_cache(vq); +} + void virtio_del_queue(VirtIODevice *vdev, int n) { if (n < 0 || n >= VIRTIO_QUEUE_MAX) { abort(); } - vdev->vq[n].vring.num = 0; - vdev->vq[n].vring.num_default = 0; - vdev->vq[n].handle_output = NULL; - vdev->vq[n].handle_aio_output = NULL; - g_free(vdev->vq[n].used_elems); + virtio_delete_queue(&vdev->vq[n]); } static void virtio_set_isr(VirtIODevice *vdev, int value) @@ -2553,6 +2566,13 @@ static bool virtio_started_needed(void *opaque) return vdev->started; } +static bool virtio_disabled_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->disabled; +} + static const VMStateDescription vmstate_virtqueue = { .name = "virtqueue_state", .version_id = 1, @@ -2718,6 +2738,17 @@ static const VMStateDescription vmstate_virtio_started = { } }; +static const VMStateDescription vmstate_virtio_disabled = { + .name = "virtio/disabled", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_disabled_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(disabled, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -2735,6 +2766,7 @@ static const VMStateDescription vmstate_virtio = { &vmstate_virtio_extra_state, &vmstate_virtio_started, &vmstate_virtio_packed_virtqueues, + &vmstate_virtio_disabled, NULL } }; @@ -3384,17 +3416,12 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); - bool progress; if (!vq->vring.desc || virtio_queue_empty(vq)) { return false; } - progress = virtio_queue_notify_aio_vq(vq); - - /* In case the handler function re-enabled notifications */ - virtio_queue_set_notification(vq, 0); - return progress; + return virtio_queue_notify_aio_vq(vq); } static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) @@ -3569,6 +3596,7 @@ static void virtio_device_instance_finalize(Object *obj) static Property virtio_properties[] = { DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features), DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true), + DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h index ba31595fc7..9ce088bd13 100644 --- a/include/hw/pci/pci_host.h +++ b/include/hw/pci/pci_host.h @@ -62,8 +62,8 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, uint32_t limit, uint32_t len); -void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len); -uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len); +void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len); +uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len); extern const MemoryRegionOps pci_host_conf_le_ops; extern const MemoryRegionOps pci_host_conf_be_ops; diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 9c19f5b634..1e62f869b2 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -38,6 +38,7 @@ struct VirtIOBlkConf uint32_t request_merging; uint16_t num_queues; uint16_t queue_size; + bool seg_max_adjust; uint32_t max_discard_sectors; uint32_t max_write_zeroes_sectors; bool x_enable_wce_if_config_wce; diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h index 122f7c4b6f..24e768909d 100644 --- a/include/hw/virtio/virtio-scsi.h +++ b/include/hw/virtio/virtio-scsi.h @@ -48,6 +48,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig; struct VirtIOSCSIConf { uint32_t num_queues; uint32_t virtqueue_size; + bool seg_max_adjust; uint32_t max_sectors; uint32_t cmd_per_lun; #ifdef CONFIG_VHOST_SCSI diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c32a815303..b69d517496 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -100,6 +100,8 @@ struct VirtIODevice uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ + bool use_disabled_flag; /* allow use of 'disable' flag when needed */ + bool disabled; /* device in temporarily disabled state */ bool use_started; bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ @@ -183,6 +185,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void virtio_del_queue(VirtIODevice *vdev, int n); +void virtio_delete_queue(VirtQueue *vq); + void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len); void virtqueue_flush(VirtQueue *vq, unsigned int count); @@ -224,6 +228,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id); void virtio_notify_config(VirtIODevice *vdev); +bool virtio_queue_get_notification(VirtQueue *vq); void virtio_queue_set_notification(VirtQueue *vq, int enable); int virtio_queue_ready(VirtQueue *vq); @@ -378,4 +383,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started) vdev->started = started; } } + +static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable) +{ + if (vdev->use_disabled_flag) { + vdev->disabled = disable; + } +} + +static inline bool virtio_device_disabled(VirtIODevice *vdev) +{ + return unlikely(vdev->disabled || vdev->broken); +} + #endif diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index ae9c41d02b..ba693cc80b 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -14,10 +14,35 @@ struct CPUArchId; #define NUMA_DISTANCE_MAX 254 #define NUMA_DISTANCE_UNREACHABLE 255 +/* the value of AcpiHmatLBInfo flags */ +enum { + HMAT_LB_MEM_MEMORY = 0, + HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, + HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, + HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, + HMAT_LB_LEVELS /* must be the last entry */ +}; + +/* the value of AcpiHmatLBInfo data type */ +enum { + HMAT_LB_DATA_ACCESS_LATENCY = 0, + HMAT_LB_DATA_READ_LATENCY = 1, + HMAT_LB_DATA_WRITE_LATENCY = 2, + HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, + HMAT_LB_DATA_READ_BANDWIDTH = 4, + HMAT_LB_DATA_WRITE_BANDWIDTH = 5, + HMAT_LB_TYPES /* must be the last entry */ +}; + +#define UINT16_BITS 16 + struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; + bool has_cpu; + uint8_t lb_info_provided; + uint16_t initiator; uint8_t distance[MAX_NODES]; }; @@ -26,6 +51,31 @@ struct NumaNodeMem { uint64_t node_plugged_mem; }; +struct HMAT_LB_Data { + uint8_t initiator; + uint8_t target; + uint64_t data; +}; +typedef struct HMAT_LB_Data HMAT_LB_Data; + +struct HMAT_LB_Info { + /* Indicates it's memory or the specified level memory side cache. */ + uint8_t hierarchy; + + /* Present the type of data, access/read/write latency or bandwidth. */ + uint8_t data_type; + + /* The range bitmap of bandwidth for calculating common base */ + uint64_t range_bitmap; + + /* The common base unit for latencies or bandwidths */ + uint64_t base; + + /* Array to store the latencies or bandwidths */ + GArray *list; +}; +typedef struct HMAT_LB_Info HMAT_LB_Info; + struct NumaState { /* Number of NUMA nodes */ int num_nodes; @@ -33,13 +83,26 @@ struct NumaState { /* Allow setting NUMA distance for different NUMA nodes */ bool have_numa_distance; + /* Detect if HMAT support is enabled. */ + bool hmat_enabled; + /* NUMA nodes information */ NodeInfo nodes[MAX_NODES]; + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + + /* Memory Side Cache Information Structure */ + NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; }; typedef struct NumaState NumaState; void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); void parse_numa_opts(MachineState *ms); +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp); void numa_complete_configuration(MachineState *ms); void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); extern QemuOptsList qemu_numa_opts; diff --git a/qapi/machine.json b/qapi/machine.json index ca26779f1a..b3d30bc816 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -426,10 +426,14 @@ # # @cpu: property based CPU(s) to node mapping (Since: 2.10) # +# @hmat-lb: memory latency and bandwidth information (Since: 5.0) +# +# @hmat-cache: memory side cache information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } ## # @NumaOptions: @@ -444,7 +448,9 @@ 'data': { 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', - 'cpu': 'NumaCpuOptions' }} + 'cpu': 'NumaCpuOptions', + 'hmat-lb': 'NumaHmatLBOptions', + 'hmat-cache': 'NumaHmatCacheOptions' }} ## # @NumaNodeOptions: @@ -463,6 +469,13 @@ # @memdev: memory backend object. If specified for one node, # it must be specified for all nodes. # +# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, +# points to the nodeid which has the memory controller +# responsible for this NUMA node. This field provides +# additional information as to the initiator node that +# is closest (as in directly attached) to this node, and +# therefore has the best performance (since 5.0) +# # Since: 2.1 ## { 'struct': 'NumaNodeOptions', @@ -470,7 +483,8 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem': 'size', - '*memdev': 'str' }} + '*memdev': 'str', + '*initiator': 'uint16' }} ## # @NumaDistOptions: @@ -550,6 +564,166 @@ 'data' : {} } ## +# @HmatLBMemoryHierarchy: +# +# The memory hierarchy in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBMemoryHierarchy, see chapter +# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec. +# +# @memory: the structure represents the memory performance +# +# @first-level: first level of memory side cache +# +# @second-level: second level of memory side cache +# +# @third-level: third level of memory side cache +# +# Since: 5.0 +## +{ 'enum': 'HmatLBMemoryHierarchy', + 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] } + +## +# @HmatLBDataType: +# +# Data type in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBDataType, see chapter +# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec. +# +# @access-latency: access latency (nanoseconds) +# +# @read-latency: read latency (nanoseconds) +# +# @write-latency: write latency (nanoseconds) +# +# @access-bandwidth: access bandwidth (Bytes per second) +# +# @read-bandwidth: read bandwidth (Bytes per second) +# +# @write-bandwidth: write bandwidth (Bytes per second) +# +# Since: 5.0 +## +{ 'enum': 'HmatLBDataType', + 'data': [ 'access-latency', 'read-latency', 'write-latency', + 'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] } + +## +# @NumaHmatLBOptions: +# +# Set the system locality latency and bandwidth information +# between Initiator and Target proximity Domains. +# +# For more information about @NumaHmatLBOptions, see chapter +# 5.2.27.4: Table 5-146 of ACPI 6.3 spec. +# +# @initiator: the Initiator Proximity Domain. +# +# @target: the Target Proximity Domain. +# +# @hierarchy: the Memory Hierarchy. Indicates the performance +# of memory or side cache. +# +# @data-type: presents the type of data, access/read/write +# latency or hit latency. +# +# @latency: the value of latency from @initiator to @target +# proximity domain, the latency unit is "ns(nanosecond)". +# +# @bandwidth: the value of bandwidth between @initiator and @target +# proximity domain, the bandwidth unit is +# "Bytes per second". +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatLBOptions', + 'data': { + 'initiator': 'uint16', + 'target': 'uint16', + 'hierarchy': 'HmatLBMemoryHierarchy', + 'data-type': 'HmatLBDataType', + '*latency': 'uint64', + '*bandwidth': 'size' }} + +## +# @HmatCacheAssociativity: +# +# Cache associativity in the Memory Side Cache Information Structure +# of HMAT +# +# For more information of @HmatCacheAssociativity, see chapter +# 5.2.27.5: Table 5-147 of ACPI 6.3 spec. +# +# @none: None (no memory side cache in this proximity domain, +# or cache associativity unknown) +# +# @direct: Direct Mapped +# +# @complex: Complex Cache Indexing (implementation specific) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheAssociativity', + 'data': [ 'none', 'direct', 'complex' ] } + +## +# @HmatCacheWritePolicy: +# +# Cache write policy in the Memory Side Cache Information Structure +# of HMAT +# +# For more information of @HmatCacheWritePolicy, see chapter +# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. +# +# @none: None (no memory side cache in this proximity domain, +# or cache write policy unknown) +# +# @write-back: Write Back (WB) +# +# @write-through: Write Through (WT) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheWritePolicy', + 'data': [ 'none', 'write-back', 'write-through' ] } + +## +# @NumaHmatCacheOptions: +# +# Set the memory side cache information for a given memory domain. +# +# For more information of @NumaHmatCacheOptions, see chapter +# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. +# +# @node-id: the memory proximity domain to which the memory belongs. +# +# @size: the size of memory side cache in bytes. +# +# @level: the cache level described in this structure. +# +# @associativity: the cache associativity, +# none/direct-mapped/complex(complex cache indexing). +# +# @policy: the write policy, none/write-back/write-through. +# +# @line: the cache Line size in bytes. +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatCacheOptions', + 'data': { + 'node-id': 'uint32', + 'size': 'size', + 'level': 'uint8', + 'associativity': 'HmatCacheAssociativity', + 'policy': 'HmatCacheWritePolicy', + 'line': 'uint16' }} + +## # @HostMemPolicy: # # Host memory policy types diff --git a/qemu-options.hx b/qemu-options.hx index e9d6231438..d4b73ef60c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -40,7 +40,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" " nvdimm=on|off controls NVDIMM support (default=off)\n" " enforce-config-section=on|off enforce configuration section migration (default=off)\n" - " memory-encryption=@var{} memory encryption object to use (default=none)\n", + " memory-encryption=@var{} memory encryption object to use (default=none)\n" + " hmat=on|off controls ACPI HMAT support (default=off)\n", QEMU_ARCH_ALL) STEXI @item -machine [type=]@var{name}[,prop=@var{value}[,...]] @@ -94,6 +95,9 @@ NOTE: this parameter is deprecated. Please use @option{-global} @option{migration.send-configuration}=@var{on|off} instead. @item memory-encryption=@var{} Memory encryption object to use. The default is none. +@item hmat=on|off +Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. +The default is off. @end table ETEXI @@ -168,19 +172,24 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, - "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" + "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa dist,src=source,dst=destination,val=distance\n" - "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" + "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n" + "-numa hmat-cache,node-id=node,size=size,level=level[,associativity=none|direct|complex][,policy=none|write-back|write-through][,line=size]\n", QEMU_ARCH_ALL) STEXI -@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] -@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] +@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] +@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] +@itemx -numa hmat-cache,node-id=@var{node},size=@var{size},level=@var{level}[,associativity=@var{str}][,policy=@var{str}][,line=@var{size}] @findex -numa Define a NUMA node and assign RAM and VCPUs to it. Set the NUMA distance from a source node to a destination node. +Set the ACPI Heterogeneous Memory Attributes for the given nodes. Legacy VCPU assignment uses @samp{cpus} option where @var{firstcpu} and @var{lastcpu} are CPU indexes. Each @@ -222,6 +231,27 @@ split equally between them. @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, if one node uses @samp{memdev}, all of them have to use it. +@samp{initiator} is an additional option that points to an @var{initiator} +NUMA node that has best performance (the lowest latency or largest bandwidth) +to this NUMA @var{node}. Note that this option can be set only when +the machine property 'hmat' is set to 'on'. + +Following example creates a machine with 2 NUMA nodes, node 0 has CPU. +node 1 has only memory, and its initiator is node 0. Note that because +node 0 has CPU, by default the initiator of node 0 is itself and must be +itself. +@example +-machine hmat=on \ +-m 2G,slots=2,maxmem=4G \ +-object memory-backend-ram,size=1G,id=m0 \ +-object memory-backend-ram,size=1G,id=m1 \ +-numa node,nodeid=0,memdev=m0 \ +-numa node,nodeid=1,memdev=m1,initiator=0 \ +-smp 2,sockets=2,maxcpus=2 \ +-numa cpu,node-id=0,socket-id=0 \ +-numa cpu,node-id=0,socket-id=1 +@end example + @var{source} and @var{destination} are NUMA node IDs. @var{distance} is the NUMA distance from @var{source} to @var{destination}. The distance from a node to itself is always 10. If any pair of nodes is @@ -238,6 +268,59 @@ specified resources, it just assigns existing resources to NUMA nodes. This means that one still has to use the @option{-m}, @option{-smp} options to allocate RAM and VCPUs respectively. +Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information +between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT). +Initiator NUMA node can create memory requests, usually it has one or more processors. +Target NUMA node contains addressable memory. + +In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the memory +hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the structure +represents the memory performance; if @var{hierarchy} is 'first-level|second-level|third-level', +this structure represents aggregated performance of memory side caches for each domain. +@var{type} of 'data-type' is type of data represented by this structure instance: +if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or 'access|read|write' +bandwidth of the target memory; if 'hierarchy' is 'first-level|second-level|third-level', +'data-type' is 'access|read|write' hit latency or 'access|read|write' hit bandwidth of the +target memory side cache. + +@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value, +the possible value and units are NUM[M|G|T], mean that the bandwidth value are +NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). +Note that if latency or bandwidth value is 0, means the corresponding latency or +bandwidth information is not provided. + +In @samp{hmat-cache} option, @var{node-id} is the NUMA-id of the memory belongs. +@var{size} is the size of memory side cache in bytes. @var{level} is the cache +level described in this structure, note that the cache level 0 should not be used +with @samp{hmat-cache} option. @var{associativity} is the cache associativity, +the possible value is 'none/direct(direct-mapped)/complex(complex cache indexing)'. +@var{policy} is the write policy. @var{line} is the cache Line size in bytes. + +For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and +a ram, node 1 has only a ram. The processors in node 0 access memory in node +0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; +The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 +nanoseconds, access-bandwidth is 100 MB/s. +And for memory side cache information, NUMA node 0 and 1 both have 1 level memory +cache, size is 10KB, policy is write-back, the cache Line size is 8 bytes: +@example +-machine hmat=on \ +-m 2G \ +-object memory-backend-ram,size=1G,id=m0 \ +-object memory-backend-ram,size=1G,id=m1 \ +-smp 2 \ +-numa node,nodeid=0,memdev=m0 \ +-numa node,nodeid=1,memdev=m1,initiator=0 \ +-numa cpu,node-id=0,socket-id=0 \ +-numa cpu,node-id=0,socket-id=1 \ +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ +-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ +-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M \ +-numa hmat-cache,node-id=0,size=10K,level=1,associativity=direct,policy=write-back,line=8 \ +-numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8 +@end example + ETEXI DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, diff --git a/tests/acceptance/virtio_seg_max_adjust.py b/tests/acceptance/virtio_seg_max_adjust.py new file mode 100755 index 0000000000..5458573138 --- /dev/null +++ b/tests/acceptance/virtio_seg_max_adjust.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Test virtio-scsi and virtio-blk queue settings for all machine types +# +# Copyright (c) 2019 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import sys +import os +import re + +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python')) +from qemu.machine import QEMUMachine +from avocado_qemu import Test + +#list of machine types and virtqueue properties to test +VIRTIO_SCSI_PROPS = {'seg_max_adjust': 'seg_max_adjust'} +VIRTIO_BLK_PROPS = {'seg_max_adjust': 'seg-max-adjust'} + +DEV_TYPES = {'virtio-scsi-pci': VIRTIO_SCSI_PROPS, + 'virtio-blk-pci': VIRTIO_BLK_PROPS} + +VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'], + 'virtio-blk-pci': ['-device', + 'virtio-blk-pci,id=scsi0,drive=drive0', + '-drive', + 'driver=null-co,id=drive0,if=none']} + + +class VirtioMaxSegSettingsCheck(Test): + @staticmethod + def make_pattern(props): + pattern_items = ['{0} = \w+'.format(prop) for prop in props] + return '|'.join(pattern_items) + + def query_virtqueue(self, vm, dev_type_name): + query_ok = False + error = None + props = None + + output = vm.command('human-monitor-command', + command_line = 'info qtree') + props_list = DEV_TYPES[dev_type_name].values(); + pattern = self.make_pattern(props_list) + res = re.findall(pattern, output) + + if len(res) != len(props_list): + props_list = set(props_list) + res = set(res) + not_found = props_list.difference(res) + not_found = ', '.join(not_found) + error = '({0}): The following properties not found: {1}'\ + .format(dev_type_name, not_found) + else: + query_ok = True + props = dict() + for prop in res: + p = prop.split(' = ') + props[p[0]] = p[1] + return query_ok, props, error + + def check_mt(self, mt, dev_type_name): + with QEMUMachine(self.qemu_bin) as vm: + vm.set_machine(mt["name"]) + for s in VM_DEV_PARAMS[dev_type_name]: + vm.add_args(s) + vm.launch() + query_ok, props, error = self.query_virtqueue(vm, dev_type_name) + + if not query_ok: + self.fail('machine type {0}: {1}'.format(mt['name'], error)) + + for prop_name, prop_val in props.items(): + expected_val = mt[prop_name] + self.assertEqual(expected_val, prop_val) + + @staticmethod + def seg_max_adjust_enabled(mt): + # machine types >= 5.0 should have seg_max_adjust = true + # others seg_max_adjust = false + mt = mt.split("-") + + # machine types with one line name and name like pc-x.x + if len(mt) <= 2: + return False + + # machine types like pc-<chip_name>-x.x[.x] + ver = mt[2] + ver = ver.split("."); + + # versions >= 5.0 goes with seg_max_adjust enabled + major = int(ver[0]) + + if major >= 5: + return True + return False + + def test_machine_types(self): + # collect all machine types except 'none', 'isapc', 'microvm' + with QEMUMachine(self.qemu_bin) as vm: + vm.launch() + machines = [m['name'] for m in vm.command('query-machines')] + vm.shutdown() + machines.remove('none') + machines.remove('isapc') + machines.remove('microvm') + + for dev_type in DEV_TYPES: + # create the list of machine types and their parameters. + mtypes = list() + for m in machines: + if self.seg_max_adjust_enabled(m): + enabled = 'true' + else: + enabled = 'false' + mtypes.append({'name': m, + DEV_TYPES[dev_type]['seg_max_adjust']: enabled}) + + # test each machine type for a device type + for mt in mtypes: + self.check_mt(mt, dev_type) diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index bc0ad594a1..f1ac2d7e96 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ + test_data data; + + memset(&data, 0, sizeof(data)); + data.machine = machine; + data.variant = ".acpihmat"; + test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1," + "associativity=direct,policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1," + "associativity=direct,policy=write-back,line=8", + &data); + free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ + test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ + test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem); qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm); qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm); + qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat); + qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat); } else if (strcmp(arch, "aarch64") == 0) { qtest_add_func("acpi/virt", test_acpi_virt_tcg); qtest_add_func("acpi/virt/numamem", test_acpi_virt_tcg_numamem); diff --git a/tests/data/acpi/pc/APIC.acpihmat b/tests/data/acpi/pc/APIC.acpihmat Binary files differnew file mode 100644 index 0000000000..a21f164699 --- /dev/null +++ b/tests/data/acpi/pc/APIC.acpihmat diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat Binary files differnew file mode 100644 index 0000000000..ad890e09aa --- /dev/null +++ b/tests/data/acpi/pc/DSDT.acpihmat diff --git a/tests/data/acpi/pc/HMAT.acpihmat b/tests/data/acpi/pc/HMAT.acpihmat Binary files differnew file mode 100644 index 0000000000..c00f7ba6cd --- /dev/null +++ b/tests/data/acpi/pc/HMAT.acpihmat diff --git a/tests/data/acpi/pc/SRAT.acpihmat b/tests/data/acpi/pc/SRAT.acpihmat Binary files differnew file mode 100644 index 0000000000..1dcae90aec --- /dev/null +++ b/tests/data/acpi/pc/SRAT.acpihmat diff --git a/tests/data/acpi/q35/APIC.acpihmat b/tests/data/acpi/q35/APIC.acpihmat Binary files differnew file mode 100644 index 0000000000..a21f164699 --- /dev/null +++ b/tests/data/acpi/q35/APIC.acpihmat diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat Binary files differnew file mode 100644 index 0000000000..30e3717b5b --- /dev/null +++ b/tests/data/acpi/q35/DSDT.acpihmat diff --git a/tests/data/acpi/q35/HMAT.acpihmat b/tests/data/acpi/q35/HMAT.acpihmat Binary files differnew file mode 100644 index 0000000000..c00f7ba6cd --- /dev/null +++ b/tests/data/acpi/q35/HMAT.acpihmat diff --git a/tests/data/acpi/q35/SRAT.acpihmat b/tests/data/acpi/q35/SRAT.acpihmat Binary files differnew file mode 100644 index 0000000000..1dcae90aec --- /dev/null +++ b/tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..17dd807d2a 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + + /* Fail: Initiator should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Target should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Initiator should contain cpu */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + + /* Fail: Data-type mismatch */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"write-latency\"," + " 'bandwidth': 524288000 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," + " 'latency': 5 } }"))); + + /* Fail: Bandwidth should be 1MB (1048576) aligned */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 1048575 } }"))); + + /* Configuring HMAT bandwidth and latency details */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); /* 1 ns */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 5 } }"))); /* Fail: Duplicate configuration */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 68717379584 } }"))); /* 65534 MB/s */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 65534 } }"))); /* 65534 ns */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 34358689792 } }"))); /* 32767 MB/s */ + + /* Fail: node_id should be less than the number of nodes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 2, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Fail: level should be less than HMAT_LB_LEVELS (4) */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 4, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Fail: associativity option should be 'none', if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"direct\", 'policy': \"none\"," + " 'line': 0 } }"))); + /* Fail: policy option should be 'none', if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"none\", 'policy': \"write-back\"," + " 'line': 0 } }"))); + /* Fail: line option should be 0, if level is 0 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"none\", 'policy': \"none\"," + " 'line': 8 } }"))); + + /* Configuring HMAT memory side cache attributes */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); /* Fail: Duplicate configuration */ + /* Fail: The size of level 2 size should be small than level 1 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 2, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + /* Fail: The size of level 0 size should be larger than level 1 */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 0, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 1, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + +static void pc_hmat_off_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0", + data ? (char *)data : ""); + + /* + * Fail: Enable HMAT with -machine hmat=on + * before using any of hmat specific options + */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\"," + " 'initiator': 0 } }"))); + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\" } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + +static void pc_hmat_erange_cfg(const void *data) +{ + QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + + /* Can't store the compressed latency */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 1 } }"))); /* 1 ns */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," + " 'latency': 65535 } }"))); /* 65535 ns */ + + /* Test the 0 input (bandwidth not provided) */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 0 } }"))); /* 0 MB/s */ + /* Fail: bandwidth should be provided before memory side cache attributes */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," + " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," + " 'line': 8 } }"))); + + /* Can't store the compressed bandwidth */ + g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," + " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," + " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," + " 'bandwidth': 68718428160 } }"))); /* 65535 MB/s */ + + /* let machine initialization to complete and run */ + g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, + "{ 'execute': 'x-exit-preconfig' }"))); + qtest_qmp_eventwait(qs, "RESUME"); + + qtest_quit(qs); +} + int main(int argc, char **argv) { const char *args = NULL; @@ -346,6 +556,9 @@ int main(int argc, char **argv) if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) { qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu); qtest_add_data_func("/numa/pc/dynamic/cpu", args, pc_dynamic_cpu_cfg); + qtest_add_data_func("/numa/pc/hmat/build", args, pc_hmat_build_cfg); + qtest_add_data_func("/numa/pc/hmat/off", args, pc_hmat_off_cfg); + qtest_add_data_func("/numa/pc/hmat/erange", args, pc_hmat_erange_cfg); } if (!strcmp(arch, "ppc64")) { |