diff options
92 files changed, 3034 insertions, 566 deletions
@@ -305,7 +305,7 @@ endif @echo 'Test targets:' $(call print-help,check,Run all tests (check-help for details)) $(call print-help,bench,Run all benchmarks) - $(call print-help,docker,Help about targets running tests inside containers) + $(call print-help,docker-help,Help about targets running tests inside containers) $(call print-help,vm-help,Help about targets running tests inside VM) @echo '' @echo 'Documentation targets:' diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 5164d838b9..e72a19aaf8 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -123,10 +123,6 @@ struct KVMState KVMMemoryListener memory_listener; QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; - /* memory encryption */ - void *memcrypt_handle; - int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); - /* For "info mtree -f" to tell if an MR is registered in KVM */ int nr_as; struct KVMAs { @@ -225,26 +221,6 @@ int kvm_get_max_memslots(void) return s->nr_slots; } -bool kvm_memcrypt_enabled(void) -{ - if (kvm_state && kvm_state->memcrypt_handle) { - return true; - } - - return false; -} - -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) -{ - if (kvm_state->memcrypt_handle && - kvm_state->memcrypt_encrypt_data) { - return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, - ptr, len); - } - - return 1; -} - /* Called with KVMMemoryListener.slots_lock held */ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) { @@ -2204,20 +2180,6 @@ static int kvm_init(MachineState *ms) kvm_state = s; - /* - * if memory encryption object is specified then initialize the memory - * encryption context. - */ - if (ms->memory_encryption) { - kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); - if (!kvm_state->memcrypt_handle) { - ret = -1; - goto err; - } - - kvm_state->memcrypt_encrypt_data = sev_encrypt_data; - } - ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; diff --git a/accel/kvm/sev-stub.c b/accel/kvm/sev-stub.c index 4f97452585..9587d1b2a3 100644 --- a/accel/kvm/sev-stub.c +++ b/accel/kvm/sev-stub.c @@ -15,12 +15,8 @@ #include "qemu-common.h" #include "sysemu/sev.h" -int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len) +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { - abort(); -} - -void *sev_guest_init(const char *id) -{ - return NULL; + /* If we get here, cgs must be some non-SEV thing */ + return 0; } diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 680e099463..0f17acfac0 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -81,16 +81,6 @@ int kvm_on_sigbus(int code, void *addr) return 1; } -bool kvm_memcrypt_enabled(void) -{ - return false; -} - -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) -{ - return 1; -} - #ifndef CONFIG_USER_ONLY int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) { diff --git a/backends/confidential-guest-support.c b/backends/confidential-guest-support.c new file mode 100644 index 0000000000..052fde8db0 --- /dev/null +++ b/backends/confidential-guest-support.c @@ -0,0 +1,33 @@ +/* + * QEMU Confidential Guest support + * + * Copyright Red Hat. + * + * Authors: + * David Gibson <david@gibson.dropbear.id.au> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "exec/confidential-guest-support.h" + +OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport, + confidential_guest_support, + CONFIDENTIAL_GUEST_SUPPORT, + OBJECT) + +static void confidential_guest_support_class_init(ObjectClass *oc, void *data) +{ +} + +static void confidential_guest_support_init(Object *obj) +{ +} + +static void confidential_guest_support_finalize(Object *obj) +{ +} diff --git a/backends/meson.build b/backends/meson.build index 484456ece7..d4221831fc 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -6,6 +6,7 @@ softmmu_ss.add([files( 'rng-builtin.c', 'rng-egd.c', 'rng.c', + 'confidential-guest-support.c', ), numa]) softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index afd75ab628..75d7fa9510 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -900,10 +900,11 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) ImageEntry *image_entry, *next_ie; SnapshotEntry *snapshot_entry; + Error *err = NULL; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, &err); if (!bs) { - monitor_printf(mon, "No available block device supports snapshots\n"); + error_report_err(err); return; } aio_context = bdrv_get_aio_context(bs); @@ -953,7 +954,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) total = 0; for (i = 0; i < nb_sns; i++) { SnapshotEntry *next_sn; - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs1) == 0) { + if (bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL) == 1) { global_snapshots[total] = i; total++; QTAILQ_FOREACH(image_entry, &image_list, next) { diff --git a/block/snapshot.c b/block/snapshot.c index a2bf3a54eb..e8ae9a28c1 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -447,6 +447,41 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, return ret; } + +static int bdrv_all_get_snapshot_devices(bool has_devices, strList *devices, + GList **all_bdrvs, + Error **errp) +{ + g_autoptr(GList) bdrvs = NULL; + + if (has_devices) { + if (!devices) { + error_setg(errp, "At least one device is required for snapshot"); + return -1; + } + + while (devices) { + BlockDriverState *bs = bdrv_find_node(devices->value); + if (!bs) { + error_setg(errp, "No block device node '%s'", devices->value); + return -1; + } + bdrvs = g_list_append(bdrvs, bs); + devices = devices->next; + } + } else { + BlockDriverState *bs; + BdrvNextIterator it; + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + bdrvs = g_list_append(bdrvs, bs); + } + } + + *all_bdrvs = g_steal_pointer(&bdrvs); + return 0; +} + + static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) { if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { @@ -462,44 +497,59 @@ static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers) */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp) { - bool ok = true; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return false; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + bool ok = true; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ok = bdrv_can_snapshot(bs); } aio_context_release(ctx); if (!ok) { - bdrv_next_cleanup(&it); - goto fail; + error_setg(errp, "Device '%s' is writable but does not support " + "snapshots", bdrv_get_device_or_node_name(bs)); + return false; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ok; + return true; } -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; - QEMUSnapshotInfo sn1, *snapshot = &sn1; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn1, *snapshot = &sn1; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs) && + if ((devices || bdrv_all_snapshots_includes_bs(bs)) && bdrv_snapshot_find(bs, snapshot, name) >= 0) { ret = bdrv_snapshot_delete(bs, snapshot->id_str, @@ -507,118 +557,180 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not delete snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ret = bdrv_snapshot_goto(bs, name, errp); } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not load snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp) { - QEMUSnapshotInfo sn; - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { - err = bdrv_snapshot_find(bs, &sn, name); + if (devices || bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_find(bs, &sn, name); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + if (ret == -ENOENT) { + return 0; + } else { + error_setg_errno(errp, errno, + "Could not check snapshot '%s' on '%s'", + name, bdrv_get_device_or_node_name(bs)); + return -1; + } } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 1; } int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs) + bool has_devices, strList *devices, + Error **errp) { - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); if (bs == vm_state_bs) { sn->vm_state_size = vm_state_size; - err = bdrv_snapshot_create(bs, sn); - } else if (bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_create(bs, sn); + } else if (devices || bdrv_all_snapshots_includes_bs(bs)) { sn->vm_state_size = 0; - err = bdrv_snapshot_create(bs, sn); + ret = bdrv_snapshot_create(bs, sn); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + error_setg(errp, "Could not create snapshot '%s' on '%s'", + sn->name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 0; } -BlockDriverState *bdrv_all_find_vmstate_bs(void) + +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp) { - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return NULL; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); - bool found; + bool found = false; aio_context_acquire(ctx); - found = bdrv_all_snapshots_includes_bs(bs) && bdrv_can_snapshot(bs); + found = (devices || bdrv_all_snapshots_includes_bs(bs)) && + bdrv_can_snapshot(bs); aio_context_release(ctx); - if (found) { - bdrv_next_cleanup(&it); - break; + if (vmstate_bs) { + if (g_str_equal(vmstate_bs, + bdrv_get_node_name(bs))) { + if (found) { + return bs; + } else { + error_setg(errp, + "vmstate block device '%s' does not support snapshots", + vmstate_bs); + return NULL; + } + } + } else if (found) { + return bs; } + + iterbdrvs = iterbdrvs->next; + } + + if (vmstate_bs) { + error_setg(errp, + "vmstate block device '%s' does not exist", vmstate_bs); + } else { + error_setg(errp, + "no block device can store vmstate for snapshot"); } - return bs; + return NULL; } @@ -198,8 +198,8 @@ has() { } version_ge () { - local_ver1=`echo $1 | tr . ' '` - local_ver2=`echo $2 | tr . ' '` + local_ver1=$(expr "$1" : '\([0-9.]*\)' | tr . ' ') + local_ver2=$(echo "$2" | tr . ' ') while true; do set x $local_ver1 local_first=${2-0} @@ -6115,7 +6115,7 @@ fi if test -n "$gdb_bin"; then gdb_version=$($gdb_bin --version | head -n 1) - if version_ge ${gdb_version##* } 8.3.1; then + if version_ge ${gdb_version##* } 9.1; then echo "HAVE_GDB_BIN=$gdb_bin" >> $config_host_mak fi fi diff --git a/docs/amd-memory-encryption.txt b/docs/amd-memory-encryption.txt index 80b8eb00e9..145896aec7 100644 --- a/docs/amd-memory-encryption.txt +++ b/docs/amd-memory-encryption.txt @@ -73,7 +73,7 @@ complete flow chart. To launch a SEV guest # ${QEMU} \ - -machine ...,memory-encryption=sev0 \ + -machine ...,confidential-guest-support=sev0 \ -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 Debugging diff --git a/docs/confidential-guest-support.txt b/docs/confidential-guest-support.txt new file mode 100644 index 0000000000..71d07ba57a --- /dev/null +++ b/docs/confidential-guest-support.txt @@ -0,0 +1,49 @@ +Confidential Guest Support +========================== + +Traditionally, hypervisors such as QEMU have complete access to a +guest's memory and other state, meaning that a compromised hypervisor +can compromise any of its guests. A number of platforms have added +mechanisms in hardware and/or firmware which give guests at least some +protection from a compromised hypervisor. This is obviously +especially desirable for public cloud environments. + +These mechanisms have different names and different modes of +operation, but are often referred to as Secure Guests or Confidential +Guests. We use the term "Confidential Guest Support" to distinguish +this from other aspects of guest security (such as security against +attacks from other guests, or from network sources). + +Running a Confidential Guest +---------------------------- + +To run a confidential guest you need to add two command line parameters: + +1. Use "-object" to create a "confidential guest support" object. The + type and parameters will vary with the specific mechanism to be + used +2. Set the "confidential-guest-support" machine parameter to the ID of + the object from (1). + +Example (for AMD SEV):: + + qemu-system-x86_64 \ + <other parameters> \ + -machine ...,confidential-guest-support=sev0 \ + -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 + +Supported mechanisms +-------------------- + +Currently supported confidential guest mechanisms are: + +AMD Secure Encrypted Virtualization (SEV) + docs/amd-memory-encryption.txt + +POWER Protected Execution Facility (PEF) + docs/papr-pef.txt + +s390x Protected Virtualization (PV) + docs/system/s390x/protvirt.rst + +Other mechanisms may be supported in future. diff --git a/docs/papr-pef.txt b/docs/papr-pef.txt new file mode 100644 index 0000000000..72550e9bf8 --- /dev/null +++ b/docs/papr-pef.txt @@ -0,0 +1,30 @@ +POWER (PAPR) Protected Execution Facility (PEF) +=============================================== + +Protected Execution Facility (PEF), also known as Secure Guest support +is a feature found on IBM POWER9 and POWER10 processors. + +If a suitable firmware including an Ultravisor is installed, it adds +an extra memory protection mode to the CPU. The ultravisor manages a +pool of secure memory which cannot be accessed by the hypervisor. + +When this feature is enabled in QEMU, a guest can use ultracalls to +enter "secure mode". This transfers most of its memory to secure +memory, where it cannot be eavesdropped by a compromised hypervisor. + +Launching +--------- + +To launch a guest which will be permitted to enter PEF secure mode: + +# ${QEMU} \ + -object pef-guest,id=pef0 \ + -machine confidential-guest-support=pef0 \ + ... + +Live Migration +---------------- + +Live migration is not yet implemented for PEF guests. For +consistency, we currently prevent migration if the PEF feature is +enabled, whether or not the guest has actually entered secure mode. diff --git a/docs/system/arm/versatile.rst b/docs/system/arm/versatile.rst index 51221c30a4..2ae792bac3 100644 --- a/docs/system/arm/versatile.rst +++ b/docs/system/arm/versatile.rst @@ -27,3 +27,37 @@ The Arm Versatile baseboard is emulated with the following devices: devices. - PL181 MultiMedia Card Interface with SD card. + +Booting a Linux kernel +---------------------- + +Building a current Linux kernel with ``versatile_defconfig`` should be +enough to get something running. Nowadays an out-of-tree build is +recommended (and also useful if you build a lot of different targets). +In the following example $BLD points to the build directory and $SRC +points to the root of the Linux source tree. You can drop $SRC if you +are running from there. + +.. code-block:: bash + + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- versatile_defconfig + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- + +You may want to enable some additional modules if you want to boot +something from the SCSI interface:: + + CONFIG_PCI=y + CONFIG_PCI_VERSATILE=y + CONFIG_SCSI=y + CONFIG_SCSI_SYM53C8XX_2=y + +You can then boot with a command line like: + +.. code-block:: bash + + $ qemu-system-arm -machine type=versatilepb \ + -serial mon:stdio \ + -drive if=scsi,driver=file,filename=debian-buster-armel-rootfs.ext4 \ + -kernel zImage \ + -dtb versatile-pb.dtb \ + -append "console=ttyAMA0 ro root=/dev/sda" diff --git a/docs/system/arm/vexpress.rst b/docs/system/arm/vexpress.rst index 7f1bcbef07..3e3839e923 100644 --- a/docs/system/arm/vexpress.rst +++ b/docs/system/arm/vexpress.rst @@ -58,3 +58,31 @@ Other differences between the hardware and the QEMU model: ``vexpress-a15``, and have IRQs from 40 upwards. If a dtb is provided on the command line then QEMU will edit it to include suitable entries describing these transports for the guest. + +Booting a Linux kernel +---------------------- + +Building a current Linux kernel with ``multi_v7_defconfig`` should be +enough to get something running. Nowadays an out-of-tree build is +recommended (and also useful if you build a lot of different targets). +In the following example $BLD points to the build directory and $SRC +points to the root of the Linux source tree. You can drop $SRC if you +are running from there. + +.. code-block:: bash + + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- multi_v7_defconfig + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- + +By default you will want to boot your rootfs off the sdcard interface. +Your rootfs will need to be padded to the right size. With a suitable +DTB you could also add devices to the virtio-mmio bus. + +.. code-block:: bash + + $ qemu-system-arm -cpu cortex-a15 -smp 4 -m 4096 \ + -machine type=vexpress-a15 -serial mon:stdio \ + -drive if=sd,driver=file,filename=armel-rootfs.ext4 \ + -kernel zImage \ + -dtb vexpress-v2p-ca15-tc1.dtb \ + -append "console=ttyAMA0 root=/dev/mmcblk0 ro" diff --git a/docs/system/s390x/protvirt.rst b/docs/system/s390x/protvirt.rst index 712974ad87..0f481043d9 100644 --- a/docs/system/s390x/protvirt.rst +++ b/docs/system/s390x/protvirt.rst @@ -22,15 +22,22 @@ If those requirements are met, the capability `KVM_CAP_S390_PROTECTED` will indicate that KVM can support PVMs on that LPAR. -QEMU Settings -------------- +Running a Protected Virtual Machine +----------------------------------- -To indicate to the VM that it can transition into protected mode, the +To run a PVM you will need to select a CPU model which includes the `Unpack facility` (stfle bit 161 represented by the feature -`unpack`/`S390_FEAT_UNPACK`) needs to be part of the cpu model of -the VM. +`unpack`/`S390_FEAT_UNPACK`), and add these options to the command line:: + + -object s390-pv-guest,id=pv0 \ + -machine confidential-guest-support=pv0 + +Adding these options will: + +* Ensure the `unpack` facility is available +* Enable the IOMMU by default for all I/O devices +* Initialize the PV mechanism -All I/O devices need to use the IOMMU. Passthrough (vfio) devices are currently not supported. Host huge page backings are not supported. However guests can use huge @@ -2245,7 +2245,6 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) { TaskState *ts; unsigned long offset, len, saved_auxv, auxv_len; - const char *mem; if (gdb_ctx->num_params < 2) { put_packet("E22"); @@ -2257,8 +2256,8 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) ts = gdbserver_state.c_cpu->opaque; saved_auxv = ts->info->saved_auxv; auxv_len = ts->info->auxv_len; - mem = (const char *)(saved_auxv + offset); - if (offset > auxv_len) { + + if (offset >= auxv_len) { put_packet("E00"); return; } @@ -2269,12 +2268,20 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) if (len < auxv_len - offset) { g_string_assign(gdbserver_state.str_buf, "m"); - memtox(gdbserver_state.str_buf, mem, len); } else { g_string_assign(gdbserver_state.str_buf, "l"); - memtox(gdbserver_state.str_buf, mem, auxv_len - offset); + len = auxv_len - offset; + } + + g_byte_array_set_size(gdbserver_state.mem_buf, len); + if (target_memory_rw_debug(gdbserver_state.g_cpu, saved_auxv + offset, + gdbserver_state.mem_buf->data, len, false)) { + put_packet("E14"); + return; } + memtox(gdbserver_state.str_buf, + (const char *)gdbserver_state.mem_buf->data, len); put_packet_binary(gdbserver_state.str_buf->str, gdbserver_state.str_buf->len, true); } diff --git a/hw/core/machine.c b/hw/core/machine.c index 5d6163ab70..970046f438 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -32,6 +32,9 @@ #include "hw/mem/nvdimm.h" #include "migration/global_state.h" #include "migration/vmstate.h" +#include "exec/confidential-guest-support.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-pci.h" GlobalProperty hw_compat_5_2[] = {}; const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2); @@ -427,24 +430,37 @@ static char *machine_get_memory_encryption(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); - return g_strdup(ms->memory_encryption); + if (ms->cgs) { + return g_strdup(object_get_canonical_path_component(OBJECT(ms->cgs))); + } + + return NULL; } static void machine_set_memory_encryption(Object *obj, const char *value, Error **errp) { - MachineState *ms = MACHINE(obj); + Object *cgs = + object_resolve_path_component(object_get_objects_root(), value); + + if (!cgs) { + error_setg(errp, "No such memory encryption object '%s'", value); + return; + } - g_free(ms->memory_encryption); - ms->memory_encryption = g_strdup(value); + object_property_set_link(obj, "confidential-guest-support", cgs, errp); +} +static void machine_check_confidential_guest_support(const Object *obj, + const char *name, + Object *new_target, + Error **errp) +{ /* - * With memory encryption, the host can't see the real contents of RAM, - * so there's no point in it trying to merge areas. + * So far the only constraint is that the target has the + * TYPE_CONFIDENTIAL_GUEST_SUPPORT interface, and that's checked + * by the QOM core */ - if (value) { - machine_set_mem_merge(obj, false, errp); - } } static bool machine_get_nvdimm(Object *obj, Error **errp) @@ -844,6 +860,15 @@ static void machine_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, "suppress-vmdesc", "Set on to disable self-describing migration"); + object_class_property_add_link(oc, "confidential-guest-support", + TYPE_CONFIDENTIAL_GUEST_SUPPORT, + offsetof(MachineState, cgs), + machine_check_confidential_guest_support, + OBJ_PROP_LINK_STRONG); + object_class_property_set_description(oc, "confidential-guest-support", + "Set confidential guest scheme to support"); + + /* For compatibility */ object_class_property_add_str(oc, "memory-encryption", machine_get_memory_encryption, machine_set_memory_encryption); object_class_property_set_description(oc, "memory-encryption", @@ -1166,6 +1191,26 @@ void machine_run_board_init(MachineState *machine) cc->deprecation_note); } + if (machine->cgs) { + /* + * With confidential guests, the host can't see the real + * contents of RAM, so there's no point in it trying to merge + * areas. + */ + machine_set_mem_merge(OBJECT(machine), false, &error_abort); + + /* + * Virtio devices can't count on directly accessing guest + * memory, so they need iommu_platform=on to use normal DMA + * mechanisms. That requires also disabling legacy virtio + * support for those virtio pci devices which allow it. + */ + object_register_sugar_prop(TYPE_VIRTIO_PCI, "disable-legacy", + "on", true); + object_register_sugar_prop(TYPE_VIRTIO_DEVICE, "iommu_platform", + "on", false); + } + machine_class->init(machine); phase_advance(PHASE_MACHINE_INITIALIZED); } diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index 92e90ff013..11172214f1 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -38,6 +38,7 @@ #include "sysemu/sysemu.h" #include "hw/block/flash.h" #include "sysemu/kvm.h" +#include "sysemu/sev.h" #define FLASH_SECTOR_SIZE 4096 @@ -147,7 +148,7 @@ static void pc_system_flash_map(PCMachineState *pcms, PFlashCFI01 *system_flash; MemoryRegion *flash_mem; void *flash_ptr; - int ret, flash_size; + int flash_size; assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled); @@ -191,16 +192,10 @@ static void pc_system_flash_map(PCMachineState *pcms, flash_mem = pflash_cfi01_get_memory(system_flash); pc_isa_bios_init(rom_memory, flash_mem, size); - /* Encrypt the pflash boot ROM */ - if (kvm_memcrypt_enabled()) { - flash_ptr = memory_region_get_ram_ptr(flash_mem); - flash_size = memory_region_size(flash_mem); - ret = kvm_memcrypt_encrypt_data(flash_ptr, flash_size); - if (ret) { - error_report("failed to encrypt pflash rom"); - exit(1); - } - } + /* Encrypt the pflash boot ROM, if necessary */ + flash_ptr = memory_region_get_ram_ptr(flash_mem); + flash_size = memory_region_size(flash_mem); + sev_encrypt_flash(flash_ptr, flash_size, &error_fatal); } } } diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index ffa2ec37fa..218631c883 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -27,6 +27,7 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files( 'spapr_nvdimm.c', 'spapr_rtas_ddw.c', 'spapr_numa.c', + 'pef.c', )) ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c')) ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files( diff --git a/hw/ppc/pef.c b/hw/ppc/pef.c new file mode 100644 index 0000000000..573be3ed79 --- /dev/null +++ b/hw/ppc/pef.c @@ -0,0 +1,140 @@ +/* + * PEF (Protected Execution Facility) for POWER support + * + * Copyright Red Hat. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "migration/blocker.h" +#include "exec/confidential-guest-support.h" +#include "hw/ppc/pef.h" + +#define TYPE_PEF_GUEST "pef-guest" +OBJECT_DECLARE_SIMPLE_TYPE(PefGuest, PEF_GUEST) + +typedef struct PefGuest PefGuest; +typedef struct PefGuestClass PefGuestClass; + +struct PefGuestClass { + ConfidentialGuestSupportClass parent_class; +}; + +/** + * PefGuest: + * + * The PefGuest object is used for creating and managing a PEF + * guest. + * + * # $QEMU \ + * -object pef-guest,id=pef0 \ + * -machine ...,confidential-guest-support=pef0 + */ +struct PefGuest { + ConfidentialGuestSupport parent_obj; +}; + +static int kvmppc_svm_init(Error **errp) +{ +#ifdef CONFIG_KVM + static Error *pef_mig_blocker; + + if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_SECURE_GUEST)) { + error_setg(errp, + "KVM implementation does not support Secure VMs (is an ultravisor running?)"); + return -1; + } else { + int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SECURE_GUEST, 0, 1); + + if (ret < 0) { + error_setg(errp, + "Error enabling PEF with KVM"); + return -1; + } + } + + /* add migration blocker */ + error_setg(&pef_mig_blocker, "PEF: Migration is not implemented"); + /* NB: This can fail if --only-migratable is used */ + migrate_add_blocker(pef_mig_blocker, &error_fatal); + + return 0; +#else + g_assert_not_reached(); +#endif +} + +/* + * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels + * that don't support this ioctl. + */ +static int kvmppc_svm_off(Error **errp) +{ +#ifdef CONFIG_KVM + int rc; + + rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF); + if (rc && rc != -ENOTTY) { + error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed"); + return rc; + } + return 0; +#else + g_assert_not_reached(); +#endif +} + +int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) { + return 0; + } + + if (!kvm_enabled()) { + error_setg(errp, "PEF requires KVM"); + return -1; + } + + return kvmppc_svm_init(errp); +} + +int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) { + return 0; + } + + /* + * If we don't have KVM we should never have been able to + * initialize PEF, so we should never get this far + */ + assert(kvm_enabled()); + + return kvmppc_svm_off(errp); +} + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(PefGuest, + pef_guest, + PEF_GUEST, + CONFIDENTIAL_GUEST_SUPPORT, + { TYPE_USER_CREATABLE }, + { NULL }) + +static void pef_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static void pef_guest_init(Object *obj) +{ +} + +static void pef_guest_finalize(Object *obj) +{ +} diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 6c47466fc2..612356e9ec 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -83,6 +83,7 @@ #include "hw/ppc/spapr_tpm_proxy.h" #include "hw/ppc/spapr_nvdimm.h" #include "hw/ppc/spapr_numa.h" +#include "hw/ppc/pef.h" #include "monitor/monitor.h" @@ -1574,7 +1575,7 @@ static void spapr_machine_reset(MachineState *machine) void *fdt; int rc; - kvmppc_svm_off(&error_fatal); + pef_kvm_reset(machine->cgs, &error_fatal); spapr_caps_apply(spapr); first_ppc_cpu = POWERPC_CPU(first_cpu); @@ -2658,6 +2659,11 @@ static void spapr_machine_init(MachineState *machine) char *filename; Error *resize_hpt_err = NULL; + /* + * if Secure VM (PEF) support is configured, then initialize it + */ + pef_kvm_init(machine->cgs, &error_fatal); + msi_nonbroken = true; QLIST_INIT(&spapr->phbs); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 76d7c91e9c..1b2b940606 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -2173,6 +2173,16 @@ static int spapr_pci_pre_save(void *opaque) return 0; } +static int spapr_pci_post_save(void *opaque) +{ + SpaprPhbState *sphb = opaque; + + g_free(sphb->msi_devs); + sphb->msi_devs = NULL; + sphb->msi_devs_num = 0; + return 0; +} + static int spapr_pci_post_load(void *opaque, int version_id) { SpaprPhbState *sphb = opaque; @@ -2205,6 +2215,7 @@ static const VMStateDescription vmstate_spapr_pci = { .version_id = 2, .minimum_version_id = 2, .pre_save = spapr_pci_pre_save, + .post_save = spapr_pci_post_save, .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL), diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c index ab3a2482aa..93eccfc05d 100644 --- a/hw/s390x/pv.c +++ b/hw/s390x/pv.c @@ -14,8 +14,11 @@ #include <linux/kvm.h> #include "cpu.h" +#include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/kvm.h" +#include "qom/object_interfaces.h" +#include "exec/confidential-guest-support.h" #include "hw/s390x/ipl.h" #include "hw/s390x/pv.h" @@ -111,3 +114,62 @@ void s390_pv_inject_reset_error(CPUState *cs) /* Report that we are unable to enter protected mode */ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; } + +#define TYPE_S390_PV_GUEST "s390-pv-guest" +OBJECT_DECLARE_SIMPLE_TYPE(S390PVGuest, S390_PV_GUEST) + +/** + * S390PVGuest: + * + * The S390PVGuest object is basically a dummy used to tell the + * confidential guest support system to use s390's PV mechanism. + * + * # $QEMU \ + * -object s390-pv-guest,id=pv0 \ + * -machine ...,confidential-guest-support=pv0 + */ +struct S390PVGuest { + ConfidentialGuestSupport parent_obj; +}; + +typedef struct S390PVGuestClass S390PVGuestClass; + +struct S390PVGuestClass { + ConfidentialGuestSupportClass parent_class; +}; + +int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_S390_PV_GUEST)) { + return 0; + } + + if (!s390_has_feat(S390_FEAT_UNPACK)) { + error_setg(errp, + "CPU model does not support Protected Virtualization"); + return -1; + } + + cgs->ready = true; + + return 0; +} + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(S390PVGuest, + s390_pv_guest, + S390_PV_GUEST, + CONFIDENTIAL_GUEST_SUPPORT, + { TYPE_USER_CREATABLE }, + { NULL }) + +static void s390_pv_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static void s390_pv_guest_init(Object *obj) +{ +} + +static void s390_pv_guest_finalize(Object *obj) +{ +} diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index a2d9a79c84..2972b607f3 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -250,6 +250,9 @@ static void ccw_init(MachineState *machine) /* init CPUs (incl. CPU model) early so s390_has_feature() works */ s390_init_cpus(machine); + /* Need CPU model to be determined before we can set up PV */ + s390_pv_init(machine->cgs, &error_fatal); + s390_flic_init(); /* init the SIGP facility */ diff --git a/include/block/snapshot.h b/include/block/snapshot.h index b0fe42993d..940345692f 100644 --- a/include/block/snapshot.h +++ b/include/block/snapshot.h @@ -25,7 +25,7 @@ #ifndef SNAPSHOT_H #define SNAPSHOT_H - +#include "qapi/qapi-builtin-types.h" #define SNAPSHOT_OPT_BASE "snapshot." #define SNAPSHOT_OPT_ID "snapshot.id" @@ -77,17 +77,26 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs); -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bsd_bs, +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp); +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs); +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs); + bool has_devices, + strList *devices, + Error **errp); -BlockDriverState *bdrv_all_find_vmstate_bs(void); +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h new file mode 100644 index 0000000000..ba2dd4b5df --- /dev/null +++ b/include/exec/confidential-guest-support.h @@ -0,0 +1,62 @@ +/* + * QEMU Confidential Guest support + * This interface describes the common pieces between various + * schemes for protecting guest memory or other state against a + * compromised hypervisor. This includes memory encryption (AMD's + * SEV and Intel's MKTME) or special protection modes (PEF on POWER, + * or PV on s390x). + * + * Copyright Red Hat. + * + * Authors: + * David Gibson <david@gibson.dropbear.id.au> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ +#ifndef QEMU_CONFIDENTIAL_GUEST_SUPPORT_H +#define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H + +#ifndef CONFIG_USER_ONLY + +#include "qom/object.h" + +#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support" +OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT) + +struct ConfidentialGuestSupport { + Object parent; + + /* + * ready: flag set by CGS initialization code once it's ready to + * start executing instructions in a potentially-secure + * guest + * + * The definition here is a bit fuzzy, because this is essentially + * part of a self-sanity-check, rather than a strict mechanism. + * + * It's not feasible to have a single point in the common machine + * init path to configure confidential guest support, because + * different mechanisms have different interdependencies requiring + * initialization in different places, often in arch or machine + * type specific code. It's also usually not possible to check + * for invalid configurations until that initialization code. + * That means it would be very easy to have a bug allowing CGS + * init to be bypassed entirely in certain configurations. + * + * Silently ignoring a requested security feature would be bad, so + * to avoid that we check late in init that this 'ready' flag is + * set if CGS was requested. If the CGS init hasn't happened, and + * so 'ready' is not set, we'll abort. + */ + bool ready; +}; + +typedef struct ConfidentialGuestSupportClass { + ObjectClass parent; +} ConfidentialGuestSupportClass; + +#endif /* !CONFIG_USER_ONLY */ + +#endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */ diff --git a/include/exec/memory.h b/include/exec/memory.h index c6ce74fb79..e58c09f130 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -149,6 +149,14 @@ typedef struct IOMMUTLBEvent { /* RAM is a persistent kind memory */ #define RAM_PMEM (1 << 5) + +/* + * UFFDIO_WRITEPROTECT is used on this RAMBlock to + * support 'write-tracking' migration type. + * Implies ram_state->ram_wt_enabled. + */ +#define RAM_UF_WRITEPROTECT (1 << 6) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/include/hw/boards.h b/include/hw/boards.h index 85af4faf76..a46dfe5d1a 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -270,7 +270,7 @@ struct MachineState { bool iommu; bool suppress_vmdesc; bool enable_graphics; - char *memory_encryption; + ConfidentialGuestSupport *cgs; char *ram_memdev_id; /* * convenience alias to ram_memdev_id backend memory region diff --git a/include/hw/ppc/pef.h b/include/hw/ppc/pef.h new file mode 100644 index 0000000000..707dbe524c --- /dev/null +++ b/include/hw/ppc/pef.h @@ -0,0 +1,17 @@ +/* + * PEF (Protected Execution Facility) for POWER support + * + * Copyright Red Hat. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef HW_PPC_PEF_H +#define HW_PPC_PEF_H + +int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp); + +#endif /* HW_PPC_PEF_H */ diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h index aee758bc2d..1f1f545bfc 100644 --- a/include/hw/s390x/pv.h +++ b/include/hw/s390x/pv.h @@ -12,6 +12,9 @@ #ifndef HW_S390_PV_H #define HW_S390_PV_H +#include "qapi/error.h" +#include "sysemu/kvm.h" + #ifdef CONFIG_KVM #include "cpu.h" #include "hw/s390x/s390-virtio-ccw.h" @@ -55,4 +58,18 @@ static inline void s390_pv_unshare(void) {} static inline void s390_pv_inject_reset_error(CPUState *cs) {}; #endif /* CONFIG_KVM */ +int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +static inline int s390_pv_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!cgs) { + return 0; + } + if (kvm_enabled()) { + return s390_pv_kvm_init(cgs, errp); + } + + error_setg(errp, "Protected Virtualization requires KVM"); + return -1; +} + #endif /* HW_S390_PV_H */ diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h index c85b6ec75b..e72083b117 100644 --- a/include/migration/snapshot.h +++ b/include/migration/snapshot.h @@ -15,7 +15,50 @@ #ifndef QEMU_MIGRATION_SNAPSHOT_H #define QEMU_MIGRATION_SNAPSHOT_H -int save_snapshot(const char *name, Error **errp); -int load_snapshot(const char *name, Error **errp); +#include "qapi/qapi-builtin-types.h" + +/** + * save_snapshot: Save an internal snapshot. + * @name: name of internal snapshot + * @overwrite: replace existing snapshot with @name + * @vmstate: blockdev node name to store VM state in + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool save_snapshot(const char *name, bool overwrite, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * load_snapshot: Load an internal snapshot. + * @name: name of internal snapshot + * @vmstate: blockdev node name to load VM state from + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool load_snapshot(const char *name, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * delete_snapshot: Delete a snapshot. + * @name: path to snapshot + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool delete_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/qemu/fifo8.h b/include/qemu/fifo8.h index 489c354291..28bf2cee57 100644 --- a/include/qemu/fifo8.h +++ b/include/qemu/fifo8.h @@ -148,12 +148,16 @@ uint32_t fifo8_num_used(Fifo8 *fifo); extern const VMStateDescription vmstate_fifo8; -#define VMSTATE_FIFO8(_field, _state) { \ - .name = (stringify(_field)), \ - .size = sizeof(Fifo8), \ - .vmsd = &vmstate_fifo8, \ - .flags = VMS_STRUCT, \ - .offset = vmstate_offset_value(_state, _field, Fifo8), \ +#define VMSTATE_FIFO8_TEST(_field, _state, _test) { \ + .name = (stringify(_field)), \ + .field_exists = (_test), \ + .size = sizeof(Fifo8), \ + .vmsd = &vmstate_fifo8, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, Fifo8), \ } +#define VMSTATE_FIFO8(_field, _state) \ + VMSTATE_FIFO8_TEST(_field, _state, NULL) + #endif /* QEMU_FIFO8_H */ diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 68deb74ef6..dc39b05c30 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -37,6 +37,7 @@ typedef struct Chardev Chardev; typedef struct Clock Clock; typedef struct CompatProperty CompatProperty; typedef struct CoMutex CoMutex; +typedef struct ConfidentialGuestSupport ConfidentialGuestSupport; typedef struct CPUAddressSpace CPUAddressSpace; typedef struct CPUState CPUState; typedef struct DeviceListener DeviceListener; diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h new file mode 100644 index 0000000000..6b74f92792 --- /dev/null +++ b/include/qemu/userfaultfd.h @@ -0,0 +1,35 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef USERFAULTFD_H +#define USERFAULTFD_H + +#include "qemu/osdep.h" +#include "exec/hwaddr.h" +#include <linux/userfaultfd.h> + +int uffd_query_features(uint64_t *features); +int uffd_create_fd(uint64_t features, bool non_blocking); +void uffd_close_fd(int uffd_fd); +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls); +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length); +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake); +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake); +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake); +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); +bool uffd_poll_events(int uffd_fd, int tmo); + +#endif /* USERFAULTFD_H */ diff --git a/include/qom/object.h b/include/qom/object.h index d378f13a11..6721cd312e 100644 --- a/include/qom/object.h +++ b/include/qom/object.h @@ -638,7 +638,8 @@ bool object_apply_global_props(Object *obj, const GPtrArray *props, Error **errp); void object_set_machine_compat_props(GPtrArray *compat_props); void object_set_accelerator_compat_props(GPtrArray *compat_props); -void object_register_sugar_prop(const char *driver, const char *prop, const char *value); +void object_register_sugar_prop(const char *driver, const char *prop, + const char *value, bool optional); void object_apply_compat_props(Object *obj); /** diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 739682f3c3..c5546bdecc 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -233,22 +233,6 @@ int kvm_has_intx_set_mask(void); */ bool kvm_arm_supports_user_irq(void); -/** - * kvm_memcrypt_enabled - return boolean indicating whether memory encryption - * is enabled - * Returns: 1 memory encryption is enabled - * 0 memory encryption is disabled - */ -bool kvm_memcrypt_enabled(void); - -/** - * kvm_memcrypt_encrypt_data: encrypt the memory range - * - * Return: 1 failed to encrypt the range - * 0 succesfully encrypted memory region - */ -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len); - #ifdef NEED_CPU_H #include "cpu.h" diff --git a/include/sysemu/sev.h b/include/sysemu/sev.h index 7ab6e3e31d..5c5a13c6ca 100644 --- a/include/sysemu/sev.h +++ b/include/sysemu/sev.h @@ -16,8 +16,8 @@ #include "sysemu/kvm.h" -void *sev_guest_init(const char *id); -int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len); +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index 1986cb8573..a5ddf43559 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -58,6 +58,7 @@ #include "qemu/queue.h" #include "multifd.h" #include "qemu/yank.h" +#include "sysemu/cpus.h" #ifdef CONFIG_VFIO #include "hw/vfio/vfio-common.h" @@ -134,6 +135,38 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; +/* Migration capabilities set */ +struct MigrateCapsSet { + int size; /* Capability set size */ + MigrationCapability caps[]; /* Variadic array of capabilities */ +}; +typedef struct MigrateCapsSet MigrateCapsSet; + +/* Define and initialize MigrateCapsSet */ +#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \ + MigrateCapsSet _name = { \ + .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \ + .caps = { __VA_ARGS__ } \ + } + +/* Background-snapshot compatibility check list */ +static const +INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, + MIGRATION_CAPABILITY_POSTCOPY_RAM, + MIGRATION_CAPABILITY_DIRTY_BITMAPS, + MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME, + MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE, + MIGRATION_CAPABILITY_RETURN_PATH, + MIGRATION_CAPABILITY_MULTIFD, + MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER, + MIGRATION_CAPABILITY_AUTO_CONVERGE, + MIGRATION_CAPABILITY_RELEASE_RAM, + MIGRATION_CAPABILITY_RDMA_PIN_ALL, + MIGRATION_CAPABILITY_COMPRESS, + MIGRATION_CAPABILITY_XBZRLE, + MIGRATION_CAPABILITY_X_COLO, + MIGRATION_CAPABILITY_VALIDATE_UUID); + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -141,6 +174,8 @@ enum mig_rp_message_type { static MigrationState *current_migration; static MigrationIncomingState *current_incoming; +static GSList *migration_blockers; + static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, @@ -1041,6 +1076,27 @@ static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); + info->blocked = migration_is_blocked(NULL); + info->has_blocked_reasons = info->blocked; + info->blocked_reasons = NULL; + if (info->blocked) { + GSList *cur_blocker = migration_blockers; + + /* + * There are two types of reasons a migration might be blocked; + * a) devices marked in VMState as non-migratable, and + * b) Explicit migration blockers + * We need to add both of them here. + */ + qemu_savevm_non_migratable_list(&info->blocked_reasons); + + while (cur_blocker) { + QAPI_LIST_PREPEND(info->blocked_reasons, + g_strdup(error_get_pretty(cur_blocker->data))); + cur_blocker = g_slist_next(cur_blocker); + } + } + switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ @@ -1089,6 +1145,31 @@ static void fill_source_migration_info(MigrationInfo *info) info->status = s->state; } +typedef enum WriteTrackingSupport { + WT_SUPPORT_UNKNOWN = 0, + WT_SUPPORT_ABSENT, + WT_SUPPORT_AVAILABLE, + WT_SUPPORT_COMPATIBLE +} WriteTrackingSupport; + +static +WriteTrackingSupport migrate_query_write_tracking(void) +{ + /* Check if kernel supports required UFFD features */ + if (!ram_write_tracking_available()) { + return WT_SUPPORT_ABSENT; + } + /* + * Check if current memory configuration is + * compatible with required UFFD features. + */ + if (!ram_write_tracking_compatible()) { + return WT_SUPPORT_AVAILABLE; + } + + return WT_SUPPORT_COMPATIBLE; +} + /** * @migration_caps_check - check capability validity * @@ -1150,6 +1231,39 @@ static bool migrate_caps_check(bool *cap_list, } } + if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { + WriteTrackingSupport wt_support; + int idx; + /* + * Check if 'background-snapshot' capability is supported by + * host kernel and compatible with guest memory configuration. + */ + wt_support = migrate_query_write_tracking(); + if (wt_support < WT_SUPPORT_AVAILABLE) { + error_setg(errp, "Background-snapshot is not supported by host kernel"); + return false; + } + if (wt_support < WT_SUPPORT_COMPATIBLE) { + error_setg(errp, "Background-snapshot is not compatible " + "with guest memory configuration"); + return false; + } + + /* + * Check if there are any migration capabilities + * incompatible with 'background-snapshot'. + */ + for (idx = 0; idx < check_caps_background_snapshot.size; idx++) { + int incomp_cap = check_caps_background_snapshot.caps[idx]; + if (cap_list[incomp_cap]) { + error_setg(errp, + "Background-snapshot is not compatible with %s", + MigrationCapability_str(incomp_cap)); + return false; + } + } + } + return true; } @@ -1226,21 +1340,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_compress_level && (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } @@ -1293,21 +1407,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_multifd_channels && (params->multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_channels", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_multifd_zlib_level && (params->multifd_zlib_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", - "is invalid, it should be in the range of 0 to 20"); + "a value between 0 and 20"); return false; } @@ -1316,8 +1430,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) !is_power_of_2(params->xbzrle_cache_size))) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "xbzrle_cache_size", - "is invalid, it should be bigger than target page size" - " and a power of 2"); + "a power of two no less than the target page size"); return false; } @@ -1334,21 +1447,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_initial > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_initial", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_max && params->announce_max > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_max", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_rounds && params->announce_rounds > 1000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_rounds", - "is invalid, it must be in the range of 0 to 1000"); + "a value between 0 and 1000"); return false; } if (params->has_announce_step && @@ -1356,7 +1469,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_step > 10000)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_step", - "is invalid, it must be in the range of 1 to 10000 ms"); + "a value between 0 and 10000"); return false; } @@ -1909,6 +2022,7 @@ void migrate_init(MigrationState *s) * locks. */ s->cleanup_bh = 0; + s->vm_start_bh = 0; s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; @@ -1934,8 +2048,6 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -static GSList *migration_blockers; - int migrate_add_blocker(Error *reason, Error **errp) { if (only_migratable) { @@ -2216,7 +2328,7 @@ void qmp_migrate_set_cache_size(int64_t value, Error **errp) qmp_migrate_set_parameters(&p, errp); } -int64_t qmp_query_migrate_cache_size(Error **errp) +uint64_t qmp_query_migrate_cache_size(Error **errp) { return migrate_xbzrle_cache_size(); } @@ -2446,7 +2558,7 @@ int migrate_use_xbzrle(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; } -int64_t migrate_xbzrle_cache_size(void) +uint64_t migrate_xbzrle_cache_size(void) { MigrationState *s; @@ -2491,6 +2603,15 @@ bool migrate_use_block_incremental(void) return s->parameters.block_incremental; } +bool migrate_background_snapshot(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; +} + /* migration thread support */ /* * Something bad happened to the RP stream, mark an error @@ -3117,6 +3238,50 @@ fail: MIGRATION_STATUS_FAILED); } +/** + * bg_migration_completion: Used by bg_migration_thread when after all the + * RAM has been saved. The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void bg_migration_completion(MigrationState *s) +{ + int current_active_state = s->state; + + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * By this moment we have RAM content saved into the migration stream. + * The next step is to flush the non-RAM content (device state) + * right after the ram content. The device state has been stored into + * the temporary buffer before RAM saving started. + */ + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); + qemu_fflush(s->to_dst_file); + } else if (s->state == MIGRATION_STATUS_CANCELLING) { + goto fail; + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + bool migrate_colo_enabled(void) { MigrationState *s = migrate_get_current(); @@ -3457,6 +3622,47 @@ static void migration_iteration_finish(MigrationState *s) qemu_mutex_unlock_iothread(); } +static void bg_migration_iteration_finish(MigrationState *s) +{ + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + break; + + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_CANCELLING: + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + + migrate_fd_cleanup_schedule(s); + qemu_mutex_unlock_iothread(); +} + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState bg_migration_iteration_run(MigrationState *s) +{ + int res; + + res = qemu_savevm_state_iterate(s->to_dst_file, false); + if (res > 0) { + bg_migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + void migration_make_urgent_request(void) { qemu_sem_post(&migrate_get_current()->rate_limit_sem); @@ -3604,6 +3810,165 @@ static void *migration_thread(void *opaque) return NULL; } +static void bg_migration_vm_start_bh(void *opaque) +{ + MigrationState *s = opaque; + + qemu_bh_delete(s->vm_start_bh); + s->vm_start_bh = NULL; + + vm_start(); + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; +} + +/** + * Background snapshot thread, based on live migration code. + * This is an alternative implementation of live migration mechanism + * introduced specifically to support background snapshots. + * + * It takes advantage of userfault_fd write protection mechanism introduced + * in v5.7 kernel. Compared to existing dirty page logging migration much + * lesser stream traffic is produced resulting in smaller snapshot images, + * simply cause of no page duplicates can get into the stream. + * + * Another key point is that generated vmstate stream reflects machine state + * 'frozen' at the beginning of snapshot creation compared to dirty page logging + * mechanism, which effectively results in that saved snapshot is the state of VM + * at the end of the process. + */ +static void *bg_migration_thread(void *opaque) +{ + MigrationState *s = opaque; + int64_t setup_start; + MigThrError thr_error; + QEMUFile *fb; + bool early_fail = true; + + rcu_register_thread(); + object_ref(OBJECT(s)); + + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + /* + * We want to save vmstate for the moment when migration has been + * initiated but also we want to save RAM content while VM is running. + * The RAM content should appear first in the vmstate. So, we first + * stash the non-RAM part of the vmstate to the temporary buffer, + * then write RAM part of the vmstate to the migration stream + * with vCPUs running and, finally, write stashed non-RAM part of + * the vmstate from the buffer to the migration stream. + */ + s->bioc = qio_channel_buffer_new(128 * 1024); + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); + object_unref(OBJECT(s->bioc)); + + update_iteration_initial_status(s); + + qemu_savevm_state_header(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); + + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, + MIGRATION_STATUS_ACTIVE); + } else { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); + } + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + + trace_migration_thread_setup_complete(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + qemu_mutex_lock_iothread(); + + /* + * If VM is currently in suspended state, then, to make a valid runstate + * transition in vm_stop_force_state() we need to wakeup it up. + */ + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + s->vm_was_running = runstate_is_running(); + + if (global_state_store()) { + goto fail; + } + /* Forcibly stop VM before saving state of vCPUs and devices */ + if (vm_stop_force_state(RUN_STATE_PAUSED)) { + goto fail; + } + /* + * Put vCPUs in sync with shadow context structures, then + * save their state to channel-buffer along with devices. + */ + cpu_synchronize_all_states(); + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + goto fail; + } + /* Now initialize UFFD context and start tracking RAM writes */ + if (ram_write_tracking_start()) { + goto fail; + } + early_fail = false; + + /* + * Start VM from BH handler to avoid write-fault lock here. + * UFFD-WP protection for the whole RAM is already enabled so + * calling VM state change notifiers from vm_start() would initiate + * writes to virtio VQs memory which is in write-protected region. + */ + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); + qemu_bh_schedule(s->vm_start_bh); + + qemu_mutex_unlock_iothread(); + + while (migration_is_active(s)) { + MigIterateState iter_state = bg_migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { + break; + } + + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ + break; + } + + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } + + trace_migration_thread_after_loop(); + +fail: + if (early_fail) { + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + } + + bg_migration_iteration_finish(s); + + qemu_fclose(fb); + object_unref(OBJECT(s)); + rcu_unregister_thread(); + + return NULL; +} + void migrate_fd_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; @@ -3667,8 +4032,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) migrate_fd_cleanup(s); return; } - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, - QEMU_THREAD_JOINABLE); + + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "bg_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&s->thread, "live_migration", + migration_thread, s, QEMU_THREAD_JOINABLE); + } s->migration_thread_running = true; } @@ -3784,6 +4155,8 @@ static Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), + DEFINE_PROP_MIG_CAP("x-background-snapshot", + MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), DEFINE_PROP_END_OF_LIST(), }; diff --git a/migration/migration.h b/migration/migration.h index d096b77f74..db6708326b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -20,6 +20,7 @@ #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" +#include "io/channel-buffer.h" #include "net/announce.h" #include "qom/object.h" @@ -147,8 +148,10 @@ struct MigrationState { /*< public >*/ QemuThread thread; + QEMUBH *vm_start_bh; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + QIOChannelBuffer *bioc; /* * Protects to_dst_file pointer. We need to make sure we won't * yield or hang during the critical section, since this lock will @@ -324,7 +327,7 @@ int migrate_multifd_zlib_level(void); int migrate_multifd_zstd_level(void); int migrate_use_xbzrle(void); -int64_t migrate_xbzrle_cache_size(void); +uint64_t migrate_xbzrle_cache_size(void); bool migrate_colo_enabled(void); bool migrate_use_block(void); @@ -341,6 +344,7 @@ int migrate_compress_wait_thread(void); int migrate_decompress_threads(void); bool migrate_use_events(void); bool migrate_postcopy_blocktime(void); +bool migrate_background_snapshot(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/migration/page_cache.c b/migration/page_cache.c index 098b436223..6d4f7a9bbc 100644 --- a/migration/page_cache.c +++ b/migration/page_cache.c @@ -38,7 +38,7 @@ struct PageCache { size_t num_items; }; -PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) +PageCache *cache_init(uint64_t new_size, size_t page_size, Error **errp) { int64_t i; size_t num_pages = new_size / page_size; @@ -60,8 +60,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) /* We prefer not to abort if there is no memory */ cache = g_try_malloc(sizeof(*cache)); if (!cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate cache"); + error_setg(errp, "Failed to allocate cache"); return NULL; } cache->page_size = page_size; @@ -74,8 +73,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) cache->page_cache = g_try_malloc((cache->max_num_items) * sizeof(*cache->page_cache)); if (!cache->page_cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate page cache"); + error_setg(errp, "Failed to allocate page cache"); g_free(cache); return NULL; } diff --git a/migration/page_cache.h b/migration/page_cache.h index 0cb94498a0..8733b4df6e 100644 --- a/migration/page_cache.h +++ b/migration/page_cache.h @@ -28,7 +28,7 @@ typedef struct PageCache PageCache; * @page_size: cache page size * @errp: set *errp if the check failed, with reason */ -PageCache *cache_init(int64_t cache_size, size_t page_size, Error **errp); +PageCache *cache_init(uint64_t cache_size, size_t page_size, Error **errp); /** * cache_fini: free all cache resources * @cache pointer to the PageCache struct diff --git a/migration/qemu-file.c b/migration/qemu-file.c index be21518c57..d6e03dbc0e 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -595,7 +595,7 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) { if (size < IO_BUF_SIZE) { size_t res; - uint8_t *src; + uint8_t *src = NULL; res = qemu_peek_buffer(f, &src, size, 0); diff --git a/migration/ram.c b/migration/ram.c index 7811cde643..72143da0ac 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -56,6 +56,11 @@ #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" +#include "sysemu/runstate.h" + +#if defined(__linux__) +#include "qemu/userfaultfd.h" +#endif /* defined(__linux__) */ /***********************************************************/ /* ram save/restore */ @@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void) * @new_size: new cache size * @errp: set *errp if the check failed, with reason */ -int xbzrle_cache_resize(int64_t new_size, Error **errp) +int xbzrle_cache_resize(uint64_t new_size, Error **errp) { PageCache *new_cache; int64_t ret = 0; @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { struct RAMState { /* QEMUFile used for this migration */ QEMUFile *f; + /* UFFD file descriptor, used in 'write-tracking' migration */ + int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; /* Last block from where we have sent data */ @@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) return block; } +#if defined(__linux__) +/** + * poll_fault_page: try to get next UFFD write fault page and, if pending fault + * is found, return RAM block pointer and page offset + * + * Returns pointer to the RAMBlock containing faulting page, + * NULL if no write faults are pending + * + * @rs: current RAM state + * @offset: page offset from the beginning of the block + */ +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + struct uffd_msg uffd_msg; + void *page_address; + RAMBlock *bs; + int res; + + if (!migrate_background_snapshot()) { + return NULL; + } + + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); + if (res <= 0) { + return NULL; + } + + page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; + bs = qemu_ram_block_from_host(page_address, false, offset); + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); + return bs; +} + +/** + * ram_save_release_protection: release UFFD write protection after + * a range of pages has been saved + * + * @rs: current RAM state + * @pss: page-search-status structure + * @start_page: index of the first page in the range relative to pss->block + * + * Returns 0 on success, negative value in case of an error +*/ +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + int res = 0; + + /* Check if page is from UFFD-managed region. */ + if (pss->block->flags & RAM_UF_WRITEPROTECT) { + void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); + uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; + + /* Flush async buffers before un-protect. */ + qemu_fflush(rs->f); + /* Un-protect memory range. */ + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, + false, false); + } + + return res; +} + +/* ram_write_tracking_available: check if kernel supports required UFFD features + * + * Returns true if supports, false otherwise + */ +bool ram_write_tracking_available(void) +{ + uint64_t uffd_features; + int res; + + res = uffd_query_features(&uffd_features); + return (res == 0 && + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); +} + +/* ram_write_tracking_compatible: check if guest configuration is + * compatible with 'write-tracking' + * + * Returns true if compatible, false otherwise + */ +bool ram_write_tracking_compatible(void) +{ + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); + int uffd_fd; + RAMBlock *bs; + bool ret = false; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); + if (uffd_fd < 0) { + return false; + } + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + uint64_t uffd_ioctls; + + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + /* Try to register block memory via UFFD-IO to track writes */ + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { + goto out; + } + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { + goto out; + } + } + ret = true; + +out: + uffd_close_fd(uffd_fd); + return ret; +} + +/* + * ram_write_tracking_start: start UFFD-WP memory tracking + * + * Returns 0 for success or negative value in case of error + */ +int ram_write_tracking_start(void) +{ + int uffd_fd; + RAMState *rs = ram_state; + RAMBlock *bs; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); + if (uffd_fd < 0) { + return uffd_fd; + } + rs->uffdio_fd = uffd_fd; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + + /* Register block memory with UFFD to track writes */ + if (uffd_register_memory(rs->uffdio_fd, bs->host, + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { + goto fail; + } + /* Apply UFFD write protection to the block memory range */ + if (uffd_change_protection(rs->uffdio_fd, bs->host, + bs->max_length, true, false)) { + goto fail; + } + bs->flags |= RAM_UF_WRITEPROTECT; + memory_region_ref(bs->mr); + + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, + bs->host, bs->max_length); + } + + return 0; + +fail: + error_report("ram_write_tracking_start() failed: restoring initial memory state"); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* + * In case some memory block failed to be write-protected + * remove protection and unregister all succeeded RAM blocks + */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + uffd_close_fd(uffd_fd); + rs->uffdio_fd = -1; + return -1; +} + +/** + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection + */ +void ram_write_tracking_stop(void) +{ + RAMState *rs = ram_state; + RAMBlock *bs; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* Remove protection and unregister all affected RAM blocks */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, + bs->host, bs->max_length); + + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + /* Finally close UFFD file descriptor */ + uffd_close_fd(rs->uffdio_fd); + rs->uffdio_fd = -1; +} + +#else +/* No target OS support, stubs just fail or ignore */ + +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + (void) rs; + (void) offset; + + return NULL; +} + +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + (void) rs; + (void) pss; + (void) start_page; + + return 0; +} + +bool ram_write_tracking_available(void) +{ + return false; +} + +bool ram_write_tracking_compatible(void) +{ + assert(0); + return false; +} + +int ram_write_tracking_start(void) +{ + assert(0); + return -1; +} + +void ram_write_tracking_stop(void) +{ + assert(0); +} +#endif /* defined(__linux__) */ + /** * get_queued_page: unqueue a page from the postcopy requests * @@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); + if (!block) { + /* + * Poll write faults too if background snapshot is enabled; that's + * when we have vcpus got blocked by the write protected pages. + */ + block = poll_fault_page(rs, &offset); + } + if (block) { /* * As soon as we start servicing pages out of order, then we have @@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + unsigned long start_page = pss->page; + int res; if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); @@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); - /* The offset we leave with is the last one we looked at */ pss->page--; - return pages; + + res = ram_save_release_protection(rs, pss, start_page); + return (res < 0 ? res : pages); } /** @@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque) RAMState **rsp = opaque; RAMBlock *block; - /* caller have hold iothread lock or is in a bh, so there is - * no writing race against the migration bitmap - */ - memory_global_dirty_log_stop(); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against the migration bitmap + */ + memory_global_dirty_log_stop(); + } RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->clear_bmap); @@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); - memory_global_dirty_log_start(); - migration_bitmap_sync_precopy(rs); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + memory_global_dirty_log_start(); + migration_bitmap_sync_precopy(rs); + } } qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); @@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f) } } /* For postcopy we need to check hugepage sizes match */ - if (postcopy_advised && + if (postcopy_advised && migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { uint64_t remote_page_size = qemu_get_be64(f); if (remote_page_size != block->page_size) { diff --git a/migration/ram.h b/migration/ram.h index 011e85414e..6378bb3ebc 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -47,7 +47,7 @@ bool ramblock_is_ignored(RAMBlock *block); INTERNAL_RAMBLOCK_FOREACH(block) \ if (!qemu_ram_is_migratable(block)) {} else -int xbzrle_cache_resize(int64_t new_size, Error **errp); +int xbzrle_cache_resize(uint64_t new_size, Error **errp); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_total(void); @@ -79,4 +79,10 @@ void colo_flush_ram_cache(void); void colo_release_ram_cache(void); void colo_incoming_start_dirty_log(void); +/* Background snapshot */ +bool ram_write_tracking_available(void); +bool ram_write_tracking_compatible(void); +int ram_write_tracking_start(void); +void ram_write_tracking_stop(void); + #endif diff --git a/migration/savevm.c b/migration/savevm.c index 4f3b69ecfc..52e2d72e4b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -43,6 +43,8 @@ #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "qapi/qmp/json-writer.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-builtin-visit.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "sysemu/cpus.h" @@ -315,6 +317,16 @@ static int configuration_pre_save(void *opaque) return 0; } +static int configuration_post_save(void *opaque) +{ + SaveState *state = opaque; + + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + return 0; +} + static int configuration_pre_load(void *opaque) { SaveState *state = opaque; @@ -365,24 +377,36 @@ static int configuration_post_load(void *opaque, int version_id) { SaveState *state = opaque; const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + int ret = 0; if (strncmp(state->name, current_name, state->len) != 0) { error_report("Machine type received is '%.*s' and local is '%s'", (int) state->len, state->name, current_name); - return -EINVAL; + ret = -EINVAL; + goto out; } if (state->target_page_bits != qemu_target_page_bits()) { error_report("Received TARGET_PAGE_BITS is %d but local is %d", state->target_page_bits, qemu_target_page_bits()); - return -EINVAL; + ret = -EINVAL; + goto out; } if (!configuration_validate_capabilities(state)) { - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; +out: + g_free((void *)state->name); + state->name = NULL; + state->len = 0; + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + + return ret; } static int get_capability(QEMUFile *f, void *pv, size_t size, @@ -516,6 +540,7 @@ static const VMStateDescription vmstate_configuration = { .pre_load = configuration_pre_load, .post_load = configuration_post_load, .pre_save = configuration_pre_save, + .post_save = configuration_post_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), @@ -1131,6 +1156,19 @@ bool qemu_savevm_state_blocked(Error **errp) return false; } +void qemu_savevm_non_migratable_list(strList **reasons) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (se->vmsd && se->vmsd->unmigratable) { + QAPI_LIST_PREPEND(*reasons, + g_strdup_printf("non-migratable device: %s", + se->idstr)); + } + } +} + void qemu_savevm_state_header(QEMUFile *f) { trace_savevm_state_header(); @@ -1355,7 +1393,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) return 0; } -static int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) @@ -2729,9 +2766,10 @@ int qemu_load_device_state(QEMUFile *f) return 0; } -int save_snapshot(const char *name, Error **errp) +bool save_snapshot(const char *name, bool overwrite, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs1; + BlockDriverState *bs; QEMUSnapshotInfo sn1, *sn = &sn1; int ret = -1, ret2; QEMUFile *f; @@ -2742,35 +2780,43 @@ int save_snapshot(const char *name, Error **errp) AioContext *aio_context; if (migration_is_blocked(errp)) { - return ret; + return false; } if (!replay_can_snapshot()) { error_setg(errp, "Record/replay does not allow making snapshot " "right now. Try once more later."); - return ret; + return false; } - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, "Device '%s' is writable but does not support " - "snapshots", bdrv_get_device_or_node_name(bs)); - return ret; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } /* Delete old snapshots of the same name */ if (name) { - ret = bdrv_all_delete_snapshot(name, &bs1, errp); - if (ret < 0) { - error_prepend(errp, "Error while deleting snapshot on device " - "'%s': ", bdrv_get_device_or_node_name(bs1)); - return ret; + if (overwrite) { + if (bdrv_all_delete_snapshot(name, has_devices, + devices, errp) < 0) { + return false; + } + } else { + ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp); + if (ret2 < 0) { + return false; + } + if (ret2 == 1) { + error_setg(errp, + "Snapshot '%s' already exists in one or more devices", + name); + return false; + } } } - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (bs == NULL) { - error_setg(errp, "No block device can accept snapshots"); - return ret; + return false; } aio_context = bdrv_get_aio_context(bs); @@ -2779,7 +2825,7 @@ int save_snapshot(const char *name, Error **errp) ret = global_state_store(); if (ret) { error_setg(errp, "Error saving global state"); - return ret; + return false; } vm_stop(RUN_STATE_SAVE_VM); @@ -2833,11 +2879,10 @@ int save_snapshot(const char *name, Error **errp) aio_context_release(aio_context); aio_context = NULL; - ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); + ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, + has_devices, devices, errp); if (ret < 0) { - error_setg(errp, "Error while creating snapshot on '%s'", - bdrv_get_device_or_node_name(bs)); - bdrv_all_delete_snapshot(sn->name, &bs, NULL); + bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL); goto the_end; } @@ -2853,7 +2898,7 @@ int save_snapshot(const char *name, Error **errp) if (saved_vm_running) { vm_start(); } - return ret; + return ret == 0; } void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, @@ -2938,33 +2983,32 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp) migration_incoming_state_destroy(); } -int load_snapshot(const char *name, Error **errp) +bool load_snapshot(const char *name, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs_vm_state; + BlockDriverState *bs_vm_state; QEMUSnapshotInfo sn; QEMUFile *f; int ret; AioContext *aio_context; MigrationIncomingState *mis = migration_incoming_get_current(); - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, - "Device '%s' is writable but does not support snapshots", - bdrv_get_device_or_node_name(bs)); - return -ENOTSUP; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } - ret = bdrv_all_find_snapshot(name, &bs); + ret = bdrv_all_has_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_setg(errp, - "Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_or_node_name(bs), name); - return ret; + return false; + } + if (ret == 0) { + error_setg(errp, "Snapshot '%s' does not exist in one or more devices", + name); + return false; } - bs_vm_state = bdrv_all_find_vmstate_bs(); + bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (!bs_vm_state) { - error_setg(errp, "No block device supports snapshots"); - return -ENOTSUP; + return false; } aio_context = bdrv_get_aio_context(bs_vm_state); @@ -2973,11 +3017,11 @@ int load_snapshot(const char *name, Error **errp) ret = bdrv_snapshot_find(bs_vm_state, &sn, name); aio_context_release(aio_context); if (ret < 0) { - return ret; + return false; } else if (sn.vm_state_size == 0) { error_setg(errp, "This is a disk-only snapshot. Revert to it " " offline using qemu-img"); - return -EINVAL; + return false; } /* @@ -2989,10 +3033,8 @@ int load_snapshot(const char *name, Error **errp) /* Flush all IO requests so they don't interfere with the new state. */ bdrv_drain_all_begin(); - ret = bdrv_all_goto_snapshot(name, &bs, errp); + ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_prepend(errp, "Could not load snapshot '%s' on '%s': ", - name, bdrv_get_device_or_node_name(bs)); goto err_drain; } @@ -3000,7 +3042,6 @@ int load_snapshot(const char *name, Error **errp) f = qemu_fopen_bdrv(bs_vm_state, 0); if (!f) { error_setg(errp, "Could not open VM state file"); - ret = -EINVAL; goto err_drain; } @@ -3020,14 +3061,28 @@ int load_snapshot(const char *name, Error **errp) if (ret < 0) { error_setg(errp, "Error %d while loading VM state", ret); - return ret; + return false; } - return 0; + return true; err_drain: bdrv_drain_all_end(); - return ret; + return false; +} + +bool delete_snapshot(const char *name, bool has_devices, + strList *devices, Error **errp) +{ + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; + } + + if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) { + return false; + } + + return true; } void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) @@ -3057,3 +3112,187 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) return !(vmsd && vmsd->unmigratable); } + +typedef struct SnapshotJob { + Job common; + char *tag; + char *vmstate; + strList *devices; + Coroutine *co; + Error **errp; + bool ret; +} SnapshotJob; + +static void qmp_snapshot_job_free(SnapshotJob *s) +{ + g_free(s->tag); + g_free(s->vmstate); + qapi_free_strList(s->devices); +} + + +static void snapshot_load_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + int orig_vm_running; + + job_progress_set_remaining(&s->common, 1); + + orig_vm_running = runstate_is_running(); + vm_stop(RUN_STATE_RESTORE_VM); + + s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp); + if (s->ret && orig_vm_running) { + vm_start(); + } + + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_save_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = save_snapshot(s->tag, false, s->vmstate, + true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_delete_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = delete_snapshot(s->tag, true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_save_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_load_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_delete_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + + +static const JobDriver snapshot_load_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_LOAD, + .run = snapshot_load_job_run, +}; + +static const JobDriver snapshot_save_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_SAVE, + .run = snapshot_save_job_run, +}; + +static const JobDriver snapshot_delete_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_DELETE, + .run = snapshot_delete_job_run, +}; + + +void qmp_snapshot_save(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_save_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_load(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_load_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_delete(const char *job_id, + const char *tag, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_delete_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} diff --git a/migration/savevm.h b/migration/savevm.h index ba64a7e271..6461342cb4 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -30,6 +30,7 @@ #define QEMU_VM_SECTION_FOOTER 0x7e bool qemu_savevm_state_blocked(Error **errp); +void qemu_savevm_non_migratable_list(strList **reasons); void qemu_savevm_state_setup(QEMUFile *f); bool qemu_savevm_state_guest_unplug_pending(void); int qemu_savevm_state_resume_prepare(MigrationState *s); @@ -64,5 +65,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); #endif diff --git a/migration/trace-events b/migration/trace-events index 75de5004ac..668c562fed 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %d" diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index a48bc1e904..3c88a4faef 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -224,6 +224,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) migration_global_dump(mon); + if (info->blocked) { + strList *reasons = info->blocked_reasons; + monitor_printf(mon, "Outgoing migration blocked:\n"); + while (reasons) { + monitor_printf(mon, " %s\n", reasons->value); + reasons = reasons->next; + } + } + if (info->has_status) { monitor_printf(mon, "Migration status: %s", MigrationStatus_str(info->status)); @@ -1130,7 +1139,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict) vm_stop(RUN_STATE_RESTORE_VM); - if (load_snapshot(name, &err) == 0 && saved_vm_running) { + if (!load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { vm_start(); } hmp_handle_error(mon, err); @@ -1140,21 +1149,17 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) { Error *err = NULL; - save_snapshot(qdict_get_try_str(qdict, "name"), &err); + save_snapshot(qdict_get_try_str(qdict, "name"), + true, NULL, false, NULL, &err); hmp_handle_error(mon, err); } void hmp_delvm(Monitor *mon, const QDict *qdict) { - BlockDriverState *bs; Error *err = NULL; const char *name = qdict_get_str(qdict, "name"); - if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { - error_prepend(&err, - "deleting snapshot on device '%s': ", - bdrv_get_device_name(bs)); - } + delete_snapshot(name, false, NULL, &err); hmp_handle_error(mon, err); } @@ -1294,11 +1299,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) switch (val) { case MIGRATION_PARAMETER_COMPRESS_LEVEL: p->has_compress_level = true; - visit_type_int(v, param, &p->compress_level, &err); + visit_type_uint8(v, param, &p->compress_level, &err); break; case MIGRATION_PARAMETER_COMPRESS_THREADS: p->has_compress_threads = true; - visit_type_int(v, param, &p->compress_threads, &err); + visit_type_uint8(v, param, &p->compress_threads, &err); break; case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD: p->has_compress_wait_thread = true; @@ -1306,19 +1311,19 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DECOMPRESS_THREADS: p->has_decompress_threads = true; - visit_type_int(v, param, &p->decompress_threads, &err); + visit_type_uint8(v, param, &p->decompress_threads, &err); break; case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: p->has_throttle_trigger_threshold = true; - visit_type_int(v, param, &p->throttle_trigger_threshold, &err); + visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL: p->has_cpu_throttle_initial = true; - visit_type_int(v, param, &p->cpu_throttle_initial, &err); + visit_type_uint8(v, param, &p->cpu_throttle_initial, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT: p->has_cpu_throttle_increment = true; - visit_type_int(v, param, &p->cpu_throttle_increment, &err); + visit_type_uint8(v, param, &p->cpu_throttle_increment, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW: p->has_cpu_throttle_tailslow = true; @@ -1326,7 +1331,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; - visit_type_int(v, param, &p->max_cpu_throttle, &err); + visit_type_uint8(v, param, &p->max_cpu_throttle, &err); break; case MIGRATION_PARAMETER_TLS_CREDS: p->has_tls_creds = true; @@ -1362,11 +1367,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DOWNTIME_LIMIT: p->has_downtime_limit = true; - visit_type_int(v, param, &p->downtime_limit, &err); + visit_type_size(v, param, &p->downtime_limit, &err); break; case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY: p->has_x_checkpoint_delay = true; - visit_type_int(v, param, &p->x_checkpoint_delay, &err); + visit_type_uint32(v, param, &p->x_checkpoint_delay, &err); break; case MIGRATION_PARAMETER_BLOCK_INCREMENTAL: p->has_block_incremental = true; @@ -1374,7 +1379,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_CHANNELS: p->has_multifd_channels = true; - visit_type_int(v, param, &p->multifd_channels, &err); + visit_type_uint8(v, param, &p->multifd_channels, &err); break; case MIGRATION_PARAMETER_MULTIFD_COMPRESSION: p->has_multifd_compression = true; @@ -1383,11 +1388,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL: p->has_multifd_zlib_level = true; - visit_type_int(v, param, &p->multifd_zlib_level, &err); + visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; - visit_type_int(v, param, &p->multifd_zstd_level, &err); + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; diff --git a/qapi/job.json b/qapi/job.json index 280c2f76f1..1a6ef03451 100644 --- a/qapi/job.json +++ b/qapi/job.json @@ -22,10 +22,17 @@ # # @amend: image options amend job type, see "x-blockdev-amend" (since 5.1) # +# @snapshot-load: snapshot load job type, see "snapshot-load" (since 6.0) +# +# @snapshot-save: snapshot save job type, see "snapshot-save" (since 6.0) +# +# @snapshot-delete: snapshot delete job type, see "snapshot-delete" (since 6.0) +# # Since: 1.7 ## { 'enum': 'JobType', - 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend'] } + 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend', + 'snapshot-load', 'snapshot-save', 'snapshot-delete'] } ## # @JobStatus: diff --git a/qapi/migration.json b/qapi/migration.json index d1d9632c2a..ce14d78071 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -78,7 +78,7 @@ # Since: 1.2 ## { 'struct': 'XBZRLECacheStats', - 'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int', + 'data': {'cache-size': 'size', 'bytes': 'int', 'pages': 'int', 'cache-miss': 'int', 'cache-miss-rate': 'number', 'encoding-rate': 'number', 'overflow': 'int' } } @@ -224,6 +224,10 @@ # only returned if VFIO device is present, migration is supported by all # VFIO devices and status is 'active' or 'completed' (since 5.2) # +# @blocked: True if outgoing migration is blocked (since 6.0) +# +# @blocked-reasons: A list of reasons an outgoing migration is blocked (since 6.0) +# # Since: 0.14 ## { 'struct': 'MigrationInfo', @@ -237,6 +241,8 @@ '*setup-time': 'int', '*cpu-throttle-percentage': 'int', '*error-desc': 'str', + 'blocked': 'bool', + '*blocked-reasons': ['str'], '*postcopy-blocktime' : 'uint32', '*postcopy-vcpu-blocktime': ['uint32'], '*compression': 'CompressionStats', @@ -442,6 +448,11 @@ # @validate-uuid: Send the UUID of the source to allow the destination # to ensure it is the same. (since 4.2) # +# @background-snapshot: If enabled, the migration stream will be a snapshot +# of the VM exactly at the point when the migration +# procedure starts. The VM RAM is saved with running VM. +# (since 6.0) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', @@ -449,7 +460,7 @@ 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'block', 'return-path', 'pause-before-switchover', 'multifd', 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', - 'x-ignore-shared', 'validate-uuid' ] } + 'x-ignore-shared', 'validate-uuid', 'background-snapshot'] } ## # @MigrationCapabilityStatus: @@ -885,28 +896,28 @@ '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', - '*compress-level': 'int', - '*compress-threads': 'int', + '*compress-level': 'uint8', + '*compress-threads': 'uint8', '*compress-wait-thread': 'bool', - '*decompress-threads': 'int', - '*throttle-trigger-threshold': 'int', - '*cpu-throttle-initial': 'int', - '*cpu-throttle-increment': 'int', + '*decompress-threads': 'uint8', + '*throttle-trigger-threshold': 'uint8', + '*cpu-throttle-initial': 'uint8', + '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', - '*max-bandwidth': 'int', - '*downtime-limit': 'int', - '*x-checkpoint-delay': 'int', + '*max-bandwidth': 'size', + '*downtime-limit': 'uint64', + '*x-checkpoint-delay': 'uint32', '*block-incremental': 'bool', - '*multifd-channels': 'int', + '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', - '*max-cpu-throttle': 'int', + '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', - '*multifd-zlib-level': 'int', - '*multifd-zstd-level': 'int', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } ## @@ -1093,7 +1104,7 @@ '*max-bandwidth': 'size', '*downtime-limit': 'uint64', '*x-checkpoint-delay': 'uint32', - '*block-incremental': 'bool' , + '*block-incremental': 'bool', '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', @@ -1465,7 +1476,7 @@ # <- { "return": 67108864 } # ## -{ 'command': 'query-migrate-cache-size', 'returns': 'int', +{ 'command': 'query-migrate-cache-size', 'returns': 'size', 'features': [ 'deprecated' ] } ## @@ -1843,3 +1854,176 @@ # Since: 5.2 ## { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' } + +## +# @snapshot-save: +# +# Save a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to create +# @vmstate: block device node name to save vmstate to +# @devices: list of block device node names to save a snapshot to +# +# Applications should not assume that the snapshot save is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs may be stopped during the +# time it takes to save the snapshot. A future version of QEMU +# may ensure CPUs are executing continuously. +# +# It is strongly recommended that @devices contain all writable +# block device nodes if a consistent snapshot is required. +# +# If @tag already exists, an error will be reported +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-save", +# "data": { +# "job-id": "snapsave0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapsave0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapsave0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-save", +# "id": "snapsave0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-save', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-load: +# +# Load a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to load. +# @vmstate: block device node name to load vmstate from +# @devices: list of block device node names to load a snapshot from +# +# Applications should not assume that the snapshot load is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs will be stopped during the +# time it takes to load the snapshot. +# +# It is strongly recommended that @devices contain all writable +# block device nodes that can have changed since the original +# @snapshot-save command execution. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-load", +# "data": { +# "job-id": "snapload0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapload0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapload0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-load", +# "id": "snapload0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-load', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-delete: +# +# Delete a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to delete. +# @devices: list of block device node names to delete a snapshot from +# +# Applications should not assume that the snapshot delete is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-delete", +# "data": { +# "job-id": "snapdelete0", +# "tag": "my-snap", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapdelete0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-delete", +# "id": "snapdelete0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-delete', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'devices': ['str'] } } diff --git a/qom/object.c b/qom/object.c index 2fa0119647..491823db4a 100644 --- a/qom/object.c +++ b/qom/object.c @@ -442,7 +442,8 @@ static GPtrArray *object_compat_props[3]; * other than "-global". These are generally used for syntactic * sugar and legacy command line options. */ -void object_register_sugar_prop(const char *driver, const char *prop, const char *value) +void object_register_sugar_prop(const char *driver, const char *prop, + const char *value, bool optional) { GlobalProperty *g; if (!object_compat_props[2]) { @@ -452,6 +453,7 @@ void object_register_sugar_prop(const char *driver, const char *prop, const char g->driver = g_strdup(driver); g->property = g_strdup(prop); g->value = g_strdup(value); + g->optional = optional; g_ptr_array_add(object_compat_props[2], g); } diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c index 5ec574724a..1cde50e9f3 100644 --- a/replay/replay-debugging.c +++ b/replay/replay-debugging.c @@ -143,12 +143,13 @@ static char *replay_find_nearest_snapshot(int64_t icount, QEMUSnapshotInfo *sn_tab; QEMUSnapshotInfo *nearest = NULL; char *ret = NULL; + int rv; int nb_sns, i; AioContext *aio_context; *snapshot_icount = -1; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, NULL); if (!bs) { goto fail; } @@ -159,7 +160,10 @@ static char *replay_find_nearest_snapshot(int64_t icount, aio_context_release(aio_context); for (i = 0; i < nb_sns; i++) { - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) { + rv = bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL); + if (rv < 0) + goto fail; + if (rv == 1) { if (sn_tab[i].icount != -1ULL && sn_tab[i].icount <= icount && (!nearest || nearest->icount < sn_tab[i].icount)) { @@ -192,7 +196,7 @@ static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp) if (icount < replay_get_current_icount() || replay_get_current_icount() < snapshot_icount) { vm_stop(RUN_STATE_RESTORE_VM); - load_snapshot(snapshot, errp); + load_snapshot(snapshot, NULL, false, NULL, errp); } g_free(snapshot); } @@ -323,7 +327,7 @@ void replay_gdb_attached(void) */ if (replay_mode == REPLAY_MODE_PLAY && !replay_snapshot) { - if (save_snapshot("start_debugging", NULL) != 0) { + if (!save_snapshot("start_debugging", true, NULL, false, NULL, NULL)) { /* Can't create the snapshot. Continue conventional debugging. */ } } diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c index e26fa4c892..e8767a1937 100644 --- a/replay/replay-snapshot.c +++ b/replay/replay-snapshot.c @@ -77,13 +77,14 @@ void replay_vmstate_init(void) if (replay_snapshot) { if (replay_mode == REPLAY_MODE_RECORD) { - if (save_snapshot(replay_snapshot, &err) != 0) { + if (!save_snapshot(replay_snapshot, + true, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not create snapshot for icount record"); exit(1); } } else if (replay_mode == REPLAY_MODE_PLAY) { - if (load_snapshot(replay_snapshot, &err) != 0) { + if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not load snapshot for icount replay"); exit(1); diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py index 25ee6887cf..cbbcba100d 100644 --- a/scripts/mtest2make.py +++ b/scripts/mtest2make.py @@ -110,6 +110,7 @@ def emit_suite(name, suite, prefix): print('ifneq ($(filter %s %s, $(MAKECMDGOALS)),)' % (target, prefix)) print('.tests += $(.test.$(SPEED).%s)' % (target, )) print('endif') + print('all-%s-targets += %s' % (prefix, target)) targets = {t['id']: [os.path.relpath(f) for f in t['filename']] for t in introspect['targets']} diff --git a/scripts/qapi/commands.py b/scripts/qapi/commands.py index 50978090b4..54af519f44 100644 --- a/scripts/qapi/commands.py +++ b/scripts/qapi/commands.py @@ -23,7 +23,6 @@ from typing import ( from .common import c_name, mcgen from .gen import ( QAPIGenC, - QAPIGenCCode, QAPISchemaModularCVisitor, build_params, ifcontext, @@ -126,6 +125,9 @@ def gen_marshal(name: str, boxed: bool, ret_type: Optional[QAPISchemaType]) -> str: have_args = boxed or (arg_type and not arg_type.is_empty()) + if have_args: + assert arg_type is not None + arg_type_c_name = arg_type.c_name() ret = mcgen(''' @@ -147,7 +149,7 @@ def gen_marshal(name: str, ret += mcgen(''' %(c_name)s arg = {0}; ''', - c_name=arg_type.c_name()) + c_name=arg_type_c_name) ret += mcgen(''' @@ -163,7 +165,7 @@ def gen_marshal(name: str, ok = visit_check_struct(v, errp); } ''', - c_arg_type=arg_type.c_name()) + c_arg_type=arg_type_c_name) else: ret += mcgen(''' ok = visit_check_struct(v, errp); @@ -193,7 +195,7 @@ out: ret += mcgen(''' visit_type_%(c_arg_type)s_members(v, &arg, NULL); ''', - c_arg_type=arg_type.c_name()) + c_arg_type=arg_type_c_name) ret += mcgen(''' visit_end_struct(v, NULL); @@ -234,28 +236,11 @@ def gen_register_command(name: str, return ret -def gen_registry(registry: str, prefix: str) -> str: - ret = mcgen(''' - -void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds) -{ - QTAILQ_INIT(cmds); - -''', - c_prefix=c_name(prefix, protect=False)) - ret += registry - ret += mcgen(''' -} -''') - return ret - - class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor): def __init__(self, prefix: str): super().__init__( prefix, 'qapi-commands', ' * Schema-defined QAPI/QMP commands', None, __doc__) - self._regy = QAPIGenCCode(None) self._visited_ret_types: Dict[QAPIGenC, Set[QAPISchemaType]] = {} def _begin_user_module(self, name: str) -> None: @@ -282,25 +267,36 @@ class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor): ''', types=types)) - def visit_end(self) -> None: - self._add_system_module('init', ' * QAPI Commands initialization') + def visit_begin(self, schema: QAPISchema) -> None: + self._add_module('./init', ' * QAPI Commands initialization') self._genh.add(mcgen(''' #include "qapi/qmp/dispatch.h" void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds); ''', c_prefix=c_name(self._prefix, protect=False))) - self._genc.preamble_add(mcgen(''' + self._genc.add(mcgen(''' #include "qemu/osdep.h" #include "%(prefix)sqapi-commands.h" #include "%(prefix)sqapi-init-commands.h" + +void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds) +{ + QTAILQ_INIT(cmds); + ''', - prefix=self._prefix)) - self._genc.add(gen_registry(self._regy.get_content(), self._prefix)) + prefix=self._prefix, + c_prefix=c_name(self._prefix, protect=False))) + + def visit_end(self) -> None: + with self._temp_module('./init'): + self._genc.add(mcgen(''' +} +''')) def visit_command(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], arg_type: Optional[QAPISchemaObjectType], @@ -321,15 +317,17 @@ void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds); if ret_type and ret_type not in self._visited_ret_types[self._genc]: self._visited_ret_types[self._genc].add(ret_type) with ifcontext(ret_type.ifcond, - self._genh, self._genc, self._regy): + self._genh, self._genc): self._genc.add(gen_marshal_output(ret_type)) - with ifcontext(ifcond, self._genh, self._genc, self._regy): + with ifcontext(ifcond, self._genh, self._genc): self._genh.add(gen_command_decl(name, arg_type, boxed, ret_type)) self._genh.add(gen_marshal_decl(name)) self._genc.add(gen_marshal(name, arg_type, boxed, ret_type)) - self._regy.add(gen_register_command(name, success_response, - allow_oob, allow_preconfig, - coroutine)) + with self._temp_module('./init'): + with ifcontext(ifcond, self._genh, self._genc): + self._genc.add(gen_register_command(name, success_response, + allow_oob, allow_preconfig, + coroutine)) def gen_commands(schema: QAPISchema, diff --git a/scripts/qapi/events.py b/scripts/qapi/events.py index 599f3d1f56..8c57deb2b8 100644 --- a/scripts/qapi/events.py +++ b/scripts/qapi/events.py @@ -12,7 +12,7 @@ This work is licensed under the terms of the GNU GPL, version 2. See the COPYING file in the top-level directory. """ -from typing import List +from typing import List, Optional from .common import c_enum_const, c_name, mcgen from .gen import QAPISchemaModularCVisitor, build_params, ifcontext @@ -27,7 +27,7 @@ from .types import gen_enum, gen_enum_lookup def build_event_send_proto(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> str: return 'void qapi_event_send_%(c_name)s(%(param)s)' % { 'c_name': c_name(name.lower()), @@ -35,7 +35,7 @@ def build_event_send_proto(name: str, def gen_event_send_decl(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> str: return mcgen(''' @@ -78,7 +78,7 @@ def gen_param_var(typ: QAPISchemaObjectType) -> str: def gen_event_send(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool, event_enum_name: str, event_emit: str) -> str: @@ -99,6 +99,7 @@ def gen_event_send(name: str, proto=build_event_send_proto(name, arg_type, boxed)) if have_args: + assert arg_type is not None ret += mcgen(''' QObject *obj; Visitor *v; @@ -114,6 +115,7 @@ def gen_event_send(name: str, name=name) if have_args: + assert arg_type is not None ret += mcgen(''' v = qobject_output_visitor_new(&obj); ''') @@ -189,7 +191,7 @@ class QAPISchemaGenEventVisitor(QAPISchemaModularCVisitor): types=types)) def visit_end(self) -> None: - self._add_system_module('emit', ' * QAPI Events emission') + self._add_module('./emit', ' * QAPI Events emission') self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "%(prefix)sqapi-emit-events.h" @@ -211,10 +213,10 @@ void %(event_emit)s(%(event_enum)s event, QDict *qdict); def visit_event(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> None: with ifcontext(ifcond, self._genh, self._genc): self._genh.add(gen_event_send_decl(name, arg_type, boxed)) diff --git a/scripts/qapi/gen.py b/scripts/qapi/gen.py index b40f18eee3..63549cc8d4 100644 --- a/scripts/qapi/gen.py +++ b/scripts/qapi/gen.py @@ -31,12 +31,16 @@ from .common import ( guardstart, mcgen, ) -from .schema import QAPISchemaObjectType, QAPISchemaVisitor +from .schema import ( + QAPISchemaModule, + QAPISchemaObjectType, + QAPISchemaVisitor, +) from .source import QAPISourceInfo class QAPIGen: - def __init__(self, fname: Optional[str]): + def __init__(self, fname: str): self.fname = fname self._preamble = '' self._body = '' @@ -121,7 +125,7 @@ def build_params(arg_type: Optional[QAPISchemaObjectType], class QAPIGenCCode(QAPIGen): - def __init__(self, fname: Optional[str]): + def __init__(self, fname: str): super().__init__(fname) self._start_if: Optional[Tuple[List[str], str, str]] = None @@ -130,15 +134,12 @@ class QAPIGenCCode(QAPIGen): self._start_if = (ifcond, self._body, self._preamble) def end_if(self) -> None: - assert self._start_if - self._wrap_ifcond() - self._start_if = None - - def _wrap_ifcond(self) -> None: + assert self._start_if is not None self._body = _wrap_ifcond(self._start_if[0], self._start_if[1], self._body) self._preamble = _wrap_ifcond(self._start_if[0], self._start_if[2], self._preamble) + self._start_if = None def get_content(self) -> str: assert self._start_if is None @@ -243,85 +244,88 @@ class QAPISchemaModularCVisitor(QAPISchemaVisitor): self._user_blurb = user_blurb self._builtin_blurb = builtin_blurb self._pydoc = pydoc - self._genc: Optional[QAPIGenC] = None - self._genh: Optional[QAPIGenH] = None - self._module: Dict[Optional[str], Tuple[QAPIGenC, QAPIGenH]] = {} + self._current_module: Optional[str] = None + self._module: Dict[str, Tuple[QAPIGenC, QAPIGenH]] = {} self._main_module: Optional[str] = None - @staticmethod - def _is_user_module(name: Optional[str]) -> bool: - return bool(name and not name.startswith('./')) + @property + def _genc(self) -> QAPIGenC: + assert self._current_module is not None + return self._module[self._current_module][0] - @staticmethod - def _is_builtin_module(name: Optional[str]) -> bool: - return not name + @property + def _genh(self) -> QAPIGenH: + assert self._current_module is not None + return self._module[self._current_module][1] - def _module_dirname(self, name: Optional[str]) -> str: - if self._is_user_module(name): + @staticmethod + def _module_dirname(name: str) -> str: + if QAPISchemaModule.is_user_module(name): return os.path.dirname(name) return '' - def _module_basename(self, what: str, name: Optional[str]) -> str: - ret = '' if self._is_builtin_module(name) else self._prefix - if self._is_user_module(name): + def _module_basename(self, what: str, name: str) -> str: + ret = '' if QAPISchemaModule.is_builtin_module(name) else self._prefix + if QAPISchemaModule.is_user_module(name): basename = os.path.basename(name) ret += what if name != self._main_module: ret += '-' + os.path.splitext(basename)[0] else: - name = name[2:] if name else 'builtin' - ret += re.sub(r'-', '-' + name + '-', what) + assert QAPISchemaModule.is_system_module(name) + ret += re.sub(r'-', '-' + name[2:] + '-', what) return ret - def _module_filename(self, what: str, name: Optional[str]) -> str: + def _module_filename(self, what: str, name: str) -> str: return os.path.join(self._module_dirname(name), self._module_basename(what, name)) - def _add_module(self, name: Optional[str], blurb: str) -> None: + def _add_module(self, name: str, blurb: str) -> None: + if QAPISchemaModule.is_user_module(name): + if self._main_module is None: + self._main_module = name basename = self._module_filename(self._what, name) genc = QAPIGenC(basename + '.c', blurb, self._pydoc) genh = QAPIGenH(basename + '.h', blurb, self._pydoc) self._module[name] = (genc, genh) - self._genc, self._genh = self._module[name] - - def _add_user_module(self, name: str, blurb: str) -> None: - assert self._is_user_module(name) - if self._main_module is None: - self._main_module = name - self._add_module(name, blurb) + self._current_module = name - def _add_system_module(self, name: Optional[str], blurb: str) -> None: - self._add_module(name and './' + name, blurb) + @contextmanager + def _temp_module(self, name: str) -> Iterator[None]: + old_module = self._current_module + self._current_module = name + yield + self._current_module = old_module def write(self, output_dir: str, opt_builtins: bool = False) -> None: for name in self._module: - if self._is_builtin_module(name) and not opt_builtins: + if QAPISchemaModule.is_builtin_module(name) and not opt_builtins: continue (genc, genh) = self._module[name] genc.write(output_dir) genh.write(output_dir) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: pass def _begin_user_module(self, name: str) -> None: pass - def visit_module(self, name: Optional[str]) -> None: - if name is None: + def visit_module(self, name: str) -> None: + if QAPISchemaModule.is_builtin_module(name): if self._builtin_blurb: - self._add_system_module(None, self._builtin_blurb) - self._begin_system_module(name) + self._add_module(name, self._builtin_blurb) + self._begin_builtin_module() else: # The built-in module has not been created. No code may # be generated. - self._genc = None - self._genh = None + self._current_module = None else: - self._add_user_module(name, self._user_blurb) + assert QAPISchemaModule.is_user_module(name) + self._add_module(name, self._user_blurb) self._begin_user_module(name) - def visit_include(self, name: str, info: QAPISourceInfo) -> None: + def visit_include(self, name: str, info: Optional[QAPISourceInfo]) -> None: relname = os.path.relpath(self._module_filename(self._what, name), os.path.dirname(self._genh.fname)) self._genh.preamble_add(mcgen(''' diff --git a/scripts/qapi/main.py b/scripts/qapi/main.py index 42517210b8..703e7ed1ed 100644 --- a/scripts/qapi/main.py +++ b/scripts/qapi/main.py @@ -23,6 +23,8 @@ from .visit import gen_visit def invalid_prefix_char(prefix: str) -> Optional[str]: match = re.match(r'([A-Za-z_.-][A-Za-z0-9_.-]*)?', prefix) + # match cannot be None, but mypy cannot infer that. + assert match is not None if match.end() != len(prefix): return prefix[match.end()] return None diff --git a/scripts/qapi/mypy.ini b/scripts/qapi/mypy.ini index 74fc6c8215..04bd5db527 100644 --- a/scripts/qapi/mypy.ini +++ b/scripts/qapi/mypy.ini @@ -1,6 +1,5 @@ [mypy] strict = True -strict_optional = False disallow_untyped_calls = False python_version = 3.6 diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py index 720449feee..353e8020a2 100644 --- a/scripts/qapi/schema.py +++ b/scripts/qapi/schema.py @@ -68,7 +68,8 @@ class QAPISchemaEntity: def _set_module(self, schema, info): assert self._checked - self._module = schema.module_by_fname(info and info.fname) + fname = info.fname if info else QAPISchemaModule.BUILTIN_MODULE_NAME + self._module = schema.module_by_fname(fname) self._module.add_entity(self) def set_module(self, schema): @@ -137,10 +138,40 @@ class QAPISchemaVisitor: class QAPISchemaModule: + + BUILTIN_MODULE_NAME = './builtin' + def __init__(self, name): self.name = name self._entity_list = [] + @staticmethod + def is_system_module(name: str) -> bool: + """ + System modules are internally defined modules. + + Their names start with the "./" prefix. + """ + return name.startswith('./') + + @classmethod + def is_user_module(cls, name: str) -> bool: + """ + User modules are those defined by the user in qapi JSON files. + + They do not start with the "./" prefix. + """ + return not cls.is_system_module(name) + + @classmethod + def is_builtin_module(cls, name: str) -> bool: + """ + The built-in module is a single System module for the built-in types. + + It is always "./builtin". + """ + return name == cls.BUILTIN_MODULE_NAME + def add_entity(self, ent): self._entity_list.append(ent) @@ -825,7 +856,7 @@ class QAPISchema: self._entity_dict = {} self._module_dict = OrderedDict() self._schema_dir = os.path.dirname(fname) - self._make_module(None) # built-ins + self._make_module(QAPISchemaModule.BUILTIN_MODULE_NAME) self._make_module(fname) self._predefining = True self._def_predefineds() @@ -870,9 +901,9 @@ class QAPISchema: info, "%s uses unknown type '%s'" % (what, name)) return typ - def _module_name(self, fname): - if fname is None: - return None + def _module_name(self, fname: str) -> str: + if QAPISchemaModule.is_system_module(fname): + return fname return os.path.relpath(fname, self._schema_dir) def _make_module(self, fname): @@ -883,7 +914,6 @@ class QAPISchema: def module_by_fname(self, fname): name = self._module_name(fname) - assert name in self._module_dict return self._module_dict[name] def _def_include(self, expr, info, doc): diff --git a/scripts/qapi/types.py b/scripts/qapi/types.py index 2b4916cdaa..2bdd626847 100644 --- a/scripts/qapi/types.py +++ b/scripts/qapi/types.py @@ -271,7 +271,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor): prefix, 'qapi-types', ' * Schema-defined QAPI types', ' * Built-in QAPI types', __doc__) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "qapi/dealloc-visitor.h" @@ -350,7 +350,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor): def visit_alternate_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], variants: QAPISchemaVariants) -> None: diff --git a/scripts/qapi/visit.py b/scripts/qapi/visit.py index 339f152152..22e62df901 100644 --- a/scripts/qapi/visit.py +++ b/scripts/qapi/visit.py @@ -305,7 +305,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): prefix, 'qapi-visit', ' * Schema-defined QAPI visitors', ' * Built-in QAPI visitors', __doc__) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "qapi/error.h" @@ -336,7 +336,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): def visit_enum_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], members: List[QAPISchemaEnumMember], @@ -378,7 +378,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): def visit_alternate_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], variants: QAPISchemaVariants) -> None: diff --git a/scripts/userfaultfd-wrlat.py b/scripts/userfaultfd-wrlat.py new file mode 100755 index 0000000000..0684be4e04 --- /dev/null +++ b/scripts/userfaultfd-wrlat.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 +# +# userfaultfd-wrlat Summarize userfaultfd write fault latencies. +# Events are continuously accumulated for the +# run, while latency distribution histogram is +# dumped each 'interval' seconds. +# +# For Linux, uses BCC, eBPF. +# +# USAGE: userfaultfd-lat [interval [count]] +# +# Copyright Virtuozzo GmbH, 2020 +# +# Authors: +# Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +from __future__ import print_function +from bcc import BPF +from ctypes import c_ushort, c_int, c_ulonglong +from time import sleep +from sys import argv + +def usage(): + print("USAGE: %s [interval [count]]" % argv[0]) + exit() + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <linux/mm.h> + +BPF_HASH(ev_start, u32, u64); +BPF_HISTOGRAM(ev_delta_hist, u64); + +/* Trace UFFD page fault start event. */ +static void do_event_start() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + + ev_start.update(&tid, &ts); +} + +/* Trace UFFD page fault end event. */ +static void do_event_end() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + u64 *tsp; + + tsp = ev_start.lookup(&tid); + if (tsp) { + u64 delta = ts - (*tsp); + /* Transform time delta to milliseconds */ + ev_delta_hist.increment(bpf_log2l(delta / 1000000)); + ev_start.delete(&tid); + } +} + +/* KPROBE for handle_userfault(). */ +int probe_handle_userfault(struct pt_regs *ctx, struct vm_fault *vmf, + unsigned long reason) +{ + /* Trace only UFFD write faults. */ + if (reason & VM_UFFD_WP) { + do_event_start(); + } + return 0; +} + +/* KRETPROBE for handle_userfault(). */ +int retprobe_handle_userfault(struct pt_regs *ctx) +{ + do_event_end(); + return 0; +} +""" + +# arguments +interval = 10 +count = -1 +if len(argv) > 1: + try: + interval = int(argv[1]) + if interval == 0: + raise + if len(argv) > 2: + count = int(argv[2]) + except: # also catches -h, --help + usage() + +# load BPF program +b = BPF(text=bpf_text) +# attach KRPOBEs +b.attach_kprobe(event="handle_userfault", fn_name="probe_handle_userfault") +b.attach_kretprobe(event="handle_userfault", fn_name="retprobe_handle_userfault") + +# header +print("Tracing UFFD-WP write fault latency... Hit Ctrl-C to end.") + +# output +loop = 0 +do_exit = 0 +while (1): + if count > 0: + loop += 1 + if loop > count: + exit() + try: + sleep(interval) + except KeyboardInterrupt: + pass; do_exit = 1 + + print() + b["ev_delta_hist"].print_log2_hist("msecs") + if do_exit: + exit() diff --git a/softmmu/rtc.c b/softmmu/rtc.c index e1e15ef613..5632684fc9 100644 --- a/softmmu/rtc.c +++ b/softmmu/rtc.c @@ -179,7 +179,8 @@ void configure_rtc(QemuOpts *opts) if (!strcmp(value, "slew")) { object_register_sugar_prop("mc146818rtc", "lost_tick_policy", - "slew"); + "slew", + false); } else if (!strcmp(value, "none")) { /* discard is default */ } else { diff --git a/softmmu/vl.c b/softmmu/vl.c index 2bf94ece9c..b219ce1f35 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -101,6 +101,7 @@ #include "qemu/plugin.h" #include "qemu/queue.h" #include "sysemu/arch_init.h" +#include "exec/confidential-guest-support.h" #include "ui/qemu-spice.h" #include "qapi/string-input-visitor.h" @@ -1663,16 +1664,20 @@ static int machine_set_property(void *opaque, return 0; } if (g_str_equal(qom_name, "igd-passthru")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value, + false); return 0; } if (g_str_equal(qom_name, "kvm-shadow-mem")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value, + false); return 0; } if (g_str_equal(qom_name, "kernel-irqchip")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value); - object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value, + false); + object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value, + false); return 0; } @@ -2298,9 +2303,10 @@ static void qemu_process_sugar_options(void) val = g_strdup_printf("%d", (uint32_t) qemu_opt_get_number(qemu_find_opts_singleton("smp-opts"), "cpus", 1)); - object_register_sugar_prop("memory-backend", "prealloc-threads", val); + object_register_sugar_prop("memory-backend", "prealloc-threads", val, + false); g_free(val); - object_register_sugar_prop("memory-backend", "prealloc", "on"); + object_register_sugar_prop("memory-backend", "prealloc", "on", false); } if (watchdog) { @@ -2493,6 +2499,8 @@ static void qemu_create_cli_devices(void) static void qemu_machine_creation_done(void) { + MachineState *machine = MACHINE(qdev_get_machine()); + /* Did we create any drives that we failed to create a device for? */ drive_check_orphaned(); @@ -2512,6 +2520,13 @@ static void qemu_machine_creation_done(void) qdev_machine_creation_done(); + if (machine->cgs) { + /* + * Verify that Confidential Guest Support has actually been initialized + */ + assert(machine->cgs->ready); + } + if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) { exit(1); } @@ -2530,7 +2545,7 @@ void qmp_x_exit_preconfig(Error **errp) if (loadvm) { Error *local_err = NULL; - if (load_snapshot(loadvm, &local_err) < 0) { + if (!load_snapshot(loadvm, NULL, false, NULL, &local_err)) { error_report_err(local_err); autostart = 0; exit(1); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6dc1ee052d..4788139128 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -42,6 +42,7 @@ #include "hw/i386/intel_iommu.h" #include "hw/i386/x86-iommu.h" #include "hw/i386/e820_memory_layout.h" +#include "sysemu/sev.h" #include "hw/pci/pci.h" #include "hw/pci/msi.h" @@ -2135,6 +2136,25 @@ int kvm_arch_init(MachineState *ms, KVMState *s) uint64_t shadow_mem; int ret; struct utsname utsname; + Error *local_err = NULL; + + /* + * Initialize SEV context, if required + * + * If no memory encryption is requested (ms->cgs == NULL) this is + * a no-op. + * + * It's also a no-op if a non-SEV confidential guest support + * mechanism is selected. SEV is the only mechanism available to + * select on x86 at present, so this doesn't arise, but if new + * mechanisms are supported in future (e.g. TDX), they'll need + * their own initialization either here or elsewhere. + */ + ret = sev_kvm_init(ms->cgs, &local_err); + if (ret < 0) { + error_report_err(local_err); + return ret; + } if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) { error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM"); diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c index c1fecc2101..1ac1fd5b94 100644 --- a/target/i386/sev-stub.c +++ b/target/i386/sev-stub.c @@ -54,3 +54,8 @@ int sev_inject_launch_secret(const char *hdr, const char *secret, { return 1; } + +int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) +{ + return 0; +} diff --git a/target/i386/sev.c b/target/i386/sev.c index 1546606811..11c9a3cc21 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -31,6 +31,7 @@ #include "qom/object.h" #include "exec/address-spaces.h" #include "monitor/monitor.h" +#include "exec/confidential-guest-support.h" #define TYPE_SEV_GUEST "sev-guest" OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) @@ -47,7 +48,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) * -machine ...,memory-encryption=sev0 */ struct SevGuestState { - Object parent_obj; + ConfidentialGuestSupport parent_obj; /* configuration parameters */ char *sev_device; @@ -322,7 +323,7 @@ sev_guest_instance_init(Object *obj) /* sev guest info */ static const TypeInfo sev_guest_info = { - .parent = TYPE_OBJECT, + .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, .name = TYPE_SEV_GUEST, .instance_size = sizeof(SevGuestState), .instance_finalize = sev_guest_finalize, @@ -334,26 +335,6 @@ static const TypeInfo sev_guest_info = { } }; -static SevGuestState * -lookup_sev_guest_info(const char *id) -{ - Object *obj; - SevGuestState *info; - - obj = object_resolve_path_component(object_get_objects_root(), id); - if (!obj) { - return NULL; - } - - info = (SevGuestState *) - object_dynamic_cast(obj, TYPE_SEV_GUEST); - if (!info) { - return NULL; - } - - return info; -} - bool sev_enabled(void) { @@ -681,27 +662,24 @@ sev_vm_state_change(void *opaque, int running, RunState state) } } -void * -sev_guest_init(const char *id) +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { - SevGuestState *sev; + SevGuestState *sev + = (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST); char *devname; int ret, fw_error; uint32_t ebx; uint32_t host_cbitpos; struct sev_user_data_status status = {}; + if (!sev) { + return 0; + } + ret = ram_block_discard_disable(true); if (ret) { error_report("%s: cannot disable RAM discard", __func__); - return NULL; - } - - sev = lookup_sev_guest_info(id); - if (!sev) { - error_report("%s: '%s' is not a valid '%s' object", - __func__, id, TYPE_SEV_GUEST); - goto err; + return -1; } sev_guest = sev; @@ -711,14 +689,14 @@ sev_guest_init(const char *id) host_cbitpos = ebx & 0x3f; if (host_cbitpos != sev->cbitpos) { - error_report("%s: cbitpos check failed, host '%d' requested '%d'", - __func__, host_cbitpos, sev->cbitpos); + error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'", + __func__, host_cbitpos, sev->cbitpos); goto err; } if (sev->reduced_phys_bits < 1) { - error_report("%s: reduced_phys_bits check failed, it should be >=1," - " requested '%d'", __func__, sev->reduced_phys_bits); + error_setg(errp, "%s: reduced_phys_bits check failed, it should be >=1," + " requested '%d'", __func__, sev->reduced_phys_bits); goto err; } @@ -727,20 +705,19 @@ sev_guest_init(const char *id) devname = object_property_get_str(OBJECT(sev), "sev-device", NULL); sev->sev_fd = open(devname, O_RDWR); if (sev->sev_fd < 0) { - error_report("%s: Failed to open %s '%s'", __func__, - devname, strerror(errno)); - } - g_free(devname); - if (sev->sev_fd < 0) { + error_setg(errp, "%s: Failed to open %s '%s'", __func__, + devname, strerror(errno)); + g_free(devname); goto err; } + g_free(devname); ret = sev_platform_ioctl(sev->sev_fd, SEV_PLATFORM_STATUS, &status, &fw_error); if (ret) { - error_report("%s: failed to get platform status ret=%d " - "fw_error='%d: %s'", __func__, ret, fw_error, - fw_error_to_str(fw_error)); + error_setg(errp, "%s: failed to get platform status ret=%d " + "fw_error='%d: %s'", __func__, ret, fw_error, + fw_error_to_str(fw_error)); goto err; } sev->build_id = status.build; @@ -750,14 +727,14 @@ sev_guest_init(const char *id) trace_kvm_sev_init(); ret = sev_ioctl(sev->sev_fd, KVM_SEV_INIT, NULL, &fw_error); if (ret) { - error_report("%s: failed to initialize ret=%d fw_error=%d '%s'", - __func__, ret, fw_error, fw_error_to_str(fw_error)); + error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); goto err; } ret = sev_launch_start(sev); if (ret) { - error_report("%s: failed to create encryption context", __func__); + error_setg(errp, "%s: failed to create encryption context", __func__); goto err; } @@ -765,23 +742,29 @@ sev_guest_init(const char *id) qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); - return sev; + cgs->ready = true; + + return 0; err: sev_guest = NULL; ram_block_discard_disable(false); - return NULL; + return -1; } int -sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len) +sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) { - SevGuestState *sev = handle; - - assert(sev); + if (!sev_guest) { + return 0; + } /* if SEV is in update state then encrypt the data else do nothing */ - if (sev_check_state(sev, SEV_STATE_LAUNCH_UPDATE)) { - return sev_launch_update_data(sev, ptr, len); + if (sev_check_state(sev_guest, SEV_STATE_LAUNCH_UPDATE)) { + int ret = sev_launch_update_data(sev_guest, ptr, len); + if (ret < 0) { + error_setg(errp, "failed to encrypt pflash rom"); + return ret; + } } return 0; diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index daf690a678..0c5056dd5b 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -2929,21 +2929,3 @@ void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset) kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &tb_offset); } } - -/* - * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels - * that don't support this ioctl. - */ -void kvmppc_svm_off(Error **errp) -{ - int rc; - - if (!kvm_enabled()) { - return; - } - - rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF); - if (rc && rc != -ENOTTY) { - error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed"); - } -} diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h index 73ce2bc951..989f61ace0 100644 --- a/target/ppc/kvm_ppc.h +++ b/target/ppc/kvm_ppc.h @@ -39,7 +39,6 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu); target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu, bool radix, bool gtse, uint64_t proc_tbl); -void kvmppc_svm_off(Error **errp); #ifndef CONFIG_USER_ONLY bool kvmppc_spapr_use_multitce(void); int kvmppc_spapr_enable_inkernel_multitce(void); @@ -216,11 +215,6 @@ static inline target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu, return 0; } -static inline void kvmppc_svm_off(Error **errp) -{ - return; -} - static inline void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online) { diff --git a/tests/Makefile.include b/tests/Makefile.include index ceaf3f0d6e..d34254fb29 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -12,7 +12,7 @@ check-help: @echo " $(MAKE) check-speed Run qobject speed tests" @echo " $(MAKE) check-qapi-schema Run QAPI schema tests" @echo " $(MAKE) check-block Run block tests" -ifeq ($(CONFIG_TCG),y) +ifneq ($(filter $(all-check-targets), check-softfloat),) @echo " $(MAKE) check-tcg Run TCG tests" @echo " $(MAKE) check-softfloat Run FPU emulation tests" endif @@ -40,11 +40,13 @@ SYSEMU_TARGET_LIST := $(subst -softmmu.mak,,$(notdir \ SPEED = quick -# Per guest TCG tests +# Build up our target list from the filtered list of ninja targets +TARGETS=$(patsubst libqemu-%.fa, %, $(filter libqemu-%.fa, $(ninja-targets))) -BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGET_DIRS)) -CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGET_DIRS)) -RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGET_DIRS)) +# Per guest TCG tests +BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGETS)) +CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGETS)) +RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGETS)) # Probe for the Docker Builds needed for each build $(foreach PROBE_TARGET,$(TARGET_DIRS), \ diff --git a/tests/acceptance/replay_kernel.py b/tests/acceptance/replay_kernel.py index 772633b01d..c1cb862468 100644 --- a/tests/acceptance/replay_kernel.py +++ b/tests/acceptance/replay_kernel.py @@ -31,7 +31,7 @@ class ReplayKernelBase(LinuxKernelTest): terminates. """ - timeout = 90 + timeout = 120 KERNEL_COMMON_COMMAND_LINE = 'printk.time=1 panic=-1 ' def run_vm(self, kernel_path, kernel_command_line, console_pattern, diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index 0779dab5b9..93b29ad823 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -1,6 +1,6 @@ # Makefile for Docker tests -.PHONY: docker docker-test docker-clean docker-image docker-qemu-src +.PHONY: docker docker-help docker-test docker-clean docker-image docker-qemu-src NULL := SPACE := $(NULL) # @@ -11,7 +11,7 @@ HOST_ARCH = $(if $(ARCH),$(ARCH),$(shell uname -m)) DOCKER_SUFFIX := .docker DOCKER_FILES_DIR := $(SRC_PATH)/tests/docker/dockerfiles # we don't run tests on intermediate images (used as base by another image) -DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap +DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap empty DOCKER_IMAGES := $(sort $(notdir $(basename $(wildcard $(DOCKER_FILES_DIR)/*.docker)))) DOCKER_TARGETS := $(patsubst %,docker-image-%,$(DOCKER_IMAGES)) # Use a global constant ccache directory to speed up repetitive builds @@ -92,6 +92,24 @@ docker-binfmt-image-debian-%: $(DOCKER_FILES_DIR)/debian-bootstrap.docker { echo "You will need to build $(EXECUTABLE)"; exit 1;},\ "CHECK", "debian-$* exists")) +# These are test targets +USER_TCG_TARGETS=$(patsubst %-linux-user,qemu-%,$(filter %-linux-user,$(TARGET_DIRS))) +EXEC_COPY_TESTS=$(patsubst %,docker-exec-copy-test-%, $(USER_TCG_TARGETS)) + +$(EXEC_COPY_TESTS): docker-exec-copy-test-%: $(DOCKER_FILES_DIR)/empty.docker + $(call quiet-command, \ + $(DOCKER_SCRIPT) build -t qemu/exec-copy-test-$* -f $< \ + $(if $V,,--quiet) --no-cache \ + --include-executable=$* \ + --skip-binfmt, \ + "TEST","copy $* to container") + $(call quiet-command, \ + $(DOCKER_SCRIPT) run qemu/exec-copy-test-$* \ + /$* -version > tests/docker-exec-copy-test-$*.out, \ + "TEST","check $* works in container") + +docker-exec-copy-test: $(EXEC_COPY_TESTS) + endif # Enforce dependencies for composite images @@ -209,7 +227,7 @@ endif @echo ' before running the command.' @echo ' NETWORK=1 Enable virtual network interface with default backend.' @echo ' NETWORK=$$BACKEND Enable virtual network interface with $$BACKEND.' - @echo ' NOUSER Define to disable adding current user to containers passwd.' + @echo ' NOUSER=1 Define to disable adding current user to containers passwd.' @echo ' NOCACHE=1 Ignore cache when build images.' @echo ' EXECUTABLE=<path> Include executable in image.' @echo ' EXTRA_FILES="<path> [... <path>]"' @@ -218,6 +236,8 @@ endif @echo ' Specify which container engine to run.' @echo ' REGISTRY=url Cache builds from registry (default:$(DOCKER_REGISTRY))' +docker-help: docker + # This rule if for directly running against an arbitrary docker target. # It is called by the expanded docker targets (e.g. make # docker-test-foo@bar) which will do additional verification. diff --git a/tests/docker/docker.py b/tests/docker/docker.py index 884dfeb29c..d28df4c140 100755 --- a/tests/docker/docker.py +++ b/tests/docker/docker.py @@ -93,7 +93,7 @@ def _guess_engine_command(): commands_txt) -def _copy_with_mkdir(src, root_dir, sub_path='.'): +def _copy_with_mkdir(src, root_dir, sub_path='.', name=None): """Copy src into root_dir, creating sub_path as needed.""" dest_dir = os.path.normpath("%s/%s" % (root_dir, sub_path)) try: @@ -102,8 +102,13 @@ def _copy_with_mkdir(src, root_dir, sub_path='.'): # we can safely ignore already created directories pass - dest_file = "%s/%s" % (dest_dir, os.path.basename(src)) - copy(src, dest_file) + dest_file = "%s/%s" % (dest_dir, name if name else os.path.basename(src)) + + try: + copy(src, dest_file) + except FileNotFoundError: + print("Couldn't copy %s to %s" % (src, dest_file)) + pass def _get_so_libs(executable): @@ -120,7 +125,7 @@ def _get_so_libs(executable): search = ldd_re.search(line) if search: try: - libs.append(s.group(1)) + libs.append(search.group(1)) except IndexError: pass except subprocess.CalledProcessError: @@ -150,8 +155,9 @@ def _copy_binary_with_libs(src, bin_dest, dest_dir): if libs: for l in libs: so_path = os.path.dirname(l) + name = os.path.basename(l) real_l = os.path.realpath(l) - _copy_with_mkdir(real_l, dest_dir, so_path) + _copy_with_mkdir(real_l, dest_dir, so_path, name) def _check_binfmt_misc(executable): @@ -432,6 +438,9 @@ class BuildCommand(SubCommand): help="""Specify a binary that will be copied to the container together with all its dependent libraries""") + parser.add_argument("--skip-binfmt", + action="store_true", + help="""Skip binfmt entry check (used for testing)""") parser.add_argument("--extra-files", nargs='*', help="""Specify files that will be copied in the Docker image, fulfilling the ADD directive from the @@ -460,7 +469,9 @@ class BuildCommand(SubCommand): docker_dir = tempfile.mkdtemp(prefix="docker_build") # Validate binfmt_misc will work - if args.include_executable: + if args.skip_binfmt: + qpath = args.include_executable + elif args.include_executable: qpath, enabled = _check_binfmt_misc(args.include_executable) if not enabled: return 1 diff --git a/tests/docker/dockerfiles/empty.docker b/tests/docker/dockerfiles/empty.docker new file mode 100644 index 0000000000..9ba980f1a8 --- /dev/null +++ b/tests/docker/dockerfiles/empty.docker @@ -0,0 +1,8 @@ +# +# Empty Dockerfile +# + +FROM scratch + +# Add everything from the context into the container +ADD . / diff --git a/tests/qapi-schema/comments.out b/tests/qapi-schema/comments.out index 273f0f54e1..ce4f6a4f0f 100644 --- a/tests/qapi-schema/comments.out +++ b/tests/qapi-schema/comments.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/doc-good.out b/tests/qapi-schema/doc-good.out index 419284dae2..715b0bbc1a 100644 --- a/tests/qapi-schema/doc-good.out +++ b/tests/qapi-schema/doc-good.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/empty.out b/tests/qapi-schema/empty.out index 69666c39ad..3feb3f69d3 100644 --- a/tests/qapi-schema/empty.out +++ b/tests/qapi-schema/empty.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/event-case.out b/tests/qapi-schema/event-case.out index 42ae519656..9ae44052ac 100644 --- a/tests/qapi-schema/event-case.out +++ b/tests/qapi-schema/event-case.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/include-repetition.out b/tests/qapi-schema/include-repetition.out index 0b654ddebb..16dbd9b819 100644 --- a/tests/qapi-schema/include-repetition.out +++ b/tests/qapi-schema/include-repetition.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/include-simple.out b/tests/qapi-schema/include-simple.out index 061f81e509..48e923bfbc 100644 --- a/tests/qapi-schema/include-simple.out +++ b/tests/qapi-schema/include-simple.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/indented-expr.out b/tests/qapi-schema/indented-expr.out index 04356775cd..6a30ded3fa 100644 --- a/tests/qapi-schema/indented-expr.out +++ b/tests/qapi-schema/indented-expr.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/qapi-schema-test.out b/tests/qapi-schema/qapi-schema-test.out index 8868ca0dca..3b1387d9f1 100644 --- a/tests/qapi-schema/qapi-schema-test.out +++ b/tests/qapi-schema/qapi-schema-test.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out index 27471ffae8..7176e376e1 100644 --- a/tests/qemu-iotests/267.out +++ b/tests/qemu-iotests/267.out @@ -6,11 +6,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 Testing: QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 -Error: No block device can accept snapshots +Error: no block device can store vmstate for snapshot (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 -Error: No block device supports snapshots +Error: no block device can store vmstate for snapshot (qemu) quit @@ -22,7 +22,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) quit @@ -58,7 +58,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) quit @@ -83,7 +83,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) quit diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index ef105dfc39..0fc52d20d7 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -53,6 +53,15 @@ _in_fd=4 # If $mismatch_only is set, only non-matching responses will # be echoed. # +# If $capture_events is non-empty, then any QMP event names it lists +# will not be echoed out, but instead collected in the $QEMU_EVENTS +# variable. The _wait_event function can later be used to receive +# the cached events. +# +# If $only_capture_events is set to anything but an empty string, +# then an error will be raised if a QMP message is seen which is +# not an event listed in $capture_events. +# # If $success_or_failure is set, the meaning of the arguments is # changed as follows: # $2: A string to search for in the response; if found, this indicates @@ -78,6 +87,31 @@ _timed_wait_for() QEMU_STATUS[$h]=0 while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} do + if [ -n "$capture_events" ]; then + capture=0 + local evname + for evname in $capture_events + do + case ${resp} in + *\"event\":\ \"${evname}\"* ) capture=1 ;; + esac + done + if [ $capture = 1 ]; + then + ev=$(echo "${resp}" | tr -d '\r' | tr % .) + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + if [ -n "$only_capture_events" ]; then + return + else + continue + fi + fi + fi + if [ -n "$only_capture_events" ]; then + echo "Only expected $capture_events but got ${resp}" + exit 1 + fi + if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp @@ -172,12 +206,82 @@ _send_qemu_cmd() let count--; done if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then - echo "Timeout waiting for ${1} on handle ${h}" + echo "Timeout waiting for command ${1} response on handle ${h}" exit 1 #Timeout means the test failed fi } +# Check event cache for a named QMP event +# +# Input parameters: +# $1: Name of the QMP event to check for +# +# Checks if the named QMP event that was previously captured +# into $QEMU_EVENTS. When matched, the QMP event will be echoed +# and the $matched variable set to 1. +# +# _wait_event is more suitable for test usage in most cases +_check_cached_events() +{ + local evname=${1} + + local match="\"event\": \"$evname\"" + + matched=0 + if [ -n "$QEMU_EVENTS" ]; then + CURRENT_QEMU_EVENTS=$QEMU_EVENTS + QEMU_EVENTS= + old_IFS=$IFS + IFS="%" + for ev in $CURRENT_QEMU_EVENTS + do + grep -q "$match" < <(echo "${ev}") + if [ $? -eq 0 ] && [ $matched = 0 ]; then + echo "${ev}" | _filter_testdir | _filter_qemu \ + | _filter_qemu_io | _filter_qmp | _filter_hmp + matched=1 + else + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + fi + done + IFS=$old_IFS + fi +} + +# Wait for a named QMP event +# +# Input parameters: +# $1: QEMU handle to use +# $2: Name of the QMP event to wait for +# +# Checks if the named QMP even was previously captured +# into $QEMU_EVENTS. If none are present, then waits for the +# event to arrive on the QMP channel. When matched, the QMP +# event will be echoed +_wait_event() +{ + local h=${1} + local evname=${2} + + while true + do + _check_cached_events $evname + + if [ $matched = 1 ]; + then + return + fi + + only_capture_events=1 qemu_error_no_exit=1 _timed_wait_for ${h} + + if [ ${QEMU_STATUS[$h]} -ne 0 ] ; then + echo "Timeout waiting for event ${evname} on handle ${h}" + exit 1 #Timeout means the test failed + fi + done +} + # Launch a QEMU process. # # Input parameters: diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 297acf9b6a..77c37e8312 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -109,8 +109,14 @@ peek_file_raw() dd if="$1" bs=1 skip="$2" count="$3" status=none } - -if ! . ./common.config +config=common.config +test -f $config || config=../common.config +if ! test -f $config +then + echo "$0: failed to find common.config" + exit 1 +fi +if ! . $config then echo "$0: failed to source common.config" exit 1 diff --git a/tests/tcg/Makefile.qemu b/tests/tcg/Makefile.qemu index c096c611a2..a56564660c 100644 --- a/tests/tcg/Makefile.qemu +++ b/tests/tcg/Makefile.qemu @@ -90,11 +90,11 @@ run-guest-tests: guest-tests else guest-tests: - $(call quiet-command, /bin/true, "BUILD", \ + $(call quiet-command, true, "BUILD", \ "$(TARGET) guest-tests SKIPPED") run-guest-tests: - $(call quiet-command, /bin/true, "RUN", \ + $(call quiet-command, true, "RUN", \ "tests for $(TARGET) SKIPPED") endif diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target index 1dd0f64d23..abbdb2e126 100644 --- a/tests/tcg/multiarch/Makefile.target +++ b/tests/tcg/multiarch/Makefile.target @@ -63,8 +63,11 @@ run-gdbstub-qxfer-auxv-read: sha1 --bin $< --test $(MULTIARCH_SRC)/gdbstub/test-qxfer-auxv-read.py, \ "basic gdbstub qXfer:auxv:read support") -EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read +else +run-gdbstub-%: + $(call skip-test, "gdbstub test $*", "need working gdb") endif +EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read # Update TESTS diff --git a/util/fifo8.c b/util/fifo8.c index a5dd789ce5..d4d1c135e0 100644 --- a/util/fifo8.c +++ b/util/fifo8.c @@ -31,9 +31,7 @@ void fifo8_destroy(Fifo8 *fifo) void fifo8_push(Fifo8 *fifo, uint8_t data) { - if (fifo->num == fifo->capacity) { - abort(); - } + assert(fifo->num < fifo->capacity); fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data; fifo->num++; } @@ -42,9 +40,7 @@ void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num) { uint32_t start, avail; - if (fifo->num + num > fifo->capacity) { - abort(); - } + assert(fifo->num + num <= fifo->capacity); start = (fifo->head + fifo->num) % fifo->capacity; @@ -63,9 +59,7 @@ uint8_t fifo8_pop(Fifo8 *fifo) { uint8_t ret; - if (fifo->num == 0) { - abort(); - } + assert(fifo->num > 0); ret = fifo->data[fifo->head++]; fifo->head %= fifo->capacity; fifo->num--; @@ -76,9 +70,7 @@ const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num) { uint8_t *ret; - if (max == 0 || max > fifo->num) { - abort(); - } + assert(max > 0 && max <= fifo->num); *num = MIN(fifo->capacity - fifo->head, max); ret = &fifo->data[fifo->head]; fifo->head += *num; diff --git a/util/meson.build b/util/meson.build index 3eccdbe596..984fba965f 100644 --- a/util/meson.build +++ b/util/meson.build @@ -52,6 +52,7 @@ if have_system util_ss.add(files('crc-ccitt.c')) util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio]) util_ss.add(files('yank.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c')) endif if have_block diff --git a/util/trace-events b/util/trace-events index 61e0d4bcdf..bac0924899 100644 --- a/util/trace-events +++ b/util/trace-events @@ -91,3 +91,12 @@ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uin qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" + +#userfaultfd.c +uffd_query_features_nosys(int err) "errno: %i" +uffd_query_features_api_failed(int err) "errno: %i" +uffd_create_fd_nosys(int err) "errno: %i" +uffd_create_fd_api_failed(int err) "errno: %i" +uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64 +uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i" +uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i" diff --git a/util/userfaultfd.c b/util/userfaultfd.c new file mode 100644 index 0000000000..f1cd6af2b1 --- /dev/null +++ b/util/userfaultfd.c @@ -0,0 +1,345 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "qemu/userfaultfd.h" +#include "trace.h" +#include <poll.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> + +/** + * uffd_query_features: query UFFD features + * + * Returns: 0 on success, negative value in case of an error + * + * @features: parameter to receive 'uffdio_api.features' + */ +int uffd_query_features(uint64_t *features) +{ + int uffd_fd; + struct uffdio_api api_struct = { 0 }; + int ret = -1; + + uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (uffd_fd < 0) { + trace_uffd_query_features_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = 0; + + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_query_features_api_failed(errno); + goto out; + } + *features = api_struct.features; + ret = 0; + +out: + close(uffd_fd); + return ret; +} + +/** + * uffd_create_fd: create UFFD file descriptor + * + * Returns non-negative file descriptor or negative value in case of an error + * + * @features: UFFD features to request + * @non_blocking: create UFFD file descriptor for non-blocking operation + */ +int uffd_create_fd(uint64_t features, bool non_blocking) +{ + int uffd_fd; + int flags; + struct uffdio_api api_struct = { 0 }; + uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); + + flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); + uffd_fd = syscall(__NR_userfaultfd, flags); + if (uffd_fd < 0) { + trace_uffd_create_fd_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = features; + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_create_fd_api_failed(errno); + goto fail; + } + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); + goto fail; + } + + return uffd_fd; + +fail: + close(uffd_fd); + return -1; +} + +/** + * uffd_close_fd: close UFFD file descriptor + * + * @uffd_fd: UFFD file descriptor + */ +void uffd_close_fd(int uffd_fd) +{ + assert(uffd_fd >= 0); + close(uffd_fd); +} + +/** + * uffd_register_memory: register memory range via UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) + * @ioctls: optional pointer to receive supported IOCTL mask + */ +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls) +{ + struct uffdio_register uffd_register; + + uffd_register.range.start = (uintptr_t) addr; + uffd_register.range.len = length; + uffd_register.mode = mode; + + if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { + trace_uffd_register_memory_failed(addr, length, mode, errno); + return -1; + } + if (ioctls) { + *ioctls = uffd_register.ioctls; + } + + return 0; +} + +/** + * uffd_unregister_memory: un-register memory range with UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + */ +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { + trace_uffd_unregister_memory_failed(addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO + * + * Returns 0 on success, negative value in case of error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @wp: write-protect/unprotect + * @dont_wake: do not wake threads waiting on wr-protected page + */ +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake) +{ + struct uffdio_writeprotect uffd_writeprotect; + + uffd_writeprotect.range.start = (uintptr_t) addr; + uffd_writeprotect.range.len = length; + if (!wp && dont_wake) { + /* DONTWAKE is meaningful only on protection release */ + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + } else { + uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); + } + + if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_writeprotect.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_copy_page: copy range of pages to destination via UFFD-IO + * + * Copy range of source pages to the destination to resolve + * missing page fault somewhere in the destination range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @dst_addr: destination base address + * @src_addr: source base address + * @length: length of the range to copy + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake) +{ + struct uffdio_copy uffd_copy; + + uffd_copy.dst = (uintptr_t) dst_addr; + uffd_copy.src = (uintptr_t) src_addr; + uffd_copy.len = length; + uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { + error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, + length, (uint64_t) uffd_copy.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_zero_page: fill range of pages with zeroes via UFFD-IO + * + * Fill range pages with zeroes to resolve missing page fault within the range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range to fill with zeroes + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) +{ + struct uffdio_zeropage uffd_zeropage; + + uffd_zeropage.range.start = (uintptr_t) addr; + uffd_zeropage.range.len = length; + uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { + error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_zeropage.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution + * + * Wake up threads waiting on any page/pages from the designated range. + * The main use case is when during some period, page faults are resolved + * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits + * for the whole memory range are satisfied in a single call to uffd_wakeup(). + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range + */ +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { + error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", + addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_read_events: read pending UFFD events + * + * Returns number of fetched messages, 0 if non is available or + * negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @msgs: pointer to message buffer + * @count: number of messages that can fit in the buffer + */ +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) +{ + ssize_t res; + do { + res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); + } while (res < 0 && errno == EINTR); + + if ((res < 0 && errno == EAGAIN)) { + return 0; + } + if (res < 0) { + error_report("uffd_read_events() failed: errno=%i", errno); + return -1; + } + + return (int) (res / sizeof(struct uffd_msg)); +} + +/** + * uffd_poll_events: poll UFFD file descriptor for read + * + * Returns true if events are available for read, false otherwise + * + * @uffd_fd: UFFD file descriptor + * @tmo: timeout value + */ +bool uffd_poll_events(int uffd_fd, int tmo) +{ + int res; + struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; + + do { + res = poll(&poll_fd, 1, tmo); + } while (res < 0 && errno == EINTR); + + if (res == 0) { + return false; + } + if (res < 0) { + error_report("uffd_poll_events() failed: errno=%i", errno); + return false; + } + + return (poll_fd.revents & POLLIN) != 0; +} |