diff options
30 files changed, 2145 insertions, 226 deletions
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index afd75ab628..75d7fa9510 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -900,10 +900,11 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) ImageEntry *image_entry, *next_ie; SnapshotEntry *snapshot_entry; + Error *err = NULL; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, &err); if (!bs) { - monitor_printf(mon, "No available block device supports snapshots\n"); + error_report_err(err); return; } aio_context = bdrv_get_aio_context(bs); @@ -953,7 +954,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) total = 0; for (i = 0; i < nb_sns; i++) { SnapshotEntry *next_sn; - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs1) == 0) { + if (bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL) == 1) { global_snapshots[total] = i; total++; QTAILQ_FOREACH(image_entry, &image_list, next) { diff --git a/block/snapshot.c b/block/snapshot.c index a2bf3a54eb..e8ae9a28c1 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -447,6 +447,41 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, return ret; } + +static int bdrv_all_get_snapshot_devices(bool has_devices, strList *devices, + GList **all_bdrvs, + Error **errp) +{ + g_autoptr(GList) bdrvs = NULL; + + if (has_devices) { + if (!devices) { + error_setg(errp, "At least one device is required for snapshot"); + return -1; + } + + while (devices) { + BlockDriverState *bs = bdrv_find_node(devices->value); + if (!bs) { + error_setg(errp, "No block device node '%s'", devices->value); + return -1; + } + bdrvs = g_list_append(bdrvs, bs); + devices = devices->next; + } + } else { + BlockDriverState *bs; + BdrvNextIterator it; + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + bdrvs = g_list_append(bdrvs, bs); + } + } + + *all_bdrvs = g_steal_pointer(&bdrvs); + return 0; +} + + static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) { if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { @@ -462,44 +497,59 @@ static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers) */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp) { - bool ok = true; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return false; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + bool ok = true; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ok = bdrv_can_snapshot(bs); } aio_context_release(ctx); if (!ok) { - bdrv_next_cleanup(&it); - goto fail; + error_setg(errp, "Device '%s' is writable but does not support " + "snapshots", bdrv_get_device_or_node_name(bs)); + return false; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ok; + return true; } -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; - QEMUSnapshotInfo sn1, *snapshot = &sn1; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn1, *snapshot = &sn1; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs) && + if ((devices || bdrv_all_snapshots_includes_bs(bs)) && bdrv_snapshot_find(bs, snapshot, name) >= 0) { ret = bdrv_snapshot_delete(bs, snapshot->id_str, @@ -507,118 +557,180 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not delete snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ret = bdrv_snapshot_goto(bs, name, errp); } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not load snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp) { - QEMUSnapshotInfo sn; - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { - err = bdrv_snapshot_find(bs, &sn, name); + if (devices || bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_find(bs, &sn, name); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + if (ret == -ENOENT) { + return 0; + } else { + error_setg_errno(errp, errno, + "Could not check snapshot '%s' on '%s'", + name, bdrv_get_device_or_node_name(bs)); + return -1; + } } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 1; } int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs) + bool has_devices, strList *devices, + Error **errp) { - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); if (bs == vm_state_bs) { sn->vm_state_size = vm_state_size; - err = bdrv_snapshot_create(bs, sn); - } else if (bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_create(bs, sn); + } else if (devices || bdrv_all_snapshots_includes_bs(bs)) { sn->vm_state_size = 0; - err = bdrv_snapshot_create(bs, sn); + ret = bdrv_snapshot_create(bs, sn); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + error_setg(errp, "Could not create snapshot '%s' on '%s'", + sn->name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 0; } -BlockDriverState *bdrv_all_find_vmstate_bs(void) + +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp) { - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return NULL; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); - bool found; + bool found = false; aio_context_acquire(ctx); - found = bdrv_all_snapshots_includes_bs(bs) && bdrv_can_snapshot(bs); + found = (devices || bdrv_all_snapshots_includes_bs(bs)) && + bdrv_can_snapshot(bs); aio_context_release(ctx); - if (found) { - bdrv_next_cleanup(&it); - break; + if (vmstate_bs) { + if (g_str_equal(vmstate_bs, + bdrv_get_node_name(bs))) { + if (found) { + return bs; + } else { + error_setg(errp, + "vmstate block device '%s' does not support snapshots", + vmstate_bs); + return NULL; + } + } + } else if (found) { + return bs; } + + iterbdrvs = iterbdrvs->next; + } + + if (vmstate_bs) { + error_setg(errp, + "vmstate block device '%s' does not exist", vmstate_bs); + } else { + error_setg(errp, + "no block device can store vmstate for snapshot"); } - return bs; + return NULL; } diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 76d7c91e9c..1b2b940606 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -2173,6 +2173,16 @@ static int spapr_pci_pre_save(void *opaque) return 0; } +static int spapr_pci_post_save(void *opaque) +{ + SpaprPhbState *sphb = opaque; + + g_free(sphb->msi_devs); + sphb->msi_devs = NULL; + sphb->msi_devs_num = 0; + return 0; +} + static int spapr_pci_post_load(void *opaque, int version_id) { SpaprPhbState *sphb = opaque; @@ -2205,6 +2215,7 @@ static const VMStateDescription vmstate_spapr_pci = { .version_id = 2, .minimum_version_id = 2, .pre_save = spapr_pci_pre_save, + .post_save = spapr_pci_post_save, .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL), diff --git a/include/block/snapshot.h b/include/block/snapshot.h index b0fe42993d..940345692f 100644 --- a/include/block/snapshot.h +++ b/include/block/snapshot.h @@ -25,7 +25,7 @@ #ifndef SNAPSHOT_H #define SNAPSHOT_H - +#include "qapi/qapi-builtin-types.h" #define SNAPSHOT_OPT_BASE "snapshot." #define SNAPSHOT_OPT_ID "snapshot.id" @@ -77,17 +77,26 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs); -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bsd_bs, +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp); +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs); +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs); + bool has_devices, + strList *devices, + Error **errp); -BlockDriverState *bdrv_all_find_vmstate_bs(void); +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index c6ce74fb79..e58c09f130 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -149,6 +149,14 @@ typedef struct IOMMUTLBEvent { /* RAM is a persistent kind memory */ #define RAM_PMEM (1 << 5) + +/* + * UFFDIO_WRITEPROTECT is used on this RAMBlock to + * support 'write-tracking' migration type. + * Implies ram_state->ram_wt_enabled. + */ +#define RAM_UF_WRITEPROTECT (1 << 6) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h index c85b6ec75b..e72083b117 100644 --- a/include/migration/snapshot.h +++ b/include/migration/snapshot.h @@ -15,7 +15,50 @@ #ifndef QEMU_MIGRATION_SNAPSHOT_H #define QEMU_MIGRATION_SNAPSHOT_H -int save_snapshot(const char *name, Error **errp); -int load_snapshot(const char *name, Error **errp); +#include "qapi/qapi-builtin-types.h" + +/** + * save_snapshot: Save an internal snapshot. + * @name: name of internal snapshot + * @overwrite: replace existing snapshot with @name + * @vmstate: blockdev node name to store VM state in + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool save_snapshot(const char *name, bool overwrite, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * load_snapshot: Load an internal snapshot. + * @name: name of internal snapshot + * @vmstate: blockdev node name to load VM state from + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool load_snapshot(const char *name, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * delete_snapshot: Delete a snapshot. + * @name: path to snapshot + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool delete_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h new file mode 100644 index 0000000000..6b74f92792 --- /dev/null +++ b/include/qemu/userfaultfd.h @@ -0,0 +1,35 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef USERFAULTFD_H +#define USERFAULTFD_H + +#include "qemu/osdep.h" +#include "exec/hwaddr.h" +#include <linux/userfaultfd.h> + +int uffd_query_features(uint64_t *features); +int uffd_create_fd(uint64_t features, bool non_blocking); +void uffd_close_fd(int uffd_fd); +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls); +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length); +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake); +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake); +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake); +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); +bool uffd_poll_events(int uffd_fd, int tmo); + +#endif /* USERFAULTFD_H */ diff --git a/migration/migration.c b/migration/migration.c index 1986cb8573..a5ddf43559 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -58,6 +58,7 @@ #include "qemu/queue.h" #include "multifd.h" #include "qemu/yank.h" +#include "sysemu/cpus.h" #ifdef CONFIG_VFIO #include "hw/vfio/vfio-common.h" @@ -134,6 +135,38 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; +/* Migration capabilities set */ +struct MigrateCapsSet { + int size; /* Capability set size */ + MigrationCapability caps[]; /* Variadic array of capabilities */ +}; +typedef struct MigrateCapsSet MigrateCapsSet; + +/* Define and initialize MigrateCapsSet */ +#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \ + MigrateCapsSet _name = { \ + .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \ + .caps = { __VA_ARGS__ } \ + } + +/* Background-snapshot compatibility check list */ +static const +INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, + MIGRATION_CAPABILITY_POSTCOPY_RAM, + MIGRATION_CAPABILITY_DIRTY_BITMAPS, + MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME, + MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE, + MIGRATION_CAPABILITY_RETURN_PATH, + MIGRATION_CAPABILITY_MULTIFD, + MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER, + MIGRATION_CAPABILITY_AUTO_CONVERGE, + MIGRATION_CAPABILITY_RELEASE_RAM, + MIGRATION_CAPABILITY_RDMA_PIN_ALL, + MIGRATION_CAPABILITY_COMPRESS, + MIGRATION_CAPABILITY_XBZRLE, + MIGRATION_CAPABILITY_X_COLO, + MIGRATION_CAPABILITY_VALIDATE_UUID); + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -141,6 +174,8 @@ enum mig_rp_message_type { static MigrationState *current_migration; static MigrationIncomingState *current_incoming; +static GSList *migration_blockers; + static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, @@ -1041,6 +1076,27 @@ static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); + info->blocked = migration_is_blocked(NULL); + info->has_blocked_reasons = info->blocked; + info->blocked_reasons = NULL; + if (info->blocked) { + GSList *cur_blocker = migration_blockers; + + /* + * There are two types of reasons a migration might be blocked; + * a) devices marked in VMState as non-migratable, and + * b) Explicit migration blockers + * We need to add both of them here. + */ + qemu_savevm_non_migratable_list(&info->blocked_reasons); + + while (cur_blocker) { + QAPI_LIST_PREPEND(info->blocked_reasons, + g_strdup(error_get_pretty(cur_blocker->data))); + cur_blocker = g_slist_next(cur_blocker); + } + } + switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ @@ -1089,6 +1145,31 @@ static void fill_source_migration_info(MigrationInfo *info) info->status = s->state; } +typedef enum WriteTrackingSupport { + WT_SUPPORT_UNKNOWN = 0, + WT_SUPPORT_ABSENT, + WT_SUPPORT_AVAILABLE, + WT_SUPPORT_COMPATIBLE +} WriteTrackingSupport; + +static +WriteTrackingSupport migrate_query_write_tracking(void) +{ + /* Check if kernel supports required UFFD features */ + if (!ram_write_tracking_available()) { + return WT_SUPPORT_ABSENT; + } + /* + * Check if current memory configuration is + * compatible with required UFFD features. + */ + if (!ram_write_tracking_compatible()) { + return WT_SUPPORT_AVAILABLE; + } + + return WT_SUPPORT_COMPATIBLE; +} + /** * @migration_caps_check - check capability validity * @@ -1150,6 +1231,39 @@ static bool migrate_caps_check(bool *cap_list, } } + if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { + WriteTrackingSupport wt_support; + int idx; + /* + * Check if 'background-snapshot' capability is supported by + * host kernel and compatible with guest memory configuration. + */ + wt_support = migrate_query_write_tracking(); + if (wt_support < WT_SUPPORT_AVAILABLE) { + error_setg(errp, "Background-snapshot is not supported by host kernel"); + return false; + } + if (wt_support < WT_SUPPORT_COMPATIBLE) { + error_setg(errp, "Background-snapshot is not compatible " + "with guest memory configuration"); + return false; + } + + /* + * Check if there are any migration capabilities + * incompatible with 'background-snapshot'. + */ + for (idx = 0; idx < check_caps_background_snapshot.size; idx++) { + int incomp_cap = check_caps_background_snapshot.caps[idx]; + if (cap_list[incomp_cap]) { + error_setg(errp, + "Background-snapshot is not compatible with %s", + MigrationCapability_str(incomp_cap)); + return false; + } + } + } + return true; } @@ -1226,21 +1340,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_compress_level && (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } @@ -1293,21 +1407,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_multifd_channels && (params->multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_channels", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_multifd_zlib_level && (params->multifd_zlib_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", - "is invalid, it should be in the range of 0 to 20"); + "a value between 0 and 20"); return false; } @@ -1316,8 +1430,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) !is_power_of_2(params->xbzrle_cache_size))) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "xbzrle_cache_size", - "is invalid, it should be bigger than target page size" - " and a power of 2"); + "a power of two no less than the target page size"); return false; } @@ -1334,21 +1447,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_initial > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_initial", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_max && params->announce_max > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_max", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_rounds && params->announce_rounds > 1000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_rounds", - "is invalid, it must be in the range of 0 to 1000"); + "a value between 0 and 1000"); return false; } if (params->has_announce_step && @@ -1356,7 +1469,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_step > 10000)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_step", - "is invalid, it must be in the range of 1 to 10000 ms"); + "a value between 0 and 10000"); return false; } @@ -1909,6 +2022,7 @@ void migrate_init(MigrationState *s) * locks. */ s->cleanup_bh = 0; + s->vm_start_bh = 0; s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; @@ -1934,8 +2048,6 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -static GSList *migration_blockers; - int migrate_add_blocker(Error *reason, Error **errp) { if (only_migratable) { @@ -2216,7 +2328,7 @@ void qmp_migrate_set_cache_size(int64_t value, Error **errp) qmp_migrate_set_parameters(&p, errp); } -int64_t qmp_query_migrate_cache_size(Error **errp) +uint64_t qmp_query_migrate_cache_size(Error **errp) { return migrate_xbzrle_cache_size(); } @@ -2446,7 +2558,7 @@ int migrate_use_xbzrle(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; } -int64_t migrate_xbzrle_cache_size(void) +uint64_t migrate_xbzrle_cache_size(void) { MigrationState *s; @@ -2491,6 +2603,15 @@ bool migrate_use_block_incremental(void) return s->parameters.block_incremental; } +bool migrate_background_snapshot(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; +} + /* migration thread support */ /* * Something bad happened to the RP stream, mark an error @@ -3117,6 +3238,50 @@ fail: MIGRATION_STATUS_FAILED); } +/** + * bg_migration_completion: Used by bg_migration_thread when after all the + * RAM has been saved. The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void bg_migration_completion(MigrationState *s) +{ + int current_active_state = s->state; + + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * By this moment we have RAM content saved into the migration stream. + * The next step is to flush the non-RAM content (device state) + * right after the ram content. The device state has been stored into + * the temporary buffer before RAM saving started. + */ + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); + qemu_fflush(s->to_dst_file); + } else if (s->state == MIGRATION_STATUS_CANCELLING) { + goto fail; + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + bool migrate_colo_enabled(void) { MigrationState *s = migrate_get_current(); @@ -3457,6 +3622,47 @@ static void migration_iteration_finish(MigrationState *s) qemu_mutex_unlock_iothread(); } +static void bg_migration_iteration_finish(MigrationState *s) +{ + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + break; + + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_CANCELLING: + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + + migrate_fd_cleanup_schedule(s); + qemu_mutex_unlock_iothread(); +} + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState bg_migration_iteration_run(MigrationState *s) +{ + int res; + + res = qemu_savevm_state_iterate(s->to_dst_file, false); + if (res > 0) { + bg_migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + void migration_make_urgent_request(void) { qemu_sem_post(&migrate_get_current()->rate_limit_sem); @@ -3604,6 +3810,165 @@ static void *migration_thread(void *opaque) return NULL; } +static void bg_migration_vm_start_bh(void *opaque) +{ + MigrationState *s = opaque; + + qemu_bh_delete(s->vm_start_bh); + s->vm_start_bh = NULL; + + vm_start(); + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; +} + +/** + * Background snapshot thread, based on live migration code. + * This is an alternative implementation of live migration mechanism + * introduced specifically to support background snapshots. + * + * It takes advantage of userfault_fd write protection mechanism introduced + * in v5.7 kernel. Compared to existing dirty page logging migration much + * lesser stream traffic is produced resulting in smaller snapshot images, + * simply cause of no page duplicates can get into the stream. + * + * Another key point is that generated vmstate stream reflects machine state + * 'frozen' at the beginning of snapshot creation compared to dirty page logging + * mechanism, which effectively results in that saved snapshot is the state of VM + * at the end of the process. + */ +static void *bg_migration_thread(void *opaque) +{ + MigrationState *s = opaque; + int64_t setup_start; + MigThrError thr_error; + QEMUFile *fb; + bool early_fail = true; + + rcu_register_thread(); + object_ref(OBJECT(s)); + + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + /* + * We want to save vmstate for the moment when migration has been + * initiated but also we want to save RAM content while VM is running. + * The RAM content should appear first in the vmstate. So, we first + * stash the non-RAM part of the vmstate to the temporary buffer, + * then write RAM part of the vmstate to the migration stream + * with vCPUs running and, finally, write stashed non-RAM part of + * the vmstate from the buffer to the migration stream. + */ + s->bioc = qio_channel_buffer_new(128 * 1024); + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); + object_unref(OBJECT(s->bioc)); + + update_iteration_initial_status(s); + + qemu_savevm_state_header(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); + + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, + MIGRATION_STATUS_ACTIVE); + } else { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); + } + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + + trace_migration_thread_setup_complete(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + qemu_mutex_lock_iothread(); + + /* + * If VM is currently in suspended state, then, to make a valid runstate + * transition in vm_stop_force_state() we need to wakeup it up. + */ + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + s->vm_was_running = runstate_is_running(); + + if (global_state_store()) { + goto fail; + } + /* Forcibly stop VM before saving state of vCPUs and devices */ + if (vm_stop_force_state(RUN_STATE_PAUSED)) { + goto fail; + } + /* + * Put vCPUs in sync with shadow context structures, then + * save their state to channel-buffer along with devices. + */ + cpu_synchronize_all_states(); + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + goto fail; + } + /* Now initialize UFFD context and start tracking RAM writes */ + if (ram_write_tracking_start()) { + goto fail; + } + early_fail = false; + + /* + * Start VM from BH handler to avoid write-fault lock here. + * UFFD-WP protection for the whole RAM is already enabled so + * calling VM state change notifiers from vm_start() would initiate + * writes to virtio VQs memory which is in write-protected region. + */ + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); + qemu_bh_schedule(s->vm_start_bh); + + qemu_mutex_unlock_iothread(); + + while (migration_is_active(s)) { + MigIterateState iter_state = bg_migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { + break; + } + + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ + break; + } + + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } + + trace_migration_thread_after_loop(); + +fail: + if (early_fail) { + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + } + + bg_migration_iteration_finish(s); + + qemu_fclose(fb); + object_unref(OBJECT(s)); + rcu_unregister_thread(); + + return NULL; +} + void migrate_fd_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; @@ -3667,8 +4032,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) migrate_fd_cleanup(s); return; } - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, - QEMU_THREAD_JOINABLE); + + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "bg_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&s->thread, "live_migration", + migration_thread, s, QEMU_THREAD_JOINABLE); + } s->migration_thread_running = true; } @@ -3784,6 +4155,8 @@ static Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), + DEFINE_PROP_MIG_CAP("x-background-snapshot", + MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), DEFINE_PROP_END_OF_LIST(), }; diff --git a/migration/migration.h b/migration/migration.h index d096b77f74..db6708326b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -20,6 +20,7 @@ #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" +#include "io/channel-buffer.h" #include "net/announce.h" #include "qom/object.h" @@ -147,8 +148,10 @@ struct MigrationState { /*< public >*/ QemuThread thread; + QEMUBH *vm_start_bh; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + QIOChannelBuffer *bioc; /* * Protects to_dst_file pointer. We need to make sure we won't * yield or hang during the critical section, since this lock will @@ -324,7 +327,7 @@ int migrate_multifd_zlib_level(void); int migrate_multifd_zstd_level(void); int migrate_use_xbzrle(void); -int64_t migrate_xbzrle_cache_size(void); +uint64_t migrate_xbzrle_cache_size(void); bool migrate_colo_enabled(void); bool migrate_use_block(void); @@ -341,6 +344,7 @@ int migrate_compress_wait_thread(void); int migrate_decompress_threads(void); bool migrate_use_events(void); bool migrate_postcopy_blocktime(void); +bool migrate_background_snapshot(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/migration/page_cache.c b/migration/page_cache.c index 098b436223..6d4f7a9bbc 100644 --- a/migration/page_cache.c +++ b/migration/page_cache.c @@ -38,7 +38,7 @@ struct PageCache { size_t num_items; }; -PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) +PageCache *cache_init(uint64_t new_size, size_t page_size, Error **errp) { int64_t i; size_t num_pages = new_size / page_size; @@ -60,8 +60,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) /* We prefer not to abort if there is no memory */ cache = g_try_malloc(sizeof(*cache)); if (!cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate cache"); + error_setg(errp, "Failed to allocate cache"); return NULL; } cache->page_size = page_size; @@ -74,8 +73,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) cache->page_cache = g_try_malloc((cache->max_num_items) * sizeof(*cache->page_cache)); if (!cache->page_cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate page cache"); + error_setg(errp, "Failed to allocate page cache"); g_free(cache); return NULL; } diff --git a/migration/page_cache.h b/migration/page_cache.h index 0cb94498a0..8733b4df6e 100644 --- a/migration/page_cache.h +++ b/migration/page_cache.h @@ -28,7 +28,7 @@ typedef struct PageCache PageCache; * @page_size: cache page size * @errp: set *errp if the check failed, with reason */ -PageCache *cache_init(int64_t cache_size, size_t page_size, Error **errp); +PageCache *cache_init(uint64_t cache_size, size_t page_size, Error **errp); /** * cache_fini: free all cache resources * @cache pointer to the PageCache struct diff --git a/migration/qemu-file.c b/migration/qemu-file.c index be21518c57..d6e03dbc0e 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -595,7 +595,7 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) { if (size < IO_BUF_SIZE) { size_t res; - uint8_t *src; + uint8_t *src = NULL; res = qemu_peek_buffer(f, &src, size, 0); diff --git a/migration/ram.c b/migration/ram.c index 7811cde643..72143da0ac 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -56,6 +56,11 @@ #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" +#include "sysemu/runstate.h" + +#if defined(__linux__) +#include "qemu/userfaultfd.h" +#endif /* defined(__linux__) */ /***********************************************************/ /* ram save/restore */ @@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void) * @new_size: new cache size * @errp: set *errp if the check failed, with reason */ -int xbzrle_cache_resize(int64_t new_size, Error **errp) +int xbzrle_cache_resize(uint64_t new_size, Error **errp) { PageCache *new_cache; int64_t ret = 0; @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { struct RAMState { /* QEMUFile used for this migration */ QEMUFile *f; + /* UFFD file descriptor, used in 'write-tracking' migration */ + int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; /* Last block from where we have sent data */ @@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) return block; } +#if defined(__linux__) +/** + * poll_fault_page: try to get next UFFD write fault page and, if pending fault + * is found, return RAM block pointer and page offset + * + * Returns pointer to the RAMBlock containing faulting page, + * NULL if no write faults are pending + * + * @rs: current RAM state + * @offset: page offset from the beginning of the block + */ +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + struct uffd_msg uffd_msg; + void *page_address; + RAMBlock *bs; + int res; + + if (!migrate_background_snapshot()) { + return NULL; + } + + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); + if (res <= 0) { + return NULL; + } + + page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; + bs = qemu_ram_block_from_host(page_address, false, offset); + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); + return bs; +} + +/** + * ram_save_release_protection: release UFFD write protection after + * a range of pages has been saved + * + * @rs: current RAM state + * @pss: page-search-status structure + * @start_page: index of the first page in the range relative to pss->block + * + * Returns 0 on success, negative value in case of an error +*/ +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + int res = 0; + + /* Check if page is from UFFD-managed region. */ + if (pss->block->flags & RAM_UF_WRITEPROTECT) { + void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); + uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; + + /* Flush async buffers before un-protect. */ + qemu_fflush(rs->f); + /* Un-protect memory range. */ + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, + false, false); + } + + return res; +} + +/* ram_write_tracking_available: check if kernel supports required UFFD features + * + * Returns true if supports, false otherwise + */ +bool ram_write_tracking_available(void) +{ + uint64_t uffd_features; + int res; + + res = uffd_query_features(&uffd_features); + return (res == 0 && + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); +} + +/* ram_write_tracking_compatible: check if guest configuration is + * compatible with 'write-tracking' + * + * Returns true if compatible, false otherwise + */ +bool ram_write_tracking_compatible(void) +{ + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); + int uffd_fd; + RAMBlock *bs; + bool ret = false; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); + if (uffd_fd < 0) { + return false; + } + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + uint64_t uffd_ioctls; + + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + /* Try to register block memory via UFFD-IO to track writes */ + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { + goto out; + } + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { + goto out; + } + } + ret = true; + +out: + uffd_close_fd(uffd_fd); + return ret; +} + +/* + * ram_write_tracking_start: start UFFD-WP memory tracking + * + * Returns 0 for success or negative value in case of error + */ +int ram_write_tracking_start(void) +{ + int uffd_fd; + RAMState *rs = ram_state; + RAMBlock *bs; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); + if (uffd_fd < 0) { + return uffd_fd; + } + rs->uffdio_fd = uffd_fd; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + + /* Register block memory with UFFD to track writes */ + if (uffd_register_memory(rs->uffdio_fd, bs->host, + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { + goto fail; + } + /* Apply UFFD write protection to the block memory range */ + if (uffd_change_protection(rs->uffdio_fd, bs->host, + bs->max_length, true, false)) { + goto fail; + } + bs->flags |= RAM_UF_WRITEPROTECT; + memory_region_ref(bs->mr); + + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, + bs->host, bs->max_length); + } + + return 0; + +fail: + error_report("ram_write_tracking_start() failed: restoring initial memory state"); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* + * In case some memory block failed to be write-protected + * remove protection and unregister all succeeded RAM blocks + */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + uffd_close_fd(uffd_fd); + rs->uffdio_fd = -1; + return -1; +} + +/** + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection + */ +void ram_write_tracking_stop(void) +{ + RAMState *rs = ram_state; + RAMBlock *bs; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* Remove protection and unregister all affected RAM blocks */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, + bs->host, bs->max_length); + + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + /* Finally close UFFD file descriptor */ + uffd_close_fd(rs->uffdio_fd); + rs->uffdio_fd = -1; +} + +#else +/* No target OS support, stubs just fail or ignore */ + +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + (void) rs; + (void) offset; + + return NULL; +} + +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + (void) rs; + (void) pss; + (void) start_page; + + return 0; +} + +bool ram_write_tracking_available(void) +{ + return false; +} + +bool ram_write_tracking_compatible(void) +{ + assert(0); + return false; +} + +int ram_write_tracking_start(void) +{ + assert(0); + return -1; +} + +void ram_write_tracking_stop(void) +{ + assert(0); +} +#endif /* defined(__linux__) */ + /** * get_queued_page: unqueue a page from the postcopy requests * @@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); + if (!block) { + /* + * Poll write faults too if background snapshot is enabled; that's + * when we have vcpus got blocked by the write protected pages. + */ + block = poll_fault_page(rs, &offset); + } + if (block) { /* * As soon as we start servicing pages out of order, then we have @@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + unsigned long start_page = pss->page; + int res; if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); @@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); - /* The offset we leave with is the last one we looked at */ pss->page--; - return pages; + + res = ram_save_release_protection(rs, pss, start_page); + return (res < 0 ? res : pages); } /** @@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque) RAMState **rsp = opaque; RAMBlock *block; - /* caller have hold iothread lock or is in a bh, so there is - * no writing race against the migration bitmap - */ - memory_global_dirty_log_stop(); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against the migration bitmap + */ + memory_global_dirty_log_stop(); + } RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->clear_bmap); @@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); - memory_global_dirty_log_start(); - migration_bitmap_sync_precopy(rs); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + memory_global_dirty_log_start(); + migration_bitmap_sync_precopy(rs); + } } qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); @@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f) } } /* For postcopy we need to check hugepage sizes match */ - if (postcopy_advised && + if (postcopy_advised && migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { uint64_t remote_page_size = qemu_get_be64(f); if (remote_page_size != block->page_size) { diff --git a/migration/ram.h b/migration/ram.h index 011e85414e..6378bb3ebc 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -47,7 +47,7 @@ bool ramblock_is_ignored(RAMBlock *block); INTERNAL_RAMBLOCK_FOREACH(block) \ if (!qemu_ram_is_migratable(block)) {} else -int xbzrle_cache_resize(int64_t new_size, Error **errp); +int xbzrle_cache_resize(uint64_t new_size, Error **errp); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_total(void); @@ -79,4 +79,10 @@ void colo_flush_ram_cache(void); void colo_release_ram_cache(void); void colo_incoming_start_dirty_log(void); +/* Background snapshot */ +bool ram_write_tracking_available(void); +bool ram_write_tracking_compatible(void); +int ram_write_tracking_start(void); +void ram_write_tracking_stop(void); + #endif diff --git a/migration/savevm.c b/migration/savevm.c index 4f3b69ecfc..52e2d72e4b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -43,6 +43,8 @@ #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "qapi/qmp/json-writer.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-builtin-visit.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "sysemu/cpus.h" @@ -315,6 +317,16 @@ static int configuration_pre_save(void *opaque) return 0; } +static int configuration_post_save(void *opaque) +{ + SaveState *state = opaque; + + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + return 0; +} + static int configuration_pre_load(void *opaque) { SaveState *state = opaque; @@ -365,24 +377,36 @@ static int configuration_post_load(void *opaque, int version_id) { SaveState *state = opaque; const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + int ret = 0; if (strncmp(state->name, current_name, state->len) != 0) { error_report("Machine type received is '%.*s' and local is '%s'", (int) state->len, state->name, current_name); - return -EINVAL; + ret = -EINVAL; + goto out; } if (state->target_page_bits != qemu_target_page_bits()) { error_report("Received TARGET_PAGE_BITS is %d but local is %d", state->target_page_bits, qemu_target_page_bits()); - return -EINVAL; + ret = -EINVAL; + goto out; } if (!configuration_validate_capabilities(state)) { - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; +out: + g_free((void *)state->name); + state->name = NULL; + state->len = 0; + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + + return ret; } static int get_capability(QEMUFile *f, void *pv, size_t size, @@ -516,6 +540,7 @@ static const VMStateDescription vmstate_configuration = { .pre_load = configuration_pre_load, .post_load = configuration_post_load, .pre_save = configuration_pre_save, + .post_save = configuration_post_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), @@ -1131,6 +1156,19 @@ bool qemu_savevm_state_blocked(Error **errp) return false; } +void qemu_savevm_non_migratable_list(strList **reasons) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (se->vmsd && se->vmsd->unmigratable) { + QAPI_LIST_PREPEND(*reasons, + g_strdup_printf("non-migratable device: %s", + se->idstr)); + } + } +} + void qemu_savevm_state_header(QEMUFile *f) { trace_savevm_state_header(); @@ -1355,7 +1393,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) return 0; } -static int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) @@ -2729,9 +2766,10 @@ int qemu_load_device_state(QEMUFile *f) return 0; } -int save_snapshot(const char *name, Error **errp) +bool save_snapshot(const char *name, bool overwrite, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs1; + BlockDriverState *bs; QEMUSnapshotInfo sn1, *sn = &sn1; int ret = -1, ret2; QEMUFile *f; @@ -2742,35 +2780,43 @@ int save_snapshot(const char *name, Error **errp) AioContext *aio_context; if (migration_is_blocked(errp)) { - return ret; + return false; } if (!replay_can_snapshot()) { error_setg(errp, "Record/replay does not allow making snapshot " "right now. Try once more later."); - return ret; + return false; } - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, "Device '%s' is writable but does not support " - "snapshots", bdrv_get_device_or_node_name(bs)); - return ret; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } /* Delete old snapshots of the same name */ if (name) { - ret = bdrv_all_delete_snapshot(name, &bs1, errp); - if (ret < 0) { - error_prepend(errp, "Error while deleting snapshot on device " - "'%s': ", bdrv_get_device_or_node_name(bs1)); - return ret; + if (overwrite) { + if (bdrv_all_delete_snapshot(name, has_devices, + devices, errp) < 0) { + return false; + } + } else { + ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp); + if (ret2 < 0) { + return false; + } + if (ret2 == 1) { + error_setg(errp, + "Snapshot '%s' already exists in one or more devices", + name); + return false; + } } } - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (bs == NULL) { - error_setg(errp, "No block device can accept snapshots"); - return ret; + return false; } aio_context = bdrv_get_aio_context(bs); @@ -2779,7 +2825,7 @@ int save_snapshot(const char *name, Error **errp) ret = global_state_store(); if (ret) { error_setg(errp, "Error saving global state"); - return ret; + return false; } vm_stop(RUN_STATE_SAVE_VM); @@ -2833,11 +2879,10 @@ int save_snapshot(const char *name, Error **errp) aio_context_release(aio_context); aio_context = NULL; - ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); + ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, + has_devices, devices, errp); if (ret < 0) { - error_setg(errp, "Error while creating snapshot on '%s'", - bdrv_get_device_or_node_name(bs)); - bdrv_all_delete_snapshot(sn->name, &bs, NULL); + bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL); goto the_end; } @@ -2853,7 +2898,7 @@ int save_snapshot(const char *name, Error **errp) if (saved_vm_running) { vm_start(); } - return ret; + return ret == 0; } void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, @@ -2938,33 +2983,32 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp) migration_incoming_state_destroy(); } -int load_snapshot(const char *name, Error **errp) +bool load_snapshot(const char *name, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs_vm_state; + BlockDriverState *bs_vm_state; QEMUSnapshotInfo sn; QEMUFile *f; int ret; AioContext *aio_context; MigrationIncomingState *mis = migration_incoming_get_current(); - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, - "Device '%s' is writable but does not support snapshots", - bdrv_get_device_or_node_name(bs)); - return -ENOTSUP; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } - ret = bdrv_all_find_snapshot(name, &bs); + ret = bdrv_all_has_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_setg(errp, - "Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_or_node_name(bs), name); - return ret; + return false; + } + if (ret == 0) { + error_setg(errp, "Snapshot '%s' does not exist in one or more devices", + name); + return false; } - bs_vm_state = bdrv_all_find_vmstate_bs(); + bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (!bs_vm_state) { - error_setg(errp, "No block device supports snapshots"); - return -ENOTSUP; + return false; } aio_context = bdrv_get_aio_context(bs_vm_state); @@ -2973,11 +3017,11 @@ int load_snapshot(const char *name, Error **errp) ret = bdrv_snapshot_find(bs_vm_state, &sn, name); aio_context_release(aio_context); if (ret < 0) { - return ret; + return false; } else if (sn.vm_state_size == 0) { error_setg(errp, "This is a disk-only snapshot. Revert to it " " offline using qemu-img"); - return -EINVAL; + return false; } /* @@ -2989,10 +3033,8 @@ int load_snapshot(const char *name, Error **errp) /* Flush all IO requests so they don't interfere with the new state. */ bdrv_drain_all_begin(); - ret = bdrv_all_goto_snapshot(name, &bs, errp); + ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_prepend(errp, "Could not load snapshot '%s' on '%s': ", - name, bdrv_get_device_or_node_name(bs)); goto err_drain; } @@ -3000,7 +3042,6 @@ int load_snapshot(const char *name, Error **errp) f = qemu_fopen_bdrv(bs_vm_state, 0); if (!f) { error_setg(errp, "Could not open VM state file"); - ret = -EINVAL; goto err_drain; } @@ -3020,14 +3061,28 @@ int load_snapshot(const char *name, Error **errp) if (ret < 0) { error_setg(errp, "Error %d while loading VM state", ret); - return ret; + return false; } - return 0; + return true; err_drain: bdrv_drain_all_end(); - return ret; + return false; +} + +bool delete_snapshot(const char *name, bool has_devices, + strList *devices, Error **errp) +{ + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; + } + + if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) { + return false; + } + + return true; } void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) @@ -3057,3 +3112,187 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) return !(vmsd && vmsd->unmigratable); } + +typedef struct SnapshotJob { + Job common; + char *tag; + char *vmstate; + strList *devices; + Coroutine *co; + Error **errp; + bool ret; +} SnapshotJob; + +static void qmp_snapshot_job_free(SnapshotJob *s) +{ + g_free(s->tag); + g_free(s->vmstate); + qapi_free_strList(s->devices); +} + + +static void snapshot_load_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + int orig_vm_running; + + job_progress_set_remaining(&s->common, 1); + + orig_vm_running = runstate_is_running(); + vm_stop(RUN_STATE_RESTORE_VM); + + s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp); + if (s->ret && orig_vm_running) { + vm_start(); + } + + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_save_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = save_snapshot(s->tag, false, s->vmstate, + true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_delete_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = delete_snapshot(s->tag, true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_save_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_load_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_delete_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + + +static const JobDriver snapshot_load_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_LOAD, + .run = snapshot_load_job_run, +}; + +static const JobDriver snapshot_save_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_SAVE, + .run = snapshot_save_job_run, +}; + +static const JobDriver snapshot_delete_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_DELETE, + .run = snapshot_delete_job_run, +}; + + +void qmp_snapshot_save(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_save_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_load(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_load_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_delete(const char *job_id, + const char *tag, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_delete_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} diff --git a/migration/savevm.h b/migration/savevm.h index ba64a7e271..6461342cb4 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -30,6 +30,7 @@ #define QEMU_VM_SECTION_FOOTER 0x7e bool qemu_savevm_state_blocked(Error **errp); +void qemu_savevm_non_migratable_list(strList **reasons); void qemu_savevm_state_setup(QEMUFile *f); bool qemu_savevm_state_guest_unplug_pending(void); int qemu_savevm_state_resume_prepare(MigrationState *s); @@ -64,5 +65,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); #endif diff --git a/migration/trace-events b/migration/trace-events index 75de5004ac..668c562fed 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %d" diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index a48bc1e904..3c88a4faef 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -224,6 +224,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) migration_global_dump(mon); + if (info->blocked) { + strList *reasons = info->blocked_reasons; + monitor_printf(mon, "Outgoing migration blocked:\n"); + while (reasons) { + monitor_printf(mon, " %s\n", reasons->value); + reasons = reasons->next; + } + } + if (info->has_status) { monitor_printf(mon, "Migration status: %s", MigrationStatus_str(info->status)); @@ -1130,7 +1139,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict) vm_stop(RUN_STATE_RESTORE_VM); - if (load_snapshot(name, &err) == 0 && saved_vm_running) { + if (!load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { vm_start(); } hmp_handle_error(mon, err); @@ -1140,21 +1149,17 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) { Error *err = NULL; - save_snapshot(qdict_get_try_str(qdict, "name"), &err); + save_snapshot(qdict_get_try_str(qdict, "name"), + true, NULL, false, NULL, &err); hmp_handle_error(mon, err); } void hmp_delvm(Monitor *mon, const QDict *qdict) { - BlockDriverState *bs; Error *err = NULL; const char *name = qdict_get_str(qdict, "name"); - if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { - error_prepend(&err, - "deleting snapshot on device '%s': ", - bdrv_get_device_name(bs)); - } + delete_snapshot(name, false, NULL, &err); hmp_handle_error(mon, err); } @@ -1294,11 +1299,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) switch (val) { case MIGRATION_PARAMETER_COMPRESS_LEVEL: p->has_compress_level = true; - visit_type_int(v, param, &p->compress_level, &err); + visit_type_uint8(v, param, &p->compress_level, &err); break; case MIGRATION_PARAMETER_COMPRESS_THREADS: p->has_compress_threads = true; - visit_type_int(v, param, &p->compress_threads, &err); + visit_type_uint8(v, param, &p->compress_threads, &err); break; case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD: p->has_compress_wait_thread = true; @@ -1306,19 +1311,19 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DECOMPRESS_THREADS: p->has_decompress_threads = true; - visit_type_int(v, param, &p->decompress_threads, &err); + visit_type_uint8(v, param, &p->decompress_threads, &err); break; case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: p->has_throttle_trigger_threshold = true; - visit_type_int(v, param, &p->throttle_trigger_threshold, &err); + visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL: p->has_cpu_throttle_initial = true; - visit_type_int(v, param, &p->cpu_throttle_initial, &err); + visit_type_uint8(v, param, &p->cpu_throttle_initial, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT: p->has_cpu_throttle_increment = true; - visit_type_int(v, param, &p->cpu_throttle_increment, &err); + visit_type_uint8(v, param, &p->cpu_throttle_increment, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW: p->has_cpu_throttle_tailslow = true; @@ -1326,7 +1331,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; - visit_type_int(v, param, &p->max_cpu_throttle, &err); + visit_type_uint8(v, param, &p->max_cpu_throttle, &err); break; case MIGRATION_PARAMETER_TLS_CREDS: p->has_tls_creds = true; @@ -1362,11 +1367,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DOWNTIME_LIMIT: p->has_downtime_limit = true; - visit_type_int(v, param, &p->downtime_limit, &err); + visit_type_size(v, param, &p->downtime_limit, &err); break; case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY: p->has_x_checkpoint_delay = true; - visit_type_int(v, param, &p->x_checkpoint_delay, &err); + visit_type_uint32(v, param, &p->x_checkpoint_delay, &err); break; case MIGRATION_PARAMETER_BLOCK_INCREMENTAL: p->has_block_incremental = true; @@ -1374,7 +1379,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_CHANNELS: p->has_multifd_channels = true; - visit_type_int(v, param, &p->multifd_channels, &err); + visit_type_uint8(v, param, &p->multifd_channels, &err); break; case MIGRATION_PARAMETER_MULTIFD_COMPRESSION: p->has_multifd_compression = true; @@ -1383,11 +1388,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL: p->has_multifd_zlib_level = true; - visit_type_int(v, param, &p->multifd_zlib_level, &err); + visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; - visit_type_int(v, param, &p->multifd_zstd_level, &err); + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; diff --git a/qapi/job.json b/qapi/job.json index 280c2f76f1..1a6ef03451 100644 --- a/qapi/job.json +++ b/qapi/job.json @@ -22,10 +22,17 @@ # # @amend: image options amend job type, see "x-blockdev-amend" (since 5.1) # +# @snapshot-load: snapshot load job type, see "snapshot-load" (since 6.0) +# +# @snapshot-save: snapshot save job type, see "snapshot-save" (since 6.0) +# +# @snapshot-delete: snapshot delete job type, see "snapshot-delete" (since 6.0) +# # Since: 1.7 ## { 'enum': 'JobType', - 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend'] } + 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend', + 'snapshot-load', 'snapshot-save', 'snapshot-delete'] } ## # @JobStatus: diff --git a/qapi/migration.json b/qapi/migration.json index d1d9632c2a..ce14d78071 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -78,7 +78,7 @@ # Since: 1.2 ## { 'struct': 'XBZRLECacheStats', - 'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int', + 'data': {'cache-size': 'size', 'bytes': 'int', 'pages': 'int', 'cache-miss': 'int', 'cache-miss-rate': 'number', 'encoding-rate': 'number', 'overflow': 'int' } } @@ -224,6 +224,10 @@ # only returned if VFIO device is present, migration is supported by all # VFIO devices and status is 'active' or 'completed' (since 5.2) # +# @blocked: True if outgoing migration is blocked (since 6.0) +# +# @blocked-reasons: A list of reasons an outgoing migration is blocked (since 6.0) +# # Since: 0.14 ## { 'struct': 'MigrationInfo', @@ -237,6 +241,8 @@ '*setup-time': 'int', '*cpu-throttle-percentage': 'int', '*error-desc': 'str', + 'blocked': 'bool', + '*blocked-reasons': ['str'], '*postcopy-blocktime' : 'uint32', '*postcopy-vcpu-blocktime': ['uint32'], '*compression': 'CompressionStats', @@ -442,6 +448,11 @@ # @validate-uuid: Send the UUID of the source to allow the destination # to ensure it is the same. (since 4.2) # +# @background-snapshot: If enabled, the migration stream will be a snapshot +# of the VM exactly at the point when the migration +# procedure starts. The VM RAM is saved with running VM. +# (since 6.0) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', @@ -449,7 +460,7 @@ 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'block', 'return-path', 'pause-before-switchover', 'multifd', 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', - 'x-ignore-shared', 'validate-uuid' ] } + 'x-ignore-shared', 'validate-uuid', 'background-snapshot'] } ## # @MigrationCapabilityStatus: @@ -885,28 +896,28 @@ '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', - '*compress-level': 'int', - '*compress-threads': 'int', + '*compress-level': 'uint8', + '*compress-threads': 'uint8', '*compress-wait-thread': 'bool', - '*decompress-threads': 'int', - '*throttle-trigger-threshold': 'int', - '*cpu-throttle-initial': 'int', - '*cpu-throttle-increment': 'int', + '*decompress-threads': 'uint8', + '*throttle-trigger-threshold': 'uint8', + '*cpu-throttle-initial': 'uint8', + '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', - '*max-bandwidth': 'int', - '*downtime-limit': 'int', - '*x-checkpoint-delay': 'int', + '*max-bandwidth': 'size', + '*downtime-limit': 'uint64', + '*x-checkpoint-delay': 'uint32', '*block-incremental': 'bool', - '*multifd-channels': 'int', + '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', - '*max-cpu-throttle': 'int', + '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', - '*multifd-zlib-level': 'int', - '*multifd-zstd-level': 'int', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } ## @@ -1093,7 +1104,7 @@ '*max-bandwidth': 'size', '*downtime-limit': 'uint64', '*x-checkpoint-delay': 'uint32', - '*block-incremental': 'bool' , + '*block-incremental': 'bool', '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', @@ -1465,7 +1476,7 @@ # <- { "return": 67108864 } # ## -{ 'command': 'query-migrate-cache-size', 'returns': 'int', +{ 'command': 'query-migrate-cache-size', 'returns': 'size', 'features': [ 'deprecated' ] } ## @@ -1843,3 +1854,176 @@ # Since: 5.2 ## { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' } + +## +# @snapshot-save: +# +# Save a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to create +# @vmstate: block device node name to save vmstate to +# @devices: list of block device node names to save a snapshot to +# +# Applications should not assume that the snapshot save is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs may be stopped during the +# time it takes to save the snapshot. A future version of QEMU +# may ensure CPUs are executing continuously. +# +# It is strongly recommended that @devices contain all writable +# block device nodes if a consistent snapshot is required. +# +# If @tag already exists, an error will be reported +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-save", +# "data": { +# "job-id": "snapsave0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapsave0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapsave0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-save", +# "id": "snapsave0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-save', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-load: +# +# Load a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to load. +# @vmstate: block device node name to load vmstate from +# @devices: list of block device node names to load a snapshot from +# +# Applications should not assume that the snapshot load is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs will be stopped during the +# time it takes to load the snapshot. +# +# It is strongly recommended that @devices contain all writable +# block device nodes that can have changed since the original +# @snapshot-save command execution. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-load", +# "data": { +# "job-id": "snapload0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapload0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapload0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-load", +# "id": "snapload0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-load', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-delete: +# +# Delete a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to delete. +# @devices: list of block device node names to delete a snapshot from +# +# Applications should not assume that the snapshot delete is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-delete", +# "data": { +# "job-id": "snapdelete0", +# "tag": "my-snap", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapdelete0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-delete", +# "id": "snapdelete0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-delete', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'devices': ['str'] } } diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c index 5ec574724a..1cde50e9f3 100644 --- a/replay/replay-debugging.c +++ b/replay/replay-debugging.c @@ -143,12 +143,13 @@ static char *replay_find_nearest_snapshot(int64_t icount, QEMUSnapshotInfo *sn_tab; QEMUSnapshotInfo *nearest = NULL; char *ret = NULL; + int rv; int nb_sns, i; AioContext *aio_context; *snapshot_icount = -1; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, NULL); if (!bs) { goto fail; } @@ -159,7 +160,10 @@ static char *replay_find_nearest_snapshot(int64_t icount, aio_context_release(aio_context); for (i = 0; i < nb_sns; i++) { - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) { + rv = bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL); + if (rv < 0) + goto fail; + if (rv == 1) { if (sn_tab[i].icount != -1ULL && sn_tab[i].icount <= icount && (!nearest || nearest->icount < sn_tab[i].icount)) { @@ -192,7 +196,7 @@ static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp) if (icount < replay_get_current_icount() || replay_get_current_icount() < snapshot_icount) { vm_stop(RUN_STATE_RESTORE_VM); - load_snapshot(snapshot, errp); + load_snapshot(snapshot, NULL, false, NULL, errp); } g_free(snapshot); } @@ -323,7 +327,7 @@ void replay_gdb_attached(void) */ if (replay_mode == REPLAY_MODE_PLAY && !replay_snapshot) { - if (save_snapshot("start_debugging", NULL) != 0) { + if (!save_snapshot("start_debugging", true, NULL, false, NULL, NULL)) { /* Can't create the snapshot. Continue conventional debugging. */ } } diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c index e26fa4c892..e8767a1937 100644 --- a/replay/replay-snapshot.c +++ b/replay/replay-snapshot.c @@ -77,13 +77,14 @@ void replay_vmstate_init(void) if (replay_snapshot) { if (replay_mode == REPLAY_MODE_RECORD) { - if (save_snapshot(replay_snapshot, &err) != 0) { + if (!save_snapshot(replay_snapshot, + true, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not create snapshot for icount record"); exit(1); } } else if (replay_mode == REPLAY_MODE_PLAY) { - if (load_snapshot(replay_snapshot, &err) != 0) { + if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not load snapshot for icount replay"); exit(1); diff --git a/scripts/userfaultfd-wrlat.py b/scripts/userfaultfd-wrlat.py new file mode 100755 index 0000000000..0684be4e04 --- /dev/null +++ b/scripts/userfaultfd-wrlat.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 +# +# userfaultfd-wrlat Summarize userfaultfd write fault latencies. +# Events are continuously accumulated for the +# run, while latency distribution histogram is +# dumped each 'interval' seconds. +# +# For Linux, uses BCC, eBPF. +# +# USAGE: userfaultfd-lat [interval [count]] +# +# Copyright Virtuozzo GmbH, 2020 +# +# Authors: +# Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +from __future__ import print_function +from bcc import BPF +from ctypes import c_ushort, c_int, c_ulonglong +from time import sleep +from sys import argv + +def usage(): + print("USAGE: %s [interval [count]]" % argv[0]) + exit() + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <linux/mm.h> + +BPF_HASH(ev_start, u32, u64); +BPF_HISTOGRAM(ev_delta_hist, u64); + +/* Trace UFFD page fault start event. */ +static void do_event_start() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + + ev_start.update(&tid, &ts); +} + +/* Trace UFFD page fault end event. */ +static void do_event_end() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + u64 *tsp; + + tsp = ev_start.lookup(&tid); + if (tsp) { + u64 delta = ts - (*tsp); + /* Transform time delta to milliseconds */ + ev_delta_hist.increment(bpf_log2l(delta / 1000000)); + ev_start.delete(&tid); + } +} + +/* KPROBE for handle_userfault(). */ +int probe_handle_userfault(struct pt_regs *ctx, struct vm_fault *vmf, + unsigned long reason) +{ + /* Trace only UFFD write faults. */ + if (reason & VM_UFFD_WP) { + do_event_start(); + } + return 0; +} + +/* KRETPROBE for handle_userfault(). */ +int retprobe_handle_userfault(struct pt_regs *ctx) +{ + do_event_end(); + return 0; +} +""" + +# arguments +interval = 10 +count = -1 +if len(argv) > 1: + try: + interval = int(argv[1]) + if interval == 0: + raise + if len(argv) > 2: + count = int(argv[2]) + except: # also catches -h, --help + usage() + +# load BPF program +b = BPF(text=bpf_text) +# attach KRPOBEs +b.attach_kprobe(event="handle_userfault", fn_name="probe_handle_userfault") +b.attach_kretprobe(event="handle_userfault", fn_name="retprobe_handle_userfault") + +# header +print("Tracing UFFD-WP write fault latency... Hit Ctrl-C to end.") + +# output +loop = 0 +do_exit = 0 +while (1): + if count > 0: + loop += 1 + if loop > count: + exit() + try: + sleep(interval) + except KeyboardInterrupt: + pass; do_exit = 1 + + print() + b["ev_delta_hist"].print_log2_hist("msecs") + if do_exit: + exit() diff --git a/softmmu/vl.c b/softmmu/vl.c index 9eb9dab1fc..b219ce1f35 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -2545,7 +2545,7 @@ void qmp_x_exit_preconfig(Error **errp) if (loadvm) { Error *local_err = NULL; - if (load_snapshot(loadvm, &local_err) < 0) { + if (!load_snapshot(loadvm, NULL, false, NULL, &local_err)) { error_report_err(local_err); autostart = 0; exit(1); diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out index 27471ffae8..7176e376e1 100644 --- a/tests/qemu-iotests/267.out +++ b/tests/qemu-iotests/267.out @@ -6,11 +6,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 Testing: QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 -Error: No block device can accept snapshots +Error: no block device can store vmstate for snapshot (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 -Error: No block device supports snapshots +Error: no block device can store vmstate for snapshot (qemu) quit @@ -22,7 +22,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) quit @@ -58,7 +58,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) quit @@ -83,7 +83,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) quit diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index ef105dfc39..0fc52d20d7 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -53,6 +53,15 @@ _in_fd=4 # If $mismatch_only is set, only non-matching responses will # be echoed. # +# If $capture_events is non-empty, then any QMP event names it lists +# will not be echoed out, but instead collected in the $QEMU_EVENTS +# variable. The _wait_event function can later be used to receive +# the cached events. +# +# If $only_capture_events is set to anything but an empty string, +# then an error will be raised if a QMP message is seen which is +# not an event listed in $capture_events. +# # If $success_or_failure is set, the meaning of the arguments is # changed as follows: # $2: A string to search for in the response; if found, this indicates @@ -78,6 +87,31 @@ _timed_wait_for() QEMU_STATUS[$h]=0 while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} do + if [ -n "$capture_events" ]; then + capture=0 + local evname + for evname in $capture_events + do + case ${resp} in + *\"event\":\ \"${evname}\"* ) capture=1 ;; + esac + done + if [ $capture = 1 ]; + then + ev=$(echo "${resp}" | tr -d '\r' | tr % .) + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + if [ -n "$only_capture_events" ]; then + return + else + continue + fi + fi + fi + if [ -n "$only_capture_events" ]; then + echo "Only expected $capture_events but got ${resp}" + exit 1 + fi + if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp @@ -172,12 +206,82 @@ _send_qemu_cmd() let count--; done if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then - echo "Timeout waiting for ${1} on handle ${h}" + echo "Timeout waiting for command ${1} response on handle ${h}" exit 1 #Timeout means the test failed fi } +# Check event cache for a named QMP event +# +# Input parameters: +# $1: Name of the QMP event to check for +# +# Checks if the named QMP event that was previously captured +# into $QEMU_EVENTS. When matched, the QMP event will be echoed +# and the $matched variable set to 1. +# +# _wait_event is more suitable for test usage in most cases +_check_cached_events() +{ + local evname=${1} + + local match="\"event\": \"$evname\"" + + matched=0 + if [ -n "$QEMU_EVENTS" ]; then + CURRENT_QEMU_EVENTS=$QEMU_EVENTS + QEMU_EVENTS= + old_IFS=$IFS + IFS="%" + for ev in $CURRENT_QEMU_EVENTS + do + grep -q "$match" < <(echo "${ev}") + if [ $? -eq 0 ] && [ $matched = 0 ]; then + echo "${ev}" | _filter_testdir | _filter_qemu \ + | _filter_qemu_io | _filter_qmp | _filter_hmp + matched=1 + else + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + fi + done + IFS=$old_IFS + fi +} + +# Wait for a named QMP event +# +# Input parameters: +# $1: QEMU handle to use +# $2: Name of the QMP event to wait for +# +# Checks if the named QMP even was previously captured +# into $QEMU_EVENTS. If none are present, then waits for the +# event to arrive on the QMP channel. When matched, the QMP +# event will be echoed +_wait_event() +{ + local h=${1} + local evname=${2} + + while true + do + _check_cached_events $evname + + if [ $matched = 1 ]; + then + return + fi + + only_capture_events=1 qemu_error_no_exit=1 _timed_wait_for ${h} + + if [ ${QEMU_STATUS[$h]} -ne 0 ] ; then + echo "Timeout waiting for event ${evname} on handle ${h}" + exit 1 #Timeout means the test failed + fi + done +} + # Launch a QEMU process. # # Input parameters: diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 297acf9b6a..77c37e8312 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -109,8 +109,14 @@ peek_file_raw() dd if="$1" bs=1 skip="$2" count="$3" status=none } - -if ! . ./common.config +config=common.config +test -f $config || config=../common.config +if ! test -f $config +then + echo "$0: failed to find common.config" + exit 1 +fi +if ! . $config then echo "$0: failed to source common.config" exit 1 diff --git a/util/meson.build b/util/meson.build index 3eccdbe596..984fba965f 100644 --- a/util/meson.build +++ b/util/meson.build @@ -52,6 +52,7 @@ if have_system util_ss.add(files('crc-ccitt.c')) util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio]) util_ss.add(files('yank.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c')) endif if have_block diff --git a/util/trace-events b/util/trace-events index 61e0d4bcdf..bac0924899 100644 --- a/util/trace-events +++ b/util/trace-events @@ -91,3 +91,12 @@ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uin qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" + +#userfaultfd.c +uffd_query_features_nosys(int err) "errno: %i" +uffd_query_features_api_failed(int err) "errno: %i" +uffd_create_fd_nosys(int err) "errno: %i" +uffd_create_fd_api_failed(int err) "errno: %i" +uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64 +uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i" +uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i" diff --git a/util/userfaultfd.c b/util/userfaultfd.c new file mode 100644 index 0000000000..f1cd6af2b1 --- /dev/null +++ b/util/userfaultfd.c @@ -0,0 +1,345 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "qemu/userfaultfd.h" +#include "trace.h" +#include <poll.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> + +/** + * uffd_query_features: query UFFD features + * + * Returns: 0 on success, negative value in case of an error + * + * @features: parameter to receive 'uffdio_api.features' + */ +int uffd_query_features(uint64_t *features) +{ + int uffd_fd; + struct uffdio_api api_struct = { 0 }; + int ret = -1; + + uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (uffd_fd < 0) { + trace_uffd_query_features_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = 0; + + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_query_features_api_failed(errno); + goto out; + } + *features = api_struct.features; + ret = 0; + +out: + close(uffd_fd); + return ret; +} + +/** + * uffd_create_fd: create UFFD file descriptor + * + * Returns non-negative file descriptor or negative value in case of an error + * + * @features: UFFD features to request + * @non_blocking: create UFFD file descriptor for non-blocking operation + */ +int uffd_create_fd(uint64_t features, bool non_blocking) +{ + int uffd_fd; + int flags; + struct uffdio_api api_struct = { 0 }; + uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); + + flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); + uffd_fd = syscall(__NR_userfaultfd, flags); + if (uffd_fd < 0) { + trace_uffd_create_fd_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = features; + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_create_fd_api_failed(errno); + goto fail; + } + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); + goto fail; + } + + return uffd_fd; + +fail: + close(uffd_fd); + return -1; +} + +/** + * uffd_close_fd: close UFFD file descriptor + * + * @uffd_fd: UFFD file descriptor + */ +void uffd_close_fd(int uffd_fd) +{ + assert(uffd_fd >= 0); + close(uffd_fd); +} + +/** + * uffd_register_memory: register memory range via UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) + * @ioctls: optional pointer to receive supported IOCTL mask + */ +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls) +{ + struct uffdio_register uffd_register; + + uffd_register.range.start = (uintptr_t) addr; + uffd_register.range.len = length; + uffd_register.mode = mode; + + if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { + trace_uffd_register_memory_failed(addr, length, mode, errno); + return -1; + } + if (ioctls) { + *ioctls = uffd_register.ioctls; + } + + return 0; +} + +/** + * uffd_unregister_memory: un-register memory range with UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + */ +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { + trace_uffd_unregister_memory_failed(addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO + * + * Returns 0 on success, negative value in case of error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @wp: write-protect/unprotect + * @dont_wake: do not wake threads waiting on wr-protected page + */ +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake) +{ + struct uffdio_writeprotect uffd_writeprotect; + + uffd_writeprotect.range.start = (uintptr_t) addr; + uffd_writeprotect.range.len = length; + if (!wp && dont_wake) { + /* DONTWAKE is meaningful only on protection release */ + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + } else { + uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); + } + + if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_writeprotect.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_copy_page: copy range of pages to destination via UFFD-IO + * + * Copy range of source pages to the destination to resolve + * missing page fault somewhere in the destination range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @dst_addr: destination base address + * @src_addr: source base address + * @length: length of the range to copy + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake) +{ + struct uffdio_copy uffd_copy; + + uffd_copy.dst = (uintptr_t) dst_addr; + uffd_copy.src = (uintptr_t) src_addr; + uffd_copy.len = length; + uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { + error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, + length, (uint64_t) uffd_copy.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_zero_page: fill range of pages with zeroes via UFFD-IO + * + * Fill range pages with zeroes to resolve missing page fault within the range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range to fill with zeroes + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) +{ + struct uffdio_zeropage uffd_zeropage; + + uffd_zeropage.range.start = (uintptr_t) addr; + uffd_zeropage.range.len = length; + uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { + error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_zeropage.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution + * + * Wake up threads waiting on any page/pages from the designated range. + * The main use case is when during some period, page faults are resolved + * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits + * for the whole memory range are satisfied in a single call to uffd_wakeup(). + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range + */ +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { + error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", + addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_read_events: read pending UFFD events + * + * Returns number of fetched messages, 0 if non is available or + * negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @msgs: pointer to message buffer + * @count: number of messages that can fit in the buffer + */ +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) +{ + ssize_t res; + do { + res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); + } while (res < 0 && errno == EINTR); + + if ((res < 0 && errno == EAGAIN)) { + return 0; + } + if (res < 0) { + error_report("uffd_read_events() failed: errno=%i", errno); + return -1; + } + + return (int) (res / sizeof(struct uffd_msg)); +} + +/** + * uffd_poll_events: poll UFFD file descriptor for read + * + * Returns true if events are available for read, false otherwise + * + * @uffd_fd: UFFD file descriptor + * @tmo: timeout value + */ +bool uffd_poll_events(int uffd_fd, int tmo) +{ + int res; + struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; + + do { + res = poll(&poll_fd, 1, tmo); + } while (res < 0 && errno == EINTR); + + if (res == 0) { + return false; + } + if (res < 0) { + error_report("uffd_poll_events() failed: errno=%i", errno); + return false; + } + + return (poll_fd.revents & POLLIN) != 0; +} |