aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2021-11-02 10:07:27 -0400
committerRichard Henderson <richard.henderson@linaro.org>2021-11-02 10:07:27 -0400
commit91e8394415f9bc9cd81c02bfafe02012855d4f98 (patch)
tree6d91ada2942583821944194f6e3f4ef778964d50
parentf79bb385c0fb9756393bde2a13ebbc70ae6c8043 (diff)
parent826b8bc80cb191557a4ce7cf0e155b436d2d1afa (diff)
Merge remote-tracking branch 'remotes/juanquintela/tags/migration-20211031-pull-request' into staging
Migration Pull request Hi this includes pending bits of migration patches. - virtio-mem support by David Hildenbrand - dirtyrate improvements by Hyman Huang - fix rdma wrid by Li Zhijian - dump-guest-memory fixes by Peter Xu Pleas apply. Thanks, Juan. # gpg: Signature made Mon 01 Nov 2021 06:03:44 PM EDT # gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full] # gpg: aka "Juan Quintela <quintela@trasno.org>" [full] * remotes/juanquintela/tags/migration-20211031-pull-request: migration/dirtyrate: implement dirty-bitmap dirtyrate calculation memory: introduce total_dirty_pages to stat dirty pages migration/ram: Handle RAMBlocks with a RamDiscardManager on background snapshots migration/ram: Factor out populating pages readable in ram_block_populate_pages() migration: Simplify alignment and alignment checks migration/postcopy: Handle RAMBlocks with a RamDiscardManager on the destination virtio-mem: Drop precopy notifier migration/ram: Handle RAMBlocks with a RamDiscardManager on the migration source virtio-mem: Implement replay_discarded RamDiscardManager callback memory: Introduce replay_discarded callback for RamDiscardManager dump-guest-memory: Block live migration migration: Add migrate_add_blocker_internal() migration: Make migration blocker work for snapshots too migration/dirtyrate: implement dirty-ring dirtyrate calculation migration/dirtyrate: move init step of calculation to main thread migration/dirtyrate: adjust order of registering thread migration/dirtyrate: introduce struct and adjust DirtyRateStat memory: make global_dirty_tracking a bitmask KVM: introduce dirty_pages and kvm_dirty_ring_enabled migration/rdma: Fix out of order wrid Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--accel/kvm/kvm-all.c7
-rw-r--r--accel/stubs/kvm-stub.c5
-rw-r--r--dump/dump.c19
-rw-r--r--hmp-commands.hx8
-rw-r--r--hw/i386/xen/xen-hvm.c4
-rw-r--r--hw/virtio/virtio-mem.c92
-rw-r--r--include/exec/memory.h41
-rw-r--r--include/exec/ram_addr.h13
-rw-r--r--include/hw/core/cpu.h1
-rw-r--r--include/hw/virtio/virtio-mem.h3
-rw-r--r--include/migration/blocker.h16
-rw-r--r--include/sysemu/kvm.h1
-rw-r--r--migration/dirtyrate.c367
-rw-r--r--migration/dirtyrate.h19
-rw-r--r--migration/migration.c30
-rw-r--r--migration/postcopy-ram.c40
-rw-r--r--migration/ram.c182
-rw-r--r--migration/ram.h1
-rw-r--r--migration/rdma.c138
-rw-r--r--migration/trace-events2
-rw-r--r--qapi/migration.json48
-rw-r--r--softmmu/memory.c43
-rw-r--r--softmmu/trace-events1
23 files changed, 909 insertions, 172 deletions
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index db8d83b137..eecd8031cf 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -469,6 +469,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
cpu->kvm_fd = ret;
cpu->kvm_state = s;
cpu->vcpu_dirty = true;
+ cpu->dirty_pages = 0;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
@@ -743,6 +744,7 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
count++;
}
cpu->kvm_fetch_index = fetch;
+ cpu->dirty_pages += count;
return count;
}
@@ -2296,6 +2298,11 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
}
+bool kvm_dirty_ring_enabled(void)
+{
+ return kvm_state->kvm_dirty_ring_size ? true : false;
+}
+
static int kvm_init(MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5b1d00a222..5319573e00 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -147,4 +147,9 @@ bool kvm_arm_supports_user_irq(void)
{
return false;
}
+
+bool kvm_dirty_ring_enabled(void)
+{
+ return false;
+}
#endif
diff --git a/dump/dump.c b/dump/dump.c
index ab625909f3..662d0a62cd 100644
--- a/dump/dump.c
+++ b/dump/dump.c
@@ -29,6 +29,7 @@
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "hw/misc/vmcoreinfo.h"
+#include "migration/blocker.h"
#ifdef TARGET_X86_64
#include "win_dump.h"
@@ -47,6 +48,8 @@
#define MAX_GUEST_NOTE_SIZE (1 << 20) /* 1MB should be enough */
+static Error *dump_migration_blocker;
+
#define ELF_NOTE_SIZE(hdr_size, name_size, desc_size) \
((DIV_ROUND_UP((hdr_size), 4) + \
DIV_ROUND_UP((name_size), 4) + \
@@ -101,6 +104,7 @@ static int dump_cleanup(DumpState *s)
qemu_mutex_unlock_iothread();
}
}
+ migrate_del_blocker(dump_migration_blocker);
return 0;
}
@@ -2005,6 +2009,21 @@ void qmp_dump_guest_memory(bool paging, const char *file,
return;
}
+ if (!dump_migration_blocker) {
+ error_setg(&dump_migration_blocker,
+ "Live migration disabled: dump-guest-memory in progress");
+ }
+
+ /*
+ * Allows even for -only-migratable, but forbid migration during the
+ * process of dump guest memory.
+ */
+ if (migrate_add_blocker_internal(dump_migration_blocker, errp)) {
+ /* Remember to release the fd before passing it over to dump state */
+ close(fd);
+ return;
+ }
+
s = &dump_state_global;
dump_state_prepare(s);
diff --git a/hmp-commands.hx b/hmp-commands.hx
index cf723c69ac..3a5aeba3fe 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1737,8 +1737,10 @@ ERST
{
.name = "calc_dirty_rate",
- .args_type = "second:l,sample_pages_per_GB:l?",
- .params = "second [sample_pages_per_GB]",
- .help = "start a round of guest dirty rate measurement",
+ .args_type = "dirty_ring:-r,dirty_bitmap:-b,second:l,sample_pages_per_GB:l?",
+ .params = "[-r] [-b] second [sample_pages_per_GB]",
+ .help = "start a round of guest dirty rate measurement (using -r to"
+ "\n\t\t\t specify dirty ring as the method of calculation and"
+ "\n\t\t\t -b to specify dirty bitmap as method of calculation)",
.cmd = hmp_calc_dirty_rate,
},
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index e3d3d5cf89..482be95415 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -1613,8 +1613,8 @@ void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
{
if (enable) {
- memory_global_dirty_log_start();
+ memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
} else {
- memory_global_dirty_log_stop();
+ memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
}
}
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index df91e454b2..d5a578142b 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -228,6 +228,38 @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
return ret;
}
+static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
+ MemoryRegionSection *s,
+ void *arg,
+ virtio_mem_section_cb cb)
+{
+ unsigned long first_bit, last_bit;
+ uint64_t offset, size;
+ int ret = 0;
+
+ first_bit = s->offset_within_region / vmem->bitmap_size;
+ first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
+ while (first_bit < vmem->bitmap_size) {
+ MemoryRegionSection tmp = *s;
+
+ offset = first_bit * vmem->block_size;
+ last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+ first_bit + 1) - 1;
+ size = (last_bit - first_bit + 1) * vmem->block_size;
+
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ break;
+ }
+ ret = cb(&tmp, arg);
+ if (ret) {
+ break;
+ }
+ first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+ last_bit + 2);
+ }
+ return ret;
+}
+
static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
{
RamDiscardListener *rdl = arg;
@@ -744,7 +776,6 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
qemu_register_reset(virtio_mem_system_reset, vmem);
- precopy_add_notifier(&vmem->precopy_notifier);
/*
* Set ourselves as RamDiscardManager before the plug handler maps the
@@ -764,7 +795,6 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
* found via an address space anymore. Unset ourselves.
*/
memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
- precopy_remove_notifier(&vmem->precopy_notifier);
qemu_unregister_reset(virtio_mem_system_reset, vmem);
vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
host_memory_backend_set_mapped(vmem->memdev, false);
@@ -1057,43 +1087,11 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
vmem->block_size = value;
}
-static int virtio_mem_precopy_exclude_range_cb(const VirtIOMEM *vmem, void *arg,
- uint64_t offset, uint64_t size)
-{
- void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block);
-
- qemu_guest_free_page_hint(host + offset, size);
- return 0;
-}
-
-static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem)
-{
- virtio_mem_for_each_unplugged_range(vmem, NULL,
- virtio_mem_precopy_exclude_range_cb);
-}
-
-static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data)
-{
- VirtIOMEM *vmem = container_of(n, VirtIOMEM, precopy_notifier);
- PrecopyNotifyData *pnd = data;
-
- switch (pnd->reason) {
- case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
- virtio_mem_precopy_exclude_unplugged(vmem);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
static void virtio_mem_instance_init(Object *obj)
{
VirtIOMEM *vmem = VIRTIO_MEM(obj);
notifier_list_init(&vmem->size_change_notifiers);
- vmem->precopy_notifier.notify = virtio_mem_precopy_notify;
QLIST_INIT(&vmem->rdl_list);
object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
@@ -1170,6 +1168,31 @@ static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
virtio_mem_rdm_replay_populated_cb);
}
+static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
+ void *arg)
+{
+ struct VirtIOMEMReplayData *data = arg;
+
+ ((ReplayRamDiscard)data->fn)(s, data->opaque);
+ return 0;
+}
+
+static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *s,
+ ReplayRamDiscard replay_fn,
+ void *opaque)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+ struct VirtIOMEMReplayData data = {
+ .fn = replay_fn,
+ .opaque = opaque,
+ };
+
+ g_assert(s->mr == &vmem->memdev->mr);
+ virtio_mem_for_each_unplugged_section(vmem, s, &data,
+ virtio_mem_rdm_replay_discarded_cb);
+}
+
static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl,
MemoryRegionSection *s)
@@ -1234,6 +1257,7 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data)
rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
rdmc->is_populated = virtio_mem_rdm_is_populated;
rdmc->replay_populated = virtio_mem_rdm_replay_populated;
+ rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
rdmc->register_listener = virtio_mem_rdm_register_listener;
rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
}
diff --git a/include/exec/memory.h b/include/exec/memory.h
index a185b6dcb8..20f1b27377 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -61,7 +61,17 @@ static inline void fuzz_dma_read_cb(size_t addr,
}
#endif
-extern bool global_dirty_log;
+/* Possible bits for global_dirty_log_{start|stop} */
+
+/* Dirty tracking enabled because migration is running */
+#define GLOBAL_DIRTY_MIGRATION (1U << 0)
+
+/* Dirty tracking enabled because measuring dirty rate */
+#define GLOBAL_DIRTY_DIRTY_RATE (1U << 1)
+
+#define GLOBAL_DIRTY_MASK (0x3)
+
+extern unsigned int global_dirty_tracking;
typedef struct MemoryRegionOps MemoryRegionOps;
@@ -540,6 +550,7 @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl,
}
typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque);
+typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque);
/*
* RamDiscardManagerClass:
@@ -629,6 +640,21 @@ struct RamDiscardManagerClass {
ReplayRamPopulate replay_fn, void *opaque);
/**
+ * @replay_discarded:
+ *
+ * Call the #ReplayRamDiscard callback for all discarded parts within the
+ * #MemoryRegionSection via the #RamDiscardManager.
+ *
+ * @rdm: the #RamDiscardManager
+ * @section: the #MemoryRegionSection
+ * @replay_fn: the #ReplayRamDiscard callback
+ * @opaque: pointer to forward to the callback
+ */
+ void (*replay_discarded)(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscard replay_fn, void *opaque);
+
+ /**
* @register_listener:
*
* Register a #RamDiscardListener for the given #MemoryRegionSection and
@@ -672,6 +698,11 @@ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
ReplayRamPopulate replay_fn,
void *opaque);
+void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscard replay_fn,
+ void *opaque);
+
void ram_discard_manager_register_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl,
MemoryRegionSection *section);
@@ -2388,13 +2419,17 @@ void memory_listener_unregister(MemoryListener *listener);
/**
* memory_global_dirty_log_start: begin dirty logging for all regions
+ *
+ * @flags: purpose of starting dirty log, migration or dirty rate
*/
-void memory_global_dirty_log_start(void);
+void memory_global_dirty_log_start(unsigned int flags);
/**
* memory_global_dirty_log_stop: end dirty logging for all regions
+ *
+ * @flags: purpose of stopping dirty log, migration or dirty rate
*/
-void memory_global_dirty_log_stop(void);
+void memory_global_dirty_log_stop(unsigned int flags);
void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 551876bed0..64fb936c7c 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -26,6 +26,8 @@
#include "exec/ramlist.h"
#include "exec/ramblock.h"
+extern uint64_t total_dirty_pages;
+
/**
* clear_bmap_size: calculate clear bitmap size
*
@@ -369,10 +371,14 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
- if (global_dirty_log) {
+ if (global_dirty_tracking) {
qatomic_or(
&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
temp);
+ if (unlikely(
+ global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
+ total_dirty_pages += ctpopl(temp);
+ }
}
if (tcg_enabled()) {
@@ -392,7 +398,7 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
} else {
uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
- if (!global_dirty_log) {
+ if (!global_dirty_tracking) {
clients &= ~(1 << DIRTY_MEMORY_MIGRATION);
}
@@ -403,6 +409,9 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
for (i = 0; i < len; i++) {
if (bitmap[i] != 0) {
c = leul_to_cpu(bitmap[i]);
+ if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
+ total_dirty_pages += ctpopl(c);
+ }
do {
j = ctzl(c);
c &= ~(1ul << j);
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 1a10497af3..e948e81f1a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -381,6 +381,7 @@ struct CPUState {
struct kvm_run *kvm_run;
struct kvm_dirty_gfn *kvm_dirty_gfns;
uint32_t kvm_fetch_index;
+ uint64_t dirty_pages;
/* Used for events with 'vcpu' and *without* the 'disabled' properties */
DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h
index 9a6e348fa2..a5dd6a493b 100644
--- a/include/hw/virtio/virtio-mem.h
+++ b/include/hw/virtio/virtio-mem.h
@@ -65,9 +65,6 @@ struct VirtIOMEM {
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;
- /* don't migrate unplugged memory */
- NotifierWithReturn precopy_notifier;
-
/* listeners to notify on plug/unplug activity. */
QLIST_HEAD(, RamDiscardListener) rdl_list;
};
diff --git a/include/migration/blocker.h b/include/migration/blocker.h
index acd27018e9..9cebe2ba06 100644
--- a/include/migration/blocker.h
+++ b/include/migration/blocker.h
@@ -26,6 +26,22 @@
int migrate_add_blocker(Error *reason, Error **errp);
/**
+ * @migrate_add_blocker_internal - prevent migration from proceeding without
+ * only-migrate implications
+ *
+ * @reason - an error to be returned whenever migration is attempted
+ *
+ * @errp - [out] The reason (if any) we cannot block migration right now.
+ *
+ * @returns - 0 on success, -EBUSY on failure, with errp set.
+ *
+ * Some of the migration blockers can be temporary (e.g., for a few seconds),
+ * so it shouldn't need to conflict with "-only-migratable". For those cases,
+ * we can call this function rather than @migrate_add_blocker().
+ */
+int migrate_add_blocker_internal(Error *reason, Error **errp);
+
+/**
* @migrate_del_blocker - remove a blocking error from migration
*
* @reason - the error blocking migration
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index a1ab1ee12d..7b22aeb6ae 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -547,4 +547,5 @@ bool kvm_cpu_check_are_resettable(void);
bool kvm_arch_cpu_check_are_resettable(void);
+bool kvm_dirty_ring_enabled(void);
#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 320c56ba2c..d65e744af9 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -15,7 +15,9 @@
#include "qapi/error.h"
#include "cpu.h"
#include "exec/ramblock.h"
+#include "exec/ram_addr.h"
#include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
#include "qapi/qapi-commands-migration.h"
#include "ram.h"
#include "trace.h"
@@ -23,9 +25,26 @@
#include "monitor/hmp.h"
#include "monitor/monitor.h"
#include "qapi/qmp/qdict.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "exec/memory.h"
+
+/*
+ * total_dirty_pages is procted by BQL and is used
+ * to stat dirty pages during the period of two
+ * memory_global_dirty_log_sync
+ */
+uint64_t total_dirty_pages;
+
+typedef struct DirtyPageRecord {
+ uint64_t start_pages;
+ uint64_t end_pages;
+} DirtyPageRecord;
static int CalculatingState = DIRTY_RATE_STATUS_UNSTARTED;
static struct DirtyRateStat DirtyStat;
+static DirtyRateMeasureMode dirtyrate_mode =
+ DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
{
@@ -70,51 +89,94 @@ static int dirtyrate_set_state(int *state, int old_state, int new_state)
static struct DirtyRateInfo *query_dirty_rate_info(void)
{
+ int i;
int64_t dirty_rate = DirtyStat.dirty_rate;
struct DirtyRateInfo *info = g_malloc0(sizeof(DirtyRateInfo));
-
- if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) {
- info->has_dirty_rate = true;
- info->dirty_rate = dirty_rate;
- }
+ DirtyRateVcpuList *head = NULL, **tail = &head;
info->status = CalculatingState;
info->start_time = DirtyStat.start_time;
info->calc_time = DirtyStat.calc_time;
info->sample_pages = DirtyStat.sample_pages;
+ info->mode = dirtyrate_mode;
+
+ if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) {
+ info->has_dirty_rate = true;
+ info->dirty_rate = dirty_rate;
+
+ if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
+ /*
+ * set sample_pages with 0 to indicate page sampling
+ * isn't enabled
+ **/
+ info->sample_pages = 0;
+ info->has_vcpu_dirty_rate = true;
+ for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
+ DirtyRateVcpu *rate = g_malloc0(sizeof(DirtyRateVcpu));
+ rate->id = DirtyStat.dirty_ring.rates[i].id;
+ rate->dirty_rate = DirtyStat.dirty_ring.rates[i].dirty_rate;
+ QAPI_LIST_APPEND(tail, rate);
+ }
+ info->vcpu_dirty_rate = head;
+ }
+
+ if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) {
+ info->sample_pages = 0;
+ }
+ }
trace_query_dirty_rate_info(DirtyRateStatus_str(CalculatingState));
return info;
}
-static void init_dirtyrate_stat(int64_t start_time, int64_t calc_time,
- uint64_t sample_pages)
+static void init_dirtyrate_stat(int64_t start_time,
+ struct DirtyRateConfig config)
{
- DirtyStat.total_dirty_samples = 0;
- DirtyStat.total_sample_count = 0;
- DirtyStat.total_block_mem_MB = 0;
DirtyStat.dirty_rate = -1;
DirtyStat.start_time = start_time;
- DirtyStat.calc_time = calc_time;
- DirtyStat.sample_pages = sample_pages;
+ DirtyStat.calc_time = config.sample_period_seconds;
+ DirtyStat.sample_pages = config.sample_pages_per_gigabytes;
+
+ switch (config.mode) {
+ case DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING:
+ DirtyStat.page_sampling.total_dirty_samples = 0;
+ DirtyStat.page_sampling.total_sample_count = 0;
+ DirtyStat.page_sampling.total_block_mem_MB = 0;
+ break;
+ case DIRTY_RATE_MEASURE_MODE_DIRTY_RING:
+ DirtyStat.dirty_ring.nvcpu = -1;
+ DirtyStat.dirty_ring.rates = NULL;
+ break;
+ default:
+ break;
+ }
+}
+
+static void cleanup_dirtyrate_stat(struct DirtyRateConfig config)
+{
+ /* last calc-dirty-rate qmp use dirty ring mode */
+ if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
+ free(DirtyStat.dirty_ring.rates);
+ DirtyStat.dirty_ring.rates = NULL;
+ }
}
static void update_dirtyrate_stat(struct RamblockDirtyInfo *info)
{
- DirtyStat.total_dirty_samples += info->sample_dirty_count;
- DirtyStat.total_sample_count += info->sample_pages_count;
+ DirtyStat.page_sampling.total_dirty_samples += info->sample_dirty_count;
+ DirtyStat.page_sampling.total_sample_count += info->sample_pages_count;
/* size of total pages in MB */
- DirtyStat.total_block_mem_MB += (info->ramblock_pages *
- TARGET_PAGE_SIZE) >> 20;
+ DirtyStat.page_sampling.total_block_mem_MB += (info->ramblock_pages *
+ TARGET_PAGE_SIZE) >> 20;
}
static void update_dirtyrate(uint64_t msec)
{
uint64_t dirtyrate;
- uint64_t total_dirty_samples = DirtyStat.total_dirty_samples;
- uint64_t total_sample_count = DirtyStat.total_sample_count;
- uint64_t total_block_mem_MB = DirtyStat.total_block_mem_MB;
+ uint64_t total_dirty_samples = DirtyStat.page_sampling.total_dirty_samples;
+ uint64_t total_sample_count = DirtyStat.page_sampling.total_sample_count;
+ uint64_t total_block_mem_MB = DirtyStat.page_sampling.total_block_mem_MB;
dirtyrate = total_dirty_samples * total_block_mem_MB *
1000 / (total_sample_count * msec);
@@ -327,21 +389,183 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
update_dirtyrate_stat(block_dinfo);
}
- if (DirtyStat.total_sample_count == 0) {
+ if (DirtyStat.page_sampling.total_sample_count == 0) {
return false;
}
return true;
}
-static void calculate_dirtyrate(struct DirtyRateConfig config)
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+ if (start) {
+ dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+ } else {
+ dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+ }
+}
+
+static void dirtyrate_global_dirty_log_start(void)
+{
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
+ qemu_mutex_unlock_iothread();
+}
+
+static void dirtyrate_global_dirty_log_stop(void)
+{
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_sync();
+ memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
+ qemu_mutex_unlock_iothread();
+}
+
+static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
+{
+ uint64_t memory_size_MB;
+ int64_t time_s;
+ uint64_t increased_dirty_pages =
+ dirty_pages.end_pages - dirty_pages.start_pages;
+
+ memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+ time_s = DirtyStat.calc_time;
+
+ return memory_size_MB / time_s;
+}
+
+static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
+ bool start)
+{
+ if (start) {
+ dirty_pages->start_pages = total_dirty_pages;
+ } else {
+ dirty_pages->end_pages = total_dirty_pages;
+ }
+}
+
+static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
+{
+ DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
+}
+
+static inline void dirtyrate_manual_reset_protect(void)
+{
+ RAMBlock *block = NULL;
+
+ WITH_RCU_READ_LOCK_GUARD() {
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
+ memory_region_clear_dirty_bitmap(block->mr, 0,
+ block->used_length);
+ }
+ }
+}
+
+static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
+{
+ int64_t msec = 0;
+ int64_t start_time;
+ DirtyPageRecord dirty_pages;
+
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
+
+ /*
+ * 1'round of log sync may return all 1 bits with
+ * KVM_DIRTY_LOG_INITIALLY_SET enable
+ * skip it unconditionally and start dirty tracking
+ * from 2'round of log sync
+ */
+ memory_global_dirty_log_sync();
+
+ /*
+ * reset page protect manually and unconditionally.
+ * this make sure kvm dirty log be cleared if
+ * KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE cap is enabled.
+ */
+ dirtyrate_manual_reset_protect();
+ qemu_mutex_unlock_iothread();
+
+ record_dirtypages_bitmap(&dirty_pages, true);
+
+ start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ DirtyStat.start_time = start_time / 1000;
+
+ msec = config.sample_period_seconds * 1000;
+ msec = set_sample_page_period(msec, start_time);
+ DirtyStat.calc_time = msec / 1000;
+
+ /*
+ * dirtyrate_global_dirty_log_stop do two things.
+ * 1. fetch dirty bitmap from kvm
+ * 2. stop dirty tracking
+ */
+ dirtyrate_global_dirty_log_stop();
+
+ record_dirtypages_bitmap(&dirty_pages, false);
+
+ do_calculate_dirtyrate_bitmap(dirty_pages);
+}
+
+static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
+{
+ CPUState *cpu;
+ int64_t msec = 0;
+ int64_t start_time;
+ uint64_t dirtyrate = 0;
+ uint64_t dirtyrate_sum = 0;
+ DirtyPageRecord *dirty_pages;
+ int nvcpu = 0;
+ int i = 0;
+
+ CPU_FOREACH(cpu) {
+ nvcpu++;
+ }
+
+ dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
+
+ DirtyStat.dirty_ring.nvcpu = nvcpu;
+ DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
+
+ dirtyrate_global_dirty_log_start();
+
+ CPU_FOREACH(cpu) {
+ record_dirtypages(dirty_pages, cpu, true);
+ }
+
+ start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ DirtyStat.start_time = start_time / 1000;
+
+ msec = config.sample_period_seconds * 1000;
+ msec = set_sample_page_period(msec, start_time);
+ DirtyStat.calc_time = msec / 1000;
+
+ dirtyrate_global_dirty_log_stop();
+
+ CPU_FOREACH(cpu) {
+ record_dirtypages(dirty_pages, cpu, false);
+ }
+
+ for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
+ dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
+ trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
+
+ DirtyStat.dirty_ring.rates[i].id = i;
+ DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
+ dirtyrate_sum += dirtyrate;
+ }
+
+ DirtyStat.dirty_rate = dirtyrate_sum;
+ free(dirty_pages);
+}
+
+static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
{
struct RamblockDirtyInfo *block_dinfo = NULL;
int block_count = 0;
int64_t msec = 0;
int64_t initial_time;
- rcu_register_thread();
rcu_read_lock();
initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
if (!record_ramblock_hash_info(&block_dinfo, config, &block_count)) {
@@ -364,16 +588,26 @@ static void calculate_dirtyrate(struct DirtyRateConfig config)
out:
rcu_read_unlock();
free_ramblock_dirty_info(block_dinfo, block_count);
- rcu_unregister_thread();
+}
+
+static void calculate_dirtyrate(struct DirtyRateConfig config)
+{
+ if (config.mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) {
+ calculate_dirtyrate_dirty_bitmap(config);
+ } else if (config.mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
+ calculate_dirtyrate_dirty_ring(config);
+ } else {
+ calculate_dirtyrate_sample_vm(config);
+ }
+
+ trace_dirtyrate_calculate(DirtyStat.dirty_rate);
}
void *get_dirtyrate_thread(void *arg)
{
struct DirtyRateConfig config = *(struct DirtyRateConfig *)arg;
int ret;
- int64_t start_time;
- int64_t calc_time;
- uint64_t sample_pages;
+ rcu_register_thread();
ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_UNSTARTED,
DIRTY_RATE_STATUS_MEASURING);
@@ -382,11 +616,6 @@ void *get_dirtyrate_thread(void *arg)
return NULL;
}
- start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
- calc_time = config.sample_period_seconds;
- sample_pages = config.sample_pages_per_gigabytes;
- init_dirtyrate_stat(start_time, calc_time, sample_pages);
-
calculate_dirtyrate(config);
ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_MEASURING,
@@ -394,15 +623,22 @@ void *get_dirtyrate_thread(void *arg)
if (ret == -1) {
error_report("change dirtyrate state failed.");
}
+
+ rcu_unregister_thread();
return NULL;
}
-void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
- int64_t sample_pages, Error **errp)
+void qmp_calc_dirty_rate(int64_t calc_time,
+ bool has_sample_pages,
+ int64_t sample_pages,
+ bool has_mode,
+ DirtyRateMeasureMode mode,
+ Error **errp)
{
static struct DirtyRateConfig config;
QemuThread thread;
int ret;
+ int64_t start_time;
/*
* If the dirty rate is already being measured, don't attempt to start.
@@ -419,6 +655,15 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
return;
}
+ if (!has_mode) {
+ mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
+ }
+
+ if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
+ error_setg(errp, "either sample-pages or dirty-ring can be specified.");
+ return;
+ }
+
if (has_sample_pages) {
if (!is_sample_pages_valid(sample_pages)) {
error_setg(errp, "sample-pages is out of range[%d, %d].",
@@ -431,6 +676,19 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
}
/*
+ * dirty ring mode only works when kvm dirty ring is enabled.
+ * on the contrary, dirty bitmap mode is not.
+ */
+ if (((mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) &&
+ !kvm_dirty_ring_enabled()) ||
+ ((mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) &&
+ kvm_dirty_ring_enabled())) {
+ error_setg(errp, "mode %s is not enabled, use other method instead.",
+ DirtyRateMeasureMode_str(mode));
+ return;
+ }
+
+ /*
* Init calculation state as unstarted.
*/
ret = dirtyrate_set_state(&CalculatingState, CalculatingState,
@@ -442,6 +700,19 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
config.sample_period_seconds = calc_time;
config.sample_pages_per_gigabytes = sample_pages;
+ config.mode = mode;
+
+ cleanup_dirtyrate_stat(config);
+
+ /*
+ * update dirty rate mode so that we can figure out what mode has
+ * been used in last calculation
+ **/
+ dirtyrate_mode = mode;
+
+ start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
+ init_dirtyrate_stat(start_time, config);
+
qemu_thread_create(&thread, "get_dirtyrate", get_dirtyrate_thread,
(void *)&config, QEMU_THREAD_DETACHED);
}
@@ -463,12 +734,24 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict)
info->sample_pages);
monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
info->calc_time);
+ monitor_printf(mon, "Mode: %s\n",
+ DirtyRateMeasureMode_str(info->mode));
monitor_printf(mon, "Dirty rate: ");
if (info->has_dirty_rate) {
monitor_printf(mon, "%"PRIi64" (MB/s)\n", info->dirty_rate);
+ if (info->has_vcpu_dirty_rate) {
+ DirtyRateVcpuList *rate, *head = info->vcpu_dirty_rate;
+ for (rate = head; rate != NULL; rate = rate->next) {
+ monitor_printf(mon, "vcpu[%"PRIi64"], Dirty rate: %"PRIi64
+ " (MB/s)\n", rate->value->id,
+ rate->value->dirty_rate);
+ }
+ }
} else {
monitor_printf(mon, "(not ready)\n");
}
+
+ qapi_free_DirtyRateVcpuList(info->vcpu_dirty_rate);
g_free(info);
}
@@ -477,6 +760,9 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict)
int64_t sec = qdict_get_try_int(qdict, "second", 0);
int64_t sample_pages = qdict_get_try_int(qdict, "sample_pages_per_GB", -1);
bool has_sample_pages = (sample_pages != -1);
+ bool dirty_ring = qdict_get_try_bool(qdict, "dirty_ring", false);
+ bool dirty_bitmap = qdict_get_try_bool(qdict, "dirty_bitmap", false);
+ DirtyRateMeasureMode mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
Error *err = NULL;
if (!sec) {
@@ -484,7 +770,20 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict)
return;
}
- qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, &err);
+ if (dirty_ring && dirty_bitmap) {
+ monitor_printf(mon, "Either dirty ring or dirty bitmap "
+ "can be specified!\n");
+ return;
+ }
+
+ if (dirty_bitmap) {
+ mode = DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP;
+ } else if (dirty_ring) {
+ mode = DIRTY_RATE_MEASURE_MODE_DIRTY_RING;
+ }
+
+ qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, true,
+ mode, &err);
if (err) {
hmp_handle_error(mon, err);
return;
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index e1fd29089e..69d4c5b865 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -43,6 +43,7 @@
struct DirtyRateConfig {
uint64_t sample_pages_per_gigabytes; /* sample pages per GB */
int64_t sample_period_seconds; /* time duration between two sampling */
+ DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */
};
/*
@@ -58,17 +59,29 @@ struct RamblockDirtyInfo {
uint32_t *hash_result; /* array of hash result for sampled pages */
};
+typedef struct SampleVMStat {
+ uint64_t total_dirty_samples; /* total dirty sampled page */
+ uint64_t total_sample_count; /* total sampled pages */
+ uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
+} SampleVMStat;
+
+typedef struct VcpuStat {
+ int nvcpu; /* number of vcpu */
+ DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
/*
* Store calculation statistics for each measure.
*/
struct DirtyRateStat {
- uint64_t total_dirty_samples; /* total dirty sampled page */
- uint64_t total_sample_count; /* total sampled pages */
- uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
int64_t dirty_rate; /* dirty rate in MB/s */
int64_t start_time; /* calculation start time in units of second */
int64_t calc_time; /* time duration of two sampling in units of second */
uint64_t sample_pages; /* sample pages per GB */
+ union {
+ SampleVMStat page_sampling;
+ VcpuStat dirty_ring;
+ };
};
void *get_dirtyrate_thread(void *arg);
diff --git a/migration/migration.c b/migration/migration.c
index 9172686b89..53b9a8af96 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -391,7 +391,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
int migrate_send_rp_req_pages(MigrationIncomingState *mis,
RAMBlock *rb, ram_addr_t start, uint64_t haddr)
{
- void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
+ void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
bool received = false;
WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
@@ -2049,6 +2049,20 @@ void migrate_init(MigrationState *s)
s->threshold_size = 0;
}
+int migrate_add_blocker_internal(Error *reason, Error **errp)
+{
+ /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
+ if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
+ error_propagate_prepend(errp, error_copy(reason),
+ "disallowing migration blocker "
+ "(migration/snapshot in progress) for: ");
+ return -EBUSY;
+ }
+
+ migration_blockers = g_slist_prepend(migration_blockers, reason);
+ return 0;
+}
+
int migrate_add_blocker(Error *reason, Error **errp)
{
if (only_migratable) {
@@ -2058,15 +2072,7 @@ int migrate_add_blocker(Error *reason, Error **errp)
return -EACCES;
}
- if (migration_is_idle()) {
- migration_blockers = g_slist_prepend(migration_blockers, reason);
- return 0;
- }
-
- error_propagate_prepend(errp, error_copy(reason),
- "disallowing migration blocker "
- "(migration in progress) for: ");
- return -EBUSY;
+ return migrate_add_blocker_internal(reason, errp);
}
void migrate_del_blocker(Error *reason)
@@ -2631,8 +2637,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
* Since we currently insist on matching page sizes, just sanity check
* we're being asked for whole host pages.
*/
- if (start & (our_host_ps - 1) ||
- (len & (our_host_ps - 1))) {
+ if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
+ !QEMU_IS_ALIGNED(len, our_host_ps)) {
error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
" len: %zd", __func__, start, len);
mark_source_rp_bad(ms);
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 2e9697bdd2..e721f69d0f 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -402,7 +402,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
strerror(errno));
goto out;
}
- g_assert(((size_t)testarea & (pagesize - 1)) == 0);
+ g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
reg_struct.range.start = (uintptr_t)testarea;
reg_struct.range.len = pagesize;
@@ -660,7 +660,7 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
struct uffdio_range range;
int ret;
trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
- range.start = client_addr & ~(pagesize - 1);
+ range.start = ROUND_DOWN(client_addr, pagesize);
range.len = pagesize;
ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
if (ret) {
@@ -671,6 +671,29 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
return ret;
}
+static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
+ ram_addr_t start, uint64_t haddr)
+{
+ void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
+
+ /*
+ * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
+ * access, place a zeropage, which will also set the relevant bits in the
+ * recv_bitmap accordingly, so we won't try placing a zeropage twice.
+ *
+ * Checking a single bit is sufficient to handle pagesize > TPS as either
+ * all relevant bits are set or not.
+ */
+ assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
+ if (ramblock_page_is_discarded(rb, start)) {
+ bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
+
+ return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
+ }
+
+ return migrate_send_rp_req_pages(mis, rb, start, haddr);
+}
+
/*
* Callback from shared fault handlers to ask for a page,
* the page must be specified by a RAMBlock and an offset in that rb
@@ -679,8 +702,7 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
uint64_t client_addr, uint64_t rb_offset)
{
- size_t pagesize = qemu_ram_pagesize(rb);
- uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
+ uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
MigrationIncomingState *mis = migration_incoming_get_current();
trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
@@ -690,7 +712,7 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
qemu_ram_get_idstr(rb), rb_offset);
return postcopy_wake_shared(pcfd, client_addr, rb);
}
- migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
+ postcopy_request_page(mis, rb, aligned_rbo, client_addr);
return 0;
}
@@ -970,7 +992,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
break;
}
- rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
+ rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
qemu_ram_get_idstr(rb),
rb_offset,
@@ -984,8 +1006,8 @@ retry:
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
*/
- ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
- msg.arg.pagefault.address);
+ ret = postcopy_request_page(mis, rb, rb_offset,
+ msg.arg.pagefault.address);
if (ret) {
/* May be network failure, try to wait for recovery */
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
@@ -993,7 +1015,7 @@ retry:
goto retry;
} else {
/* This is a unavoidable fault */
- error_report("%s: migrate_send_rp_req_pages() get %d",
+ error_report("%s: postcopy_request_page() get %d",
__func__, ret);
break;
}
diff --git a/migration/ram.c b/migration/ram.c
index bb908822d5..680a5158aa 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -811,7 +811,7 @@ static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
assert(shift >= 6);
size = 1ULL << (TARGET_PAGE_BITS + shift);
- start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
+ start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
memory_region_clear_dirty_bitmap(rb->mr, start, size);
}
@@ -858,6 +858,81 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
return ret;
}
+static void dirty_bitmap_clear_section(MemoryRegionSection *section,
+ void *opaque)
+{
+ const hwaddr offset = section->offset_within_region;
+ const hwaddr size = int128_get64(section->size);
+ const unsigned long start = offset >> TARGET_PAGE_BITS;
+ const unsigned long npages = size >> TARGET_PAGE_BITS;
+ RAMBlock *rb = section->mr->ram_block;
+ uint64_t *cleared_bits = opaque;
+
+ /*
+ * We don't grab ram_state->bitmap_mutex because we expect to run
+ * only when starting migration or during postcopy recovery where
+ * we don't have concurrent access.
+ */
+ if (!migration_in_postcopy() && !migrate_background_snapshot()) {
+ migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
+ }
+ *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
+ bitmap_clear(rb->bmap, start, npages);
+}
+
+/*
+ * Exclude all dirty pages from migration that fall into a discarded range as
+ * managed by a RamDiscardManager responsible for the mapped memory region of
+ * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
+ *
+ * Discarded pages ("logically unplugged") have undefined content and must
+ * not get migrated, because even reading these pages for migration might
+ * result in undesired behavior.
+ *
+ * Returns the number of cleared bits in the RAMBlock dirty bitmap.
+ *
+ * Note: The result is only stable while migrating (precopy/postcopy).
+ */
+static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
+{
+ uint64_t cleared_bits = 0;
+
+ if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
+ RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
+ MemoryRegionSection section = {
+ .mr = rb->mr,
+ .offset_within_region = 0,
+ .size = int128_make64(qemu_ram_get_used_length(rb)),
+ };
+
+ ram_discard_manager_replay_discarded(rdm, &section,
+ dirty_bitmap_clear_section,
+ &cleared_bits);
+ }
+ return cleared_bits;
+}
+
+/*
+ * Check if a host-page aligned page falls into a discarded range as managed by
+ * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
+ *
+ * Note: The result is only stable while migrating (precopy/postcopy).
+ */
+bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
+{
+ if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
+ RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
+ MemoryRegionSection section = {
+ .mr = rb->mr,
+ .offset_within_region = start,
+ .size = int128_make64(qemu_ram_pagesize(rb)),
+ };
+
+ return !ram_discard_manager_is_populated(rdm, &section);
+ }
+ return false;
+}
+
/* Called with RCU critical section */
static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
{
@@ -1564,25 +1639,68 @@ out:
return ret;
}
+static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
+ ram_addr_t size)
+{
+ /*
+ * We read one byte of each page; this will preallocate page tables if
+ * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
+ * where no page was populated yet. This might require adaption when
+ * supporting other mappings, like shmem.
+ */
+ for (; offset < size; offset += block->page_size) {
+ char tmp = *((char *)block->host + offset);
+
+ /* Don't optimize the read out */
+ asm volatile("" : "+r" (tmp));
+ }
+}
+
+static inline int populate_read_section(MemoryRegionSection *section,
+ void *opaque)
+{
+ const hwaddr size = int128_get64(section->size);
+ hwaddr offset = section->offset_within_region;
+ RAMBlock *block = section->mr->ram_block;
+
+ populate_read_range(block, offset, size);
+ return 0;
+}
+
/*
- * ram_block_populate_pages: populate memory in the RAM block by reading
- * an integer from the beginning of each page.
+ * ram_block_populate_read: preallocate page tables and populate pages in the
+ * RAM block by reading a byte of each page.
*
* Since it's solely used for userfault_fd WP feature, here we just
* hardcode page size to qemu_real_host_page_size.
*
* @block: RAM block to populate
*/
-static void ram_block_populate_pages(RAMBlock *block)
+static void ram_block_populate_read(RAMBlock *rb)
{
- char *ptr = (char *) block->host;
-
- for (ram_addr_t offset = 0; offset < block->used_length;
- offset += qemu_real_host_page_size) {
- char tmp = *(ptr + offset);
-
- /* Don't optimize the read out */
- asm volatile("" : "+r" (tmp));
+ /*
+ * Skip populating all pages that fall into a discarded range as managed by
+ * a RamDiscardManager responsible for the mapped memory region of the
+ * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
+ * must not get populated automatically. We don't have to track
+ * modifications via userfaultfd WP reliably, because these pages will
+ * not be part of the migration stream either way -- see
+ * ramblock_dirty_bitmap_exclude_discarded_pages().
+ *
+ * Note: The result is only stable while migrating (precopy/postcopy).
+ */
+ if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
+ RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
+ MemoryRegionSection section = {
+ .mr = rb->mr,
+ .offset_within_region = 0,
+ .size = rb->mr->size,
+ };
+
+ ram_discard_manager_replay_populated(rdm, &section,
+ populate_read_section, NULL);
+ } else {
+ populate_read_range(rb, 0, rb->used_length);
}
}
@@ -1609,7 +1727,7 @@ void ram_write_tracking_prepare(void)
* UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
* pages with pte_none() entries in page table.
*/
- ram_block_populate_pages(block);
+ ram_block_populate_read(block);
}
}
@@ -2216,7 +2334,14 @@ static void ram_save_cleanup(void *opaque)
/* caller have hold iothread lock or is in a bh, so there is
* no writing race against the migration bitmap
*/
- memory_global_dirty_log_stop();
+ if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
+ /*
+ * do not stop dirty log without starting it, since
+ * memory_global_dirty_log_stop will assert that
+ * memory_global_dirty_log_start/stop used in pairs
+ */
+ memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
+ }
}
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
@@ -2668,6 +2793,19 @@ static void ram_list_init_bitmaps(void)
}
}
+static void migration_bitmap_clear_discarded_pages(RAMState *rs)
+{
+ unsigned long pages;
+ RAMBlock *rb;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
+ pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
+ rs->migration_dirty_pages -= pages;
+ }
+}
+
static void ram_init_bitmaps(RAMState *rs)
{
/* For memory_global_dirty_log_start below. */
@@ -2678,12 +2816,18 @@ static void ram_init_bitmaps(RAMState *rs)
ram_list_init_bitmaps();
/* We don't use dirty log with background snapshots */
if (!migrate_background_snapshot()) {
- memory_global_dirty_log_start();
+ memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
migration_bitmap_sync_precopy(rs);
}
}
qemu_mutex_unlock_ramlist();
qemu_mutex_unlock_iothread();
+
+ /*
+ * After an eventual first bitmap sync, fixup the initial bitmap
+ * containing all 1s to exclude any discarded pages from migration.
+ */
+ migration_bitmap_clear_discarded_pages(rs);
}
static int ram_init_all(RAMState **rsp)
@@ -3434,7 +3578,7 @@ void colo_incoming_start_dirty_log(void)
/* Discard this dirty bitmap record */
bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
}
- memory_global_dirty_log_start();
+ memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
}
ram_state->migration_dirty_pages = 0;
qemu_mutex_unlock_ramlist();
@@ -3446,7 +3590,7 @@ void colo_release_ram_cache(void)
{
RAMBlock *block;
- memory_global_dirty_log_stop();
+ memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
g_free(block->bmap);
block->bmap = NULL;
@@ -4112,6 +4256,10 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
*/
bitmap_complement(block->bmap, block->bmap, nbits);
+ /* Clear dirty bits of discarded ranges that we don't want to migrate. */
+ ramblock_dirty_bitmap_clear_discarded_pages(block);
+
+ /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
trace_ram_dirty_bitmap_reload_complete(block->idstr);
/*
diff --git a/migration/ram.h b/migration/ram.h
index 4833e9fd5b..dda1988f3d 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -72,6 +72,7 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
int64_t ramblock_recv_bitmap_send(QEMUFile *file,
const char *block_name);
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
+bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
/* ram cache */
int colo_init_ram_cache(void);
diff --git a/migration/rdma.c b/migration/rdma.c
index 2a3c7889b9..f5d3bbe7e9 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -358,9 +358,11 @@ typedef struct RDMAContext {
struct ibv_context *verbs;
struct rdma_event_channel *channel;
struct ibv_qp *qp; /* queue pair */
- struct ibv_comp_channel *comp_channel; /* completion channel */
+ struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
+ struct ibv_comp_channel *send_comp_channel; /* send completion channel */
struct ibv_pd *pd; /* protection domain */
- struct ibv_cq *cq; /* completion queue */
+ struct ibv_cq *recv_cq; /* recvieve completion queue */
+ struct ibv_cq *send_cq; /* send completion queue */
/*
* If a previous write failed (perhaps because of a failed
@@ -1059,21 +1061,34 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
return -1;
}
- /* create completion channel */
- rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
- if (!rdma->comp_channel) {
- error_report("failed to allocate completion channel");
+ /* create receive completion channel */
+ rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
+ if (!rdma->recv_comp_channel) {
+ error_report("failed to allocate receive completion channel");
goto err_alloc_pd_cq;
}
/*
- * Completion queue can be filled by both read and write work requests,
- * so must reflect the sum of both possible queue sizes.
+ * Completion queue can be filled by read work requests.
*/
- rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
- NULL, rdma->comp_channel, 0);
- if (!rdma->cq) {
- error_report("failed to allocate completion queue");
+ rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+ NULL, rdma->recv_comp_channel, 0);
+ if (!rdma->recv_cq) {
+ error_report("failed to allocate receive completion queue");
+ goto err_alloc_pd_cq;
+ }
+
+ /* create send completion channel */
+ rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
+ if (!rdma->send_comp_channel) {
+ error_report("failed to allocate send completion channel");
+ goto err_alloc_pd_cq;
+ }
+
+ rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+ NULL, rdma->send_comp_channel, 0);
+ if (!rdma->send_cq) {
+ error_report("failed to allocate send completion queue");
goto err_alloc_pd_cq;
}
@@ -1083,11 +1098,19 @@ err_alloc_pd_cq:
if (rdma->pd) {
ibv_dealloc_pd(rdma->pd);
}
- if (rdma->comp_channel) {
- ibv_destroy_comp_channel(rdma->comp_channel);
+ if (rdma->recv_comp_channel) {
+ ibv_destroy_comp_channel(rdma->recv_comp_channel);
+ }
+ if (rdma->send_comp_channel) {
+ ibv_destroy_comp_channel(rdma->send_comp_channel);
+ }
+ if (rdma->recv_cq) {
+ ibv_destroy_cq(rdma->recv_cq);
+ rdma->recv_cq = NULL;
}
rdma->pd = NULL;
- rdma->comp_channel = NULL;
+ rdma->recv_comp_channel = NULL;
+ rdma->send_comp_channel = NULL;
return -1;
}
@@ -1104,8 +1127,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
attr.cap.max_recv_wr = 3;
attr.cap.max_send_sge = 1;
attr.cap.max_recv_sge = 1;
- attr.send_cq = rdma->cq;
- attr.recv_cq = rdma->cq;
+ attr.send_cq = rdma->send_cq;
+ attr.recv_cq = rdma->recv_cq;
attr.qp_type = IBV_QPT_RC;
ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1496,14 +1519,14 @@ static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
* (of any kind) has completed.
* Return the work request ID that completed.
*/
-static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
- uint32_t *byte_len)
+static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
+ uint64_t *wr_id_out, uint32_t *byte_len)
{
int ret;
struct ibv_wc wc;
uint64_t wr_id;
- ret = ibv_poll_cq(rdma->cq, 1, &wc);
+ ret = ibv_poll_cq(cq, 1, &wc);
if (!ret) {
*wr_id_out = RDMA_WRID_NONE;
@@ -1575,7 +1598,8 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
/* Wait for activity on the completion channel.
* Returns 0 on success, none-0 on error.
*/
-static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
+static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
+ struct ibv_comp_channel *comp_channel)
{
struct rdma_cm_event *cm_event;
int ret = -1;
@@ -1586,7 +1610,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
*/
if (rdma->migration_started_on_destination &&
migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
- yield_until_fd_readable(rdma->comp_channel->fd);
+ yield_until_fd_readable(comp_channel->fd);
} else {
/* This is the source side, we're in a separate thread
* or destination prior to migration_fd_process_incoming()
@@ -1597,7 +1621,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
*/
while (!rdma->error_state && !rdma->received_error) {
GPollFD pfds[2];
- pfds[0].fd = rdma->comp_channel->fd;
+ pfds[0].fd = comp_channel->fd;
pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
pfds[0].revents = 0;
@@ -1655,6 +1679,17 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
return rdma->error_state;
}
+static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
+{
+ return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
+ rdma->recv_comp_channel;
+}
+
+static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
+{
+ return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
+}
+
/*
* Block until the next work request has completed.
*
@@ -1675,13 +1710,15 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
struct ibv_cq *cq;
void *cq_ctx;
uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
+ struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
+ struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
- if (ibv_req_notify_cq(rdma->cq, 0)) {
+ if (ibv_req_notify_cq(poll_cq, 0)) {
return -1;
}
/* poll cq first */
while (wr_id != wrid_requested) {
- ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+ ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
if (ret < 0) {
return ret;
}
@@ -1702,12 +1739,12 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
}
while (1) {
- ret = qemu_rdma_wait_comp_channel(rdma);
+ ret = qemu_rdma_wait_comp_channel(rdma, ch);
if (ret) {
goto err_block_for_wrid;
}
- ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
+ ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
if (ret) {
perror("ibv_get_cq_event");
goto err_block_for_wrid;
@@ -1721,7 +1758,7 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
}
while (wr_id != wrid_requested) {
- ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+ ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
if (ret < 0) {
goto err_block_for_wrid;
}
@@ -2437,13 +2474,21 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
rdma_destroy_qp(rdma->cm_id);
rdma->qp = NULL;
}
- if (rdma->cq) {
- ibv_destroy_cq(rdma->cq);
- rdma->cq = NULL;
+ if (rdma->recv_cq) {
+ ibv_destroy_cq(rdma->recv_cq);
+ rdma->recv_cq = NULL;
+ }
+ if (rdma->send_cq) {
+ ibv_destroy_cq(rdma->send_cq);
+ rdma->send_cq = NULL;
+ }
+ if (rdma->recv_comp_channel) {
+ ibv_destroy_comp_channel(rdma->recv_comp_channel);
+ rdma->recv_comp_channel = NULL;
}
- if (rdma->comp_channel) {
- ibv_destroy_comp_channel(rdma->comp_channel);
- rdma->comp_channel = NULL;
+ if (rdma->send_comp_channel) {
+ ibv_destroy_comp_channel(rdma->send_comp_channel);
+ rdma->send_comp_channel = NULL;
}
if (rdma->pd) {
ibv_dealloc_pd(rdma->pd);
@@ -3115,10 +3160,14 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
{
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
if (io_read) {
- aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
+ aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
+ false, io_read, io_write, NULL, opaque);
+ aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
false, io_read, io_write, NULL, opaque);
} else {
- aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
+ aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
+ false, io_read, io_write, NULL, opaque);
+ aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
false, io_read, io_write, NULL, opaque);
}
}
@@ -3332,7 +3381,22 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
*/
while (1) {
uint64_t wr_id, wr_id_in;
- int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+ int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
+ if (ret < 0) {
+ error_report("rdma migration: polling error! %d", ret);
+ goto err;
+ }
+
+ wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
+
+ if (wr_id == RDMA_WRID_NONE) {
+ break;
+ }
+ }
+
+ while (1) {
+ uint64_t wr_id, wr_id_in;
+ int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
if (ret < 0) {
error_report("rdma migration: polling error! %d", ret);
goto err;
diff --git a/migration/trace-events b/migration/trace-events
index a8ae163707..b48d873b8a 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -333,6 +333,8 @@ get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t crc) "ramblock n
calc_page_dirty_rate(const char *idstr, uint32_t new_crc, uint32_t old_crc) "ramblock name: %s, new crc: %" PRIu32 ", old crc: %" PRIu32
skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: %s, ramblock size: %" PRIu64
find_page_matched(const char *idstr) "ramblock %s addr or size changed"
+dirtyrate_calculate(int64_t dirtyrate) "dirty rate: %" PRIi64 " MB/s"
+dirtyrate_do_calculate_vcpu(int idx, uint64_t rate) "vcpu[%d]: %"PRIu64 " MB/s"
# block.c
migration_block_init_shared(const char *blk_device_name) "Start migration for %s with shared base image"
diff --git a/qapi/migration.json b/qapi/migration.json
index 9aa8bc5759..87146ceea2 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1732,6 +1732,21 @@
'data': { 'device-id': 'str' } }
##
+# @DirtyRateVcpu:
+#
+# Dirty rate of vcpu.
+#
+# @id: vcpu index.
+#
+# @dirty-rate: dirty rate.
+#
+# Since: 6.1
+#
+##
+{ 'struct': 'DirtyRateVcpu',
+ 'data': { 'id': 'int', 'dirty-rate': 'int64' } }
+
+##
# @DirtyRateStatus:
#
# An enumeration of dirtyrate status.
@@ -1749,6 +1764,23 @@
'data': [ 'unstarted', 'measuring', 'measured'] }
##
+# @DirtyRateMeasureMode:
+#
+# An enumeration of mode of measuring dirtyrate.
+#
+# @page-sampling: calculate dirtyrate by sampling pages.
+#
+# @dirty-ring: calculate dirtyrate by dirty ring.
+#
+# @dirty-bitmap: calculate dirtyrate by dirty bitmap.
+#
+# Since: 6.1
+#
+##
+{ 'enum': 'DirtyRateMeasureMode',
+ 'data': ['page-sampling', 'dirty-ring', 'dirty-bitmap'] }
+
+##
# @DirtyRateInfo:
#
# Information about current dirty page rate of vm.
@@ -1766,6 +1798,12 @@
# @sample-pages: page count per GB for sample dirty pages
# the default value is 512 (since 6.1)
#
+# @mode: mode containing method of calculate dirtyrate includes
+# 'page-sampling' and 'dirty-ring' (Since 6.1)
+#
+# @vcpu-dirty-rate: dirtyrate for each vcpu if dirty-ring
+# mode specified (Since 6.1)
+#
# Since: 5.2
#
##
@@ -1774,7 +1812,9 @@
'status': 'DirtyRateStatus',
'start-time': 'int64',
'calc-time': 'int64',
- 'sample-pages': 'uint64'} }
+ 'sample-pages': 'uint64',
+ 'mode': 'DirtyRateMeasureMode',
+ '*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } }
##
# @calc-dirty-rate:
@@ -1786,6 +1826,9 @@
# @sample-pages: page count per GB for sample dirty pages
# the default value is 512 (since 6.1)
#
+# @mode: mechanism of calculating dirtyrate includes
+# 'page-sampling' and 'dirty-ring' (Since 6.1)
+#
# Since: 5.2
#
# Example:
@@ -1794,7 +1837,8 @@
#
##
{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64',
- '*sample-pages': 'int'} }
+ '*sample-pages': 'int',
+ '*mode': 'DirtyRateMeasureMode'} }
##
# @query-dirty-rate:
diff --git a/softmmu/memory.c b/softmmu/memory.c
index e5826faa0c..7340e19ff5 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -39,7 +39,7 @@
static unsigned memory_region_transaction_depth;
static bool memory_region_update_pending;
static bool ioeventfd_update_pending;
-bool global_dirty_log;
+unsigned int global_dirty_tracking;
static QTAILQ_HEAD(, MemoryListener) memory_listeners
= QTAILQ_HEAD_INITIALIZER(memory_listeners);
@@ -1821,7 +1821,7 @@ uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
uint8_t mask = mr->dirty_log_mask;
RAMBlock *rb = mr->ram_block;
- if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) ||
+ if (global_dirty_tracking && ((rb && qemu_ram_is_migratable(rb)) ||
memory_region_is_iommu(mr))) {
mask |= (1 << DIRTY_MEMORY_MIGRATION);
}
@@ -2081,6 +2081,17 @@ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
return rdmc->replay_populated(rdm, section, replay_fn, opaque);
}
+void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscard replay_fn,
+ void *opaque)
+{
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
+
+ g_assert(rdmc->replay_discarded);
+ rdmc->replay_discarded(rdm, section, replay_fn, opaque);
+}
+
void ram_discard_manager_register_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl,
MemoryRegionSection *section)
@@ -2760,14 +2771,18 @@ void memory_global_after_dirty_log_sync(void)
static VMChangeStateEntry *vmstate_change;
-void memory_global_dirty_log_start(void)
+void memory_global_dirty_log_start(unsigned int flags)
{
if (vmstate_change) {
qemu_del_vm_change_state_handler(vmstate_change);
vmstate_change = NULL;
}
- global_dirty_log = true;
+ assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
+ assert(!(global_dirty_tracking & flags));
+ global_dirty_tracking |= flags;
+
+ trace_global_dirty_changed(global_dirty_tracking);
MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward);
@@ -2777,9 +2792,13 @@ void memory_global_dirty_log_start(void)
memory_region_transaction_commit();
}
-static void memory_global_dirty_log_do_stop(void)
+static void memory_global_dirty_log_do_stop(unsigned int flags)
{
- global_dirty_log = false;
+ assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
+ assert((global_dirty_tracking & flags) == flags);
+ global_dirty_tracking &= ~flags;
+
+ trace_global_dirty_changed(global_dirty_tracking);
/* Refresh DIRTY_MEMORY_MIGRATION bit. */
memory_region_transaction_begin();
@@ -2792,8 +2811,9 @@ static void memory_global_dirty_log_do_stop(void)
static void memory_vm_change_state_handler(void *opaque, bool running,
RunState state)
{
+ unsigned int flags = (unsigned int)(uintptr_t)opaque;
if (running) {
- memory_global_dirty_log_do_stop();
+ memory_global_dirty_log_do_stop(flags);
if (vmstate_change) {
qemu_del_vm_change_state_handler(vmstate_change);
@@ -2802,18 +2822,19 @@ static void memory_vm_change_state_handler(void *opaque, bool running,
}
}
-void memory_global_dirty_log_stop(void)
+void memory_global_dirty_log_stop(unsigned int flags)
{
if (!runstate_is_running()) {
if (vmstate_change) {
return;
}
vmstate_change = qemu_add_vm_change_state_handler(
- memory_vm_change_state_handler, NULL);
+ memory_vm_change_state_handler,
+ (void *)(uintptr_t)flags);
return;
}
- memory_global_dirty_log_do_stop();
+ memory_global_dirty_log_do_stop(flags);
}
static void listener_add_address_space(MemoryListener *listener,
@@ -2825,7 +2846,7 @@ static void listener_add_address_space(MemoryListener *listener,
if (listener->begin) {
listener->begin(listener);
}
- if (global_dirty_log) {
+ if (global_dirty_tracking) {
if (listener->log_global_start) {
listener->log_global_start(listener);
}
diff --git a/softmmu/trace-events b/softmmu/trace-events
index bf1469990e..9c88887b3c 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -19,6 +19,7 @@ memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '
flatview_new(void *view, void *root) "%p (root %p)"
flatview_destroy(void *view, void *root) "%p (root %p)"
flatview_destroy_rcu(void *view, void *root) "%p (root %p)"
+global_dirty_changed(unsigned int bitmask) "bitmask 0x%"PRIx32
# softmmu.c
vm_stop_flush_all(int ret) "ret %d"