aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2015-07-07 17:19:59 +0100
committerPeter Maydell <peter.maydell@linaro.org>2015-07-07 17:19:59 +0100
commit9861b71fd63f04175fddd1e93a417bae4a7808d7 (patch)
tree2e9bb2400821788865d19ac061eff7b7e86bc223
parentf2562fbb7ac54d597cfe05f613d30296d1850d1b (diff)
parentdd63169766abd2b8dc33f4451dac5e778458a47c (diff)
Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20150707' into staging
migration/next for 20150707 # gpg: Signature made Tue Jul 7 13:56:30 2015 BST using RSA key ID 5872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" # gpg: aka "Juan Quintela <quintela@trasno.org>" * remotes/juanquintela/tags/migration/20150707: (28 commits) migration: extend migration_bitmap migration: protect migration_bitmap check_section_footers: Check the correct section_id migration: Add migration events on target side migration: Make events a capability migration: create migration event migration: No need to call trace_migrate_set_state() migration: Use always helper to set state migration: ensure we start in NONE state migration: Use cmpxchg correctly migration: Add configuration section vmstate: Create optional sections global_state: Make section optional migration: create new section to store global state runstate: migration allows more transitions now runstate: Add runstate store Fix older machine type compatibility on power with section footers Fail more cleanly in mismatched RAM cases Sanity check RDMA remote data Sort destination RAMBlocks to be the same as the source ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--docs/qmp/qmp-events.txt14
-rw-r--r--exec.c5
-rw-r--r--hw/i386/pc_piix.c2
-rw-r--r--hw/i386/pc_q35.c2
-rw-r--r--hw/ppc/spapr.c3
-rw-r--r--include/exec/exec-all.h3
-rw-r--r--include/migration/migration.h7
-rw-r--r--include/migration/qemu-file.h14
-rw-r--r--include/migration/vmstate.h2
-rw-r--r--include/sysemu/sysemu.h1
-rw-r--r--migration/migration.c176
-rw-r--r--migration/qemu-file.c16
-rw-r--r--migration/ram.c55
-rw-r--r--migration/rdma.c289
-rw-r--r--migration/savevm.c180
-rw-r--r--migration/vmstate.c11
-rw-r--r--qapi-schema.json5
-rw-r--r--qapi/event.json12
-rw-r--r--trace-events16
-rw-r--r--vl.c21
20 files changed, 668 insertions, 166 deletions
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index 4c13d48726..d92cc4833b 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -473,6 +473,20 @@ Example:
{ "timestamp": {"seconds": 1290688046, "microseconds": 417172},
"event": "SPICE_MIGRATE_COMPLETED" }
+MIGRATION
+---------
+
+Emitted when a migration event happens
+
+Data: None.
+
+ - "status": migration status
+ See MigrationStatus in ~/qapi-schema.json for possible values
+
+Example:
+
+{"timestamp": {"seconds": 1432121972, "microseconds": 744001},
+ "event": "MIGRATION", "data": {"status": "completed"}}
STOP
----
diff --git a/exec.c b/exec.c
index 251dc79e10..b7f7f9818f 100644
--- a/exec.c
+++ b/exec.c
@@ -1414,6 +1414,11 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
}
}
+ new_ram_size = MAX(old_ram_size,
+ (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
+ if (new_ram_size > old_ram_size) {
+ migration_bitmap_extend(old_ram_size, new_ram_size);
+ }
/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
* QLIST (which has an RCU-friendly variant) does not have insertion at
* tail, so save the last element in last_block.
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 56cdcb9661..4c3cb40ce0 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -312,6 +312,8 @@ static void pc_compat_2_3(MachineState *machine)
if (kvm_enabled()) {
pcms->smm = ON_OFF_AUTO_OFF;
}
+ global_state_set_optional();
+ savevm_skip_configuration();
}
static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 8aa3a67fdf..43e6c18777 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -295,6 +295,8 @@ static void pc_compat_2_3(MachineState *machine)
if (kvm_enabled()) {
pcms->smm = ON_OFF_AUTO_OFF;
}
+ global_state_set_optional();
+ savevm_skip_configuration();
}
static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f174e5a0f3..bcf5f0f6ba 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -34,6 +34,7 @@
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
#include "kvm_ppc.h"
+#include "migration/migration.h"
#include "mmu-hash64.h"
#include "qom/cpu.h"
@@ -1851,6 +1852,8 @@ static const TypeInfo spapr_machine_info = {
static void spapr_compat_2_3(Object *obj)
{
+ savevm_skip_section_footers();
+ global_state_set_optional();
}
static void spapr_compat_2_2(Object *obj)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index d678114cb2..2e74760ade 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -365,4 +365,7 @@ static inline bool cpu_can_do_io(CPUState *cpu)
return cpu->can_do_io != 0;
}
+#if !defined(CONFIG_USER_ONLY)
+void migration_bitmap_extend(ram_addr_t old, ram_addr_t new);
+#endif
#endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 9387c8c9d4..b2711ef305 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -34,6 +34,7 @@
#define QEMU_VM_SECTION_FULL 0x04
#define QEMU_VM_SUBSECTION 0x05
#define QEMU_VM_VMDESCRIPTION 0x06
+#define QEMU_VM_CONFIGURATION 0x07
#define QEMU_VM_SECTION_FOOTER 0x7e
struct MigrationParams {
@@ -176,10 +177,11 @@ bool migrate_use_compression(void);
int migrate_compress_level(void);
int migrate_compress_threads(void);
int migrate_decompress_threads(void);
+bool migrate_use_events(void);
void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
-void ram_control_load_hook(QEMUFile *f, uint64_t flags);
+void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data);
/* Whenever this is found in the data stream, the flags
* will be passed to ram_control_load_hook in the incoming-migration
@@ -197,4 +199,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
void ram_mig_init(void);
void savevm_skip_section_footers(void);
+void register_global_state(void);
+void global_state_set_optional(void);
+void savevm_skip_configuration(void);
#endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 4f67d79227..ea49f33fac 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -63,16 +63,20 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
/*
* This function provides hooks around different
* stages of RAM migration.
+ * 'opaque' is the backend specific data in QEMUFile
+ * 'data' is call specific data associated with the 'flags' value
*/
-typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
+typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags,
+ void *data);
/*
* Constants used by ram_control_* hooks
*/
-#define RAM_CONTROL_SETUP 0
-#define RAM_CONTROL_ROUND 1
-#define RAM_CONTROL_HOOK 2
-#define RAM_CONTROL_FINISH 3
+#define RAM_CONTROL_SETUP 0
+#define RAM_CONTROL_ROUND 1
+#define RAM_CONTROL_HOOK 2
+#define RAM_CONTROL_FINISH 3
+#define RAM_CONTROL_BLOCK_REG 4
/*
* This function allows override of where the RAM page
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 0695d7c3de..f51ff693e9 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -820,6 +820,8 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque, QJSON *vmdesc);
+bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque);
+
int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
const VMStateDescription *vmsd,
void *base, int alias_id,
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index df809518b4..44570d17e6 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -28,6 +28,7 @@ bool runstate_check(RunState state);
void runstate_set(RunState new_state);
int runstate_is_running(void);
bool runstate_needs_reset(void);
+bool runstate_store(char *str, size_t size);
typedef struct vm_change_state_entry VMChangeStateEntry;
typedef void VMChangeStateHandler(void *opaque, int running, RunState state);
diff --git a/migration/migration.c b/migration/migration.c
index c6ac08a0cb..45719a0f5f 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -26,6 +26,8 @@
#include "qemu/thread.h"
#include "qmp-commands.h"
#include "trace.h"
+#include "qapi/util.h"
+#include "qapi-event.h"
#define MAX_THROTTLE (32 << 20) /* Migration speed throttling */
@@ -97,6 +99,120 @@ void migration_incoming_state_destroy(void)
mis_current = NULL;
}
+
+typedef struct {
+ bool optional;
+ uint32_t size;
+ uint8_t runstate[100];
+} GlobalState;
+
+static GlobalState global_state;
+
+static int global_state_store(void)
+{
+ if (!runstate_store((char *)global_state.runstate,
+ sizeof(global_state.runstate))) {
+ error_report("runstate name too big: %s", global_state.runstate);
+ trace_migrate_state_too_big();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static char *global_state_get_runstate(void)
+{
+ return (char *)global_state.runstate;
+}
+
+void global_state_set_optional(void)
+{
+ global_state.optional = true;
+}
+
+static bool global_state_needed(void *opaque)
+{
+ GlobalState *s = opaque;
+ char *runstate = (char *)s->runstate;
+
+ /* If it is not optional, it is mandatory */
+
+ if (s->optional == false) {
+ return true;
+ }
+
+ /* If state is running or paused, it is not needed */
+
+ if (strcmp(runstate, "running") == 0 ||
+ strcmp(runstate, "paused") == 0) {
+ return false;
+ }
+
+ /* for any other state it is needed */
+ return true;
+}
+
+static int global_state_post_load(void *opaque, int version_id)
+{
+ GlobalState *s = opaque;
+ int ret = 0;
+ char *runstate = (char *)s->runstate;
+
+ trace_migrate_global_state_post_load(runstate);
+
+ if (strcmp(runstate, "running") != 0) {
+ Error *local_err = NULL;
+ int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX,
+ -1, &local_err);
+
+ if (r == -1) {
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ return -EINVAL;
+ }
+ ret = vm_stop_force_state(r);
+ }
+
+ return ret;
+}
+
+static void global_state_pre_save(void *opaque)
+{
+ GlobalState *s = opaque;
+
+ trace_migrate_global_state_pre_save((char *)s->runstate);
+ s->size = strlen((char *)s->runstate) + 1;
+}
+
+static const VMStateDescription vmstate_globalstate = {
+ .name = "globalstate",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = global_state_post_load,
+ .pre_save = global_state_pre_save,
+ .needed = global_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(size, GlobalState),
+ VMSTATE_BUFFER(runstate, GlobalState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+void register_global_state(void)
+{
+ /* We would use it independently that we receive it */
+ strcpy((char *)&global_state.runstate, "");
+ vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
+}
+
+static void migrate_generate_event(int new_state)
+{
+ if (migrate_use_events()) {
+ qapi_event_send_migration(new_state, &error_abort);
+ trace_migrate_set_state(new_state);
+ }
+}
+
/*
* Called on -incoming with a defer: uri.
* The migration can be started later after any parameters have been
@@ -114,6 +230,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
{
const char *p;
+ qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort);
if (!strcmp(uri, "defer")) {
deferred_incoming_migration(errp);
} else if (strstart(uri, "tcp:", &p)) {
@@ -142,7 +259,7 @@ static void process_incoming_migration_co(void *opaque)
int ret;
migration_incoming_state_new(f);
-
+ migrate_generate_event(MIGRATION_STATUS_ACTIVE);
ret = qemu_loadvm_state(f);
qemu_fclose(f);
@@ -150,10 +267,12 @@ static void process_incoming_migration_co(void *opaque)
migration_incoming_state_destroy();
if (ret < 0) {
+ migrate_generate_event(MIGRATION_STATUS_FAILED);
error_report("load of migration failed: %s", strerror(-ret));
migrate_decompress_threads_join();
exit(EXIT_FAILURE);
}
+ migrate_generate_event(MIGRATION_STATUS_COMPLETED);
qemu_announce_self();
/* Make sure all file formats flush their mutable metadata */
@@ -164,10 +283,20 @@ static void process_incoming_migration_co(void *opaque)
exit(EXIT_FAILURE);
}
- if (autostart) {
+ /* runstate == "" means that we haven't received it through the
+ * wire, so we obey autostart. runstate == runing means that we
+ * need to run it, we need to make sure that we do it after
+ * everything else has finished. Every other state change is done
+ * at the post_load function */
+
+ if (strcmp(global_state_get_runstate(), "running") == 0) {
vm_start();
- } else {
- runstate_set(RUN_STATE_PAUSED);
+ } else if (strcmp(global_state_get_runstate(), "") == 0) {
+ if (autostart) {
+ vm_start();
+ } else {
+ runstate_set(RUN_STATE_PAUSED);
+ }
}
migrate_decompress_threads_join();
}
@@ -392,8 +521,8 @@ void qmp_migrate_set_parameters(bool has_compress_level,
static void migrate_set_state(MigrationState *s, int old_state, int new_state)
{
- if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) {
- trace_migrate_set_state(new_state);
+ if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) {
+ migrate_generate_event(new_state);
}
}
@@ -432,8 +561,7 @@ void migrate_fd_error(MigrationState *s)
{
trace_migrate_fd_error();
assert(s->file == NULL);
- s->state = MIGRATION_STATUS_FAILED;
- trace_migrate_set_state(MIGRATION_STATUS_FAILED);
+ migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED);
notifier_list_notify(&migration_state_notifiers, s);
}
@@ -517,8 +645,7 @@ static MigrationState *migrate_init(const MigrationParams *params)
s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
decompress_thread_count;
s->bandwidth_limit = bandwidth_limit;
- s->state = MIGRATION_STATUS_SETUP;
- trace_migrate_set_state(MIGRATION_STATUS_SETUP);
+ migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
return s;
@@ -577,7 +704,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
}
-
if (runstate_check(RUN_STATE_INMIGRATE)) {
error_setg(errp, "Guest is waiting for an incoming migration");
return;
@@ -592,6 +718,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
return;
}
+ /* We are starting a new migration, so we want to start in a clean
+ state. This change is only needed if previous migration
+ failed/was cancelled. We don't use migrate_set_state() because
+ we are setting the initial state, not changing it. */
+ s->state = MIGRATION_STATUS_NONE;
+
s = migrate_init(&params);
if (strstart(uri, "tcp:", &p)) {
@@ -611,7 +743,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
} else {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
"a valid migration protocol");
- s->state = MIGRATION_STATUS_FAILED;
+ migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED);
return;
}
@@ -740,6 +872,15 @@ int migrate_decompress_threads(void)
return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
}
+bool migrate_use_events(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
+}
+
int migrate_use_xbzrle(void)
{
MigrationState *s;
@@ -793,10 +934,13 @@ static void *migration_thread(void *opaque)
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
old_vm_running = runstate_is_running();
- ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
- if (ret >= 0) {
- qemu_file_set_rate_limit(s->file, INT64_MAX);
- qemu_savevm_state_complete(s->file);
+ ret = global_state_store();
+ if (!ret) {
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret >= 0) {
+ qemu_file_set_rate_limit(s->file, INT64_MAX);
+ qemu_savevm_state_complete(s->file);
+ }
}
qemu_mutex_unlock_iothread();
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 557c1c1a62..6bb3dc15cd 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -129,7 +129,7 @@ void ram_control_before_iterate(QEMUFile *f, uint64_t flags)
int ret = 0;
if (f->ops->before_ram_iterate) {
- ret = f->ops->before_ram_iterate(f, f->opaque, flags);
+ ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL);
if (ret < 0) {
qemu_file_set_error(f, ret);
}
@@ -141,24 +141,30 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags)
int ret = 0;
if (f->ops->after_ram_iterate) {
- ret = f->ops->after_ram_iterate(f, f->opaque, flags);
+ ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL);
if (ret < 0) {
qemu_file_set_error(f, ret);
}
}
}
-void ram_control_load_hook(QEMUFile *f, uint64_t flags)
+void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data)
{
int ret = -EINVAL;
if (f->ops->hook_ram_load) {
- ret = f->ops->hook_ram_load(f, f->opaque, flags);
+ ret = f->ops->hook_ram_load(f, f->opaque, flags, data);
if (ret < 0) {
qemu_file_set_error(f, ret);
}
} else {
- qemu_file_set_error(f, ret);
+ /*
+ * Hook is a hook specifically requested by the source sending a flag
+ * that expects there to be a hook on the destination.
+ */
+ if (flags == RAM_CONTROL_HOOK) {
+ qemu_file_set_error(f, ret);
+ }
}
}
diff --git a/migration/ram.c b/migration/ram.c
index 57368e1575..c696814196 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -222,6 +222,7 @@ static RAMBlock *last_seen_block;
static RAMBlock *last_sent_block;
static ram_addr_t last_offset;
static unsigned long *migration_bitmap;
+static QemuMutex migration_bitmap_mutex;
static uint64_t migration_dirty_pages;
static uint32_t last_version;
static bool ram_bulk_stage;
@@ -494,6 +495,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
return 1;
}
+/* Called with rcu_read_lock() to protect migration_bitmap */
static inline
ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
ram_addr_t start)
@@ -502,26 +504,31 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
unsigned long nr = base + (start >> TARGET_PAGE_BITS);
uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr));
unsigned long size = base + (mr_size >> TARGET_PAGE_BITS);
+ unsigned long *bitmap;
unsigned long next;
+ bitmap = atomic_rcu_read(&migration_bitmap);
if (ram_bulk_stage && nr > base) {
next = nr + 1;
} else {
- next = find_next_bit(migration_bitmap, size, nr);
+ next = find_next_bit(bitmap, size, nr);
}
if (next < size) {
- clear_bit(next, migration_bitmap);
+ clear_bit(next, bitmap);
migration_dirty_pages--;
}
return (next - base) << TARGET_PAGE_BITS;
}
+/* Called with rcu_read_lock() to protect migration_bitmap */
static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
{
+ unsigned long *bitmap;
+ bitmap = atomic_rcu_read(&migration_bitmap);
migration_dirty_pages +=
- cpu_physical_memory_sync_dirty_bitmap(migration_bitmap, start, length);
+ cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
}
@@ -563,11 +570,13 @@ static void migration_bitmap_sync(void)
trace_migration_bitmap_sync_start();
address_space_sync_dirty_bitmap(&address_space_memory);
+ qemu_mutex_lock(&migration_bitmap_mutex);
rcu_read_lock();
QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
}
rcu_read_unlock();
+ qemu_mutex_unlock(&migration_bitmap_mutex);
trace_migration_bitmap_sync_end(migration_dirty_pages
- num_dirty_pages_init);
@@ -1017,10 +1026,15 @@ void free_xbzrle_decoded_buf(void)
static void migration_end(void)
{
- if (migration_bitmap) {
+ /* caller have hold iothread lock or is in a bh, so there is
+ * no writing race against this migration_bitmap
+ */
+ unsigned long *bitmap = migration_bitmap;
+ atomic_rcu_set(&migration_bitmap, NULL);
+ if (bitmap) {
memory_global_dirty_log_stop();
- g_free(migration_bitmap);
- migration_bitmap = NULL;
+ synchronize_rcu();
+ g_free(bitmap);
}
XBZRLE_cache_lock();
@@ -1051,6 +1065,30 @@ static void reset_ram_globals(void)
#define MAX_WAIT 50 /* ms, half buffered_file limit */
+void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
+{
+ /* called in qemu main thread, so there is
+ * no writing race against this migration_bitmap
+ */
+ if (migration_bitmap) {
+ unsigned long *old_bitmap = migration_bitmap, *bitmap;
+ bitmap = bitmap_new(new);
+
+ /* prevent migration_bitmap content from being set bit
+ * by migration_bitmap_sync_range() at the same time.
+ * it is safe to migration if migration_bitmap is cleared bit
+ * at the same time.
+ */
+ qemu_mutex_lock(&migration_bitmap_mutex);
+ bitmap_copy(bitmap, old_bitmap, old);
+ bitmap_set(bitmap, old, new - old);
+ atomic_rcu_set(&migration_bitmap, bitmap);
+ qemu_mutex_unlock(&migration_bitmap_mutex);
+ migration_dirty_pages += new - old;
+ synchronize_rcu();
+ g_free(old_bitmap);
+ }
+}
/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
* long-running RCU critical section. When rcu-reclaims in the code
@@ -1067,6 +1105,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
dirty_rate_high_cnt = 0;
bitmap_sync_count = 0;
migration_bitmap_sync_init();
+ qemu_mutex_init(&migration_bitmap_mutex);
if (migrate_use_xbzrle()) {
XBZRLE_cache_lock();
@@ -1477,6 +1516,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
error_report_err(local_err);
}
}
+ ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
+ block->idstr);
break;
}
}
@@ -1545,7 +1586,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
break;
default:
if (flags & RAM_SAVE_FLAG_HOOK) {
- ram_control_load_hook(f, flags);
+ ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
} else {
error_report("Unknown combination of migration flags: %#x",
flags);
diff --git a/migration/rdma.c b/migration/rdma.c
index b777273b59..f106b2a818 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -215,17 +215,19 @@ static void network_to_caps(RDMACapabilities *cap)
* the information. It's small anyway, so a list is overkill.
*/
typedef struct RDMALocalBlock {
- uint8_t *local_host_addr; /* local virtual address */
- uint64_t remote_host_addr; /* remote virtual address */
- uint64_t offset;
- uint64_t length;
- struct ibv_mr **pmr; /* MRs for chunk-level registration */
- struct ibv_mr *mr; /* MR for non-chunk-level registration */
- uint32_t *remote_keys; /* rkeys for chunk-level registration */
- uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
- int index; /* which block are we */
- bool is_ram_block;
- int nb_chunks;
+ char *block_name;
+ uint8_t *local_host_addr; /* local virtual address */
+ uint64_t remote_host_addr; /* remote virtual address */
+ uint64_t offset;
+ uint64_t length;
+ struct ibv_mr **pmr; /* MRs for chunk-level registration */
+ struct ibv_mr *mr; /* MR for non-chunk-level registration */
+ uint32_t *remote_keys; /* rkeys for chunk-level registration */
+ uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
+ int index; /* which block are we */
+ unsigned int src_index; /* (Only used on dest) */
+ bool is_ram_block;
+ int nb_chunks;
unsigned long *transit_bitmap;
unsigned long *unregister_bitmap;
} RDMALocalBlock;
@@ -353,6 +355,9 @@ typedef struct RDMAContext {
RDMALocalBlocks local_ram_blocks;
RDMADestBlock *dest_blocks;
+ /* Index of the next RAMBlock received during block registration */
+ unsigned int next_src_index;
+
/*
* Migration on *destination* started.
* Then use coroutine yield function.
@@ -411,7 +416,7 @@ static void network_to_control(RDMAControlHeader *control)
*/
typedef struct QEMU_PACKED {
union QEMU_PACKED {
- uint64_t current_addr; /* offset into the ramblock of the chunk */
+ uint64_t current_addr; /* offset into the ram_addr_t space */
uint64_t chunk; /* chunk to lookup if unregistering */
} key;
uint32_t current_index; /* which ramblock the chunk belongs to */
@@ -419,8 +424,19 @@ typedef struct QEMU_PACKED {
uint64_t chunks; /* how many sequential chunks to register */
} RDMARegister;
-static void register_to_network(RDMARegister *reg)
+static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
{
+ RDMALocalBlock *local_block;
+ local_block = &rdma->local_ram_blocks.block[reg->current_index];
+
+ if (local_block->is_ram_block) {
+ /*
+ * current_addr as passed in is an address in the local ram_addr_t
+ * space, we need to translate this for the destination
+ */
+ reg->key.current_addr -= local_block->offset;
+ reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
+ }
reg->key.current_addr = htonll(reg->key.current_addr);
reg->current_index = htonl(reg->current_index);
reg->chunks = htonll(reg->chunks);
@@ -436,13 +452,19 @@ static void network_to_register(RDMARegister *reg)
typedef struct QEMU_PACKED {
uint32_t value; /* if zero, we will madvise() */
uint32_t block_idx; /* which ram block index */
- uint64_t offset; /* where in the remote ramblock this chunk */
+ uint64_t offset; /* Address in remote ram_addr_t space */
uint64_t length; /* length of the chunk */
} RDMACompress;
-static void compress_to_network(RDMACompress *comp)
+static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
{
comp->value = htonl(comp->value);
+ /*
+ * comp->offset as passed in is an address in the local ram_addr_t
+ * space, we need to translate this for the destination
+ */
+ comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
+ comp->offset += rdma->dest_blocks[comp->block_idx].offset;
comp->block_idx = htonl(comp->block_idx);
comp->offset = htonll(comp->offset);
comp->length = htonll(comp->length);
@@ -511,27 +533,27 @@ static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
return result;
}
-static int rdma_add_block(RDMAContext *rdma, void *host_addr,
+static int rdma_add_block(RDMAContext *rdma, const char *block_name,
+ void *host_addr,
ram_addr_t block_offset, uint64_t length)
{
RDMALocalBlocks *local = &rdma->local_ram_blocks;
- RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
- (void *)(uintptr_t)block_offset);
+ RDMALocalBlock *block;
RDMALocalBlock *old = local->block;
- assert(block == NULL);
-
local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
if (local->nb_blocks) {
int x;
- for (x = 0; x < local->nb_blocks; x++) {
- g_hash_table_remove(rdma->blockmap,
- (void *)(uintptr_t)old[x].offset);
- g_hash_table_insert(rdma->blockmap,
- (void *)(uintptr_t)old[x].offset,
- &local->block[x]);
+ if (rdma->blockmap) {
+ for (x = 0; x < local->nb_blocks; x++) {
+ g_hash_table_remove(rdma->blockmap,
+ (void *)(uintptr_t)old[x].offset);
+ g_hash_table_insert(rdma->blockmap,
+ (void *)(uintptr_t)old[x].offset,
+ &local->block[x]);
+ }
}
memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
g_free(old);
@@ -539,10 +561,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr,
block = &local->block[local->nb_blocks];
+ block->block_name = g_strdup(block_name);
block->local_host_addr = host_addr;
block->offset = block_offset;
block->length = length;
block->index = local->nb_blocks;
+ block->src_index = ~0U; /* Filled in by the receipt of the block list */
block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
block->transit_bitmap = bitmap_new(block->nb_chunks);
bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
@@ -552,9 +576,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr,
block->is_ram_block = local->init ? false : true;
- g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
+ if (rdma->blockmap) {
+ g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
+ }
- trace_rdma_add_block(local->nb_blocks, (uintptr_t) block->local_host_addr,
+ trace_rdma_add_block(block_name, local->nb_blocks,
+ (uintptr_t) block->local_host_addr,
block->offset, block->length,
(uintptr_t) (block->local_host_addr + block->length),
BITS_TO_LONGS(block->nb_chunks) *
@@ -574,7 +601,7 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr,
static int qemu_rdma_init_one_block(const char *block_name, void *host_addr,
ram_addr_t block_offset, ram_addr_t length, void *opaque)
{
- return rdma_add_block(opaque, host_addr, block_offset, length);
+ return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
}
/*
@@ -587,7 +614,6 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
RDMALocalBlocks *local = &rdma->local_ram_blocks;
assert(rdma->blockmap == NULL);
- rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
memset(local, 0, sizeof *local);
qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
@@ -597,16 +623,19 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
return 0;
}
-static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
+/*
+ * Note: If used outside of cleanup, the caller must ensure that the destination
+ * block structures are also updated
+ */
+static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
{
RDMALocalBlocks *local = &rdma->local_ram_blocks;
- RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
- (void *) block_offset);
RDMALocalBlock *old = local->block;
int x;
- assert(block);
-
+ if (rdma->blockmap) {
+ g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
+ }
if (block->pmr) {
int j;
@@ -636,8 +665,14 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
g_free(block->remote_keys);
block->remote_keys = NULL;
- for (x = 0; x < local->nb_blocks; x++) {
- g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset);
+ g_free(block->block_name);
+ block->block_name = NULL;
+
+ if (rdma->blockmap) {
+ for (x = 0; x < local->nb_blocks; x++) {
+ g_hash_table_remove(rdma->blockmap,
+ (void *)(uintptr_t)old[x].offset);
+ }
}
if (local->nb_blocks > 1) {
@@ -659,8 +694,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
local->block = NULL;
}
- trace_rdma_delete_block(local->nb_blocks,
- (uintptr_t)block->local_host_addr,
+ trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
block->offset, block->length,
(uintptr_t)(block->local_host_addr + block->length),
BITS_TO_LONGS(block->nb_chunks) *
@@ -670,7 +704,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
local->nb_blocks--;
- if (local->nb_blocks) {
+ if (local->nb_blocks && rdma->blockmap) {
for (x = 0; x < local->nb_blocks; x++) {
g_hash_table_insert(rdma->blockmap,
(void *)(uintptr_t)local->block[x].offset,
@@ -1223,7 +1257,7 @@ const char *print_wrid(int wrid)
/*
* Perform a non-optimized memory unregistration after every transfer
- * for demonsration purposes, only if pin-all is not requested.
+ * for demonstration purposes, only if pin-all is not requested.
*
* Potential optimizations:
* 1. Start a new thread to run this function continuously
@@ -1289,7 +1323,7 @@ static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
rdma->total_registrations--;
reg.key.chunk = chunk;
- register_to_network(&reg);
+ register_to_network(rdma, &reg);
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
&resp, NULL, NULL);
if (ret < 0) {
@@ -1910,7 +1944,7 @@ retry:
trace_qemu_rdma_write_one_zero(chunk, sge.length,
current_index, current_addr);
- compress_to_network(&comp);
+ compress_to_network(rdma, &comp);
ret = qemu_rdma_exchange_send(rdma, &head,
(uint8_t *) &comp, NULL, NULL, NULL);
@@ -1937,7 +1971,7 @@ retry:
trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
current_addr);
- register_to_network(&reg);
+ register_to_network(rdma, &reg);
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
&resp, &reg_result_idx, NULL);
if (ret < 0) {
@@ -2198,7 +2232,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
if (rdma->local_ram_blocks.block) {
while (rdma->local_ram_blocks.nb_blocks) {
- rdma_delete_block(rdma, rdma->local_ram_blocks.block->offset);
+ rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
}
}
@@ -2271,6 +2305,14 @@ static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all)
goto err_rdma_source_init;
}
+ /* Build the hash that maps from offset to RAMBlock */
+ rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
+ for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
+ g_hash_table_insert(rdma->blockmap,
+ (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
+ &rdma->local_ram_blocks.block[idx]);
+ }
+
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
ret = qemu_rdma_reg_control(rdma, idx);
if (ret) {
@@ -2880,6 +2922,14 @@ err_rdma_dest_wait:
return ret;
}
+static int dest_ram_sort_func(const void *a, const void *b)
+{
+ unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
+ unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
+
+ return (a_index < b_index) ? -1 : (a_index != b_index);
+}
+
/*
* During each iteration of the migration, we listen for instructions
* by the source VM to perform dynamic page registrations before they
@@ -2889,8 +2939,7 @@ err_rdma_dest_wait:
*
* Keep doing this until the source tells us to stop.
*/
-static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
- uint64_t flags)
+static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
{
RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
.type = RDMA_CONTROL_REGISTER_RESULT,
@@ -2920,7 +2969,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
CHECK_ERROR_STATE();
do {
- trace_qemu_rdma_registration_handle_wait(flags);
+ trace_qemu_rdma_registration_handle_wait();
ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
@@ -2943,6 +2992,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
trace_qemu_rdma_registration_handle_compress(comp->length,
comp->block_idx,
comp->offset);
+ if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
+ error_report("rdma: 'compress' bad block index %u (vs %d)",
+ (unsigned int)comp->block_idx,
+ rdma->local_ram_blocks.nb_blocks);
+ ret = -EIO;
+ break;
+ }
block = &(rdma->local_ram_blocks.block[comp->block_idx]);
host_addr = block->local_host_addr +
@@ -2958,6 +3014,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
trace_qemu_rdma_registration_handle_ram_blocks();
+ /* Sort our local RAM Block list so it's the same as the source,
+ * we can do this since we've filled in a src_index in the list
+ * as we received the RAMBlock list earlier.
+ */
+ qsort(rdma->local_ram_blocks.block,
+ rdma->local_ram_blocks.nb_blocks,
+ sizeof(RDMALocalBlock), dest_ram_sort_func);
if (rdma->pin_all) {
ret = qemu_rdma_reg_whole_ram_blocks(rdma);
if (ret) {
@@ -2985,6 +3048,12 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
rdma->dest_blocks[i].length = local->block[i].length;
dest_block_to_network(&rdma->dest_blocks[i]);
+ trace_qemu_rdma_registration_handle_ram_blocks_loop(
+ local->block[i].block_name,
+ local->block[i].offset,
+ local->block[i].length,
+ local->block[i].local_host_addr,
+ local->block[i].src_index);
}
blocks.len = rdma->local_ram_blocks.nb_blocks
@@ -3018,8 +3087,23 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
trace_qemu_rdma_registration_handle_register_loop(count,
reg->current_index, reg->key.current_addr, reg->chunks);
+ if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
+ error_report("rdma: 'register' bad block index %u (vs %d)",
+ (unsigned int)reg->current_index,
+ rdma->local_ram_blocks.nb_blocks);
+ ret = -ENOENT;
+ break;
+ }
block = &(rdma->local_ram_blocks.block[reg->current_index]);
if (block->is_ram_block) {
+ if (block->offset > reg->key.current_addr) {
+ error_report("rdma: bad register address for block %s"
+ " offset: %" PRIx64 " current_addr: %" PRIx64,
+ block->block_name, block->offset,
+ reg->key.current_addr);
+ ret = -ERANGE;
+ break;
+ }
host_addr = (block->local_host_addr +
(reg->key.current_addr - block->offset));
chunk = ram_chunk_index(block->local_host_addr,
@@ -3028,6 +3112,14 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
chunk = reg->key.chunk;
host_addr = block->local_host_addr +
(reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
+ /* Check for particularly bad chunk value */
+ if (host_addr < (void *)block->local_host_addr) {
+ error_report("rdma: bad chunk for block %s"
+ " chunk: %" PRIx64,
+ block->block_name, reg->key.chunk);
+ ret = -ERANGE;
+ break;
+ }
}
chunk_start = ram_chunk_start(block, chunk);
chunk_end = ram_chunk_end(block, chunk + reg->chunks);
@@ -3108,8 +3200,56 @@ out:
return ret;
}
+/* Destination:
+ * Called via a ram_control_load_hook during the initial RAM load section which
+ * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks
+ * on the source.
+ * We've already built our local RAMBlock list, but not yet sent the list to
+ * the source.
+ */
+static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name)
+{
+ RDMAContext *rdma = rfile->rdma;
+ int curr;
+ int found = -1;
+
+ /* Find the matching RAMBlock in our local list */
+ for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
+ if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
+ found = curr;
+ break;
+ }
+ }
+
+ if (found == -1) {
+ error_report("RAMBlock '%s' not found on destination", name);
+ return -ENOENT;
+ }
+
+ rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
+ trace_rdma_block_notification_handle(name, rdma->next_src_index);
+ rdma->next_src_index++;
+
+ return 0;
+}
+
+static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
+{
+ switch (flags) {
+ case RAM_CONTROL_BLOCK_REG:
+ return rdma_block_notification_handle(opaque, data);
+
+ case RAM_CONTROL_HOOK:
+ return qemu_rdma_registration_handle(f, opaque);
+
+ default:
+ /* Shouldn't be called with any other values */
+ abort();
+ }
+}
+
static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
- uint64_t flags)
+ uint64_t flags, void *data)
{
QEMUFileRDMA *rfile = opaque;
RDMAContext *rdma = rfile->rdma;
@@ -3128,7 +3268,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
* First, flush writes, if any.
*/
static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
- uint64_t flags)
+ uint64_t flags, void *data)
{
Error *local_err = NULL, **errp = &local_err;
QEMUFileRDMA *rfile = opaque;
@@ -3148,7 +3288,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
if (flags == RAM_CONTROL_SETUP) {
RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
RDMALocalBlocks *local = &rdma->local_ram_blocks;
- int reg_result_idx, i, j, nb_dest_blocks;
+ int reg_result_idx, i, nb_dest_blocks;
head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
trace_qemu_rdma_registration_stop_ram();
@@ -3184,9 +3324,11 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
*/
if (local->nb_blocks != nb_dest_blocks) {
- ERROR(errp, "ram blocks mismatch #1! "
+ ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) "
"Your QEMU command line parameters are probably "
- "not identical on both the source and destination.");
+ "not identical on both the source and destination.",
+ local->nb_blocks, nb_dest_blocks);
+ rdma->error_state = -EINVAL;
return -EINVAL;
}
@@ -3196,30 +3338,18 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
for (i = 0; i < nb_dest_blocks; i++) {
network_to_dest_block(&rdma->dest_blocks[i]);
- /* search local ram blocks */
- for (j = 0; j < local->nb_blocks; j++) {
- if (rdma->dest_blocks[i].offset != local->block[j].offset) {
- continue;
- }
-
- if (rdma->dest_blocks[i].length != local->block[j].length) {
- ERROR(errp, "ram blocks mismatch #2! "
- "Your QEMU command line parameters are probably "
- "not identical on both the source and destination.");
- return -EINVAL;
- }
- local->block[j].remote_host_addr =
- rdma->dest_blocks[i].remote_host_addr;
- local->block[j].remote_rkey = rdma->dest_blocks[i].remote_rkey;
- break;
- }
-
- if (j >= local->nb_blocks) {
- ERROR(errp, "ram blocks mismatch #3! "
- "Your QEMU command line parameters are probably "
- "not identical on both the source and destination.");
+ /* We require that the blocks are in the same order */
+ if (rdma->dest_blocks[i].length != local->block[i].length) {
+ ERROR(errp, "Block %s/%d has a different length %" PRIu64
+ "vs %" PRIu64, local->block[i].block_name, i,
+ local->block[i].length,
+ rdma->dest_blocks[i].length);
+ rdma->error_state = -EINVAL;
return -EINVAL;
}
+ local->block[i].remote_host_addr =
+ rdma->dest_blocks[i].remote_host_addr;
+ local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
}
}
@@ -3250,7 +3380,7 @@ static const QEMUFileOps rdma_read_ops = {
.get_buffer = qemu_rdma_get_buffer,
.get_fd = qemu_rdma_get_fd,
.close = qemu_rdma_close,
- .hook_ram_load = qemu_rdma_registration_handle,
+ .hook_ram_load = rdma_load_hook,
};
static const QEMUFileOps rdma_write_ops = {
@@ -3263,12 +3393,13 @@ static const QEMUFileOps rdma_write_ops = {
static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
{
- QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
+ QEMUFileRDMA *r;
if (qemu_file_mode_is_not_valid(mode)) {
return NULL;
}
+ r = g_malloc0(sizeof(QEMUFileRDMA));
r->rdma = rdma;
if (mode[0] == 'w') {
@@ -3287,7 +3418,7 @@ static void rdma_accept_incoming_migration(void *opaque)
QEMUFile *f;
Error *local_err = NULL, **errp = &local_err;
- trace_qemu_dma_accept_incoming_migration();
+ trace_qemu_rdma_accept_incoming_migration();
ret = qemu_rdma_accept(rdma);
if (ret) {
@@ -3295,7 +3426,7 @@ static void rdma_accept_incoming_migration(void *opaque)
return;
}
- trace_qemu_dma_accept_incoming_migration_accepted();
+ trace_qemu_rdma_accept_incoming_migration_accepted();
f = qemu_fopen_rdma(rdma, "rb");
if (f == NULL) {
diff --git a/migration/savevm.c b/migration/savevm.c
index 9e0e286797..86735fc53a 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -246,11 +246,55 @@ typedef struct SaveStateEntry {
typedef struct SaveState {
QTAILQ_HEAD(, SaveStateEntry) handlers;
int global_section_id;
+ bool skip_configuration;
+ uint32_t len;
+ const char *name;
} SaveState;
static SaveState savevm_state = {
.handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
.global_section_id = 0,
+ .skip_configuration = false,
+};
+
+void savevm_skip_configuration(void)
+{
+ savevm_state.skip_configuration = true;
+}
+
+
+static void configuration_pre_save(void *opaque)
+{
+ SaveState *state = opaque;
+ const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
+
+ state->len = strlen(current_name);
+ state->name = current_name;
+}
+
+static int configuration_post_load(void *opaque, int version_id)
+{
+ SaveState *state = opaque;
+ const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
+
+ if (strncmp(state->name, current_name, state->len) != 0) {
+ error_report("Machine type received is '%s' and local is '%s'",
+ state->name, current_name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static const VMStateDescription vmstate_configuration = {
+ .name = "configuration",
+ .version_id = 1,
+ .post_load = configuration_post_load,
+ .pre_save = configuration_pre_save,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(len, SaveState),
+ VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len),
+ VMSTATE_END_OF_LIST()
+ },
};
static void dump_vmstate_vmsd(FILE *out_file,
@@ -653,41 +697,6 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
}
}
-/*
- * Read a footer off the wire and check that it matches the expected section
- *
- * Returns: true if the footer was good
- * false if there is a problem (and calls error_report to say why)
- */
-static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
-{
- uint8_t read_mark;
- uint32_t read_section_id;
-
- if (skip_section_footers) {
- /* No footer to check */
- return true;
- }
-
- read_mark = qemu_get_byte(f);
-
- if (read_mark != QEMU_VM_SECTION_FOOTER) {
- error_report("Missing section footer for %s", se->idstr);
- return false;
- }
-
- read_section_id = qemu_get_be32(f);
- if (read_section_id != se->section_id) {
- error_report("Mismatched section id in footer for %s -"
- " read 0x%x expected 0x%x",
- se->idstr, read_section_id, se->section_id);
- return false;
- }
-
- /* All good */
- return true;
-}
-
bool qemu_savevm_state_blocked(Error **errp)
{
SaveStateEntry *se;
@@ -723,6 +732,11 @@ void qemu_savevm_state_begin(QEMUFile *f,
se->ops->set_params(params, se->opaque);
}
+ if (!savevm_state.skip_configuration) {
+ qemu_put_byte(f, QEMU_VM_CONFIGURATION);
+ vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
+ }
+
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_setup) {
continue;
@@ -836,6 +850,11 @@ void qemu_savevm_state_complete(QEMUFile *f)
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
continue;
}
+ if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
+ trace_savevm_section_skip(se->idstr, se->section_id);
+ continue;
+ }
+
trace_savevm_section_start(se->idstr, se->section_id);
json_start_object(vmdesc, NULL);
@@ -949,6 +968,9 @@ static int qemu_save_device_state(QEMUFile *f)
if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
continue;
}
+ if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
+ continue;
+ }
save_section_header(f, se, QEMU_VM_SECTION_FULL);
@@ -989,6 +1011,41 @@ struct LoadStateEntry {
int version_id;
};
+/*
+ * Read a footer off the wire and check that it matches the expected section
+ *
+ * Returns: true if the footer was good
+ * false if there is a problem (and calls error_report to say why)
+ */
+static bool check_section_footer(QEMUFile *f, LoadStateEntry *le)
+{
+ uint8_t read_mark;
+ uint32_t read_section_id;
+
+ if (skip_section_footers) {
+ /* No footer to check */
+ return true;
+ }
+
+ read_mark = qemu_get_byte(f);
+
+ if (read_mark != QEMU_VM_SECTION_FOOTER) {
+ error_report("Missing section footer for %s", le->se->idstr);
+ return false;
+ }
+
+ read_section_id = qemu_get_be32(f);
+ if (read_section_id != le->section_id) {
+ error_report("Mismatched section id in footer for %s -"
+ " read 0x%x expected 0x%x",
+ le->se->idstr, read_section_id, le->section_id);
+ return false;
+ }
+
+ /* All good */
+ return true;
+}
+
void loadvm_free_handlers(MigrationIncomingState *mis)
{
LoadStateEntry *le, *new_le;
@@ -1029,6 +1086,18 @@ int qemu_loadvm_state(QEMUFile *f)
return -ENOTSUP;
}
+ if (!savevm_state.skip_configuration) {
+ if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
+ error_report("Configuration section missing");
+ return -EINVAL;
+ }
+ ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
+
+ if (ret) {
+ return ret;
+ }
+ }
+
while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
uint32_t instance_id, version_id, section_id;
SaveStateEntry *se;
@@ -1082,7 +1151,7 @@ int qemu_loadvm_state(QEMUFile *f)
" device '%s'", instance_id, idstr);
goto out;
}
- if (!check_section_footer(f, le->se)) {
+ if (!check_section_footer(f, le)) {
ret = -EINVAL;
goto out;
}
@@ -1109,7 +1178,7 @@ int qemu_loadvm_state(QEMUFile *f)
section_id, le->se->idstr);
goto out;
}
- if (!check_section_footer(f, le->se)) {
+ if (!check_section_footer(f, le)) {
ret = -EINVAL;
goto out;
}
@@ -1127,16 +1196,35 @@ int qemu_loadvm_state(QEMUFile *f)
* Try to read in the VMDESC section as well, so that dumping tools that
* intercept our migration stream have the chance to see it.
*/
- if (qemu_get_byte(f) == QEMU_VM_VMDESCRIPTION) {
- uint32_t size = qemu_get_be32(f);
- uint8_t *buf = g_malloc(0x1000);
-
- while (size > 0) {
- uint32_t read_chunk = MIN(size, 0x1000);
- qemu_get_buffer(f, buf, read_chunk);
- size -= read_chunk;
+
+ /* We've got to be careful; if we don't read the data and just shut the fd
+ * then the sender can error if we close while it's still sending.
+ * We also mustn't read data that isn't there; some transports (RDMA)
+ * will stall waiting for that data when the source has already closed.
+ */
+ if (should_send_vmdesc()) {
+ uint8_t *buf;
+ uint32_t size;
+ section_type = qemu_get_byte(f);
+
+ if (section_type != QEMU_VM_VMDESCRIPTION) {
+ error_report("Expected vmdescription section, but got %d",
+ section_type);
+ /*
+ * It doesn't seem worth failing at this point since
+ * we apparently have an otherwise valid VM state
+ */
+ } else {
+ buf = g_malloc(0x1000);
+ size = qemu_get_be32(f);
+
+ while (size > 0) {
+ uint32_t read_chunk = MIN(size, 0x1000);
+ qemu_get_buffer(f, buf, read_chunk);
+ size -= read_chunk;
+ }
+ g_free(buf);
}
- g_free(buf);
}
cpu_synchronize_all_post_init();
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 6138d1acb7..e8ccf22f67 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -276,6 +276,17 @@ static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc,
json_end_object(vmdesc);
}
+
+bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque)
+{
+ if (vmsd->needed && !vmsd->needed(opaque)) {
+ /* optional section not needed */
+ return false;
+ }
+ return true;
+}
+
+
void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque, QJSON *vmdesc)
{
diff --git a/qapi-schema.json b/qapi-schema.json
index 106008cdeb..1285b8c74a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -523,6 +523,9 @@
# minimize migration traffic. The feature is disabled by default.
# (since 2.4 )
#
+# @events: generate events for each migration state change
+# (since 2.4 )
+#
# @auto-converge: If enabled, QEMU will automatically throttle down the guest
# to speed up convergence of RAM migration. (since 1.6)
#
@@ -530,7 +533,7 @@
##
{ 'enum': 'MigrationCapability',
'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
- 'compress'] }
+ 'compress', 'events'] }
##
# @MigrationCapabilityStatus
diff --git a/qapi/event.json b/qapi/event.json
index 378dda572a..f0cef010f0 100644
--- a/qapi/event.json
+++ b/qapi/event.json
@@ -243,6 +243,18 @@
{ 'event': 'SPICE_MIGRATE_COMPLETED' }
##
+# @MIGRATION
+#
+# Emitted when a migration event happens
+#
+# @status: @MigrationStatus describing the current migration status.
+#
+# Since: 2.4
+##
+{ 'event': 'MIGRATION',
+ 'data': {'status': 'MigrationStatus'}}
+
+##
# @ACPI_DEVICE_OST
#
# Emitted when guest executes ACPI _OST method.
diff --git a/trace-events b/trace-events
index d24d80aed2..2d395c59a9 100644
--- a/trace-events
+++ b/trace-events
@@ -1191,6 +1191,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
+savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
savevm_state_begin(void) ""
savevm_state_header(void) ""
savevm_state_iterate(void) ""
@@ -1403,10 +1404,13 @@ migrate_fd_error(void) ""
migrate_fd_cancel(void) ""
migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
+migrate_state_too_big(void) ""
+migrate_global_state_post_load(const char *state) "loaded state: %s"
+migrate_global_state_pre_save(const char *state) "saved state: %s"
# migration/rdma.c
-qemu_dma_accept_incoming_migration(void) ""
-qemu_dma_accept_incoming_migration_accepted(void) ""
+qemu_rdma_accept_incoming_migration(void) ""
+qemu_rdma_accept_incoming_migration_accepted(void) ""
qemu_rdma_accept_pin_state(bool pin) "%d"
qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p"
qemu_rdma_block_for_wrid_miss(const char *wcompstr, int wcomp, const char *gcompstr, uint64_t req) "A Wanted wrid %s (%d) but got %s (%" PRIu64 ")"
@@ -1433,13 +1437,14 @@ qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu6
qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
qemu_rdma_registration_handle_finished(void) ""
qemu_rdma_registration_handle_ram_blocks(void) ""
+qemu_rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u"
qemu_rdma_registration_handle_register(int requests) "%d requests"
qemu_rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64
qemu_rdma_registration_handle_register_rkey(int rkey) "%x"
qemu_rdma_registration_handle_unregister(int requests) "%d requests"
qemu_rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64
qemu_rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64
-qemu_rdma_registration_handle_wait(uint64_t flags) "Waiting for next request %" PRIu64
+qemu_rdma_registration_handle_wait(void) ""
qemu_rdma_registration_start(uint64_t flags) "%" PRIu64
qemu_rdma_registration_stop(uint64_t flags) "%" PRIu64
qemu_rdma_registration_stop_ram(void) ""
@@ -1458,8 +1463,9 @@ qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Receive
qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)"
qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
-rdma_add_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
-rdma_delete_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
+rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
+rdma_block_notification_handle(const char *name, int index) "%s at %d"
+rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
rdma_start_incoming_migration(void) ""
rdma_start_incoming_migration_after_dest_init(void) ""
rdma_start_incoming_migration_after_rdma_listen(void) ""
diff --git a/vl.c b/vl.c
index 69ad90c87f..cfa6133308 100644
--- a/vl.c
+++ b/vl.c
@@ -573,8 +573,14 @@ static const RunStateTransition runstate_transitions_def[] = {
{ RUN_STATE_DEBUG, RUN_STATE_RUNNING },
{ RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE },
- { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING },
+ { RUN_STATE_INMIGRATE, RUN_STATE_INTERNAL_ERROR },
+ { RUN_STATE_INMIGRATE, RUN_STATE_IO_ERROR },
{ RUN_STATE_INMIGRATE, RUN_STATE_PAUSED },
+ { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING },
+ { RUN_STATE_INMIGRATE, RUN_STATE_SHUTDOWN },
+ { RUN_STATE_INMIGRATE, RUN_STATE_SUSPENDED },
+ { RUN_STATE_INMIGRATE, RUN_STATE_WATCHDOG },
+ { RUN_STATE_INMIGRATE, RUN_STATE_GUEST_PANICKED },
{ RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED },
{ RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE },
@@ -634,6 +640,18 @@ bool runstate_check(RunState state)
return current_run_state == state;
}
+bool runstate_store(char *str, size_t size)
+{
+ const char *state = RunState_lookup[current_run_state];
+ size_t len = strlen(state) + 1;
+
+ if (len > size) {
+ return false;
+ }
+ memcpy(str, state, len);
+ return true;
+}
+
static void runstate_init(void)
{
const RunStateTransition *p;
@@ -4606,6 +4624,7 @@ int main(int argc, char **argv, char **envp)
return 0;
}
+ register_global_state();
if (incoming) {
Error *local_err = NULL;
qemu_start_incoming_migration(incoming, &local_err);