diff options
Diffstat (limited to 'migration')
-rw-r--r-- | migration/migration.c | 176 | ||||
-rw-r--r-- | migration/qemu-file.c | 16 | ||||
-rw-r--r-- | migration/ram.c | 55 | ||||
-rw-r--r-- | migration/rdma.c | 289 | ||||
-rw-r--r-- | migration/savevm.c | 180 | ||||
-rw-r--r-- | migration/vmstate.c | 11 |
6 files changed, 574 insertions, 153 deletions
diff --git a/migration/migration.c b/migration/migration.c index c6ac08a0cb..45719a0f5f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -26,6 +26,8 @@ #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" +#include "qapi/util.h" +#include "qapi-event.h" #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ @@ -97,6 +99,120 @@ void migration_incoming_state_destroy(void) mis_current = NULL; } + +typedef struct { + bool optional; + uint32_t size; + uint8_t runstate[100]; +} GlobalState; + +static GlobalState global_state; + +static int global_state_store(void) +{ + if (!runstate_store((char *)global_state.runstate, + sizeof(global_state.runstate))) { + error_report("runstate name too big: %s", global_state.runstate); + trace_migrate_state_too_big(); + return -EINVAL; + } + return 0; +} + +static char *global_state_get_runstate(void) +{ + return (char *)global_state.runstate; +} + +void global_state_set_optional(void) +{ + global_state.optional = true; +} + +static bool global_state_needed(void *opaque) +{ + GlobalState *s = opaque; + char *runstate = (char *)s->runstate; + + /* If it is not optional, it is mandatory */ + + if (s->optional == false) { + return true; + } + + /* If state is running or paused, it is not needed */ + + if (strcmp(runstate, "running") == 0 || + strcmp(runstate, "paused") == 0) { + return false; + } + + /* for any other state it is needed */ + return true; +} + +static int global_state_post_load(void *opaque, int version_id) +{ + GlobalState *s = opaque; + int ret = 0; + char *runstate = (char *)s->runstate; + + trace_migrate_global_state_post_load(runstate); + + if (strcmp(runstate, "running") != 0) { + Error *local_err = NULL; + int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX, + -1, &local_err); + + if (r == -1) { + if (local_err) { + error_report_err(local_err); + } + return -EINVAL; + } + ret = vm_stop_force_state(r); + } + + return ret; +} + +static void global_state_pre_save(void *opaque) +{ + GlobalState *s = opaque; + + trace_migrate_global_state_pre_save((char *)s->runstate); + s->size = strlen((char *)s->runstate) + 1; +} + +static const VMStateDescription vmstate_globalstate = { + .name = "globalstate", + .version_id = 1, + .minimum_version_id = 1, + .post_load = global_state_post_load, + .pre_save = global_state_pre_save, + .needed = global_state_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT32(size, GlobalState), + VMSTATE_BUFFER(runstate, GlobalState), + VMSTATE_END_OF_LIST() + }, +}; + +void register_global_state(void) +{ + /* We would use it independently that we receive it */ + strcpy((char *)&global_state.runstate, ""); + vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); +} + +static void migrate_generate_event(int new_state) +{ + if (migrate_use_events()) { + qapi_event_send_migration(new_state, &error_abort); + trace_migrate_set_state(new_state); + } +} + /* * Called on -incoming with a defer: uri. * The migration can be started later after any parameters have been @@ -114,6 +230,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; + qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); if (!strcmp(uri, "defer")) { deferred_incoming_migration(errp); } else if (strstart(uri, "tcp:", &p)) { @@ -142,7 +259,7 @@ static void process_incoming_migration_co(void *opaque) int ret; migration_incoming_state_new(f); - + migrate_generate_event(MIGRATION_STATUS_ACTIVE); ret = qemu_loadvm_state(f); qemu_fclose(f); @@ -150,10 +267,12 @@ static void process_incoming_migration_co(void *opaque) migration_incoming_state_destroy(); if (ret < 0) { + migrate_generate_event(MIGRATION_STATUS_FAILED); error_report("load of migration failed: %s", strerror(-ret)); migrate_decompress_threads_join(); exit(EXIT_FAILURE); } + migrate_generate_event(MIGRATION_STATUS_COMPLETED); qemu_announce_self(); /* Make sure all file formats flush their mutable metadata */ @@ -164,10 +283,20 @@ static void process_incoming_migration_co(void *opaque) exit(EXIT_FAILURE); } - if (autostart) { + /* runstate == "" means that we haven't received it through the + * wire, so we obey autostart. runstate == runing means that we + * need to run it, we need to make sure that we do it after + * everything else has finished. Every other state change is done + * at the post_load function */ + + if (strcmp(global_state_get_runstate(), "running") == 0) { vm_start(); - } else { - runstate_set(RUN_STATE_PAUSED); + } else if (strcmp(global_state_get_runstate(), "") == 0) { + if (autostart) { + vm_start(); + } else { + runstate_set(RUN_STATE_PAUSED); + } } migrate_decompress_threads_join(); } @@ -392,8 +521,8 @@ void qmp_migrate_set_parameters(bool has_compress_level, static void migrate_set_state(MigrationState *s, int old_state, int new_state) { - if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) { - trace_migrate_set_state(new_state); + if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { + migrate_generate_event(new_state); } } @@ -432,8 +561,7 @@ void migrate_fd_error(MigrationState *s) { trace_migrate_fd_error(); assert(s->file == NULL); - s->state = MIGRATION_STATUS_FAILED; - trace_migrate_set_state(MIGRATION_STATUS_FAILED); + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); notifier_list_notify(&migration_state_notifiers, s); } @@ -517,8 +645,7 @@ static MigrationState *migrate_init(const MigrationParams *params) s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = decompress_thread_count; s->bandwidth_limit = bandwidth_limit; - s->state = MIGRATION_STATUS_SETUP; - trace_migrate_set_state(MIGRATION_STATUS_SETUP); + migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); return s; @@ -577,7 +704,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, error_setg(errp, QERR_MIGRATION_ACTIVE); return; } - if (runstate_check(RUN_STATE_INMIGRATE)) { error_setg(errp, "Guest is waiting for an incoming migration"); return; @@ -592,6 +718,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } + /* We are starting a new migration, so we want to start in a clean + state. This change is only needed if previous migration + failed/was cancelled. We don't use migrate_set_state() because + we are setting the initial state, not changing it. */ + s->state = MIGRATION_STATUS_NONE; + s = migrate_init(¶ms); if (strstart(uri, "tcp:", &p)) { @@ -611,7 +743,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } else { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); - s->state = MIGRATION_STATUS_FAILED; + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); return; } @@ -740,6 +872,15 @@ int migrate_decompress_threads(void) return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; } +bool migrate_use_events(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -793,10 +934,13 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - qemu_file_set_rate_limit(s->file, INT64_MAX); - qemu_savevm_state_complete(s->file); + ret = global_state_store(); + if (!ret) { + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret >= 0) { + qemu_file_set_rate_limit(s->file, INT64_MAX); + qemu_savevm_state_complete(s->file); + } } qemu_mutex_unlock_iothread(); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 557c1c1a62..6bb3dc15cd 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -129,7 +129,7 @@ void ram_control_before_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->before_ram_iterate) { - ret = f->ops->before_ram_iterate(f, f->opaque, flags); + ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -141,24 +141,30 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->after_ram_iterate) { - ret = f->ops->after_ram_iterate(f, f->opaque, flags); + ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } } } -void ram_control_load_hook(QEMUFile *f, uint64_t flags) +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) { int ret = -EINVAL; if (f->ops->hook_ram_load) { - ret = f->ops->hook_ram_load(f, f->opaque, flags); + ret = f->ops->hook_ram_load(f, f->opaque, flags, data); if (ret < 0) { qemu_file_set_error(f, ret); } } else { - qemu_file_set_error(f, ret); + /* + * Hook is a hook specifically requested by the source sending a flag + * that expects there to be a hook on the destination. + */ + if (flags == RAM_CONTROL_HOOK) { + qemu_file_set_error(f, ret); + } } } diff --git a/migration/ram.c b/migration/ram.c index 57368e1575..c696814196 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -222,6 +222,7 @@ static RAMBlock *last_seen_block; static RAMBlock *last_sent_block; static ram_addr_t last_offset; static unsigned long *migration_bitmap; +static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; @@ -494,6 +495,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return 1; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -502,26 +504,31 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, unsigned long nr = base + (start >> TARGET_PAGE_BITS); uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); + unsigned long *bitmap; unsigned long next; + bitmap = atomic_rcu_read(&migration_bitmap); if (ram_bulk_stage && nr > base) { next = nr + 1; } else { - next = find_next_bit(migration_bitmap, size, nr); + next = find_next_bit(bitmap, size, nr); } if (next < size) { - clear_bit(next, migration_bitmap); + clear_bit(next, bitmap); migration_dirty_pages--; } return (next - base) << TARGET_PAGE_BITS; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) { + unsigned long *bitmap; + bitmap = atomic_rcu_read(&migration_bitmap); migration_dirty_pages += - cpu_physical_memory_sync_dirty_bitmap(migration_bitmap, start, length); + cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); } @@ -563,11 +570,13 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); + qemu_mutex_lock(&migration_bitmap_mutex); rcu_read_lock(); QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); } rcu_read_unlock(); + qemu_mutex_unlock(&migration_bitmap_mutex); trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); @@ -1017,10 +1026,15 @@ void free_xbzrle_decoded_buf(void) static void migration_end(void) { - if (migration_bitmap) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against this migration_bitmap + */ + unsigned long *bitmap = migration_bitmap; + atomic_rcu_set(&migration_bitmap, NULL); + if (bitmap) { memory_global_dirty_log_stop(); - g_free(migration_bitmap); - migration_bitmap = NULL; + synchronize_rcu(); + g_free(bitmap); } XBZRLE_cache_lock(); @@ -1051,6 +1065,30 @@ static void reset_ram_globals(void) #define MAX_WAIT 50 /* ms, half buffered_file limit */ +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) +{ + /* called in qemu main thread, so there is + * no writing race against this migration_bitmap + */ + if (migration_bitmap) { + unsigned long *old_bitmap = migration_bitmap, *bitmap; + bitmap = bitmap_new(new); + + /* prevent migration_bitmap content from being set bit + * by migration_bitmap_sync_range() at the same time. + * it is safe to migration if migration_bitmap is cleared bit + * at the same time. + */ + qemu_mutex_lock(&migration_bitmap_mutex); + bitmap_copy(bitmap, old_bitmap, old); + bitmap_set(bitmap, old, new - old); + atomic_rcu_set(&migration_bitmap, bitmap); + qemu_mutex_unlock(&migration_bitmap_mutex); + migration_dirty_pages += new - old; + synchronize_rcu(); + g_free(old_bitmap); + } +} /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -1067,6 +1105,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) dirty_rate_high_cnt = 0; bitmap_sync_count = 0; migration_bitmap_sync_init(); + qemu_mutex_init(&migration_bitmap_mutex); if (migrate_use_xbzrle()) { XBZRLE_cache_lock(); @@ -1477,6 +1516,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) error_report_err(local_err); } } + ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, + block->idstr); break; } } @@ -1545,7 +1586,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) break; default: if (flags & RAM_SAVE_FLAG_HOOK) { - ram_control_load_hook(f, flags); + ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); } else { error_report("Unknown combination of migration flags: %#x", flags); diff --git a/migration/rdma.c b/migration/rdma.c index b777273b59..f106b2a818 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -215,17 +215,19 @@ static void network_to_caps(RDMACapabilities *cap) * the information. It's small anyway, so a list is overkill. */ typedef struct RDMALocalBlock { - uint8_t *local_host_addr; /* local virtual address */ - uint64_t remote_host_addr; /* remote virtual address */ - uint64_t offset; - uint64_t length; - struct ibv_mr **pmr; /* MRs for chunk-level registration */ - struct ibv_mr *mr; /* MR for non-chunk-level registration */ - uint32_t *remote_keys; /* rkeys for chunk-level registration */ - uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ - int index; /* which block are we */ - bool is_ram_block; - int nb_chunks; + char *block_name; + uint8_t *local_host_addr; /* local virtual address */ + uint64_t remote_host_addr; /* remote virtual address */ + uint64_t offset; + uint64_t length; + struct ibv_mr **pmr; /* MRs for chunk-level registration */ + struct ibv_mr *mr; /* MR for non-chunk-level registration */ + uint32_t *remote_keys; /* rkeys for chunk-level registration */ + uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ + int index; /* which block are we */ + unsigned int src_index; /* (Only used on dest) */ + bool is_ram_block; + int nb_chunks; unsigned long *transit_bitmap; unsigned long *unregister_bitmap; } RDMALocalBlock; @@ -353,6 +355,9 @@ typedef struct RDMAContext { RDMALocalBlocks local_ram_blocks; RDMADestBlock *dest_blocks; + /* Index of the next RAMBlock received during block registration */ + unsigned int next_src_index; + /* * Migration on *destination* started. * Then use coroutine yield function. @@ -411,7 +416,7 @@ static void network_to_control(RDMAControlHeader *control) */ typedef struct QEMU_PACKED { union QEMU_PACKED { - uint64_t current_addr; /* offset into the ramblock of the chunk */ + uint64_t current_addr; /* offset into the ram_addr_t space */ uint64_t chunk; /* chunk to lookup if unregistering */ } key; uint32_t current_index; /* which ramblock the chunk belongs to */ @@ -419,8 +424,19 @@ typedef struct QEMU_PACKED { uint64_t chunks; /* how many sequential chunks to register */ } RDMARegister; -static void register_to_network(RDMARegister *reg) +static void register_to_network(RDMAContext *rdma, RDMARegister *reg) { + RDMALocalBlock *local_block; + local_block = &rdma->local_ram_blocks.block[reg->current_index]; + + if (local_block->is_ram_block) { + /* + * current_addr as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + reg->key.current_addr -= local_block->offset; + reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; + } reg->key.current_addr = htonll(reg->key.current_addr); reg->current_index = htonl(reg->current_index); reg->chunks = htonll(reg->chunks); @@ -436,13 +452,19 @@ static void network_to_register(RDMARegister *reg) typedef struct QEMU_PACKED { uint32_t value; /* if zero, we will madvise() */ uint32_t block_idx; /* which ram block index */ - uint64_t offset; /* where in the remote ramblock this chunk */ + uint64_t offset; /* Address in remote ram_addr_t space */ uint64_t length; /* length of the chunk */ } RDMACompress; -static void compress_to_network(RDMACompress *comp) +static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) { comp->value = htonl(comp->value); + /* + * comp->offset as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; + comp->offset += rdma->dest_blocks[comp->block_idx].offset; comp->block_idx = htonl(comp->block_idx); comp->offset = htonll(comp->offset); comp->length = htonll(comp->length); @@ -511,27 +533,27 @@ static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, return result; } -static int rdma_add_block(RDMAContext *rdma, void *host_addr, +static int rdma_add_block(RDMAContext *rdma, const char *block_name, + void *host_addr, ram_addr_t block_offset, uint64_t length) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *)(uintptr_t)block_offset); + RDMALocalBlock *block; RDMALocalBlock *old = local->block; - assert(block == NULL); - local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); if (local->nb_blocks) { int x; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, - (void *)(uintptr_t)old[x].offset); - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)old[x].offset, - &local->block[x]); + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)old[x].offset, + &local->block[x]); + } } memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); g_free(old); @@ -539,10 +561,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, block = &local->block[local->nb_blocks]; + block->block_name = g_strdup(block_name); block->local_host_addr = host_addr; block->offset = block_offset; block->length = length; block->index = local->nb_blocks; + block->src_index = ~0U; /* Filled in by the receipt of the block list */ block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; block->transit_bitmap = bitmap_new(block->nb_chunks); bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); @@ -552,9 +576,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, block->is_ram_block = local->init ? false : true; - g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + if (rdma->blockmap) { + g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + } - trace_rdma_add_block(local->nb_blocks, (uintptr_t) block->local_host_addr, + trace_rdma_add_block(block_name, local->nb_blocks, + (uintptr_t) block->local_host_addr, block->offset, block->length, (uintptr_t) (block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -574,7 +601,7 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, ram_addr_t block_offset, ram_addr_t length, void *opaque) { - return rdma_add_block(opaque, host_addr, block_offset, length); + return rdma_add_block(opaque, block_name, host_addr, block_offset, length); } /* @@ -587,7 +614,6 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) RDMALocalBlocks *local = &rdma->local_ram_blocks; assert(rdma->blockmap == NULL); - rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); memset(local, 0, sizeof *local); qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); trace_qemu_rdma_init_ram_blocks(local->nb_blocks); @@ -597,16 +623,19 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) return 0; } -static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) +/* + * Note: If used outside of cleanup, the caller must ensure that the destination + * block structures are also updated + */ +static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *) block_offset); RDMALocalBlock *old = local->block; int x; - assert(block); - + if (rdma->blockmap) { + g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); + } if (block->pmr) { int j; @@ -636,8 +665,14 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) g_free(block->remote_keys); block->remote_keys = NULL; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); + g_free(block->block_name); + block->block_name = NULL; + + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + } } if (local->nb_blocks > 1) { @@ -659,8 +694,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->block = NULL; } - trace_rdma_delete_block(local->nb_blocks, - (uintptr_t)block->local_host_addr, + trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, block->offset, block->length, (uintptr_t)(block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -670,7 +704,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->nb_blocks--; - if (local->nb_blocks) { + if (local->nb_blocks && rdma->blockmap) { for (x = 0; x < local->nb_blocks; x++) { g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)local->block[x].offset, @@ -1223,7 +1257,7 @@ const char *print_wrid(int wrid) /* * Perform a non-optimized memory unregistration after every transfer - * for demonsration purposes, only if pin-all is not requested. + * for demonstration purposes, only if pin-all is not requested. * * Potential optimizations: * 1. Start a new thread to run this function continuously @@ -1289,7 +1323,7 @@ static int qemu_rdma_unregister_waiting(RDMAContext *rdma) rdma->total_registrations--; reg.key.chunk = chunk; - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, NULL, NULL); if (ret < 0) { @@ -1910,7 +1944,7 @@ retry: trace_qemu_rdma_write_one_zero(chunk, sge.length, current_index, current_addr); - compress_to_network(&comp); + compress_to_network(rdma, &comp); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &comp, NULL, NULL, NULL); @@ -1937,7 +1971,7 @@ retry: trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, current_addr); - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, ®_result_idx, NULL); if (ret < 0) { @@ -2198,7 +2232,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) if (rdma->local_ram_blocks.block) { while (rdma->local_ram_blocks.nb_blocks) { - rdma_delete_block(rdma, rdma->local_ram_blocks.block->offset); + rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); } } @@ -2271,6 +2305,14 @@ static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) goto err_rdma_source_init; } + /* Build the hash that maps from offset to RAMBlock */ + rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); + for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, + &rdma->local_ram_blocks.block[idx]); + } + for (idx = 0; idx < RDMA_WRID_MAX; idx++) { ret = qemu_rdma_reg_control(rdma, idx); if (ret) { @@ -2880,6 +2922,14 @@ err_rdma_dest_wait: return ret; } +static int dest_ram_sort_func(const void *a, const void *b) +{ + unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; + unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; + + return (a_index < b_index) ? -1 : (a_index != b_index); +} + /* * During each iteration of the migration, we listen for instructions * by the source VM to perform dynamic page registrations before they @@ -2889,8 +2939,7 @@ err_rdma_dest_wait: * * Keep doing this until the source tells us to stop. */ -static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, - uint64_t flags) +static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, @@ -2920,7 +2969,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, CHECK_ERROR_STATE(); do { - trace_qemu_rdma_registration_handle_wait(flags); + trace_qemu_rdma_registration_handle_wait(); ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); @@ -2943,6 +2992,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, trace_qemu_rdma_registration_handle_compress(comp->length, comp->block_idx, comp->offset); + if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'compress' bad block index %u (vs %d)", + (unsigned int)comp->block_idx, + rdma->local_ram_blocks.nb_blocks); + ret = -EIO; + break; + } block = &(rdma->local_ram_blocks.block[comp->block_idx]); host_addr = block->local_host_addr + @@ -2958,6 +3014,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, case RDMA_CONTROL_RAM_BLOCKS_REQUEST: trace_qemu_rdma_registration_handle_ram_blocks(); + /* Sort our local RAM Block list so it's the same as the source, + * we can do this since we've filled in a src_index in the list + * as we received the RAMBlock list earlier. + */ + qsort(rdma->local_ram_blocks.block, + rdma->local_ram_blocks.nb_blocks, + sizeof(RDMALocalBlock), dest_ram_sort_func); if (rdma->pin_all) { ret = qemu_rdma_reg_whole_ram_blocks(rdma); if (ret) { @@ -2985,6 +3048,12 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, rdma->dest_blocks[i].length = local->block[i].length; dest_block_to_network(&rdma->dest_blocks[i]); + trace_qemu_rdma_registration_handle_ram_blocks_loop( + local->block[i].block_name, + local->block[i].offset, + local->block[i].length, + local->block[i].local_host_addr, + local->block[i].src_index); } blocks.len = rdma->local_ram_blocks.nb_blocks @@ -3018,8 +3087,23 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, trace_qemu_rdma_registration_handle_register_loop(count, reg->current_index, reg->key.current_addr, reg->chunks); + if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'register' bad block index %u (vs %d)", + (unsigned int)reg->current_index, + rdma->local_ram_blocks.nb_blocks); + ret = -ENOENT; + break; + } block = &(rdma->local_ram_blocks.block[reg->current_index]); if (block->is_ram_block) { + if (block->offset > reg->key.current_addr) { + error_report("rdma: bad register address for block %s" + " offset: %" PRIx64 " current_addr: %" PRIx64, + block->block_name, block->offset, + reg->key.current_addr); + ret = -ERANGE; + break; + } host_addr = (block->local_host_addr + (reg->key.current_addr - block->offset)); chunk = ram_chunk_index(block->local_host_addr, @@ -3028,6 +3112,14 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, chunk = reg->key.chunk; host_addr = block->local_host_addr + (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); + /* Check for particularly bad chunk value */ + if (host_addr < (void *)block->local_host_addr) { + error_report("rdma: bad chunk for block %s" + " chunk: %" PRIx64, + block->block_name, reg->key.chunk); + ret = -ERANGE; + break; + } } chunk_start = ram_chunk_start(block, chunk); chunk_end = ram_chunk_end(block, chunk + reg->chunks); @@ -3108,8 +3200,56 @@ out: return ret; } +/* Destination: + * Called via a ram_control_load_hook during the initial RAM load section which + * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks + * on the source. + * We've already built our local RAMBlock list, but not yet sent the list to + * the source. + */ +static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) +{ + RDMAContext *rdma = rfile->rdma; + int curr; + int found = -1; + + /* Find the matching RAMBlock in our local list */ + for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { + if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { + found = curr; + break; + } + } + + if (found == -1) { + error_report("RAMBlock '%s' not found on destination", name); + return -ENOENT; + } + + rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; + trace_rdma_block_notification_handle(name, rdma->next_src_index); + rdma->next_src_index++; + + return 0; +} + +static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) +{ + switch (flags) { + case RAM_CONTROL_BLOCK_REG: + return rdma_block_notification_handle(opaque, data); + + case RAM_CONTROL_HOOK: + return qemu_rdma_registration_handle(f, opaque); + + default: + /* Shouldn't be called with any other values */ + abort(); + } +} + static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { QEMUFileRDMA *rfile = opaque; RDMAContext *rdma = rfile->rdma; @@ -3128,7 +3268,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, * First, flush writes, if any. */ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { Error *local_err = NULL, **errp = &local_err; QEMUFileRDMA *rfile = opaque; @@ -3148,7 +3288,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, if (flags == RAM_CONTROL_SETUP) { RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; RDMALocalBlocks *local = &rdma->local_ram_blocks; - int reg_result_idx, i, j, nb_dest_blocks; + int reg_result_idx, i, nb_dest_blocks; head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; trace_qemu_rdma_registration_stop_ram(); @@ -3184,9 +3324,11 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, */ if (local->nb_blocks != nb_dest_blocks) { - ERROR(errp, "ram blocks mismatch #1! " + ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) " "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + "not identical on both the source and destination.", + local->nb_blocks, nb_dest_blocks); + rdma->error_state = -EINVAL; return -EINVAL; } @@ -3196,30 +3338,18 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, for (i = 0; i < nb_dest_blocks; i++) { network_to_dest_block(&rdma->dest_blocks[i]); - /* search local ram blocks */ - for (j = 0; j < local->nb_blocks; j++) { - if (rdma->dest_blocks[i].offset != local->block[j].offset) { - continue; - } - - if (rdma->dest_blocks[i].length != local->block[j].length) { - ERROR(errp, "ram blocks mismatch #2! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); - return -EINVAL; - } - local->block[j].remote_host_addr = - rdma->dest_blocks[i].remote_host_addr; - local->block[j].remote_rkey = rdma->dest_blocks[i].remote_rkey; - break; - } - - if (j >= local->nb_blocks) { - ERROR(errp, "ram blocks mismatch #3! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + /* We require that the blocks are in the same order */ + if (rdma->dest_blocks[i].length != local->block[i].length) { + ERROR(errp, "Block %s/%d has a different length %" PRIu64 + "vs %" PRIu64, local->block[i].block_name, i, + local->block[i].length, + rdma->dest_blocks[i].length); + rdma->error_state = -EINVAL; return -EINVAL; } + local->block[i].remote_host_addr = + rdma->dest_blocks[i].remote_host_addr; + local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; } } @@ -3250,7 +3380,7 @@ static const QEMUFileOps rdma_read_ops = { .get_buffer = qemu_rdma_get_buffer, .get_fd = qemu_rdma_get_fd, .close = qemu_rdma_close, - .hook_ram_load = qemu_rdma_registration_handle, + .hook_ram_load = rdma_load_hook, }; static const QEMUFileOps rdma_write_ops = { @@ -3263,12 +3393,13 @@ static const QEMUFileOps rdma_write_ops = { static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) { - QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA)); + QEMUFileRDMA *r; if (qemu_file_mode_is_not_valid(mode)) { return NULL; } + r = g_malloc0(sizeof(QEMUFileRDMA)); r->rdma = rdma; if (mode[0] == 'w') { @@ -3287,7 +3418,7 @@ static void rdma_accept_incoming_migration(void *opaque) QEMUFile *f; Error *local_err = NULL, **errp = &local_err; - trace_qemu_dma_accept_incoming_migration(); + trace_qemu_rdma_accept_incoming_migration(); ret = qemu_rdma_accept(rdma); if (ret) { @@ -3295,7 +3426,7 @@ static void rdma_accept_incoming_migration(void *opaque) return; } - trace_qemu_dma_accept_incoming_migration_accepted(); + trace_qemu_rdma_accept_incoming_migration_accepted(); f = qemu_fopen_rdma(rdma, "rb"); if (f == NULL) { diff --git a/migration/savevm.c b/migration/savevm.c index 9e0e286797..86735fc53a 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -246,11 +246,55 @@ typedef struct SaveStateEntry { typedef struct SaveState { QTAILQ_HEAD(, SaveStateEntry) handlers; int global_section_id; + bool skip_configuration; + uint32_t len; + const char *name; } SaveState; static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), .global_section_id = 0, + .skip_configuration = false, +}; + +void savevm_skip_configuration(void) +{ + savevm_state.skip_configuration = true; +} + + +static void configuration_pre_save(void *opaque) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + state->len = strlen(current_name); + state->name = current_name; +} + +static int configuration_post_load(void *opaque, int version_id) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + if (strncmp(state->name, current_name, state->len) != 0) { + error_report("Machine type received is '%s' and local is '%s'", + state->name, current_name); + return -EINVAL; + } + return 0; +} + +static const VMStateDescription vmstate_configuration = { + .name = "configuration", + .version_id = 1, + .post_load = configuration_post_load, + .pre_save = configuration_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(len, SaveState), + VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), + VMSTATE_END_OF_LIST() + }, }; static void dump_vmstate_vmsd(FILE *out_file, @@ -653,41 +697,6 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } -/* - * Read a footer off the wire and check that it matches the expected section - * - * Returns: true if the footer was good - * false if there is a problem (and calls error_report to say why) - */ -static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) -{ - uint8_t read_mark; - uint32_t read_section_id; - - if (skip_section_footers) { - /* No footer to check */ - return true; - } - - read_mark = qemu_get_byte(f); - - if (read_mark != QEMU_VM_SECTION_FOOTER) { - error_report("Missing section footer for %s", se->idstr); - return false; - } - - read_section_id = qemu_get_be32(f); - if (read_section_id != se->section_id) { - error_report("Mismatched section id in footer for %s -" - " read 0x%x expected 0x%x", - se->idstr, read_section_id, se->section_id); - return false; - } - - /* All good */ - return true; -} - bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -723,6 +732,11 @@ void qemu_savevm_state_begin(QEMUFile *f, se->ops->set_params(params, se->opaque); } + if (!savevm_state.skip_configuration) { + qemu_put_byte(f, QEMU_VM_CONFIGURATION); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + } + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; @@ -836,6 +850,11 @@ void qemu_savevm_state_complete(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + trace_savevm_section_skip(se->idstr, se->section_id); + continue; + } + trace_savevm_section_start(se->idstr, se->section_id); json_start_object(vmdesc, NULL); @@ -949,6 +968,9 @@ static int qemu_save_device_state(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + continue; + } save_section_header(f, se, QEMU_VM_SECTION_FULL); @@ -989,6 +1011,41 @@ struct LoadStateEntry { int version_id; }; +/* + * Read a footer off the wire and check that it matches the expected section + * + * Returns: true if the footer was good + * false if there is a problem (and calls error_report to say why) + */ +static bool check_section_footer(QEMUFile *f, LoadStateEntry *le) +{ + uint8_t read_mark; + uint32_t read_section_id; + + if (skip_section_footers) { + /* No footer to check */ + return true; + } + + read_mark = qemu_get_byte(f); + + if (read_mark != QEMU_VM_SECTION_FOOTER) { + error_report("Missing section footer for %s", le->se->idstr); + return false; + } + + read_section_id = qemu_get_be32(f); + if (read_section_id != le->section_id) { + error_report("Mismatched section id in footer for %s -" + " read 0x%x expected 0x%x", + le->se->idstr, read_section_id, le->section_id); + return false; + } + + /* All good */ + return true; +} + void loadvm_free_handlers(MigrationIncomingState *mis) { LoadStateEntry *le, *new_le; @@ -1029,6 +1086,18 @@ int qemu_loadvm_state(QEMUFile *f) return -ENOTSUP; } + if (!savevm_state.skip_configuration) { + if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { + error_report("Configuration section missing"); + return -EINVAL; + } + ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + + if (ret) { + return ret; + } + } + while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; SaveStateEntry *se; @@ -1082,7 +1151,7 @@ int qemu_loadvm_state(QEMUFile *f) " device '%s'", instance_id, idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } @@ -1109,7 +1178,7 @@ int qemu_loadvm_state(QEMUFile *f) section_id, le->se->idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } @@ -1127,16 +1196,35 @@ int qemu_loadvm_state(QEMUFile *f) * Try to read in the VMDESC section as well, so that dumping tools that * intercept our migration stream have the chance to see it. */ - if (qemu_get_byte(f) == QEMU_VM_VMDESCRIPTION) { - uint32_t size = qemu_get_be32(f); - uint8_t *buf = g_malloc(0x1000); - - while (size > 0) { - uint32_t read_chunk = MIN(size, 0x1000); - qemu_get_buffer(f, buf, read_chunk); - size -= read_chunk; + + /* We've got to be careful; if we don't read the data and just shut the fd + * then the sender can error if we close while it's still sending. + * We also mustn't read data that isn't there; some transports (RDMA) + * will stall waiting for that data when the source has already closed. + */ + if (should_send_vmdesc()) { + uint8_t *buf; + uint32_t size; + section_type = qemu_get_byte(f); + + if (section_type != QEMU_VM_VMDESCRIPTION) { + error_report("Expected vmdescription section, but got %d", + section_type); + /* + * It doesn't seem worth failing at this point since + * we apparently have an otherwise valid VM state + */ + } else { + buf = g_malloc(0x1000); + size = qemu_get_be32(f); + + while (size > 0) { + uint32_t read_chunk = MIN(size, 0x1000); + qemu_get_buffer(f, buf, read_chunk); + size -= read_chunk; + } + g_free(buf); } - g_free(buf); } cpu_synchronize_all_post_init(); diff --git a/migration/vmstate.c b/migration/vmstate.c index 6138d1acb7..e8ccf22f67 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -276,6 +276,17 @@ static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc, json_end_object(vmdesc); } + +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque) +{ + if (vmsd->needed && !vmsd->needed(opaque)) { + /* optional section not needed */ + return false; + } + return true; +} + + void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, QJSON *vmdesc) { |