diff options
Diffstat (limited to 'migration/colo.c')
-rw-r--r-- | migration/colo.c | 102 |
1 files changed, 92 insertions, 10 deletions
diff --git a/migration/colo.c b/migration/colo.c index 93c85c538b..712308ed5e 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -20,6 +20,8 @@ #include "qapi/error.h" #include "migration/failover.h" +static bool vmstate_loading; + #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024) bool colo_supported(void) @@ -51,6 +53,19 @@ static void secondary_vm_do_failover(void) int old_state; MigrationIncomingState *mis = migration_incoming_get_current(); + /* Can not do failover during the process of VM's loading VMstate, Or + * it will break the secondary VM. + */ + if (vmstate_loading) { + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, + FAILOVER_STATUS_RELAUNCH); + if (old_state != FAILOVER_STATUS_ACTIVE) { + error_report("Unknown error while do failover for secondary VM," + "old_state: %s", FailoverStatus_lookup[old_state]); + } + return; + } + migrate_set_state(&mis->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); @@ -59,6 +74,18 @@ static void secondary_vm_do_failover(void) /* recover runstate to normal migration finish state */ autostart = true; } + /* + * Make sure COLO incoming thread not block in recv or send, + * If mis->from_src_file and mis->to_src_file use the same fd, + * The second shutdown() will return -1, we ignore this value, + * It is harmless. + */ + if (mis->from_src_file) { + qemu_file_shutdown(mis->from_src_file); + } + if (mis->to_src_file) { + qemu_file_shutdown(mis->to_src_file); + } old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); @@ -67,6 +94,8 @@ static void secondary_vm_do_failover(void) "secondary VM", FailoverStatus_lookup[old_state]); return; } + /* Notify COLO incoming thread that failover work is finished */ + qemu_sem_post(&mis->colo_incoming_sem); /* For Secondary VM, jump to incoming co */ if (mis->migration_incoming_co) { qemu_coroutine_enter(mis->migration_incoming_co); @@ -81,6 +110,18 @@ static void primary_vm_do_failover(void) migrate_set_state(&s->state, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); + /* + * Wake up COLO thread which may blocked in recv() or send(), + * The s->rp_state.from_dst_file and s->to_dst_file may use the + * same fd, but we still shutdown the fd for twice, it is harmless. + */ + if (s->to_dst_file) { + qemu_file_shutdown(s->to_dst_file); + } + if (s->rp_state.from_dst_file) { + qemu_file_shutdown(s->rp_state.from_dst_file); + } + old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, FAILOVER_STATUS_COMPLETED); if (old_state != FAILOVER_STATUS_ACTIVE) { @@ -88,6 +129,8 @@ static void primary_vm_do_failover(void) FailoverStatus_lookup[old_state]); return; } + /* Notify COLO thread that failover work is finished */ + qemu_sem_post(&s->colo_exit_sem); } void colo_do_failover(MigrationState *s) @@ -302,7 +345,7 @@ static void colo_process_checkpoint(MigrationState *s) { QIOChannelBuffer *bioc; QEMUFile *fb = NULL; - int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); Error *local_err = NULL; int ret; @@ -332,26 +375,21 @@ static void colo_process_checkpoint(MigrationState *s) qemu_mutex_unlock_iothread(); trace_colo_vm_state_change("stop", "run"); + timer_mod(s->colo_delay_timer, + current_time + s->parameters.x_checkpoint_delay); + while (s->state == MIGRATION_STATUS_COLO) { if (failover_get_state() != FAILOVER_STATUS_NONE) { error_report("failover request"); goto out; } - current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); - if (current_time - checkpoint_time < - s->parameters.x_checkpoint_delay) { - int64_t delay_ms; + qemu_sem_wait(&s->colo_checkpoint_sem); - delay_ms = s->parameters.x_checkpoint_delay - - (current_time - checkpoint_time); - g_usleep(delay_ms * 1000); - } ret = colo_do_checkpoint_transaction(s, bioc, fb); if (ret < 0) { goto out; } - checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); } out: @@ -364,14 +402,41 @@ out: qemu_fclose(fb); } + timer_del(s->colo_delay_timer); + + /* Hope this not to be too long to wait here */ + qemu_sem_wait(&s->colo_exit_sem); + qemu_sem_destroy(&s->colo_exit_sem); + /* + * Must be called after failover BH is completed, + * Or the failover BH may shutdown the wrong fd that + * re-used by other threads after we release here. + */ if (s->rp_state.from_dst_file) { qemu_fclose(s->rp_state.from_dst_file); } } +void colo_checkpoint_notify(void *opaque) +{ + MigrationState *s = opaque; + int64_t next_notify_time; + + qemu_sem_post(&s->colo_checkpoint_sem); + s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); + next_notify_time = s->colo_checkpoint_time + + s->parameters.x_checkpoint_delay; + timer_mod(s->colo_delay_timer, next_notify_time); +} + void migrate_start_colo_process(MigrationState *s) { qemu_mutex_unlock_iothread(); + qemu_sem_init(&s->colo_checkpoint_sem, 0); + s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, + colo_checkpoint_notify, s); + + qemu_sem_init(&s->colo_exit_sem, 0); migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); colo_process_checkpoint(s); @@ -410,6 +475,8 @@ void *colo_process_incoming_thread(void *opaque) uint64_t value; Error *local_err = NULL; + qemu_sem_init(&mis->colo_incoming_sem, 0); + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -496,13 +563,23 @@ void *colo_process_incoming_thread(void *opaque) qemu_mutex_lock_iothread(); qemu_system_reset(VMRESET_SILENT); + vmstate_loading = true; if (qemu_loadvm_state(fb) < 0) { error_report("COLO: loadvm failed"); qemu_mutex_unlock_iothread(); goto out; } + + vmstate_loading = false; qemu_mutex_unlock_iothread(); + if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) { + failover_set_state(FAILOVER_STATUS_RELAUNCH, + FAILOVER_STATUS_NONE); + failover_request_active(NULL); + goto out; + } + colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED, &local_err); if (local_err) { @@ -511,6 +588,7 @@ void *colo_process_incoming_thread(void *opaque) } out: + vmstate_loading = false; /* Throw the unreported error message after exited from loop */ if (local_err) { error_report_err(local_err); @@ -520,6 +598,10 @@ out: qemu_fclose(fb); } + /* Hope this not to be too long to loop here */ + qemu_sem_wait(&mis->colo_incoming_sem); + qemu_sem_destroy(&mis->colo_incoming_sem); + /* Must be called after failover BH is completed */ if (mis->to_src_file) { qemu_fclose(mis->to_src_file); } |