diff options
Diffstat (limited to 'migration')
-rw-r--r-- | migration/migration.c | 425 | ||||
-rw-r--r-- | migration/migration.h | 35 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 258 | ||||
-rw-r--r-- | migration/ram.c | 3 | ||||
-rw-r--r-- | migration/socket.c | 4 | ||||
-rw-r--r-- | migration/trace-events | 6 |
6 files changed, 560 insertions, 171 deletions
diff --git a/migration/migration.c b/migration/migration.c index 4de3b551fe..d3a1c494c0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -132,6 +132,11 @@ void migration_object_init(void) } } +void migration_object_finalize(void) +{ + object_unref(OBJECT(current_migration)); +} + /* For outgoing */ MigrationState *migrate_get_current(void) { @@ -591,14 +596,15 @@ static void populate_disk_info(MigrationInfo *info) } } -MigrationInfo *qmp_query_migrate(Error **errp) +static void fill_source_migration_info(MigrationInfo *info) { - MigrationInfo *info = g_malloc0(sizeof(*info)); MigrationState *s = migrate_get_current(); switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ + /* do not overwrite destination migration status */ + return; break; case MIGRATION_STATUS_SETUP: info->has_status = true; @@ -613,7 +619,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->has_status = true; info->has_total_time = true; info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; + - s->start_time; info->has_expected_downtime = true; info->expected_downtime = s->expected_downtime; info->has_setup_time = true; @@ -649,8 +655,6 @@ MigrationInfo *qmp_query_migrate(Error **errp) break; } info->status = s->state; - - return info; } /** @@ -714,6 +718,41 @@ static bool migrate_caps_check(bool *cap_list, return true; } +static void fill_destination_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + switch (mis->state) { + case MIGRATION_STATUS_NONE: + return; + break; + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_COLO: + info->has_status = true; + break; + case MIGRATION_STATUS_COMPLETED: + info->has_status = true; + fill_destination_postcopy_migration_info(info); + break; + } + info->status = mis->state; +} + +MigrationInfo *qmp_query_migrate(Error **errp) +{ + MigrationInfo *info = g_malloc0(sizeof(*info)); + + fill_destination_migration_info(info); + fill_source_migration_info(info); + + return info; +} + void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, Error **errp) { @@ -741,22 +780,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, static bool migrate_params_check(MigrationParameters *params, Error **errp) { if (params->has_compress_level && - (params->compress_level < 0 || params->compress_level > 9)) { + (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", "is invalid, it should be in the range of 0 to 9"); return false; } - if (params->has_compress_threads && - (params->compress_threads < 1 || params->compress_threads > 255)) { + if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", "is invalid, it should be in the range of 1 to 255"); return false; } - if (params->has_decompress_threads && - (params->decompress_threads < 1 || params->decompress_threads > 255)) { + if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", "is invalid, it should be in the range of 1 to 255"); @@ -781,38 +818,31 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } - if (params->has_max_bandwidth && - (params->max_bandwidth < 0 || params->max_bandwidth > SIZE_MAX)) { + if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) { error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the" " range of 0 to %zu bytes/second", SIZE_MAX); return false; } if (params->has_downtime_limit && - (params->downtime_limit < 0 || - params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { + (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { error_setg(errp, "Parameter 'downtime_limit' expects an integer in " "the range of 0 to %d milliseconds", MAX_MIGRATE_DOWNTIME); return false; } - if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "x_checkpoint_delay", - "is invalid, it should be positive"); - return false; - } - if (params->has_x_multifd_channels && - (params->x_multifd_channels < 1 || params->x_multifd_channels > 255)) { + /* x_checkpoint_delay is now always positive */ + + if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_channels", "is invalid, it should be in the range of 1 to 255"); return false; } if (params->has_x_multifd_page_count && - (params->x_multifd_page_count < 1 || - params->x_multifd_page_count > 10000)) { + (params->x_multifd_page_count < 1 || + params->x_multifd_page_count > 10000)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_page_count", "is invalid, it should be in the range of 1 to 10000"); @@ -1077,6 +1107,8 @@ static void migrate_fd_cleanup(void *opaque) qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; + qemu_savevm_state_cleanup(); + if (s->to_dst_file) { Error *local_err = NULL; @@ -1127,8 +1159,6 @@ void migrate_fd_error(MigrationState *s, const Error *error) migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); migrate_set_error(s, error); - notifier_list_notify(&migration_state_notifiers, s); - block_cleanup_parameters(s); } static void migrate_fd_cancel(MigrationState *s) @@ -1174,7 +1204,6 @@ static void migrate_fd_cancel(MigrationState *s) s->block_inactive = false; } } - block_cleanup_parameters(s); } void add_migration_state_change_notifier(Notifier *notify) @@ -1268,7 +1297,11 @@ MigrationState *migrate_init(void) migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); - s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->total_time = 0; + s->vm_was_running = false; + s->iteration_initial_bytes = 0; + s->threshold_size = 0; return s; } @@ -1508,6 +1541,15 @@ bool migrate_zero_blocks(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; } +bool migrate_postcopy_blocktime(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; +} + bool migrate_use_compression(void) { MigrationState *s; @@ -1843,7 +1885,7 @@ static int await_return_path_close_on_source(MigrationState *ms) * Switch from normal iteration to postcopy * Returns non-0 on error */ -static int postcopy_start(MigrationState *ms, bool *old_vm_running) +static int postcopy_start(MigrationState *ms) { int ret; QIOChannelBuffer *bioc; @@ -1861,7 +1903,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) trace_postcopy_start_set_run(); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); if (ret < 0) { @@ -2051,21 +2092,17 @@ static int migration_maybe_pause(MigrationState *s, * The caller 'breaks' the loop when this returns. * * @s: Current migration state - * @current_active_state: The migration state we expect to be in - * @*old_vm_running: Pointer to old_vm_running flag - * @*start_time: Pointer to time to update */ -static void migration_completion(MigrationState *s, int current_active_state, - bool *old_vm_running, - int64_t *start_time) +static void migration_completion(MigrationState *s) { int ret; + int current_active_state = s->state; if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); - *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); + s->vm_was_running = runstate_is_running(); ret = global_state_store(); if (!ret) { @@ -2152,6 +2189,155 @@ bool migrate_colo_enabled(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO]; } +static void migration_calculate_complete(MigrationState *s) +{ + uint64_t bytes = qemu_ftell(s->to_dst_file); + int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + s->total_time = end_time - s->start_time; + if (!s->downtime) { + /* + * It's still not set, so we are precopy migration. For + * postcopy, downtime is calculated during postcopy_start(). + */ + s->downtime = end_time - s->downtime_start; + } + + if (s->total_time) { + s->mbps = ((double) bytes * 8.0) / s->total_time / 1000; + } +} + +static void migration_update_counters(MigrationState *s, + int64_t current_time) +{ + uint64_t transferred, time_spent; + int64_t threshold_size; + double bandwidth; + + if (current_time < s->iteration_start_time + BUFFER_DELAY) { + return; + } + + transferred = qemu_ftell(s->to_dst_file) - s->iteration_initial_bytes; + time_spent = current_time - s->iteration_start_time; + bandwidth = (double)transferred / time_spent; + threshold_size = bandwidth * s->parameters.downtime_limit; + + s->mbps = (((double) transferred * 8.0) / + ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; + + /* + * if we haven't sent anything, we don't want to + * recalculate. 10000 is a small enough number for our purposes + */ + if (ram_counters.dirty_pages_rate && transferred > 10000) { + s->expected_downtime = ram_counters.dirty_pages_rate * + qemu_target_page_size() / bandwidth; + } + + qemu_file_reset_rate_limit(s->to_dst_file); + + s->iteration_start_time = current_time; + s->iteration_initial_bytes = qemu_ftell(s->to_dst_file); + + trace_migrate_transferred(transferred, time_spent, + bandwidth, threshold_size); +} + +/* Migration thread iteration status */ +typedef enum { + MIG_ITERATE_RESUME, /* Resume current iteration */ + MIG_ITERATE_SKIP, /* Skip current iteration */ + MIG_ITERATE_BREAK, /* Break the loop */ +} MigIterateState; + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState migration_iteration_run(MigrationState *s) +{ + uint64_t pending_size, pend_post, pend_nonpost; + bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; + + qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, + &pend_nonpost, &pend_post); + pending_size = pend_nonpost + pend_post; + + trace_migrate_pending(pending_size, s->threshold_size, + pend_post, pend_nonpost); + + if (pending_size && pending_size >= s->threshold_size) { + /* Still a significant amount to transfer */ + if (migrate_postcopy() && !in_postcopy && + pend_nonpost <= s->threshold_size && + atomic_read(&s->start_postcopy)) { + if (postcopy_start(s)) { + error_report("%s: postcopy failed to start", __func__); + } + return MIG_ITERATE_SKIP; + } + /* Just another iteration step */ + qemu_savevm_state_iterate(s->to_dst_file, + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); + } else { + trace_migration_thread_low_pending(pending_size); + migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + +static void migration_iteration_finish(MigrationState *s) +{ + /* If we enabled cpu throttling for auto-converge, turn it off. */ + cpu_throttle_stop(); + + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + runstate_set(RUN_STATE_POSTMIGRATE); + break; + + case MIGRATION_STATUS_ACTIVE: + /* + * We should really assert here, but since it's during + * migration, let's try to reduce the usage of assertions. + */ + if (!migrate_colo_enabled()) { + error_report("%s: critical error: calling COLO code without " + "COLO enabled", __func__); + } + migrate_start_colo_process(s); + /* + * Fixme: we will run VM in COLO no matter its old running state. + * After exited COLO, we will keep running. + */ + s->vm_was_running = true; + /* Fallthrough */ + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + if (s->vm_was_running) { + vm_start(); + } else { + if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { + runstate_set(RUN_STATE_POSTMIGRATE); + } + } + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + qemu_bh_schedule(s->cleanup_bh); + qemu_mutex_unlock_iothread(); +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -2159,26 +2345,12 @@ bool migrate_colo_enabled(void) static void *migration_thread(void *opaque) { MigrationState *s = opaque; - /* Used by the bandwidth calcs, updated later */ - int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); - int64_t initial_bytes = 0; - /* - * The final stage happens when the remaining data is smaller than - * this threshold; it's calculated from the requested downtime and - * measured bandwidth - */ - int64_t threshold_size = 0; - int64_t start_time = initial_time; - int64_t end_time; - bool old_vm_running = false; - bool entered_postcopy = false; - /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ - enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; - bool enable_colo = migrate_colo_enabled(); rcu_register_thread(); + s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_savevm_state_header(s->to_dst_file); /* @@ -2213,122 +2385,38 @@ static void *migration_thread(void *opaque) while (s->state == MIGRATION_STATUS_ACTIVE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { int64_t current_time; - uint64_t pending_size; if (!qemu_file_rate_limit(s->to_dst_file)) { - uint64_t pend_post, pend_nonpost; - - qemu_savevm_state_pending(s->to_dst_file, threshold_size, - &pend_nonpost, &pend_post); - pending_size = pend_nonpost + pend_post; - trace_migrate_pending(pending_size, threshold_size, - pend_post, pend_nonpost); - if (pending_size && pending_size >= threshold_size) { - /* Still a significant amount to transfer */ - - if (migrate_postcopy() && - s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && - pend_nonpost <= threshold_size && - atomic_read(&s->start_postcopy)) { - - if (!postcopy_start(s, &old_vm_running)) { - current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; - entered_postcopy = true; - } - - continue; - } - /* Just another iteration step */ - qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); - } else { - trace_migration_thread_low_pending(pending_size); - migration_completion(s, current_active_state, - &old_vm_running, &start_time); + MigIterateState iter_state = migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { break; } } if (qemu_file_get_error(s->to_dst_file)) { - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + if (migration_is_setup_or_active(s->state)) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } trace_migration_thread_file_err(); break; } + current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - if (current_time >= initial_time + BUFFER_DELAY) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - - initial_bytes; - uint64_t time_spent = current_time - initial_time; - double bandwidth = (double)transferred_bytes / time_spent; - threshold_size = bandwidth * s->parameters.downtime_limit; - - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; - - trace_migrate_transferred(transferred_bytes, time_spent, - bandwidth, threshold_size); - /* if we haven't sent anything, we don't want to recalculate - 10000 is a small enough number for our purposes */ - if (ram_counters.dirty_pages_rate && transferred_bytes > 10000) { - s->expected_downtime = ram_counters.dirty_pages_rate * - qemu_target_page_size() / bandwidth; - } - qemu_file_reset_rate_limit(s->to_dst_file); - initial_time = current_time; - initial_bytes = qemu_ftell(s->to_dst_file); - } + migration_update_counters(s, current_time); + if (qemu_file_rate_limit(s->to_dst_file)) { /* usleep expects microseconds */ - g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); + g_usleep((s->iteration_start_time + BUFFER_DELAY - + current_time) * 1000); } } trace_migration_thread_after_loop(); - /* If we enabled cpu throttling for auto-converge, turn it off. */ - cpu_throttle_stop(); - end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - - qemu_mutex_lock_iothread(); - /* - * The resource has been allocated by migration will be reused in COLO - * process, so don't release them. - */ - if (!enable_colo) { - qemu_savevm_state_cleanup(); - } - if (s->state == MIGRATION_STATUS_COMPLETED) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); - s->total_time = end_time - s->total_time; - if (!entered_postcopy) { - s->downtime = end_time - start_time; - } - if (s->total_time) { - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) s->total_time)) / 1000; - } - runstate_set(RUN_STATE_POSTMIGRATE); - } else { - if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) { - migrate_start_colo_process(s); - qemu_savevm_state_cleanup(); - /* - * Fixme: we will run VM in COLO no matter its old running state. - * After exited COLO, we will keep running. - */ - old_vm_running = true; - } - if (old_vm_running && !entered_postcopy) { - vm_start(); - } else { - if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { - runstate_set(RUN_STATE_POSTMIGRATE); - } - } - } - qemu_bh_schedule(s->cleanup_bh); - qemu_mutex_unlock_iothread(); - + migration_iteration_finish(s); rcu_unregister_thread(); return NULL; } @@ -2375,10 +2463,15 @@ void migration_global_dump(Monitor *mon) { MigrationState *ms = migrate_get_current(); - monitor_printf(mon, "globals: store-global-state=%d, only_migratable=%d, " - "send-configuration=%d, send-section-footer=%d\n", - ms->store_global_state, ms->only_migratable, - ms->send_configuration, ms->send_section_footer); + monitor_printf(mon, "globals:\n"); + monitor_printf(mon, "store-global-state: %s\n", + ms->store_global_state ? "on" : "off"); + monitor_printf(mon, "only-migratable: %s\n", + ms->only_migratable ? "on" : "off"); + monitor_printf(mon, "send-configuration: %s\n", + ms->send_configuration ? "on" : "off"); + monitor_printf(mon, "send-section-footer: %s\n", + ms->send_section_footer ? "on" : "off"); } #define DEFINE_PROP_MIG_CAP(name, x) \ @@ -2394,33 +2487,33 @@ static Property migration_properties[] = { send_section_footer, true), /* Migration parameters */ - DEFINE_PROP_INT64("x-compress-level", MigrationState, + DEFINE_PROP_UINT8("x-compress-level", MigrationState, parameters.compress_level, DEFAULT_MIGRATE_COMPRESS_LEVEL), - DEFINE_PROP_INT64("x-compress-threads", MigrationState, + DEFINE_PROP_UINT8("x-compress-threads", MigrationState, parameters.compress_threads, DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT), - DEFINE_PROP_INT64("x-decompress-threads", MigrationState, + DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, parameters.decompress_threads, DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), - DEFINE_PROP_INT64("x-cpu-throttle-initial", MigrationState, + DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState, parameters.cpu_throttle_initial, DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL), - DEFINE_PROP_INT64("x-cpu-throttle-increment", MigrationState, + DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState, parameters.cpu_throttle_increment, DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), - DEFINE_PROP_INT64("x-max-bandwidth", MigrationState, + DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), - DEFINE_PROP_INT64("x-downtime-limit", MigrationState, + DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, parameters.downtime_limit, DEFAULT_MIGRATE_SET_DOWNTIME), - DEFINE_PROP_INT64("x-checkpoint-delay", MigrationState, + DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState, parameters.x_checkpoint_delay, DEFAULT_MIGRATE_X_CHECKPOINT_DELAY), - DEFINE_PROP_INT64("x-multifd-channels", MigrationState, + DEFINE_PROP_UINT8("x-multifd-channels", MigrationState, parameters.x_multifd_channels, DEFAULT_MIGRATE_MULTIFD_CHANNELS), - DEFINE_PROP_INT64("x-multifd-page-count", MigrationState, + DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState, parameters.x_multifd_page_count, DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT), DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, diff --git a/migration/migration.h b/migration/migration.h index 663415fe48..f2bc1aaf85 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -22,6 +22,8 @@ #include "hw/qdev.h" #include "io/channel.h" +struct PostcopyBlocktimeContext; + /* State for the incoming migration */ struct MigrationIncomingState { QEMUFile *from_src_file; @@ -59,10 +61,20 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *migration_incoming_co; QemuSemaphore colo_incoming_sem; + + /* + * PostcopyBlocktimeContext to keep information for postcopy + * live migration, to calculate vCPU block time + * */ + struct PostcopyBlocktimeContext *blocktime_ctx; }; MigrationIncomingState *migration_incoming_get_current(void); void migration_incoming_state_destroy(void); +/* + * Functions to work with blocktime context + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info); #define TYPE_MIGRATION "migration" @@ -90,6 +102,17 @@ struct MigrationState QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + /* bytes already send at the beggining of current interation */ + uint64_t iteration_initial_bytes; + /* time at the start of current iteration */ + int64_t iteration_start_time; + /* + * The final stage happens when the remaining data is smaller than + * this threshold; it's calculated from the requested downtime and + * measured bandwidth + */ + int64_t threshold_size; + /* params from 'migrate-set-parameters' */ MigrationParameters parameters; @@ -103,11 +126,22 @@ struct MigrationState } rp_state; double mbps; + /* Timestamp when recent migration starts (ms) */ + int64_t start_time; + /* Total time used by latest migration (ms) */ int64_t total_time; + /* Timestamp when VM is down (ms) to migrate the last stuff */ + int64_t downtime_start; int64_t downtime; int64_t expected_downtime; bool enabled_capabilities[MIGRATION_CAPABILITY__MAX]; int64_t setup_time; + /* + * Whether guest was running when we enter the completion stage. + * If migration is interrupted by any reason, we need to continue + * running the guest on source. + */ + bool vm_was_running; /* Flag set once the migration has been asked to enter postcopy */ bool start_postcopy; @@ -201,6 +235,7 @@ int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); bool migrate_use_events(void); +bool migrate_postcopy_blocktime(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index bec6c2c66b..7814da5b4b 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -61,6 +61,101 @@ struct PostcopyDiscardState { #include <sys/eventfd.h> #include <linux/userfaultfd.h> +typedef struct PostcopyBlocktimeContext { + /* time when page fault initiated per vCPU */ + int64_t *page_fault_vcpu_time; + /* page address per vCPU */ + uintptr_t *vcpu_addr; + int64_t total_blocktime; + /* blocktime per vCPU */ + int64_t *vcpu_blocktime; + /* point in time when last page fault was initiated */ + int64_t last_begin; + /* number of vCPU are suspended */ + int smp_cpus_down; + + /* + * Handler for exit event, necessary for + * releasing whole blocktime_ctx + */ + Notifier exit_notifier; +} PostcopyBlocktimeContext; + +static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) +{ + g_free(ctx->page_fault_vcpu_time); + g_free(ctx->vcpu_addr); + g_free(ctx->vcpu_blocktime); + g_free(ctx); +} + +static void migration_exit_cb(Notifier *n, void *data) +{ + PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext, + exit_notifier); + destroy_blocktime_context(ctx); +} + +static struct PostcopyBlocktimeContext *blocktime_context_new(void) +{ + PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); + ctx->page_fault_vcpu_time = g_new0(int64_t, smp_cpus); + ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); + ctx->vcpu_blocktime = g_new0(int64_t, smp_cpus); + + ctx->exit_notifier.notify = migration_exit_cb; + qemu_add_exit_notifier(&ctx->exit_notifier); + return ctx; +} + +static int64List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) +{ + int64List *list = NULL, *entry = NULL; + int i; + + for (i = smp_cpus - 1; i >= 0; i--) { + entry = g_new0(int64List, 1); + entry->value = ctx->vcpu_blocktime[i]; + entry->next = list; + list = entry; + } + + return list; +} + +/* + * This function just populates MigrationInfo from postcopy's + * blocktime context. It will not populate MigrationInfo, + * unless postcopy-blocktime capability was set. + * + * @info: pointer to MigrationInfo to populate + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return; + } + + info->has_postcopy_blocktime = true; + info->postcopy_blocktime = bc->total_blocktime; + info->has_postcopy_vcpu_blocktime = true; + info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); +} + +static uint64_t get_postcopy_total_blocktime(void) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return 0; + } + + return bc->total_blocktime; +} /** * receive_ufd_features: check userfault fd features, to request only supported @@ -153,6 +248,19 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) } } +#ifdef UFFD_FEATURE_THREAD_ID + if (migrate_postcopy_blocktime() && mis && + UFFD_FEATURE_THREAD_ID & supported_features) { + /* kernel supports that feature */ + /* don't create blocktime_context if it exists */ + if (!mis->blocktime_ctx) { + mis->blocktime_ctx = blocktime_context_new(); + } + + asked_features |= UFFD_FEATURE_THREAD_ID; + } +#endif + /* * request features, even if asked_features is 0, due to * kernel expects UFFD_API before UFFDIO_REGISTER, per @@ -423,6 +531,9 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size); mis->postcopy_tmp_zero_page = NULL; } + trace_postcopy_ram_incoming_cleanup_blocktime( + get_postcopy_total_blocktime()); + trace_postcopy_ram_incoming_cleanup_exit(); return 0; } @@ -494,6 +605,142 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, return 0; } +static int get_mem_fault_cpu_index(uint32_t pid) +{ + CPUState *cpu_iter; + + CPU_FOREACH(cpu_iter) { + if (cpu_iter->thread_id == pid) { + trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); + return cpu_iter->cpu_index; + } + } + trace_get_mem_fault_cpu_index(-1, pid); + return -1; +} + +/* + * This function is being called when pagefault occurs. It + * tracks down vCPU blocking time. + * + * @addr: faulted host virtual address + * @ptid: faulted process thread id + * @rb: ramblock appropriate to addr + */ +static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb) +{ + int cpu, already_received; + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + int64_t now_ms; + + if (!dc || ptid == 0) { + return; + } + cpu = get_mem_fault_cpu_index(ptid); + if (cpu < 0) { + return; + } + + now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (dc->vcpu_addr[cpu] == 0) { + atomic_inc(&dc->smp_cpus_down); + } + + atomic_xchg__nocheck(&dc->last_begin, now_ms); + atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms); + atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr); + + /* check it here, not at the begining of the function, + * due to, check could accur early than bitmap_set in + * qemu_ufd_copy_ioctl */ + already_received = ramblock_recv_bitmap_test(rb, (void *)addr); + if (already_received) { + atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0); + atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0); + atomic_dec(&dc->smp_cpus_down); + } + trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], + cpu, already_received); +} + +/* + * This function just provide calculated blocktime per cpu and trace it. + * Total blocktime is calculated in mark_postcopy_blocktime_end. + * + * + * Assume we have 3 CPU + * + * S1 E1 S1 E1 + * -----***********------------xxx***************------------------------> CPU1 + * + * S2 E2 + * ------------****************xxx---------------------------------------> CPU2 + * + * S3 E3 + * ------------------------****xxx********-------------------------------> CPU3 + * + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1 + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3 + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 - + * it's a part of total blocktime. + * S1 - here is last_begin + * Legend of the picture is following: + * * - means blocktime per vCPU + * x - means overlapped blocktime (total blocktime) + * + * @addr: host virtual address + */ +static void mark_postcopy_blocktime_end(uintptr_t addr) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + int i, affected_cpu = 0; + int64_t now_ms; + bool vcpu_total_blocktime = false; + int64_t read_vcpu_time; + + if (!dc) { + return; + } + + now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + /* lookup cpu, to clear it, + * that algorithm looks straighforward, but it's not + * optimal, more optimal algorithm is keeping tree or hash + * where key is address value is a list of */ + for (i = 0; i < smp_cpus; i++) { + uint64_t vcpu_blocktime = 0; + + read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); + if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || + read_vcpu_time == 0) { + continue; + } + atomic_xchg__nocheck(&dc->vcpu_addr[i], 0); + vcpu_blocktime = now_ms - read_vcpu_time; + affected_cpu += 1; + /* we need to know is that mark_postcopy_end was due to + * faulted page, another possible case it's prefetched + * page and in that case we shouldn't be here */ + if (!vcpu_total_blocktime && + atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { + vcpu_total_blocktime = true; + } + /* continue cycle, due to one page could affect several vCPUs */ + dc->vcpu_blocktime[i] += vcpu_blocktime; + } + + atomic_sub(&dc->smp_cpus_down, affected_cpu); + if (vcpu_total_blocktime) { + dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0); + } + trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, + affected_cpu); +} + /* * Handle faults detected by the USERFAULT markings */ @@ -571,8 +818,11 @@ static void *postcopy_ram_fault_thread(void *opaque) rb_offset &= ~(qemu_ram_pagesize(rb) - 1); trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), - rb_offset); + rb_offset, + msg.arg.pagefault.feat.ptid); + mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address), + msg.arg.pagefault.feat.ptid, rb); /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) @@ -662,6 +912,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, if (!ret) { ramblock_recv_bitmap_set_range(rb, host_addr, pagesize / qemu_target_page_size()); + mark_postcopy_blocktime_end((uintptr_t)host_addr); + } return ret; } @@ -759,6 +1011,10 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis) #else /* No target OS support, stubs just fail */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ +} + bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) { error_report("%s: No OS support", __func__); diff --git a/migration/ram.c b/migration/ram.c index 021d583b9b..cb1950f3eb 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -237,7 +237,8 @@ static RAMState *ram_state; uint64_t ram_bytes_remaining(void) { - return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE; + return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : + 0; } MigrationStats ram_counters; diff --git a/migration/socket.c b/migration/socket.c index dee869044a..3a8232dd2d 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -172,7 +172,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr, if (qio_channel_socket_listen_sync(listen_ioc, saddr, errp) < 0) { object_unref(OBJECT(listen_ioc)); - qapi_free_SocketAddress(saddr); return; } @@ -181,7 +180,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr, socket_accept_incoming_migration, listen_ioc, (GDestroyNotify)object_unref); - qapi_free_SocketAddress(saddr); } void tcp_start_incoming_migration(const char *host_port, Error **errp) @@ -191,6 +189,7 @@ void tcp_start_incoming_migration(const char *host_port, Error **errp) if (!err) { socket_start_incoming_migration(saddr, &err); } + qapi_free_SocketAddress(saddr); error_propagate(errp, err); } @@ -198,4 +197,5 @@ void unix_start_incoming_migration(const char *path, Error **errp) { SocketAddress *saddr = unix_build_address(path); socket_start_incoming_migration(saddr, errp); + qapi_free_SocketAddress(saddr); } diff --git a/migration/trace-events b/migration/trace-events index 6f29fcc686..141e773305 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -115,6 +115,8 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s" migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s" +mark_postcopy_blocktime_begin(uint64_t addr, void *dd, int64_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", cpu: %d, already_received: %d" +mark_postcopy_blocktime_end(uint64_t addr, void *dd, int64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", affected_cpu: %d" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" @@ -191,15 +193,17 @@ postcopy_ram_enable_notify(void) "" postcopy_ram_fault_thread_entry(void) "" postcopy_ram_fault_thread_exit(void) "" postcopy_ram_fault_thread_quit(void) "" -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx" +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u" postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" +postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64 save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u" # migration/exec.c migration_exec_outgoing(const char *cmd) "cmd=%s" |