aboutsummaryrefslogtreecommitdiff
path: root/migration
diff options
context:
space:
mode:
Diffstat (limited to 'migration')
-rw-r--r--migration/migration.c425
-rw-r--r--migration/migration.h35
-rw-r--r--migration/postcopy-ram.c258
-rw-r--r--migration/ram.c3
-rw-r--r--migration/socket.c4
-rw-r--r--migration/trace-events6
6 files changed, 560 insertions, 171 deletions
diff --git a/migration/migration.c b/migration/migration.c
index 4de3b551fe..d3a1c494c0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -132,6 +132,11 @@ void migration_object_init(void)
}
}
+void migration_object_finalize(void)
+{
+ object_unref(OBJECT(current_migration));
+}
+
/* For outgoing */
MigrationState *migrate_get_current(void)
{
@@ -591,14 +596,15 @@ static void populate_disk_info(MigrationInfo *info)
}
}
-MigrationInfo *qmp_query_migrate(Error **errp)
+static void fill_source_migration_info(MigrationInfo *info)
{
- MigrationInfo *info = g_malloc0(sizeof(*info));
MigrationState *s = migrate_get_current();
switch (s->state) {
case MIGRATION_STATUS_NONE:
/* no migration has happened ever */
+ /* do not overwrite destination migration status */
+ return;
break;
case MIGRATION_STATUS_SETUP:
info->has_status = true;
@@ -613,7 +619,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
info->has_status = true;
info->has_total_time = true;
info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
- - s->total_time;
+ - s->start_time;
info->has_expected_downtime = true;
info->expected_downtime = s->expected_downtime;
info->has_setup_time = true;
@@ -649,8 +655,6 @@ MigrationInfo *qmp_query_migrate(Error **errp)
break;
}
info->status = s->state;
-
- return info;
}
/**
@@ -714,6 +718,41 @@ static bool migrate_caps_check(bool *cap_list,
return true;
}
+static void fill_destination_migration_info(MigrationInfo *info)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ switch (mis->state) {
+ case MIGRATION_STATUS_NONE:
+ return;
+ break;
+ case MIGRATION_STATUS_SETUP:
+ case MIGRATION_STATUS_CANCELLING:
+ case MIGRATION_STATUS_CANCELLED:
+ case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_FAILED:
+ case MIGRATION_STATUS_COLO:
+ info->has_status = true;
+ break;
+ case MIGRATION_STATUS_COMPLETED:
+ info->has_status = true;
+ fill_destination_postcopy_migration_info(info);
+ break;
+ }
+ info->status = mis->state;
+}
+
+MigrationInfo *qmp_query_migrate(Error **errp)
+{
+ MigrationInfo *info = g_malloc0(sizeof(*info));
+
+ fill_destination_migration_info(info);
+ fill_source_migration_info(info);
+
+ return info;
+}
+
void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
Error **errp)
{
@@ -741,22 +780,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
static bool migrate_params_check(MigrationParameters *params, Error **errp)
{
if (params->has_compress_level &&
- (params->compress_level < 0 || params->compress_level > 9)) {
+ (params->compress_level > 9)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
"is invalid, it should be in the range of 0 to 9");
return false;
}
- if (params->has_compress_threads &&
- (params->compress_threads < 1 || params->compress_threads > 255)) {
+ if (params->has_compress_threads && (params->compress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"compress_threads",
"is invalid, it should be in the range of 1 to 255");
return false;
}
- if (params->has_decompress_threads &&
- (params->decompress_threads < 1 || params->decompress_threads > 255)) {
+ if (params->has_decompress_threads && (params->decompress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"decompress_threads",
"is invalid, it should be in the range of 1 to 255");
@@ -781,38 +818,31 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
return false;
}
- if (params->has_max_bandwidth &&
- (params->max_bandwidth < 0 || params->max_bandwidth > SIZE_MAX)) {
+ if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the"
" range of 0 to %zu bytes/second", SIZE_MAX);
return false;
}
if (params->has_downtime_limit &&
- (params->downtime_limit < 0 ||
- params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
+ (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
error_setg(errp, "Parameter 'downtime_limit' expects an integer in "
"the range of 0 to %d milliseconds",
MAX_MIGRATE_DOWNTIME);
return false;
}
- if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) {
- error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
- "x_checkpoint_delay",
- "is invalid, it should be positive");
- return false;
- }
- if (params->has_x_multifd_channels &&
- (params->x_multifd_channels < 1 || params->x_multifd_channels > 255)) {
+ /* x_checkpoint_delay is now always positive */
+
+ if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"multifd_channels",
"is invalid, it should be in the range of 1 to 255");
return false;
}
if (params->has_x_multifd_page_count &&
- (params->x_multifd_page_count < 1 ||
- params->x_multifd_page_count > 10000)) {
+ (params->x_multifd_page_count < 1 ||
+ params->x_multifd_page_count > 10000)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"multifd_page_count",
"is invalid, it should be in the range of 1 to 10000");
@@ -1077,6 +1107,8 @@ static void migrate_fd_cleanup(void *opaque)
qemu_bh_delete(s->cleanup_bh);
s->cleanup_bh = NULL;
+ qemu_savevm_state_cleanup();
+
if (s->to_dst_file) {
Error *local_err = NULL;
@@ -1127,8 +1159,6 @@ void migrate_fd_error(MigrationState *s, const Error *error)
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_FAILED);
migrate_set_error(s, error);
- notifier_list_notify(&migration_state_notifiers, s);
- block_cleanup_parameters(s);
}
static void migrate_fd_cancel(MigrationState *s)
@@ -1174,7 +1204,6 @@ static void migrate_fd_cancel(MigrationState *s)
s->block_inactive = false;
}
}
- block_cleanup_parameters(s);
}
void add_migration_state_change_notifier(Notifier *notify)
@@ -1268,7 +1297,11 @@ MigrationState *migrate_init(void)
migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
- s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ s->total_time = 0;
+ s->vm_was_running = false;
+ s->iteration_initial_bytes = 0;
+ s->threshold_size = 0;
return s;
}
@@ -1508,6 +1541,15 @@ bool migrate_zero_blocks(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
}
+bool migrate_postcopy_blocktime(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
+}
+
bool migrate_use_compression(void)
{
MigrationState *s;
@@ -1843,7 +1885,7 @@ static int await_return_path_close_on_source(MigrationState *ms)
* Switch from normal iteration to postcopy
* Returns non-0 on error
*/
-static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+static int postcopy_start(MigrationState *ms)
{
int ret;
QIOChannelBuffer *bioc;
@@ -1861,7 +1903,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
trace_postcopy_start_set_run();
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
- *old_vm_running = runstate_is_running();
global_state_store();
ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
@@ -2051,21 +2092,17 @@ static int migration_maybe_pause(MigrationState *s,
* The caller 'breaks' the loop when this returns.
*
* @s: Current migration state
- * @current_active_state: The migration state we expect to be in
- * @*old_vm_running: Pointer to old_vm_running flag
- * @*start_time: Pointer to time to update
*/
-static void migration_completion(MigrationState *s, int current_active_state,
- bool *old_vm_running,
- int64_t *start_time)
+static void migration_completion(MigrationState *s)
{
int ret;
+ int current_active_state = s->state;
if (s->state == MIGRATION_STATUS_ACTIVE) {
qemu_mutex_lock_iothread();
- *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
- *old_vm_running = runstate_is_running();
+ s->vm_was_running = runstate_is_running();
ret = global_state_store();
if (!ret) {
@@ -2152,6 +2189,155 @@ bool migrate_colo_enabled(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
}
+static void migration_calculate_complete(MigrationState *s)
+{
+ uint64_t bytes = qemu_ftell(s->to_dst_file);
+ int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ s->total_time = end_time - s->start_time;
+ if (!s->downtime) {
+ /*
+ * It's still not set, so we are precopy migration. For
+ * postcopy, downtime is calculated during postcopy_start().
+ */
+ s->downtime = end_time - s->downtime_start;
+ }
+
+ if (s->total_time) {
+ s->mbps = ((double) bytes * 8.0) / s->total_time / 1000;
+ }
+}
+
+static void migration_update_counters(MigrationState *s,
+ int64_t current_time)
+{
+ uint64_t transferred, time_spent;
+ int64_t threshold_size;
+ double bandwidth;
+
+ if (current_time < s->iteration_start_time + BUFFER_DELAY) {
+ return;
+ }
+
+ transferred = qemu_ftell(s->to_dst_file) - s->iteration_initial_bytes;
+ time_spent = current_time - s->iteration_start_time;
+ bandwidth = (double)transferred / time_spent;
+ threshold_size = bandwidth * s->parameters.downtime_limit;
+
+ s->mbps = (((double) transferred * 8.0) /
+ ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
+
+ /*
+ * if we haven't sent anything, we don't want to
+ * recalculate. 10000 is a small enough number for our purposes
+ */
+ if (ram_counters.dirty_pages_rate && transferred > 10000) {
+ s->expected_downtime = ram_counters.dirty_pages_rate *
+ qemu_target_page_size() / bandwidth;
+ }
+
+ qemu_file_reset_rate_limit(s->to_dst_file);
+
+ s->iteration_start_time = current_time;
+ s->iteration_initial_bytes = qemu_ftell(s->to_dst_file);
+
+ trace_migrate_transferred(transferred, time_spent,
+ bandwidth, threshold_size);
+}
+
+/* Migration thread iteration status */
+typedef enum {
+ MIG_ITERATE_RESUME, /* Resume current iteration */
+ MIG_ITERATE_SKIP, /* Skip current iteration */
+ MIG_ITERATE_BREAK, /* Break the loop */
+} MigIterateState;
+
+/*
+ * Return true if continue to the next iteration directly, false
+ * otherwise.
+ */
+static MigIterateState migration_iteration_run(MigrationState *s)
+{
+ uint64_t pending_size, pend_post, pend_nonpost;
+ bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
+
+ qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
+ &pend_nonpost, &pend_post);
+ pending_size = pend_nonpost + pend_post;
+
+ trace_migrate_pending(pending_size, s->threshold_size,
+ pend_post, pend_nonpost);
+
+ if (pending_size && pending_size >= s->threshold_size) {
+ /* Still a significant amount to transfer */
+ if (migrate_postcopy() && !in_postcopy &&
+ pend_nonpost <= s->threshold_size &&
+ atomic_read(&s->start_postcopy)) {
+ if (postcopy_start(s)) {
+ error_report("%s: postcopy failed to start", __func__);
+ }
+ return MIG_ITERATE_SKIP;
+ }
+ /* Just another iteration step */
+ qemu_savevm_state_iterate(s->to_dst_file,
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ } else {
+ trace_migration_thread_low_pending(pending_size);
+ migration_completion(s);
+ return MIG_ITERATE_BREAK;
+ }
+
+ return MIG_ITERATE_RESUME;
+}
+
+static void migration_iteration_finish(MigrationState *s)
+{
+ /* If we enabled cpu throttling for auto-converge, turn it off. */
+ cpu_throttle_stop();
+
+ qemu_mutex_lock_iothread();
+ switch (s->state) {
+ case MIGRATION_STATUS_COMPLETED:
+ migration_calculate_complete(s);
+ runstate_set(RUN_STATE_POSTMIGRATE);
+ break;
+
+ case MIGRATION_STATUS_ACTIVE:
+ /*
+ * We should really assert here, but since it's during
+ * migration, let's try to reduce the usage of assertions.
+ */
+ if (!migrate_colo_enabled()) {
+ error_report("%s: critical error: calling COLO code without "
+ "COLO enabled", __func__);
+ }
+ migrate_start_colo_process(s);
+ /*
+ * Fixme: we will run VM in COLO no matter its old running state.
+ * After exited COLO, we will keep running.
+ */
+ s->vm_was_running = true;
+ /* Fallthrough */
+ case MIGRATION_STATUS_FAILED:
+ case MIGRATION_STATUS_CANCELLED:
+ if (s->vm_was_running) {
+ vm_start();
+ } else {
+ if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+ runstate_set(RUN_STATE_POSTMIGRATE);
+ }
+ }
+ break;
+
+ default:
+ /* Should not reach here, but if so, forgive the VM. */
+ error_report("%s: Unknown ending state %d", __func__, s->state);
+ break;
+ }
+ qemu_bh_schedule(s->cleanup_bh);
+ qemu_mutex_unlock_iothread();
+}
+
/*
* Master migration thread on the source VM.
* It drives the migration and pumps the data down the outgoing channel.
@@ -2159,26 +2345,12 @@ bool migrate_colo_enabled(void)
static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
- /* Used by the bandwidth calcs, updated later */
- int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
- int64_t initial_bytes = 0;
- /*
- * The final stage happens when the remaining data is smaller than
- * this threshold; it's calculated from the requested downtime and
- * measured bandwidth
- */
- int64_t threshold_size = 0;
- int64_t start_time = initial_time;
- int64_t end_time;
- bool old_vm_running = false;
- bool entered_postcopy = false;
- /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
- enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
- bool enable_colo = migrate_colo_enabled();
rcu_register_thread();
+ s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
qemu_savevm_state_header(s->to_dst_file);
/*
@@ -2213,122 +2385,38 @@ static void *migration_thread(void *opaque)
while (s->state == MIGRATION_STATUS_ACTIVE ||
s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
int64_t current_time;
- uint64_t pending_size;
if (!qemu_file_rate_limit(s->to_dst_file)) {
- uint64_t pend_post, pend_nonpost;
-
- qemu_savevm_state_pending(s->to_dst_file, threshold_size,
- &pend_nonpost, &pend_post);
- pending_size = pend_nonpost + pend_post;
- trace_migrate_pending(pending_size, threshold_size,
- pend_post, pend_nonpost);
- if (pending_size && pending_size >= threshold_size) {
- /* Still a significant amount to transfer */
-
- if (migrate_postcopy() &&
- s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
- pend_nonpost <= threshold_size &&
- atomic_read(&s->start_postcopy)) {
-
- if (!postcopy_start(s, &old_vm_running)) {
- current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
- entered_postcopy = true;
- }
-
- continue;
- }
- /* Just another iteration step */
- qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy);
- } else {
- trace_migration_thread_low_pending(pending_size);
- migration_completion(s, current_active_state,
- &old_vm_running, &start_time);
+ MigIterateState iter_state = migration_iteration_run(s);
+ if (iter_state == MIG_ITERATE_SKIP) {
+ continue;
+ } else if (iter_state == MIG_ITERATE_BREAK) {
break;
}
}
if (qemu_file_get_error(s->to_dst_file)) {
- migrate_set_state(&s->state, current_active_state,
- MIGRATION_STATUS_FAILED);
+ if (migration_is_setup_or_active(s->state)) {
+ migrate_set_state(&s->state, s->state,
+ MIGRATION_STATUS_FAILED);
+ }
trace_migration_thread_file_err();
break;
}
+
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- if (current_time >= initial_time + BUFFER_DELAY) {
- uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
- initial_bytes;
- uint64_t time_spent = current_time - initial_time;
- double bandwidth = (double)transferred_bytes / time_spent;
- threshold_size = bandwidth * s->parameters.downtime_limit;
-
- s->mbps = (((double) transferred_bytes * 8.0) /
- ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
-
- trace_migrate_transferred(transferred_bytes, time_spent,
- bandwidth, threshold_size);
- /* if we haven't sent anything, we don't want to recalculate
- 10000 is a small enough number for our purposes */
- if (ram_counters.dirty_pages_rate && transferred_bytes > 10000) {
- s->expected_downtime = ram_counters.dirty_pages_rate *
- qemu_target_page_size() / bandwidth;
- }
- qemu_file_reset_rate_limit(s->to_dst_file);
- initial_time = current_time;
- initial_bytes = qemu_ftell(s->to_dst_file);
- }
+ migration_update_counters(s, current_time);
+
if (qemu_file_rate_limit(s->to_dst_file)) {
/* usleep expects microseconds */
- g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
+ g_usleep((s->iteration_start_time + BUFFER_DELAY -
+ current_time) * 1000);
}
}
trace_migration_thread_after_loop();
- /* If we enabled cpu throttling for auto-converge, turn it off. */
- cpu_throttle_stop();
- end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-
- qemu_mutex_lock_iothread();
- /*
- * The resource has been allocated by migration will be reused in COLO
- * process, so don't release them.
- */
- if (!enable_colo) {
- qemu_savevm_state_cleanup();
- }
- if (s->state == MIGRATION_STATUS_COMPLETED) {
- uint64_t transferred_bytes = qemu_ftell(s->to_dst_file);
- s->total_time = end_time - s->total_time;
- if (!entered_postcopy) {
- s->downtime = end_time - start_time;
- }
- if (s->total_time) {
- s->mbps = (((double) transferred_bytes * 8.0) /
- ((double) s->total_time)) / 1000;
- }
- runstate_set(RUN_STATE_POSTMIGRATE);
- } else {
- if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) {
- migrate_start_colo_process(s);
- qemu_savevm_state_cleanup();
- /*
- * Fixme: we will run VM in COLO no matter its old running state.
- * After exited COLO, we will keep running.
- */
- old_vm_running = true;
- }
- if (old_vm_running && !entered_postcopy) {
- vm_start();
- } else {
- if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
- runstate_set(RUN_STATE_POSTMIGRATE);
- }
- }
- }
- qemu_bh_schedule(s->cleanup_bh);
- qemu_mutex_unlock_iothread();
-
+ migration_iteration_finish(s);
rcu_unregister_thread();
return NULL;
}
@@ -2375,10 +2463,15 @@ void migration_global_dump(Monitor *mon)
{
MigrationState *ms = migrate_get_current();
- monitor_printf(mon, "globals: store-global-state=%d, only_migratable=%d, "
- "send-configuration=%d, send-section-footer=%d\n",
- ms->store_global_state, ms->only_migratable,
- ms->send_configuration, ms->send_section_footer);
+ monitor_printf(mon, "globals:\n");
+ monitor_printf(mon, "store-global-state: %s\n",
+ ms->store_global_state ? "on" : "off");
+ monitor_printf(mon, "only-migratable: %s\n",
+ ms->only_migratable ? "on" : "off");
+ monitor_printf(mon, "send-configuration: %s\n",
+ ms->send_configuration ? "on" : "off");
+ monitor_printf(mon, "send-section-footer: %s\n",
+ ms->send_section_footer ? "on" : "off");
}
#define DEFINE_PROP_MIG_CAP(name, x) \
@@ -2394,33 +2487,33 @@ static Property migration_properties[] = {
send_section_footer, true),
/* Migration parameters */
- DEFINE_PROP_INT64("x-compress-level", MigrationState,
+ DEFINE_PROP_UINT8("x-compress-level", MigrationState,
parameters.compress_level,
DEFAULT_MIGRATE_COMPRESS_LEVEL),
- DEFINE_PROP_INT64("x-compress-threads", MigrationState,
+ DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
parameters.compress_threads,
DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
- DEFINE_PROP_INT64("x-decompress-threads", MigrationState,
+ DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
parameters.decompress_threads,
DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
- DEFINE_PROP_INT64("x-cpu-throttle-initial", MigrationState,
+ DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
parameters.cpu_throttle_initial,
DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
- DEFINE_PROP_INT64("x-cpu-throttle-increment", MigrationState,
+ DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
parameters.cpu_throttle_increment,
DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
- DEFINE_PROP_INT64("x-max-bandwidth", MigrationState,
+ DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
parameters.max_bandwidth, MAX_THROTTLE),
- DEFINE_PROP_INT64("x-downtime-limit", MigrationState,
+ DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
parameters.downtime_limit,
DEFAULT_MIGRATE_SET_DOWNTIME),
- DEFINE_PROP_INT64("x-checkpoint-delay", MigrationState,
+ DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
parameters.x_checkpoint_delay,
DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
- DEFINE_PROP_INT64("x-multifd-channels", MigrationState,
+ DEFINE_PROP_UINT8("x-multifd-channels", MigrationState,
parameters.x_multifd_channels,
DEFAULT_MIGRATE_MULTIFD_CHANNELS),
- DEFINE_PROP_INT64("x-multifd-page-count", MigrationState,
+ DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState,
parameters.x_multifd_page_count,
DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT),
DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
diff --git a/migration/migration.h b/migration/migration.h
index 663415fe48..f2bc1aaf85 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -22,6 +22,8 @@
#include "hw/qdev.h"
#include "io/channel.h"
+struct PostcopyBlocktimeContext;
+
/* State for the incoming migration */
struct MigrationIncomingState {
QEMUFile *from_src_file;
@@ -59,10 +61,20 @@ struct MigrationIncomingState {
/* The coroutine we should enter (back) after failover */
Coroutine *migration_incoming_co;
QemuSemaphore colo_incoming_sem;
+
+ /*
+ * PostcopyBlocktimeContext to keep information for postcopy
+ * live migration, to calculate vCPU block time
+ * */
+ struct PostcopyBlocktimeContext *blocktime_ctx;
};
MigrationIncomingState *migration_incoming_get_current(void);
void migration_incoming_state_destroy(void);
+/*
+ * Functions to work with blocktime context
+ */
+void fill_destination_postcopy_migration_info(MigrationInfo *info);
#define TYPE_MIGRATION "migration"
@@ -90,6 +102,17 @@ struct MigrationState
QEMUBH *cleanup_bh;
QEMUFile *to_dst_file;
+ /* bytes already send at the beggining of current interation */
+ uint64_t iteration_initial_bytes;
+ /* time at the start of current iteration */
+ int64_t iteration_start_time;
+ /*
+ * The final stage happens when the remaining data is smaller than
+ * this threshold; it's calculated from the requested downtime and
+ * measured bandwidth
+ */
+ int64_t threshold_size;
+
/* params from 'migrate-set-parameters' */
MigrationParameters parameters;
@@ -103,11 +126,22 @@ struct MigrationState
} rp_state;
double mbps;
+ /* Timestamp when recent migration starts (ms) */
+ int64_t start_time;
+ /* Total time used by latest migration (ms) */
int64_t total_time;
+ /* Timestamp when VM is down (ms) to migrate the last stuff */
+ int64_t downtime_start;
int64_t downtime;
int64_t expected_downtime;
bool enabled_capabilities[MIGRATION_CAPABILITY__MAX];
int64_t setup_time;
+ /*
+ * Whether guest was running when we enter the completion stage.
+ * If migration is interrupted by any reason, we need to continue
+ * running the guest on source.
+ */
+ bool vm_was_running;
/* Flag set once the migration has been asked to enter postcopy */
bool start_postcopy;
@@ -201,6 +235,7 @@ int migrate_compress_level(void);
int migrate_compress_threads(void);
int migrate_decompress_threads(void);
bool migrate_use_events(void);
+bool migrate_postcopy_blocktime(void);
/* Sending on the return path - generic and then for each message type */
void migrate_send_rp_shut(MigrationIncomingState *mis,
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index bec6c2c66b..7814da5b4b 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -61,6 +61,101 @@ struct PostcopyDiscardState {
#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
+typedef struct PostcopyBlocktimeContext {
+ /* time when page fault initiated per vCPU */
+ int64_t *page_fault_vcpu_time;
+ /* page address per vCPU */
+ uintptr_t *vcpu_addr;
+ int64_t total_blocktime;
+ /* blocktime per vCPU */
+ int64_t *vcpu_blocktime;
+ /* point in time when last page fault was initiated */
+ int64_t last_begin;
+ /* number of vCPU are suspended */
+ int smp_cpus_down;
+
+ /*
+ * Handler for exit event, necessary for
+ * releasing whole blocktime_ctx
+ */
+ Notifier exit_notifier;
+} PostcopyBlocktimeContext;
+
+static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
+{
+ g_free(ctx->page_fault_vcpu_time);
+ g_free(ctx->vcpu_addr);
+ g_free(ctx->vcpu_blocktime);
+ g_free(ctx);
+}
+
+static void migration_exit_cb(Notifier *n, void *data)
+{
+ PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
+ exit_notifier);
+ destroy_blocktime_context(ctx);
+}
+
+static struct PostcopyBlocktimeContext *blocktime_context_new(void)
+{
+ PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
+ ctx->page_fault_vcpu_time = g_new0(int64_t, smp_cpus);
+ ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
+ ctx->vcpu_blocktime = g_new0(int64_t, smp_cpus);
+
+ ctx->exit_notifier.notify = migration_exit_cb;
+ qemu_add_exit_notifier(&ctx->exit_notifier);
+ return ctx;
+}
+
+static int64List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
+{
+ int64List *list = NULL, *entry = NULL;
+ int i;
+
+ for (i = smp_cpus - 1; i >= 0; i--) {
+ entry = g_new0(int64List, 1);
+ entry->value = ctx->vcpu_blocktime[i];
+ entry->next = list;
+ list = entry;
+ }
+
+ return list;
+}
+
+/*
+ * This function just populates MigrationInfo from postcopy's
+ * blocktime context. It will not populate MigrationInfo,
+ * unless postcopy-blocktime capability was set.
+ *
+ * @info: pointer to MigrationInfo to populate
+ */
+void fill_destination_postcopy_migration_info(MigrationInfo *info)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
+
+ if (!bc) {
+ return;
+ }
+
+ info->has_postcopy_blocktime = true;
+ info->postcopy_blocktime = bc->total_blocktime;
+ info->has_postcopy_vcpu_blocktime = true;
+ info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
+}
+
+static uint64_t get_postcopy_total_blocktime(void)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
+
+ if (!bc) {
+ return 0;
+ }
+
+ return bc->total_blocktime;
+}
/**
* receive_ufd_features: check userfault fd features, to request only supported
@@ -153,6 +248,19 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
}
}
+#ifdef UFFD_FEATURE_THREAD_ID
+ if (migrate_postcopy_blocktime() && mis &&
+ UFFD_FEATURE_THREAD_ID & supported_features) {
+ /* kernel supports that feature */
+ /* don't create blocktime_context if it exists */
+ if (!mis->blocktime_ctx) {
+ mis->blocktime_ctx = blocktime_context_new();
+ }
+
+ asked_features |= UFFD_FEATURE_THREAD_ID;
+ }
+#endif
+
/*
* request features, even if asked_features is 0, due to
* kernel expects UFFD_API before UFFDIO_REGISTER, per
@@ -423,6 +531,9 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
mis->postcopy_tmp_zero_page = NULL;
}
+ trace_postcopy_ram_incoming_cleanup_blocktime(
+ get_postcopy_total_blocktime());
+
trace_postcopy_ram_incoming_cleanup_exit();
return 0;
}
@@ -494,6 +605,142 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
return 0;
}
+static int get_mem_fault_cpu_index(uint32_t pid)
+{
+ CPUState *cpu_iter;
+
+ CPU_FOREACH(cpu_iter) {
+ if (cpu_iter->thread_id == pid) {
+ trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
+ return cpu_iter->cpu_index;
+ }
+ }
+ trace_get_mem_fault_cpu_index(-1, pid);
+ return -1;
+}
+
+/*
+ * This function is being called when pagefault occurs. It
+ * tracks down vCPU blocking time.
+ *
+ * @addr: faulted host virtual address
+ * @ptid: faulted process thread id
+ * @rb: ramblock appropriate to addr
+ */
+static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
+ RAMBlock *rb)
+{
+ int cpu, already_received;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
+ int64_t now_ms;
+
+ if (!dc || ptid == 0) {
+ return;
+ }
+ cpu = get_mem_fault_cpu_index(ptid);
+ if (cpu < 0) {
+ return;
+ }
+
+ now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ if (dc->vcpu_addr[cpu] == 0) {
+ atomic_inc(&dc->smp_cpus_down);
+ }
+
+ atomic_xchg__nocheck(&dc->last_begin, now_ms);
+ atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms);
+ atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr);
+
+ /* check it here, not at the begining of the function,
+ * due to, check could accur early than bitmap_set in
+ * qemu_ufd_copy_ioctl */
+ already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
+ if (already_received) {
+ atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0);
+ atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0);
+ atomic_dec(&dc->smp_cpus_down);
+ }
+ trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
+ cpu, already_received);
+}
+
+/*
+ * This function just provide calculated blocktime per cpu and trace it.
+ * Total blocktime is calculated in mark_postcopy_blocktime_end.
+ *
+ *
+ * Assume we have 3 CPU
+ *
+ * S1 E1 S1 E1
+ * -----***********------------xxx***************------------------------> CPU1
+ *
+ * S2 E2
+ * ------------****************xxx---------------------------------------> CPU2
+ *
+ * S3 E3
+ * ------------------------****xxx********-------------------------------> CPU3
+ *
+ * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
+ * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
+ * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
+ * it's a part of total blocktime.
+ * S1 - here is last_begin
+ * Legend of the picture is following:
+ * * - means blocktime per vCPU
+ * x - means overlapped blocktime (total blocktime)
+ *
+ * @addr: host virtual address
+ */
+static void mark_postcopy_blocktime_end(uintptr_t addr)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
+ int i, affected_cpu = 0;
+ int64_t now_ms;
+ bool vcpu_total_blocktime = false;
+ int64_t read_vcpu_time;
+
+ if (!dc) {
+ return;
+ }
+
+ now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ /* lookup cpu, to clear it,
+ * that algorithm looks straighforward, but it's not
+ * optimal, more optimal algorithm is keeping tree or hash
+ * where key is address value is a list of */
+ for (i = 0; i < smp_cpus; i++) {
+ uint64_t vcpu_blocktime = 0;
+
+ read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
+ if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
+ read_vcpu_time == 0) {
+ continue;
+ }
+ atomic_xchg__nocheck(&dc->vcpu_addr[i], 0);
+ vcpu_blocktime = now_ms - read_vcpu_time;
+ affected_cpu += 1;
+ /* we need to know is that mark_postcopy_end was due to
+ * faulted page, another possible case it's prefetched
+ * page and in that case we shouldn't be here */
+ if (!vcpu_total_blocktime &&
+ atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
+ vcpu_total_blocktime = true;
+ }
+ /* continue cycle, due to one page could affect several vCPUs */
+ dc->vcpu_blocktime[i] += vcpu_blocktime;
+ }
+
+ atomic_sub(&dc->smp_cpus_down, affected_cpu);
+ if (vcpu_total_blocktime) {
+ dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0);
+ }
+ trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
+ affected_cpu);
+}
+
/*
* Handle faults detected by the USERFAULT markings
*/
@@ -571,8 +818,11 @@ static void *postcopy_ram_fault_thread(void *opaque)
rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
qemu_ram_get_idstr(rb),
- rb_offset);
+ rb_offset,
+ msg.arg.pagefault.feat.ptid);
+ mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address),
+ msg.arg.pagefault.feat.ptid, rb);
/*
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
@@ -662,6 +912,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
if (!ret) {
ramblock_recv_bitmap_set_range(rb, host_addr,
pagesize / qemu_target_page_size());
+ mark_postcopy_blocktime_end((uintptr_t)host_addr);
+
}
return ret;
}
@@ -759,6 +1011,10 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis)
#else
/* No target OS support, stubs just fail */
+void fill_destination_postcopy_migration_info(MigrationInfo *info)
+{
+}
+
bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
{
error_report("%s: No OS support", __func__);
diff --git a/migration/ram.c b/migration/ram.c
index 021d583b9b..cb1950f3eb 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -237,7 +237,8 @@ static RAMState *ram_state;
uint64_t ram_bytes_remaining(void)
{
- return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
+ return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
+ 0;
}
MigrationStats ram_counters;
diff --git a/migration/socket.c b/migration/socket.c
index dee869044a..3a8232dd2d 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -172,7 +172,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr,
if (qio_channel_socket_listen_sync(listen_ioc, saddr, errp) < 0) {
object_unref(OBJECT(listen_ioc));
- qapi_free_SocketAddress(saddr);
return;
}
@@ -181,7 +180,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr,
socket_accept_incoming_migration,
listen_ioc,
(GDestroyNotify)object_unref);
- qapi_free_SocketAddress(saddr);
}
void tcp_start_incoming_migration(const char *host_port, Error **errp)
@@ -191,6 +189,7 @@ void tcp_start_incoming_migration(const char *host_port, Error **errp)
if (!err) {
socket_start_incoming_migration(saddr, &err);
}
+ qapi_free_SocketAddress(saddr);
error_propagate(errp, err);
}
@@ -198,4 +197,5 @@ void unix_start_incoming_migration(const char *path, Error **errp)
{
SocketAddress *saddr = unix_build_address(path);
socket_start_incoming_migration(saddr, errp);
+ qapi_free_SocketAddress(saddr);
}
diff --git a/migration/trace-events b/migration/trace-events
index 6f29fcc686..141e773305 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -115,6 +115,8 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
process_incoming_migration_co_postcopy_end_main(void) ""
migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s"
+mark_postcopy_blocktime_begin(uint64_t addr, void *dd, int64_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", cpu: %d, already_received: %d"
+mark_postcopy_blocktime_end(uint64_t addr, void *dd, int64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", affected_cpu: %d"
# migration/rdma.c
qemu_rdma_accept_incoming_migration(void) ""
@@ -191,15 +193,17 @@ postcopy_ram_enable_notify(void) ""
postcopy_ram_fault_thread_entry(void) ""
postcopy_ram_fault_thread_exit(void) ""
postcopy_ram_fault_thread_quit(void) ""
-postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx"
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u"
postcopy_ram_incoming_cleanup_closeuf(void) ""
postcopy_ram_incoming_cleanup_entry(void) ""
postcopy_ram_incoming_cleanup_exit(void) ""
postcopy_ram_incoming_cleanup_join(void) ""
+postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64
save_xbzrle_page_skipping(void) ""
save_xbzrle_page_overflow(void) ""
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
+get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u"
# migration/exec.c
migration_exec_outgoing(const char *cmd) "cmd=%s"