aboutsummaryrefslogtreecommitdiff
path: root/migration/migration.c
diff options
context:
space:
mode:
authorPeter Xu <peterx@redhat.com>2023-02-08 15:28:13 -0500
committerJuan Quintela <quintela@redhat.com>2023-02-11 16:51:09 +0100
commit5655aab0794b5c82e61683cab215c5f745be8af3 (patch)
treef65486f6c63edc3d327dabfee346319214fab79c /migration/migration.c
parentb28fb58227aa88b940fd45b31b0f66c8e3b8cdc0 (diff)
migration: Postpone postcopy preempt channel to be after main
Postcopy with preempt-mode enabled needs two channels to communicate. The order of channel establishment is not guaranteed. It can happen that the dest QEMU got the preempt channel connection request before the main channel is established, then the migration may make no progress even during precopy due to the wrong order. To fix it, create the preempt channel only if we know the main channel is established. For a general postcopy migration, we delay it until postcopy_start(), that's where we already went through some part of precopy on the main channel. To make sure dest QEMU has already established the channel, we wait until we got the first PONG received. That's something we do at the start of precopy when postcopy enabled so it's guaranteed to happen sooner or later. For a postcopy recovery, we delay it to qemu_savevm_state_resume_prepare() where we'll have round trips of data on bitmap synchronizations, which means the main channel must have been established. Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
Diffstat (limited to 'migration/migration.c')
-rw-r--r--migration/migration.c72
1 files changed, 55 insertions, 17 deletions
diff --git a/migration/migration.c b/migration/migration.c
index a2e362541d..a5c22e327d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -235,6 +235,8 @@ void migration_object_init(void)
qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
+ qemu_sem_init(&current_incoming->postcopy_qemufile_dst_done, 0);
+
qemu_mutex_init(&current_incoming->page_request_mutex);
current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
@@ -737,6 +739,31 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp)
migration_incoming_process();
}
+/*
+ * Returns true when we want to start a new incoming migration process,
+ * false otherwise.
+ */
+static bool migration_should_start_incoming(bool main_channel)
+{
+ /* Multifd doesn't start unless all channels are established */
+ if (migrate_use_multifd()) {
+ return migration_has_all_channels();
+ }
+
+ /* Preempt channel only starts when the main channel is created */
+ if (migrate_postcopy_preempt()) {
+ return main_channel;
+ }
+
+ /*
+ * For all the rest types of migration, we should only reach here when
+ * it's the main channel that's being created, and we should always
+ * proceed with this channel.
+ */
+ assert(main_channel);
+ return true;
+}
+
void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
@@ -798,7 +825,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
}
}
- if (migration_has_all_channels()) {
+ if (migration_should_start_incoming(default_channel)) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
@@ -3159,6 +3186,13 @@ static int await_return_path_close_on_source(MigrationState *ms)
return ms->rp_state.error;
}
+static inline void
+migration_wait_main_channel(MigrationState *ms)
+{
+ /* Wait until one PONG message received */
+ qemu_sem_wait(&ms->rp_state.rp_pong_acks);
+}
+
/*
* Switch from normal iteration to postcopy
* Returns non-0 on error
@@ -3173,9 +3207,12 @@ static int postcopy_start(MigrationState *ms)
bool restart_block = false;
int cur_state = MIGRATION_STATUS_ACTIVE;
- if (postcopy_preempt_wait_channel(ms)) {
- migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
- return -1;
+ if (migrate_postcopy_preempt()) {
+ migration_wait_main_channel(ms);
+ if (postcopy_preempt_establish_channel(ms)) {
+ migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
+ return -1;
+ }
}
if (!migrate_pause_before_switchover()) {
@@ -3587,6 +3624,20 @@ static int postcopy_do_resume(MigrationState *s)
}
/*
+ * If preempt is enabled, re-establish the preempt channel. Note that
+ * we do it after resume prepare to make sure the main channel will be
+ * created before the preempt channel. E.g. with weak network, the
+ * dest QEMU may get messed up with the preempt and main channels on
+ * the order of connection setup. This guarantees the correct order.
+ */
+ ret = postcopy_preempt_establish_channel(s);
+ if (ret) {
+ error_report("%s: postcopy_preempt_establish_channel(): %d",
+ __func__, ret);
+ return ret;
+ }
+
+ /*
* Last handshake with destination on the resume (destination will
* switch to postcopy-active afterwards)
*/
@@ -3647,14 +3698,6 @@ static MigThrError postcopy_pause(MigrationState *s)
if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
/* Woken up by a recover procedure. Give it a shot */
- if (postcopy_preempt_wait_channel(s)) {
- /*
- * Preempt enabled, and new channel create failed; loop
- * back to wait for another recovery.
- */
- continue;
- }
-
/*
* Firstly, let's wake up the return path now, with a new
* return path channel.
@@ -4347,11 +4390,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
}
}
- /* This needs to be done before resuming a postcopy */
- if (migrate_postcopy_preempt()) {
- postcopy_preempt_setup(s);
- }
-
if (resume) {
/* Wakeup the main migration thread to do the recovery */
migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,