aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2021-07-13 23:01:10 +0100
committerPeter Maydell <peter.maydell@linaro.org>2021-07-13 23:01:10 +0100
commit4598b0735025042c62e85a52e4c91fc0d50ec157 (patch)
treefa8cdac0c9f98462264b8affe36cb7ef6bc5f189
parent2a54fc454cf0dbf173d5dc95205febe381cfb7cc (diff)
parent63268c4970a5f126cc9af75f3ccb8057abef5ec0 (diff)
Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-migration-20210713a' into staging
Migration pull 2021-07-13 # gpg: Signature made Tue 13 Jul 2021 16:22:28 BST # gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7 # gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full] # Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7 * remotes/dgilbert-gitlab/tags/pull-migration-20210713a: migration: Move bitmap_mutex out of migration_bitmap_clear_dirty() migration: Clear error at entry of migrate_fd_connect() migration: Don't do migrate cleanup if during postcopy resume migration: Release return path early for paused postcopy migration: failover: emit a warning when the card is not fully unplugged migration/rdma: prevent from double free the same mr Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--migration/migration.c41
-rw-r--r--migration/ram.c13
-rw-r--r--migration/rdma.c1
3 files changed, 48 insertions, 7 deletions
diff --git a/migration/migration.c b/migration/migration.c
index 5ff7ba9d5c..2d306582eb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1855,6 +1855,15 @@ void migrate_set_error(MigrationState *s, const Error *error)
}
}
+static void migrate_error_free(MigrationState *s)
+{
+ QEMU_LOCK_GUARD(&s->error_mutex);
+ if (s->error) {
+ error_free(s->error);
+ s->error = NULL;
+ }
+}
+
void migrate_fd_error(MigrationState *s, const Error *error)
{
trace_migrate_fd_error(error_get_pretty(error));
@@ -2818,12 +2827,12 @@ out:
* Maybe there is something we can do: it looks like a
* network down issue, and we pause for a recovery.
*/
+ qemu_fclose(rp);
+ ms->rp_state.from_dst_file = NULL;
+ rp = NULL;
if (postcopy_pause_return_path_thread(ms)) {
/* Reload rp, reset the rest */
- if (rp != ms->rp_state.from_dst_file) {
- qemu_fclose(rp);
- rp = ms->rp_state.from_dst_file;
- }
+ rp = ms->rp_state.from_dst_file;
ms->rp_state.error = false;
goto retry;
}
@@ -3701,6 +3710,10 @@ static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
qemu_sem_timedwait(&s->wait_unplug_sem, 250);
}
+ if (qemu_savevm_state_guest_unplug_pending()) {
+ warn_report("migration: partially unplugged device on "
+ "failure");
+ }
}
migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
@@ -3966,6 +3979,13 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
int64_t rate_limit;
bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
+ /*
+ * If there's a previous error, free it and prepare for another one.
+ * Meanwhile if migration completes successfully, there won't have an error
+ * dumped when calling migrate_fd_cleanup().
+ */
+ migrate_error_free(s);
+
s->expected_downtime = s->parameters.downtime_limit;
if (resume) {
assert(s->cleanup_bh);
@@ -3975,7 +3995,18 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
}
if (error_in) {
migrate_fd_error(s, error_in);
- migrate_fd_cleanup(s);
+ if (resume) {
+ /*
+ * Don't do cleanup for resume if channel is invalid, but only dump
+ * the error. We wait for another channel connect from the user.
+ * The error_report still gives HMP user a hint on what failed.
+ * It's normally done in migrate_fd_cleanup(), but call it here
+ * explicitly.
+ */
+ error_report_err(error_copy(s->error));
+ } else {
+ migrate_fd_cleanup(s);
+ }
return;
}
diff --git a/migration/ram.c b/migration/ram.c
index 88ff34f574..b5fc454b2f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -795,8 +795,6 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
{
bool ret;
- QEMU_LOCK_GUARD(&rs->bitmap_mutex);
-
/*
* Clear dirty bitmap if needed. This _must_ be called before we
* send any of the page in the chunk because we need to make sure
@@ -2834,6 +2832,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
goto out;
}
+ /*
+ * We'll take this lock a little bit long, but it's okay for two reasons.
+ * Firstly, the only possible other thread to take it is who calls
+ * qemu_guest_free_page_hint(), which should be rare; secondly, see
+ * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
+ * guarantees that we'll at least released it in a regular basis.
+ */
+ qemu_mutex_lock(&rs->bitmap_mutex);
WITH_RCU_READ_LOCK_GUARD() {
if (ram_list.version != rs->last_version) {
ram_state_reset(rs);
@@ -2893,6 +2899,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
i++;
}
}
+ qemu_mutex_unlock(&rs->bitmap_mutex);
/*
* Must occur before EOS (or any QEMUFile operation)
@@ -3682,6 +3689,7 @@ void colo_flush_ram_cache(void)
unsigned long offset = 0;
memory_global_dirty_log_sync();
+ qemu_mutex_lock(&ram_state->bitmap_mutex);
WITH_RCU_READ_LOCK_GUARD() {
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
ramblock_sync_dirty_bitmap(ram_state, block);
@@ -3710,6 +3718,7 @@ void colo_flush_ram_cache(void)
}
}
trace_colo_flush_ram_cache_end();
+ qemu_mutex_unlock(&ram_state->bitmap_mutex);
}
/**
diff --git a/migration/rdma.c b/migration/rdma.c
index 38a099f7ee..5c2d113aa9 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1143,6 +1143,7 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
for (i--; i >= 0; i--) {
ibv_dereg_mr(local->block[i].mr);
+ local->block[i].mr = NULL;
rdma->total_registrations--;
}