diff options
author | Peter Xu <peterx@redhat.com> | 2023-10-17 16:26:29 -0400 |
---|---|---|
committer | Juan Quintela <quintela@redhat.com> | 2023-11-02 11:35:03 +0100 |
commit | 7aa6070d09c5a6c83490599a3c564c64d7e2520a (patch) | |
tree | b9173cb1f3f58a8fc624cb9605494311f7d055db /migration/migration.c | |
parent | e7b428d6bc06a0dfffa7e47fb3b3ee47f1c93499 (diff) |
migration: Refactor error handling in source return path
rp_state.error was a boolean used to show error happened in return path
thread. That's not only duplicating error reporting (migrate_set_error),
but also not good enough in that we only do error_report() and set it to
true, we never can keep a history of the exact error and show it in
query-migrate.
To make this better, a few things done:
- Use error_setg() rather than error_report() across the whole lifecycle
of return path thread, keeping the error in an Error*.
- With above, no need to have mark_source_rp_bad(), remove it, alongside
with rp_state.error itself.
- Use migrate_set_error() to apply that captured error to the global
migration object when error occured in this thread.
- Do the same when detected qemufile error in source return path
We need to re-export qemu_file_get_error_obj() to do the last one.
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20231017202633.296756-2-peterx@redhat.com>
Diffstat (limited to 'migration/migration.c')
-rw-r--r-- | migration/migration.c | 121 |
1 files changed, 53 insertions, 68 deletions
diff --git a/migration/migration.c b/migration/migration.c index 11c1490090..455ddc896a 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -99,7 +99,7 @@ static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); static void migrate_fd_cancel(MigrationState *s); -static int close_return_path_on_source(MigrationState *s); +static bool close_return_path_on_source(MigrationState *s); static void migration_downtime_start(MigrationState *s) { @@ -1475,7 +1475,6 @@ int migrate_init(MigrationState *s, Error **errp) s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; - s->rp_state.error = false; s->mbps = 0.0; s->pages_per_second = 0.0; s->downtime = 0; @@ -1883,16 +1882,6 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) qemu_sem_post(&s->pause_sem); } -/* migration thread support */ -/* - * Something bad happened to the RP stream, mark an error - * The caller shall print or trace something to indicate why - */ -static void mark_source_rp_bad(MigrationState *s) -{ - s->rp_state.error = true; -} - void migration_rp_wait(MigrationState *s) { qemu_sem_wait(&s->rp_state.rp_sem); @@ -1923,8 +1912,9 @@ static struct rp_cmd_args { * We're allowed to send more than requested (e.g. to round to our page size) * and we don't need to send pages that have already been sent. */ -static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, - ram_addr_t start, size_t len) +static void +migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, + ram_addr_t start, size_t len, Error **errp) { long our_host_ps = qemu_real_host_page_size(); @@ -1936,37 +1926,36 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, */ if (!QEMU_IS_ALIGNED(start, our_host_ps) || !QEMU_IS_ALIGNED(len, our_host_ps)) { - error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT - " len: %zd", __func__, start, len); - mark_source_rp_bad(ms); + error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:" + RAM_ADDR_FMT " len: %zd", start, len); return; } - if (ram_save_queue_pages(rbname, start, len)) { - mark_source_rp_bad(ms); - } + ram_save_queue_pages(rbname, start, len, errp); } -static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name) +static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name, + Error **errp) { RAMBlock *block = qemu_ram_block_by_name(block_name); if (!block) { - error_report("%s: invalid block name '%s'", __func__, block_name); + error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'", + block_name); return -EINVAL; } /* Fetch the received bitmap and refresh the dirty bitmap */ - return ram_dirty_bitmap_reload(s, block); + return ram_dirty_bitmap_reload(s, block, errp); } -static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value) +static int migrate_handle_rp_resume_ack(MigrationState *s, + uint32_t value, Error **errp) { trace_source_return_path_thread_resume_ack(value); if (value != MIGRATION_RESUME_ACK_VALUE) { - error_report("%s: illegal resume_ack value %"PRIu32, - __func__, value); + error_setg(errp, "illegal resume_ack value %"PRIu32, value); return -1; } @@ -2025,48 +2014,46 @@ static void *source_return_path_thread(void *opaque) uint32_t tmp32, sibling_error; ram_addr_t start = 0; /* =0 to silence warning */ size_t len = 0, expected_len; + Error *err = NULL; int res; trace_source_return_path_thread_entry(); rcu_register_thread(); - while (!ms->rp_state.error && !qemu_file_get_error(rp) && - migration_is_setup_or_active(ms->state)) { + while (migration_is_setup_or_active(ms->state)) { trace_source_return_path_thread_loop_top(); + header_type = qemu_get_be16(rp); header_len = qemu_get_be16(rp); if (qemu_file_get_error(rp)) { - mark_source_rp_bad(ms); + qemu_file_get_error_obj(rp, &err); goto out; } if (header_type >= MIG_RP_MSG_MAX || header_type == MIG_RP_MSG_INVALID) { - error_report("RP: Received invalid message 0x%04x length 0x%04x", - header_type, header_len); - mark_source_rp_bad(ms); + error_setg(&err, "Received invalid message 0x%04x length 0x%04x", + header_type, header_len); goto out; } if ((rp_cmd_args[header_type].len != -1 && header_len != rp_cmd_args[header_type].len) || header_len > sizeof(buf)) { - error_report("RP: Received '%s' message (0x%04x) with" - "incorrect length %d expecting %zu", - rp_cmd_args[header_type].name, header_type, header_len, - (size_t)rp_cmd_args[header_type].len); - mark_source_rp_bad(ms); + error_setg(&err, "Received '%s' message (0x%04x) with" + "incorrect length %d expecting %zu", + rp_cmd_args[header_type].name, header_type, header_len, + (size_t)rp_cmd_args[header_type].len); goto out; } /* We know we've got a valid header by this point */ res = qemu_get_buffer(rp, buf, header_len); if (res != header_len) { - error_report("RP: Failed reading data for message 0x%04x" - " read %d expected %d", - header_type, res, header_len); - mark_source_rp_bad(ms); + error_setg(&err, "Failed reading data for message 0x%04x" + " read %d expected %d", + header_type, res, header_len); goto out; } @@ -2076,8 +2063,7 @@ static void *source_return_path_thread(void *opaque) sibling_error = ldl_be_p(buf); trace_source_return_path_thread_shut(sibling_error); if (sibling_error) { - error_report("RP: Sibling indicated error %d", sibling_error); - mark_source_rp_bad(ms); + error_setg(&err, "Sibling indicated error %d", sibling_error); } /* * We'll let the main thread deal with closing the RP @@ -2095,7 +2081,10 @@ static void *source_return_path_thread(void *opaque) case MIG_RP_MSG_REQ_PAGES: start = ldq_be_p(buf); len = ldl_be_p(buf + 8); - migrate_handle_rp_req_pages(ms, NULL, start, len); + migrate_handle_rp_req_pages(ms, NULL, start, len, &err); + if (err) { + goto out; + } break; case MIG_RP_MSG_REQ_PAGES_ID: @@ -2110,32 +2099,32 @@ static void *source_return_path_thread(void *opaque) expected_len += tmp32; } if (header_len != expected_len) { - error_report("RP: Req_Page_id with length %d expecting %zd", - header_len, expected_len); - mark_source_rp_bad(ms); + error_setg(&err, "Req_Page_id with length %d expecting %zd", + header_len, expected_len); + goto out; + } + migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len, + &err); + if (err) { goto out; } - migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); break; case MIG_RP_MSG_RECV_BITMAP: if (header_len < 1) { - error_report("%s: missing block name", __func__); - mark_source_rp_bad(ms); + error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name"); goto out; } /* Format: len (1B) + idstr (<255B). This ends the idstr. */ buf[buf[0] + 1] = '\0'; - if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) { - mark_source_rp_bad(ms); + if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) { goto out; } break; case MIG_RP_MSG_RESUME_ACK: tmp32 = ldl_be_p(buf); - if (migrate_handle_rp_resume_ack(ms, tmp32)) { - mark_source_rp_bad(ms); + if (migrate_handle_rp_resume_ack(ms, tmp32, &err)) { goto out; } break; @@ -2151,13 +2140,15 @@ static void *source_return_path_thread(void *opaque) } out: - if (qemu_file_get_error(rp)) { + if (err) { + migrate_set_error(ms, err); + error_free(err); trace_source_return_path_thread_bad_end(); - mark_source_rp_bad(ms); } trace_source_return_path_thread_end(); rcu_unregister_thread(); + return NULL; } @@ -2179,12 +2170,11 @@ static int open_return_path_on_source(MigrationState *ms) return 0; } -static int close_return_path_on_source(MigrationState *ms) +/* Return true if error detected, or false otherwise */ +static bool close_return_path_on_source(MigrationState *ms) { - int ret; - if (!ms->rp_state.rp_thread_created) { - return 0; + return false; } trace_migration_return_path_end_before(); @@ -2202,18 +2192,13 @@ static int close_return_path_on_source(MigrationState *ms) } } - trace_await_return_path_close_on_source_joining(); qemu_thread_join(&ms->rp_state.rp_thread); ms->rp_state.rp_thread_created = false; - trace_await_return_path_close_on_source_close(); - - ret = ms->rp_state.error; - ms->rp_state.error = false; - migration_release_dst_files(ms); + trace_migration_return_path_end_after(); - trace_migration_return_path_end_after(ret); - return ret; + /* Return path will persist the error in MigrationState when quit */ + return migrate_has_error(ms); } static inline void |