diff options
author | Peter Xu <peterx@redhat.com> | 2023-10-17 16:26:29 -0400 |
---|---|---|
committer | Juan Quintela <quintela@redhat.com> | 2023-11-02 11:35:03 +0100 |
commit | 7aa6070d09c5a6c83490599a3c564c64d7e2520a (patch) | |
tree | b9173cb1f3f58a8fc624cb9605494311f7d055db /migration | |
parent | e7b428d6bc06a0dfffa7e47fb3b3ee47f1c93499 (diff) |
migration: Refactor error handling in source return path
rp_state.error was a boolean used to show error happened in return path
thread. That's not only duplicating error reporting (migrate_set_error),
but also not good enough in that we only do error_report() and set it to
true, we never can keep a history of the exact error and show it in
query-migrate.
To make this better, a few things done:
- Use error_setg() rather than error_report() across the whole lifecycle
of return path thread, keeping the error in an Error*.
- With above, no need to have mark_source_rp_bad(), remove it, alongside
with rp_state.error itself.
- Use migrate_set_error() to apply that captured error to the global
migration object when error occured in this thread.
- Do the same when detected qemufile error in source return path
We need to re-export qemu_file_get_error_obj() to do the last one.
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20231017202633.296756-2-peterx@redhat.com>
Diffstat (limited to 'migration')
-rw-r--r-- | migration/migration.c | 121 | ||||
-rw-r--r-- | migration/migration.h | 1 | ||||
-rw-r--r-- | migration/qemu-file.c | 2 | ||||
-rw-r--r-- | migration/qemu-file.h | 1 | ||||
-rw-r--r-- | migration/ram.c | 41 | ||||
-rw-r--r-- | migration/ram.h | 5 | ||||
-rw-r--r-- | migration/trace-events | 4 |
7 files changed, 80 insertions, 95 deletions
diff --git a/migration/migration.c b/migration/migration.c index 11c1490090..455ddc896a 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -99,7 +99,7 @@ static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); static void migrate_fd_cancel(MigrationState *s); -static int close_return_path_on_source(MigrationState *s); +static bool close_return_path_on_source(MigrationState *s); static void migration_downtime_start(MigrationState *s) { @@ -1475,7 +1475,6 @@ int migrate_init(MigrationState *s, Error **errp) s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; - s->rp_state.error = false; s->mbps = 0.0; s->pages_per_second = 0.0; s->downtime = 0; @@ -1883,16 +1882,6 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) qemu_sem_post(&s->pause_sem); } -/* migration thread support */ -/* - * Something bad happened to the RP stream, mark an error - * The caller shall print or trace something to indicate why - */ -static void mark_source_rp_bad(MigrationState *s) -{ - s->rp_state.error = true; -} - void migration_rp_wait(MigrationState *s) { qemu_sem_wait(&s->rp_state.rp_sem); @@ -1923,8 +1912,9 @@ static struct rp_cmd_args { * We're allowed to send more than requested (e.g. to round to our page size) * and we don't need to send pages that have already been sent. */ -static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, - ram_addr_t start, size_t len) +static void +migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, + ram_addr_t start, size_t len, Error **errp) { long our_host_ps = qemu_real_host_page_size(); @@ -1936,37 +1926,36 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, */ if (!QEMU_IS_ALIGNED(start, our_host_ps) || !QEMU_IS_ALIGNED(len, our_host_ps)) { - error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT - " len: %zd", __func__, start, len); - mark_source_rp_bad(ms); + error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:" + RAM_ADDR_FMT " len: %zd", start, len); return; } - if (ram_save_queue_pages(rbname, start, len)) { - mark_source_rp_bad(ms); - } + ram_save_queue_pages(rbname, start, len, errp); } -static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name) +static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name, + Error **errp) { RAMBlock *block = qemu_ram_block_by_name(block_name); if (!block) { - error_report("%s: invalid block name '%s'", __func__, block_name); + error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'", + block_name); return -EINVAL; } /* Fetch the received bitmap and refresh the dirty bitmap */ - return ram_dirty_bitmap_reload(s, block); + return ram_dirty_bitmap_reload(s, block, errp); } -static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value) +static int migrate_handle_rp_resume_ack(MigrationState *s, + uint32_t value, Error **errp) { trace_source_return_path_thread_resume_ack(value); if (value != MIGRATION_RESUME_ACK_VALUE) { - error_report("%s: illegal resume_ack value %"PRIu32, - __func__, value); + error_setg(errp, "illegal resume_ack value %"PRIu32, value); return -1; } @@ -2025,48 +2014,46 @@ static void *source_return_path_thread(void *opaque) uint32_t tmp32, sibling_error; ram_addr_t start = 0; /* =0 to silence warning */ size_t len = 0, expected_len; + Error *err = NULL; int res; trace_source_return_path_thread_entry(); rcu_register_thread(); - while (!ms->rp_state.error && !qemu_file_get_error(rp) && - migration_is_setup_or_active(ms->state)) { + while (migration_is_setup_or_active(ms->state)) { trace_source_return_path_thread_loop_top(); + header_type = qemu_get_be16(rp); header_len = qemu_get_be16(rp); if (qemu_file_get_error(rp)) { - mark_source_rp_bad(ms); + qemu_file_get_error_obj(rp, &err); goto out; } if (header_type >= MIG_RP_MSG_MAX || header_type == MIG_RP_MSG_INVALID) { - error_report("RP: Received invalid message 0x%04x length 0x%04x", - header_type, header_len); - mark_source_rp_bad(ms); + error_setg(&err, "Received invalid message 0x%04x length 0x%04x", + header_type, header_len); goto out; } if ((rp_cmd_args[header_type].len != -1 && header_len != rp_cmd_args[header_type].len) || header_len > sizeof(buf)) { - error_report("RP: Received '%s' message (0x%04x) with" - "incorrect length %d expecting %zu", - rp_cmd_args[header_type].name, header_type, header_len, - (size_t)rp_cmd_args[header_type].len); - mark_source_rp_bad(ms); + error_setg(&err, "Received '%s' message (0x%04x) with" + "incorrect length %d expecting %zu", + rp_cmd_args[header_type].name, header_type, header_len, + (size_t)rp_cmd_args[header_type].len); goto out; } /* We know we've got a valid header by this point */ res = qemu_get_buffer(rp, buf, header_len); if (res != header_len) { - error_report("RP: Failed reading data for message 0x%04x" - " read %d expected %d", - header_type, res, header_len); - mark_source_rp_bad(ms); + error_setg(&err, "Failed reading data for message 0x%04x" + " read %d expected %d", + header_type, res, header_len); goto out; } @@ -2076,8 +2063,7 @@ static void *source_return_path_thread(void *opaque) sibling_error = ldl_be_p(buf); trace_source_return_path_thread_shut(sibling_error); if (sibling_error) { - error_report("RP: Sibling indicated error %d", sibling_error); - mark_source_rp_bad(ms); + error_setg(&err, "Sibling indicated error %d", sibling_error); } /* * We'll let the main thread deal with closing the RP @@ -2095,7 +2081,10 @@ static void *source_return_path_thread(void *opaque) case MIG_RP_MSG_REQ_PAGES: start = ldq_be_p(buf); len = ldl_be_p(buf + 8); - migrate_handle_rp_req_pages(ms, NULL, start, len); + migrate_handle_rp_req_pages(ms, NULL, start, len, &err); + if (err) { + goto out; + } break; case MIG_RP_MSG_REQ_PAGES_ID: @@ -2110,32 +2099,32 @@ static void *source_return_path_thread(void *opaque) expected_len += tmp32; } if (header_len != expected_len) { - error_report("RP: Req_Page_id with length %d expecting %zd", - header_len, expected_len); - mark_source_rp_bad(ms); + error_setg(&err, "Req_Page_id with length %d expecting %zd", + header_len, expected_len); + goto out; + } + migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len, + &err); + if (err) { goto out; } - migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); break; case MIG_RP_MSG_RECV_BITMAP: if (header_len < 1) { - error_report("%s: missing block name", __func__); - mark_source_rp_bad(ms); + error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name"); goto out; } /* Format: len (1B) + idstr (<255B). This ends the idstr. */ buf[buf[0] + 1] = '\0'; - if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) { - mark_source_rp_bad(ms); + if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) { goto out; } break; case MIG_RP_MSG_RESUME_ACK: tmp32 = ldl_be_p(buf); - if (migrate_handle_rp_resume_ack(ms, tmp32)) { - mark_source_rp_bad(ms); + if (migrate_handle_rp_resume_ack(ms, tmp32, &err)) { goto out; } break; @@ -2151,13 +2140,15 @@ static void *source_return_path_thread(void *opaque) } out: - if (qemu_file_get_error(rp)) { + if (err) { + migrate_set_error(ms, err); + error_free(err); trace_source_return_path_thread_bad_end(); - mark_source_rp_bad(ms); } trace_source_return_path_thread_end(); rcu_unregister_thread(); + return NULL; } @@ -2179,12 +2170,11 @@ static int open_return_path_on_source(MigrationState *ms) return 0; } -static int close_return_path_on_source(MigrationState *ms) +/* Return true if error detected, or false otherwise */ +static bool close_return_path_on_source(MigrationState *ms) { - int ret; - if (!ms->rp_state.rp_thread_created) { - return 0; + return false; } trace_migration_return_path_end_before(); @@ -2202,18 +2192,13 @@ static int close_return_path_on_source(MigrationState *ms) } } - trace_await_return_path_close_on_source_joining(); qemu_thread_join(&ms->rp_state.rp_thread); ms->rp_state.rp_thread_created = false; - trace_await_return_path_close_on_source_close(); - - ret = ms->rp_state.error; - ms->rp_state.error = false; - migration_release_dst_files(ms); + trace_migration_return_path_end_after(); - trace_migration_return_path_end_after(ret); - return ret; + /* Return path will persist the error in MigrationState when quit */ + return migrate_has_error(ms); } static inline void diff --git a/migration/migration.h b/migration/migration.h index 5944107ad5..615b517594 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -308,7 +308,6 @@ struct MigrationState { /* Protected by qemu_file_lock */ QEMUFile *from_dst_file; QemuThread rp_thread; - bool error; /* * We can also check non-zero of rp_thread, but there's no "official" * way to do this, so this bool makes it slightly more elegant. diff --git a/migration/qemu-file.c b/migration/qemu-file.c index d64500310d..94231ff295 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -137,7 +137,7 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc) * * If errp is specified, a verbose error message will be copied over. */ -static int qemu_file_get_error_obj(QEMUFile *f, Error **errp) +int qemu_file_get_error_obj(QEMUFile *f, Error **errp) { if (!f->last_error) { return 0; diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 1774116f79..8aec9fabf7 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -68,6 +68,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset); void qemu_file_skip(QEMUFile *f, int size); int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp); void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err); +int qemu_file_get_error_obj(QEMUFile *f, Error **errp); void qemu_file_set_error(QEMUFile *f, int ret); int qemu_file_shutdown(QEMUFile *f); QEMUFile *qemu_file_get_return_path(QEMUFile *f); diff --git a/migration/ram.c b/migration/ram.c index 34724e8fe8..d05ffddbc8 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1898,7 +1898,8 @@ static void migration_page_queue_free(RAMState *rs) * @start: starting address from the start of the RAMBlock * @len: length (in bytes) to send */ -int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) +int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, + Error **errp) { RAMBlock *ramblock; RAMState *rs = ram_state; @@ -1915,7 +1916,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) * Shouldn't happen, we can't reuse the last RAMBlock if * it's the 1st request. */ - error_report("ram_save_queue_pages no previous block"); + error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); return -1; } } else { @@ -1923,16 +1924,17 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) if (!ramblock) { /* We shouldn't be asked for a non-existent RAMBlock */ - error_report("ram_save_queue_pages no block '%s'", rbname); + error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); return -1; } rs->last_req_rb = ramblock; } trace_ram_save_queue_pages(ramblock->idstr, start, len); if (!offset_in_ramblock(ramblock, start + len - 1)) { - error_report("%s request overrun start=" RAM_ADDR_FMT " len=" - RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, - __func__, start, len, ramblock->used_length); + error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " + "start=" RAM_ADDR_FMT " len=" + RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, + start, len, ramblock->used_length); return -1; } @@ -1964,9 +1966,9 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) assert(len % page_size == 0); while (len) { if (ram_save_host_page_urgent(pss)) { - error_report("%s: ram_save_host_page_urgent() failed: " - "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, - __func__, ramblock->idstr, start); + error_setg(errp, "ram_save_host_page_urgent() failed: " + "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, + ramblock->idstr, start); ret = -1; break; } @@ -4110,7 +4112,7 @@ static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) * This is only used when the postcopy migration is paused but wants * to resume from a middle point. */ -int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) +int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) { int ret = -EINVAL; /* from_dst_file is always valid because we're within rp_thread */ @@ -4124,8 +4126,8 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) trace_ram_dirty_bitmap_reload_begin(block->idstr); if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { - error_report("%s: incorrect state %s", __func__, - MigrationStatus_str(s->state)); + error_setg(errp, "Reload bitmap in incorrect state %s", + MigrationStatus_str(s->state)); return -EINVAL; } @@ -4142,9 +4144,8 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) /* The size of the bitmap should match with our ramblock */ if (size != local_size) { - error_report("%s: ramblock '%s' bitmap size mismatch " - "(0x%"PRIx64" != 0x%"PRIx64")", __func__, - block->idstr, size, local_size); + error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 + " != 0x%"PRIx64")", block->idstr, size, local_size); return -EINVAL; } @@ -4153,15 +4154,15 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) ret = qemu_file_get_error(file); if (ret || size != local_size) { - error_report("%s: read bitmap failed for ramblock '%s': %d" - " (size 0x%"PRIx64", got: 0x%"PRIx64")", - __func__, block->idstr, ret, local_size, size); + error_setg(errp, "read bitmap failed for ramblock '%s': %d" + " (size 0x%"PRIx64", got: 0x%"PRIx64")", + block->idstr, ret, local_size, size); return -EIO; } if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { - error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64, - __func__, block->idstr, end_mark); + error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, + block->idstr, end_mark); return -EINVAL; } diff --git a/migration/ram.h b/migration/ram.h index 9f3ad1ee81..6e2c2c1950 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -50,7 +50,8 @@ uint64_t ram_bytes_total(void); void mig_throttle_counter_reset(void); uint64_t ram_pagesize_summary(void); -int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len); +int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, + Error **errp); void ram_postcopy_migrated_memory_release(MigrationState *ms); /* For outgoing discard bitmap */ void ram_postcopy_send_discard_bitmap(MigrationState *ms); @@ -70,7 +71,7 @@ void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); int64_t ramblock_recv_bitmap_send(QEMUFile *file, const char *block_name); -int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb); +int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); diff --git a/migration/trace-events b/migration/trace-events index e54f317e3b..de4a743c8a 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -150,8 +150,6 @@ multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p" multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s" # migration.c -await_return_path_close_on_source_close(void) "" -await_return_path_close_on_source_joining(void) "" migrate_set_state(const char *new_state) "new state %s" migrate_fd_cleanup(void) "" migrate_fd_error(const char *error_desc) "error=%s" @@ -168,7 +166,7 @@ migration_completion_postcopy_end_after_complete(void) "" migration_rate_limit_pre(int ms) "%d ms" migration_rate_limit_post(int urgent) "urgent: %d" migration_return_path_end_before(void) "" -migration_return_path_end_after(int rp_error) "%d" +migration_return_path_end_after(void) "" migration_thread_after_loop(void) "" migration_thread_file_err(void) "" migration_thread_setup_complete(void) "" |