diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2022-12-15 14:52:12 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2022-12-15 14:52:13 +0000 |
commit | 928eac953918cbd9b237d7cb8b937a6fc575d009 (patch) | |
tree | 1ff8dc7745a748859b089739e1076a205100b380 /migration | |
parent | 48804eebd4a327e4b11f902ba80a00876ee53a43 (diff) | |
parent | 7f401b80445e8746202a6d643410ba1b9eeb3cb1 (diff) |
Merge tag 'next-8.0-pull-request' of https://gitlab.com/juan.quintela/qemu into staging
Migration patches for 8.0
Hi
This are the patches that I had to drop form the last PULL request because they werent fixes:
- AVX2 is dropped, intel posted a fix, I have to redo it
- Fix for out of order channels is out
Daniel nacked it and I need to redo it
# gpg: Signature made Thu 15 Dec 2022 09:38:29 GMT
# gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723
# gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full]
# gpg: aka "Juan Quintela <quintela@trasno.org>" [full]
# Primary key fingerprint: 1899 FF8E DEBF 58CC EE03 4B82 F487 EF18 5872 D723
* tag 'next-8.0-pull-request' of https://gitlab.com/juan.quintela/qemu:
migration: Drop rs->f
migration: Remove old preempt code around state maintainance
migration: Send requested page directly in rp-return thread
migration: Move last_sent_block into PageSearchStatus
migration: Make PageSearchStatus part of RAMState
migration: Add pss_init()
migration: Introduce pss_channel
migration: Teach PSS about host page
migration: Use atomic ops properly for page accountings
migration: Yield bitmap_mutex properly when sending/sleeping
migration: Remove RAMState.f references in compression code
migration: Trivial cleanup save_page_header() on same block check
migration: Cleanup xbzrle zero page cache update logic
migration: Add postcopy_preempt_active()
migration: Take bitmap mutex when completing ram migration
migration: Export ram_release_page()
migration: Export ram_transferred_ram()
multifd: Create page_count fields into both MultiFD{Recv,Send}Params
multifd: Create page_size fields into both MultiFD{Recv,Send}Params
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'migration')
-rw-r--r-- | migration/migration.c | 47 | ||||
-rw-r--r-- | migration/migration.h | 7 | ||||
-rw-r--r-- | migration/multifd-zlib.c | 14 | ||||
-rw-r--r-- | migration/multifd-zstd.c | 12 | ||||
-rw-r--r-- | migration/multifd.c | 27 | ||||
-rw-r--r-- | migration/multifd.h | 8 | ||||
-rw-r--r-- | migration/ram.c | 729 | ||||
-rw-r--r-- | migration/ram.h | 23 |
8 files changed, 419 insertions, 448 deletions
diff --git a/migration/migration.c b/migration/migration.c index 78a0b010d4..52b5d39244 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1045,13 +1045,13 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) size_t page_size = qemu_target_page_size(); info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = ram_counters.transferred; + info->ram->transferred = stat64_get(&ram_atomic_counters.transferred); info->ram->total = ram_bytes_total(); - info->ram->duplicate = ram_counters.duplicate; + info->ram->duplicate = stat64_get(&ram_atomic_counters.duplicate); /* legacy value. It is not used anymore */ info->ram->skipped = 0; - info->ram->normal = ram_counters.normal; - info->ram->normal_bytes = ram_counters.normal * page_size; + info->ram->normal = stat64_get(&ram_atomic_counters.normal); + info->ram->normal_bytes = info->ram->normal * page_size; info->ram->mbps = s->mbps; info->ram->dirty_sync_count = ram_counters.dirty_sync_count; info->ram->dirty_sync_missed_zero_copy = @@ -1062,7 +1062,7 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) info->ram->pages_per_second = s->pages_per_second; info->ram->precopy_bytes = ram_counters.precopy_bytes; info->ram->downtime_bytes = ram_counters.downtime_bytes; - info->ram->postcopy_bytes = ram_counters.postcopy_bytes; + info->ram->postcopy_bytes = stat64_get(&ram_atomic_counters.postcopy_bytes); if (migrate_use_xbzrle()) { info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); @@ -2840,8 +2840,11 @@ static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value) return 0; } -/* Release ms->rp_state.from_dst_file in a safe way */ -static void migration_release_from_dst_file(MigrationState *ms) +/* + * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if + * existed) in a safe way. + */ +static void migration_release_dst_files(MigrationState *ms) { QEMUFile *file; @@ -2854,6 +2857,18 @@ static void migration_release_from_dst_file(MigrationState *ms) ms->rp_state.from_dst_file = NULL; } + /* + * Do the same to postcopy fast path socket too if there is. No + * locking needed because this qemufile should only be managed by + * return path thread. + */ + if (ms->postcopy_qemufile_src) { + migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); + qemu_file_shutdown(ms->postcopy_qemufile_src); + qemu_fclose(ms->postcopy_qemufile_src); + ms->postcopy_qemufile_src = NULL; + } + qemu_fclose(file); } @@ -2998,7 +3013,7 @@ out: * Maybe there is something we can do: it looks like a * network down issue, and we pause for a recovery. */ - migration_release_from_dst_file(ms); + migration_release_dst_files(ms); rp = NULL; if (postcopy_pause_return_path_thread(ms)) { /* @@ -3016,7 +3031,7 @@ out: } trace_source_return_path_thread_end(); - migration_release_from_dst_file(ms); + migration_release_dst_files(ms); rcu_unregister_thread(); return NULL; } @@ -3539,18 +3554,6 @@ static MigThrError postcopy_pause(MigrationState *s) qemu_file_shutdown(file); qemu_fclose(file); - /* - * Do the same to postcopy fast path socket too if there is. No - * locking needed because no racer as long as we do this before setting - * status to paused. - */ - if (s->postcopy_qemufile_src) { - migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src); - qemu_file_shutdown(s->postcopy_qemufile_src); - qemu_fclose(s->postcopy_qemufile_src); - s->postcopy_qemufile_src = NULL; - } - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_POSTCOPY_PAUSED); @@ -4391,8 +4394,6 @@ static Property migration_properties[] = { DEFINE_PROP_SIZE("announce-step", MigrationState, parameters.announce_step, DEFAULT_MIGRATE_ANNOUNCE_STEP), - DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState, - postcopy_preempt_break_huge, true), DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds), DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname), DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz), diff --git a/migration/migration.h b/migration/migration.h index cdad8aceaa..ae4ffd3454 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -340,13 +340,6 @@ struct MigrationState { bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; - /* - * Whether we allow break sending huge pages when postcopy preempt is - * enabled. When disabled, we won't interrupt precopy within sending a - * host huge page, which is the old behavior of vanilla postcopy. - * NOTE: this parameter is ignored if postcopy preempt is not enabled. - */ - bool postcopy_preempt_break_huge; /* Needed by postcopy-pause state */ QemuSemaphore postcopy_pause_sem; diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 18213a9513..37770248e1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -116,7 +116,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { struct zlib_data *z = p->data; - size_t page_size = qemu_target_page_size(); z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -135,8 +134,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], page_size); - zs->avail_in = page_size; + memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + zs->avail_in = p->page_size; zs->next_in = z->buf; zs->avail_out = available; @@ -242,12 +241,11 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = p->data; - size_t page_size = qemu_target_page_size(); z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ uint32_t out_size = zs->total_out; - uint32_t expected_size = p->normal_num * page_size; + uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; int ret; int i; @@ -274,7 +272,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) flush = Z_SYNC_FLUSH; } - zs->avail_out = page_size; + zs->avail_out = p->page_size; zs->next_out = p->host + p->normal[i]; /* @@ -288,8 +286,8 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) do { ret = inflate(zs, flush); } while (ret == Z_OK && zs->avail_in - && (zs->total_out - start) < page_size); - if (ret == Z_OK && (zs->total_out - start) < page_size) { + && (zs->total_out - start) < p->page_size); + if (ret == Z_OK && (zs->total_out - start) < p->page_size) { error_setg(errp, "multifd %u: inflate generated too few output", p->id); return -1; diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index d788d309f2..f4a8e1ed1f 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -113,7 +113,6 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { struct zstd_data *z = p->data; - size_t page_size = qemu_target_page_size(); int ret; uint32_t i; @@ -128,7 +127,7 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + p->normal[i]; - z->in.size = page_size; + z->in.size = p->page_size; z->in.pos = 0; /* @@ -241,8 +240,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; - size_t page_size = qemu_target_page_size(); - uint32_t expected_size = p->normal_num * page_size; + uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; struct zstd_data *z = p->data; int ret; @@ -265,7 +263,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) for (i = 0; i < p->normal_num; i++) { z->out.dst = p->host + p->normal[i]; - z->out.size = page_size; + z->out.size = p->page_size; z->out.pos = 0; /* @@ -279,8 +277,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) do { ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.pos < page_size)); - if (ret > 0 && (z->out.pos < page_size)) { + && (z->out.pos < p->page_size)); + if (ret > 0 && (z->out.pos < p->page_size)) { error_setg(errp, "multifd %u: decompressStream buffer too small", p->id); return -1; diff --git a/migration/multifd.c b/migration/multifd.c index 509bbbe3bf..000ca4d4ec 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -87,15 +87,14 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - size_t page_size = qemu_target_page_size(); for (int i = 0; i < p->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; - p->iov[p->iovs_num].iov_len = page_size; + p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = p->normal_num * page_size; + p->next_packet_size = p->normal_num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; return 0; } @@ -139,7 +138,6 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) { uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - size_t page_size = qemu_target_page_size(); if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -148,7 +146,7 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) } for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; - p->iov[i].iov_len = page_size; + p->iov[i].iov_len = p->page_size; } return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); } @@ -281,8 +279,6 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) { MultiFDPacket_t *packet = p->packet; - size_t page_size = qemu_target_page_size(); - uint32_t page_count = MULTIFD_PACKET_SIZE / page_size; RAMBlock *block; int i; @@ -309,10 +305,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) * If we received a packet that is 100 times bigger than expected * just stop migration. It is a magic number. */ - if (packet->pages_alloc > page_count) { + if (packet->pages_alloc > p->page_count) { error_setg(errp, "multifd: received packet " "with size %u and expected a size of %u", - packet->pages_alloc, page_count) ; + packet->pages_alloc, p->page_count) ; return -1; } @@ -344,7 +340,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) for (i = 0; i < p->normal_num; i++) { uint64_t offset = be64_to_cpu(packet->offset[i]); - if (offset > (block->used_length - page_size)) { + if (offset > (block->used_length - p->page_size)) { error_setg(errp, "multifd: offset too long %" PRIu64 " (max " RAM_ADDR_FMT ")", offset, block->used_length); @@ -433,11 +429,10 @@ static int multifd_send_pages(QEMUFile *f) p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; - transferred = ((uint64_t) pages->num) * qemu_target_page_size() - + p->packet_len; + transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len; qemu_file_acct_rate_limit(f, transferred); ram_counters.multifd_bytes += transferred; - ram_counters.transferred += transferred; + stat64_add(&ram_atomic_counters.transferred, transferred); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); @@ -629,7 +624,7 @@ int multifd_send_sync_main(QEMUFile *f) p->pending_job++; qemu_file_acct_rate_limit(f, p->packet_len); ram_counters.multifd_bytes += p->packet_len; - ram_counters.transferred += p->packet_len; + stat64_add(&ram_atomic_counters.transferred, p->packet_len); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); @@ -947,6 +942,8 @@ int multifd_save_setup(Error **errp) /* We need one extra place for the packet header */ p->iov = g_new0(struct iovec, page_count + 1); p->normal = g_new0(ram_addr_t, page_count); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; if (migrate_use_zero_copy_send()) { p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; @@ -1194,6 +1191,8 @@ int multifd_load_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->page_count = page_count; + p->page_size = qemu_target_page_size(); } for (i = 0; i < thread_count; i++) { diff --git a/migration/multifd.h b/migration/multifd.h index 519f498643..e2802a9ce2 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -80,6 +80,10 @@ typedef struct { bool registered_yank; /* packet allocated len */ uint32_t packet_len; + /* guest page size */ + uint32_t page_size; + /* number of pages in a full packet */ + uint32_t page_count; /* multifd flags for sending ram */ int write_flags; @@ -143,6 +147,10 @@ typedef struct { QIOChannel *c; /* packet allocated len */ uint32_t packet_len; + /* guest page size */ + uint32_t page_size; + /* number of pages in a full packet */ + uint32_t page_count; /* syncs main thread and channels */ QemuSemaphore sem_sync; diff --git a/migration/ram.c b/migration/ram.c index 1338e47665..334309f1c6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -85,6 +85,26 @@ XBZRLECacheStats xbzrle_counters; +/* used by the search for pages to send */ +struct PageSearchStatus { + /* The migration channel used for a specific host page */ + QEMUFile *pss_channel; + /* Last block from where we have sent data */ + RAMBlock *last_sent_block; + /* Current block being searched */ + RAMBlock *block; + /* Current page to search from */ + unsigned long page; + /* Set once we wrap around */ + bool complete_round; + /* Whether we're sending a host page */ + bool host_page_sending; + /* The start/end of current host page. Invalid if host_page_sending==false */ + unsigned long host_page_start; + unsigned long host_page_end; +}; +typedef struct PageSearchStatus PageSearchStatus; + /* struct contains XBZRLE cache and a static page used by the compression */ static struct { @@ -162,6 +182,11 @@ out: return ret; } +static bool postcopy_preempt_active(void) +{ + return migrate_postcopy_preempt() && migration_in_postcopy(); +} + bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || @@ -296,30 +321,17 @@ struct RAMSrcPageRequest { QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; }; -typedef struct { - /* - * Cached ramblock/offset values if preempted. They're only meaningful if - * preempted==true below. - */ - RAMBlock *ram_block; - unsigned long ram_page; - /* - * Whether a postcopy preemption just happened. Will be reset after - * precopy recovered to background migration. - */ - bool preempted; -} PostcopyPreemptState; - /* State of RAM for migration */ struct RAMState { - /* QEMUFile used for this migration */ - QEMUFile *f; + /* + * PageSearchStatus structures for the channels when send pages. + * Protected by the bitmap_mutex. + */ + PageSearchStatus pss[RAM_CHANNEL_MAX]; /* UFFD file descriptor, used in 'write-tracking' migration */ int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; - /* Last block from where we have sent data */ - RAMBlock *last_sent_block; /* Last dirty target page we have sent */ ram_addr_t last_page; /* last ram version we have seen */ @@ -357,21 +369,18 @@ struct RAMState { uint64_t target_page_count; /* number of dirty bits in the bitmap */ uint64_t migration_dirty_pages; - /* Protects modification of the bitmap and migration dirty pages */ + /* + * Protects: + * - dirty/clear bitmap + * - migration_dirty_pages + * - pss structures + */ QemuMutex bitmap_mutex; /* The RAMBlock used in the last src_page_requests */ RAMBlock *last_req_rb; /* Queue of outstanding page requests from the destination */ QemuMutex src_page_req_mutex; QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; - - /* Postcopy preemption informations */ - PostcopyPreemptState postcopy_preempt_state; - /* - * Current channel we're using on src VM. Only valid if postcopy-preempt - * is enabled. - */ - unsigned int postcopy_channel; }; typedef struct RAMState RAMState; @@ -379,11 +388,6 @@ static RAMState *ram_state; static NotifierWithReturnList precopy_notifier_list; -static void postcopy_preempt_reset(RAMState *rs) -{ - memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState)); -} - /* Whether postcopy has queued requests? */ static bool postcopy_has_request(RAMState *rs) { @@ -420,18 +424,25 @@ uint64_t ram_bytes_remaining(void) 0; } +/* + * NOTE: not all stats in ram_counters are used in reality. See comments + * for struct MigrationAtomicStats. The ultimate result of ram migration + * counters will be a merged version with both ram_counters and the atomic + * fields in ram_atomic_counters. + */ MigrationStats ram_counters; +MigrationAtomicStats ram_atomic_counters; -static void ram_transferred_add(uint64_t bytes) +void ram_transferred_add(uint64_t bytes) { if (runstate_is_running()) { ram_counters.precopy_bytes += bytes; } else if (migration_in_postcopy()) { - ram_counters.postcopy_bytes += bytes; + stat64_add(&ram_atomic_counters.postcopy_bytes, bytes); } else { ram_counters.downtime_bytes += bytes; } - ram_counters.transferred += bytes; + stat64_add(&ram_atomic_counters.transferred, bytes); } void dirty_sync_missed_zero_copy(void) @@ -439,39 +450,6 @@ void dirty_sync_missed_zero_copy(void) ram_counters.dirty_sync_missed_zero_copy++; } -/* used by the search for pages to send */ -struct PageSearchStatus { - /* Current block being searched */ - RAMBlock *block; - /* Current page to search from */ - unsigned long page; - /* Set once we wrap around */ - bool complete_round; - /* - * [POSTCOPY-ONLY] Whether current page is explicitly requested by - * postcopy. When set, the request is "urgent" because the dest QEMU - * threads are waiting for us. - */ - bool postcopy_requested; - /* - * [POSTCOPY-ONLY] The target channel to use to send current page. - * - * Note: This may _not_ match with the value in postcopy_requested - * above. Let's imagine the case where the postcopy request is exactly - * the page that we're sending in progress during precopy. In this case - * we'll have postcopy_requested set to true but the target channel - * will be the precopy channel (so that we don't split brain on that - * specific page since the precopy channel already contains partial of - * that page data). - * - * Besides that specific use case, postcopy_target_channel should - * always be equal to postcopy_requested, because by default we send - * postcopy pages via postcopy preempt channel. - */ - bool postcopy_target_channel; -}; -typedef struct PageSearchStatus PageSearchStatus; - CompressionStats compression_counters; struct CompressParam { @@ -517,11 +495,28 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; +static int ram_save_host_page_urgent(PageSearchStatus *pss); + static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset, uint8_t *source_buf); -static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, - bool postcopy_requested); +/* NOTE: page is the PFN not real ram_addr_t. */ +static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) +{ + pss->block = rb; + pss->page = page; + pss->complete_round = false; +} + +/* + * Check whether two PSSs are actively sending the same page. Return true + * if it is, false otherwise. + */ +static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) +{ + return pss1->host_page_sending && pss2->host_page_sending && + (pss1->host_page_start == pss2->host_page_start); +} static void *do_data_compress(void *opaque) { @@ -647,28 +642,30 @@ exit: * * Returns the number of bytes written * - * @f: QEMUFile where to send the data + * @pss: current PSS channel status * @block: block that contains the page we want to send * @offset: offset inside the block for the page * in the lower bits, it contains flags */ -static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, +static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { size_t size, len; + bool same_block = (block == pss->last_sent_block); + QEMUFile *f = pss->pss_channel; - if (block == rs->last_sent_block) { + if (same_block) { offset |= RAM_SAVE_FLAG_CONTINUE; } qemu_put_be64(f, offset); size = 8; - if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { + if (!same_block) { len = strlen(block->idstr); qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)block->idstr, len); size += 1 + len; - rs->last_sent_block = block; + pss->last_sent_block = block; } return size; } @@ -719,7 +716,7 @@ void mig_throttle_counter_reset(void) rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = ram_counters.transferred; + rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); } /** @@ -736,10 +733,6 @@ void mig_throttle_counter_reset(void) */ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) { - if (!rs->xbzrle_enabled) { - return; - } - /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, @@ -756,17 +749,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) * -1 means that xbzrle would be longer than normal * * @rs: current RAM state + * @pss: current PSS channel * @current_data: pointer to the address of the page contents * @current_addr: addr of the page * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, - ram_addr_t current_addr, RAMBlock *block, - ram_addr_t offset) +static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, + uint8_t **current_data, ram_addr_t current_addr, + RAMBlock *block, ram_addr_t offset) { int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; + QEMUFile *file = pss->pss_channel; if (!cache_is_cached(XBZRLE.cache, current_addr, ram_counters.dirty_sync_count)) { @@ -831,11 +826,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /* Send XBZRLE based compressed page */ - bytes_xbzrle = save_page_header(rs, rs->f, block, + bytes_xbzrle = save_page_header(pss, block, offset | RAM_SAVE_FLAG_XBZRLE); - qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); - qemu_put_be16(rs->f, encoded_len); - qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); + qemu_put_byte(file, ENCODING_FLAG_XBZRLE); + qemu_put_be16(file, encoded_len); + qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); bytes_xbzrle += encoded_len + 1 + 2; /* * Like compressed_size (please see update_compress_thread_counts), @@ -849,26 +844,38 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /** - * migration_bitmap_find_dirty: find the next dirty page from start + * pss_find_next_dirty: find the next dirty page of current ramblock * - * Returns the page offset within memory region of the start of a dirty page + * This function updates pss->page to point to the next dirty page index + * within the ramblock to migrate, or the end of ramblock when nothing + * found. Note that when pss->host_page_sending==true it means we're + * during sending a host page, so we won't look for dirty page that is + * outside the host page boundary. * - * @rs: current RAM state - * @rb: RAMBlock where to search for dirty pages - * @start: page where we start the search + * @pss: the current page search status */ -static inline -unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, - unsigned long start) +static void pss_find_next_dirty(PageSearchStatus *pss) { + RAMBlock *rb = pss->block; unsigned long size = rb->used_length >> TARGET_PAGE_BITS; unsigned long *bitmap = rb->bmap; if (ramblock_is_ignored(rb)) { - return size; + /* Points directly to the end, so we know no dirty page */ + pss->page = size; + return; + } + + /* + * If during sending a host page, only look for dirty pages within the + * current host page being send. + */ + if (pss->host_page_sending) { + assert(pss->host_page_end); + size = MIN(size, pss->host_page_end); } - return find_next_bit(bitmap, size, start); + pss->page = find_next_bit(bitmap, size, pss->page); } static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, @@ -1083,8 +1090,9 @@ uint64_t ram_pagesize_summary(void) uint64_t ram_get_total_transferred_pages(void) { - return ram_counters.normal + ram_counters.duplicate + - compression_counters.pages + xbzrle_counters.pages; + return stat64_get(&ram_atomic_counters.normal) + + stat64_get(&ram_atomic_counters.duplicate) + + compression_counters.pages + xbzrle_counters.pages; } static void migration_update_rates(RAMState *rs, int64_t end_time) @@ -1143,8 +1151,8 @@ static void migration_trigger_throttle(RAMState *rs) { MigrationState *s = migrate_get_current(); uint64_t threshold = s->parameters.throttle_trigger_threshold; - - uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev; + uint64_t bytes_xfer_period = + stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev; uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; @@ -1207,7 +1215,7 @@ static void migration_bitmap_sync(RAMState *rs) /* reset period counters */ rs->time_last_bitmap_sync = end_time; rs->num_dirty_pages_period = 0; - rs->bytes_xfer_prev = ram_counters.transferred; + rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred); } if (migrate_use_events()) { qapi_event_send_migration_pass(ram_counters.dirty_sync_count); @@ -1234,7 +1242,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs) } } -static void ram_release_page(const char *rbname, uint64_t offset) +void ram_release_page(const char *rbname, uint64_t offset) { if (!migrate_release_ram() || !migration_in_postcopy()) { return; @@ -1249,19 +1257,19 @@ static void ram_release_page(const char *rbname, uint64_t offset) * Returns the size of data written to the file, 0 means the page is not * a zero page * - * @rs: current RAM state - * @file: the file where the data is saved + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, +static int save_zero_page_to_file(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { uint8_t *p = block->host + offset; + QEMUFile *file = pss->pss_channel; int len = 0; if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { - len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); + len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(block->idstr, offset); @@ -1274,16 +1282,17 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, * * Returns the number of pages written. * - * @rs: current RAM state + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) +static int save_zero_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset) { - int len = save_zero_page_to_file(rs, rs->f, block, offset); + int len = save_zero_page_to_file(pss, block, offset); if (len) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); ram_transferred_add(len); return 1; } @@ -1297,15 +1306,15 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) * * Return true if the pages has been saved, otherwise false is returned. */ -static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, - int *pages) +static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset, int *pages) { uint64_t bytes_xmit = 0; int ret; *pages = -1; - ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, - &bytes_xmit); + ret = ram_control_save_page(pss->pss_channel, block->offset, offset, + TARGET_PAGE_SIZE, &bytes_xmit); if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { return false; } @@ -1320,9 +1329,9 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, } if (bytes_xmit > 0) { - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); } else if (bytes_xmit == 0) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); } return true; @@ -1333,26 +1342,28 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, * * Returns the number of pages written. * - * @rs: current RAM state + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page * @buf: the page to be sent * @async: send to page asyncly */ -static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, - uint8_t *buf, bool async) +static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset, uint8_t *buf, bool async) { - ram_transferred_add(save_page_header(rs, rs->f, block, + QEMUFile *file = pss->pss_channel; + + ram_transferred_add(save_page_header(pss, block, offset | RAM_SAVE_FLAG_PAGE)); if (async) { - qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, + qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, migrate_release_ram() && migration_in_postcopy()); } else { - qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); + qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); } ram_transferred_add(TARGET_PAGE_SIZE); - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); return 1; } @@ -1382,8 +1393,8 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) XBZRLE_cache_lock(); if (rs->xbzrle_enabled && !migration_in_postcopy()) { - pages = save_xbzrle_page(rs, &p, current_addr, block, - offset); + pages = save_xbzrle_page(rs, pss, &p, current_addr, + block, offset); if (!rs->last_stage) { /* Can't send this cached data async, since the cache page * might get updated before it gets to the wire @@ -1394,7 +1405,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) /* XBZRLE overflow or normal page */ if (pages == -1) { - pages = save_normal_page(rs, block, offset, p, send_async); + pages = save_normal_page(pss, block, offset, p, send_async); } XBZRLE_cache_unlock(); @@ -1402,13 +1413,13 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(RAMState *rs, RAMBlock *block, +static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(rs->f, block, offset) < 0) { + if (multifd_queue_page(file, block, offset) < 0) { return -1; } - ram_counters.normal++; + stat64_add(&ram_atomic_counters.normal, 1); return 1; } @@ -1417,14 +1428,15 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset, uint8_t *source_buf) { RAMState *rs = ram_state; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; uint8_t *p = block->host + offset; int ret; - if (save_zero_page_to_file(rs, f, block, offset)) { + if (save_zero_page_to_file(pss, block, offset)) { return true; } - save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); + save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); /* * copy it to a internal buffer to avoid it being modified by VM @@ -1446,7 +1458,7 @@ update_compress_thread_counts(const CompressParam *param, int bytes_xmit) ram_transferred_add(bytes_xmit); if (param->zero_page) { - ram_counters.duplicate++; + stat64_add(&ram_atomic_counters.duplicate, 1); return; } @@ -1459,6 +1471,7 @@ static bool save_page_use_compression(RAMState *rs); static void flush_compressed_data(RAMState *rs) { + MigrationState *ms = migrate_get_current(); int idx, len, thread_count; if (!save_page_use_compression(rs)) { @@ -1477,7 +1490,7 @@ static void flush_compressed_data(RAMState *rs) for (idx = 0; idx < thread_count; idx++) { qemu_mutex_lock(&comp_param[idx].mutex); if (!comp_param[idx].quit) { - len = qemu_put_qemu_file(rs->f, comp_param[idx].file); + len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); /* * it's safe to fetch zero_page without holding comp_done_lock * as there is no further request submitted to the thread, @@ -1496,11 +1509,11 @@ static inline void set_compress_params(CompressParam *param, RAMBlock *block, param->offset = offset; } -static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, - ram_addr_t offset) +static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) { int idx, thread_count, bytes_xmit = -1, pages = -1; bool wait = migrate_compress_wait_thread(); + MigrationState *ms = migrate_get_current(); thread_count = migrate_compress_threads(); qemu_mutex_lock(&comp_done_lock); @@ -1508,7 +1521,8 @@ retry: for (idx = 0; idx < thread_count; idx++) { if (comp_param[idx].done) { comp_param[idx].done = false; - bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); + bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, + comp_param[idx].file); qemu_mutex_lock(&comp_param[idx].mutex); set_compress_params(&comp_param[idx], block, offset); qemu_cond_signal(&comp_param[idx].cond); @@ -1544,14 +1558,9 @@ retry: */ static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) { - /* - * This is not a postcopy requested page, mark it "not urgent", and use - * precopy channel to send it. - */ - pss->postcopy_requested = false; - pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; + /* Update pss->page for the next dirty bit in ramblock */ + pss_find_next_dirty(pss); - pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); if (pss->complete_round && pss->block == rs->last_seen_block && pss->page >= rs->last_page) { /* @@ -1696,7 +1705,7 @@ static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; /* Flush async buffers before un-protect. */ - qemu_fflush(rs->f); + qemu_fflush(pss->pss_channel); /* Un-protect memory range. */ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, false, false); @@ -1999,55 +2008,6 @@ void ram_write_tracking_stop(void) } #endif /* defined(__linux__) */ -/* - * Check whether two addr/offset of the ramblock falls onto the same host huge - * page. Returns true if so, false otherwise. - */ -static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1, - uint64_t addr2) -{ - size_t page_size = qemu_ram_pagesize(rb); - - addr1 = ROUND_DOWN(addr1, page_size); - addr2 = ROUND_DOWN(addr2, page_size); - - return addr1 == addr2; -} - -/* - * Whether a previous preempted precopy huge page contains current requested - * page? Returns true if so, false otherwise. - * - * This should really happen very rarely, because it means when we were sending - * during background migration for postcopy we're sending exactly the page that - * some vcpu got faulted on on dest node. When it happens, we probably don't - * need to do much but drop the request, because we know right after we restore - * the precopy stream it'll be serviced. It'll slightly affect the order of - * postcopy requests to be serviced (e.g. it'll be the same as we move current - * request to the end of the queue) but it shouldn't be a big deal. The most - * imporant thing is we can _never_ try to send a partial-sent huge page on the - * POSTCOPY channel again, otherwise that huge page will got "split brain" on - * two channels (PRECOPY, POSTCOPY). - */ -static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block, - ram_addr_t offset) -{ - PostcopyPreemptState *state = &rs->postcopy_preempt_state; - - /* No preemption at all? */ - if (!state->preempted) { - return false; - } - - /* Not even the same ramblock? */ - if (state->ram_block != block) { - return false; - } - - return offset_on_same_huge_page(block, offset, - state->ram_page << TARGET_PAGE_BITS); -} - /** * get_queued_page: unqueue a page from the postcopy requests * @@ -2087,20 +2047,7 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); - if (block) { - /* See comment above postcopy_preempted_contains() */ - if (postcopy_preempted_contains(rs, block, offset)) { - trace_postcopy_preempt_hit(block->idstr, offset); - /* - * If what we preempted previously was exactly what we're - * requesting right now, restore the preempted precopy - * immediately, boosting its priority as it's requested by - * postcopy. - */ - postcopy_preempt_restore(rs, pss, true); - return true; - } - } else { + if (!block) { /* * Poll write faults too if background snapshot is enabled; that's * when we have vcpus got blocked by the write protected pages. @@ -2122,9 +2069,6 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) * really rare. */ pss->complete_round = false; - /* Mark it an urgent request, meanwhile using POSTCOPY channel */ - pss->postcopy_requested = true; - pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY; } return !!block; @@ -2202,6 +2146,56 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) return -1; } + /* + * When with postcopy preempt, we send back the page directly in the + * rp-return thread. + */ + if (postcopy_preempt_active()) { + ram_addr_t page_start = start >> TARGET_PAGE_BITS; + size_t page_size = qemu_ram_pagesize(ramblock); + PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; + int ret = 0; + + qemu_mutex_lock(&rs->bitmap_mutex); + + pss_init(pss, ramblock, page_start); + /* + * Always use the preempt channel, and make sure it's there. It's + * safe to access without lock, because when rp-thread is running + * we should be the only one who operates on the qemufile + */ + pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; + assert(pss->pss_channel); + + /* + * It must be either one or multiple of host page size. Just + * assert; if something wrong we're mostly split brain anyway. + */ + assert(len % page_size == 0); + while (len) { + if (ram_save_host_page_urgent(pss)) { + error_report("%s: ram_save_host_page_urgent() failed: " + "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, + __func__, ramblock->idstr, start); + ret = -1; + break; + } + /* + * NOTE: after ram_save_host_page_urgent() succeeded, pss->page + * will automatically be moved and point to the next host page + * we're going to send, so no need to update here. + * + * Normally QEMU never sends >1 host page in requests, so + * logically we don't even need that as the loop should only + * run once, but just to be consistent. + */ + len -= page_size; + }; + qemu_mutex_unlock(&rs->bitmap_mutex); + + return ret; + } + struct RAMSrcPageRequest *new_entry = g_new0(struct RAMSrcPageRequest, 1); new_entry->rb = ramblock; @@ -2240,7 +2234,8 @@ static bool save_page_use_compression(RAMState *rs) * has been properly handled by compression, otherwise needs other * paths to handle it */ -static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) +static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, + RAMBlock *block, ram_addr_t offset) { if (!save_page_use_compression(rs)) { return false; @@ -2256,12 +2251,12 @@ static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) * We post the fist page as normal page as compression will take * much CPU resource. */ - if (block != rs->last_sent_block) { + if (block != pss->last_sent_block) { flush_compressed_data(rs); return false; } - if (compress_page_with_multi_thread(rs, block, offset) > 0) { + if (compress_page_with_multi_thread(block, offset) > 0) { return true; } @@ -2283,20 +2278,20 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; - if (control_save_page(rs, block, offset, &res)) { + if (control_save_page(pss, block, offset, &res)) { return res; } - if (save_compress_page(rs, block, offset)) { + if (save_compress_page(rs, pss, block, offset)) { return 1; } - res = save_zero_page(rs, block, offset); + res = save_zero_page(pss, block, offset); if (res > 0) { /* Must let xbzrle know, otherwise a previous (now 0'd) cached * page would be stale */ - if (!save_page_use_compression(rs)) { + if (rs->xbzrle_enabled) { XBZRLE_cache_lock(); xbzrle_cache_zero_page(rs, block->offset + offset); XBZRLE_cache_unlock(); @@ -2311,133 +2306,97 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) * still see partially copied pages which is data corruption. */ if (migrate_use_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(rs, block, offset); + return ram_save_multifd_page(pss->pss_channel, block, offset); } return ram_save_page(rs, pss); } -static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss) +/* Should be called before sending a host page */ +static void pss_host_page_prepare(PageSearchStatus *pss) { - MigrationState *ms = migrate_get_current(); - - /* Not enabled eager preempt? Then never do that. */ - if (!migrate_postcopy_preempt()) { - return false; - } + /* How many guest pages are there in one host page? */ + size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; - /* If the user explicitly disabled breaking of huge page, skip */ - if (!ms->postcopy_preempt_break_huge) { - return false; - } + pss->host_page_sending = true; + pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); + pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); +} - /* If the ramblock we're sending is a small page? Never bother. */ - if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) { - return false; - } +/* + * Whether the page pointed by PSS is within the host page being sent. + * Must be called after a previous pss_host_page_prepare(). + */ +static bool pss_within_range(PageSearchStatus *pss) +{ + ram_addr_t ram_addr; - /* Not in postcopy at all? */ - if (!migration_in_postcopy()) { - return false; - } + assert(pss->host_page_sending); - /* - * If we're already handling a postcopy request, don't preempt as this page - * has got the same high priority. - */ - if (pss->postcopy_requested) { + /* Over host-page boundary? */ + if (pss->page >= pss->host_page_end) { return false; } - /* If there's postcopy requests, then check it up! */ - return postcopy_has_request(rs); -} - -/* Returns true if we preempted precopy, false otherwise */ -static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss) -{ - PostcopyPreemptState *p_state = &rs->postcopy_preempt_state; - - trace_postcopy_preempt_triggered(pss->block->idstr, pss->page); + ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; - /* - * Time to preempt precopy. Cache current PSS into preempt state, so that - * after handling the postcopy pages we can recover to it. We need to do - * so because the dest VM will have partial of the precopy huge page kept - * over in its tmp huge page caches; better move on with it when we can. - */ - p_state->ram_block = pss->block; - p_state->ram_page = pss->page; - p_state->preempted = true; + return offset_in_ramblock(pss->block, ram_addr); } -/* Whether we're preempted by a postcopy request during sending a huge page */ -static bool postcopy_preempt_triggered(RAMState *rs) +static void pss_host_page_finish(PageSearchStatus *pss) { - return rs->postcopy_preempt_state.preempted; + pss->host_page_sending = false; + /* This is not needed, but just to reset it */ + pss->host_page_start = pss->host_page_end = 0; } -static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, - bool postcopy_requested) +/* + * Send an urgent host page specified by `pss'. Need to be called with + * bitmap_mutex held. + * + * Returns 0 if save host page succeeded, false otherwise. + */ +static int ram_save_host_page_urgent(PageSearchStatus *pss) { - PostcopyPreemptState *state = &rs->postcopy_preempt_state; - - assert(state->preempted); + bool page_dirty, sent = false; + RAMState *rs = ram_state; + int ret = 0; - pss->block = state->ram_block; - pss->page = state->ram_page; + trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); + pss_host_page_prepare(pss); - /* Whether this is a postcopy request? */ - pss->postcopy_requested = postcopy_requested; /* - * When restoring a preempted page, the old data resides in PRECOPY - * slow channel, even if postcopy_requested is set. So always use - * PRECOPY channel here. + * If precopy is sending the same page, let it be done in precopy, or + * we could send the same page in two channels and none of them will + * receive the whole page. */ - pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; - - trace_postcopy_preempt_restored(pss->block->idstr, pss->page); - - /* Reset preempt state, most importantly, set preempted==false */ - postcopy_preempt_reset(rs); -} - -static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) -{ - MigrationState *s = migrate_get_current(); - unsigned int channel = pss->postcopy_target_channel; - QEMUFile *next; - - if (channel != rs->postcopy_channel) { - if (channel == RAM_CHANNEL_PRECOPY) { - next = s->to_dst_file; - } else { - next = s->postcopy_qemufile_src; - } - /* Update and cache the current channel */ - rs->f = next; - rs->postcopy_channel = channel; - - /* - * If channel switched, reset last_sent_block since the old sent block - * may not be on the same channel. - */ - rs->last_sent_block = NULL; - - trace_postcopy_preempt_switch_channel(channel); + if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { + trace_postcopy_preempt_hit(pss->block->idstr, + pss->page << TARGET_PAGE_BITS); + return 0; } - trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); -} - -/* We need to make sure rs->f always points to the default channel elsewhere */ -static void postcopy_preempt_reset_channel(RAMState *rs) -{ - if (migrate_postcopy_preempt() && migration_in_postcopy()) { - rs->postcopy_channel = RAM_CHANNEL_PRECOPY; - rs->f = migrate_get_current()->to_dst_file; - trace_postcopy_preempt_reset_channel(); + do { + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); + + if (page_dirty) { + /* Be strict to return code; it must be 1, or what else? */ + if (ram_save_target_page(rs, pss) != 1) { + error_report_once("%s: ram_save_target_page failed", __func__); + ret = -1; + goto out; + } + sent = true; + } + pss_find_next_dirty(pss); + } while (pss_within_range(pss)); +out: + pss_host_page_finish(pss); + /* For urgent requests, flush immediately if sent */ + if (sent) { + qemu_fflush(pss->pss_channel); } + return ret; } /** @@ -2448,9 +2407,14 @@ static void postcopy_preempt_reset_channel(RAMState *rs) * a host page in which case the remainder of the hostpage is sent. * Only dirty target pages are sent. Note that the host page size may * be a huge page for this block. + * * The saving stops at the boundary of the used_length of the block * if the RAMBlock isn't a multiple of the host page size. * + * The caller must be with ram_state.bitmap_mutex held to call this + * function. Note that this function can temporarily release the lock, but + * when the function is returned it'll make sure the lock is still held. + * * Returns the number of pages written or negative on error * * @rs: current RAM state @@ -2458,11 +2422,10 @@ static void postcopy_preempt_reset_channel(RAMState *rs) */ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) { + bool page_dirty, preempt_active = postcopy_preempt_active(); int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; - unsigned long hostpage_boundary = - QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); unsigned long start_page = pss->page; int res; @@ -2471,51 +2434,49 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } - if (migrate_postcopy_preempt() && migration_in_postcopy()) { - postcopy_preempt_choose_channel(rs, pss); - } + /* Update host page boundary information */ + pss_host_page_prepare(pss); do { - if (postcopy_needs_preempt(rs, pss)) { - postcopy_do_preempt(rs, pss); - break; - } + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); /* Check the pages is dirty and if it is send it */ - if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { - tmppages = ram_save_target_page(rs, pss); - if (tmppages < 0) { - return tmppages; - } - - pages += tmppages; + if (page_dirty) { /* - * Allow rate limiting to happen in the middle of huge pages if - * something is sent in the current iteration. + * Properly yield the lock only in postcopy preempt mode + * because both migration thread and rp-return thread can + * operate on the bitmaps. */ - if (pagesize_bits > 1 && tmppages > 0) { - migration_rate_limit(); + if (preempt_active) { + qemu_mutex_unlock(&rs->bitmap_mutex); + } + tmppages = ram_save_target_page(rs, pss); + if (tmppages >= 0) { + pages += tmppages; + /* + * Allow rate limiting to happen in the middle of huge pages if + * something is sent in the current iteration. + */ + if (pagesize_bits > 1 && tmppages > 0) { + migration_rate_limit(); + } } + if (preempt_active) { + qemu_mutex_lock(&rs->bitmap_mutex); + } + } else { + tmppages = 0; } - pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); - } while ((pss->page < hostpage_boundary) && - offset_in_ramblock(pss->block, - ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); - /* The offset we leave with is the min boundary of host page and block */ - pss->page = MIN(pss->page, hostpage_boundary); - /* - * When with postcopy preempt mode, flush the data as soon as possible for - * postcopy requests, because we've already sent a whole huge page, so the - * dst node should already have enough resource to atomically filling in - * the current missing page. - * - * More importantly, when using separate postcopy channel, we must do - * explicit flush or it won't flush until the buffer is full. - */ - if (migrate_postcopy_preempt() && pss->postcopy_requested) { - qemu_fflush(rs->f); - } + if (tmppages < 0) { + pss_host_page_finish(pss); + return tmppages; + } + + pss_find_next_dirty(pss); + } while (pss_within_range(pss)); + + pss_host_page_finish(pss); res = ram_save_release_protection(rs, pss, start_page); return (res < 0 ? res : pages); @@ -2536,7 +2497,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) */ static int ram_find_and_save_block(RAMState *rs) { - PageSearchStatus pss; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; int pages = 0; bool again, found; @@ -2557,35 +2518,24 @@ static int ram_find_and_save_block(RAMState *rs) rs->last_page = 0; } - pss.block = rs->last_seen_block; - pss.page = rs->last_page; - pss.complete_round = false; + pss_init(pss, rs->last_seen_block, rs->last_page); do { again = true; - found = get_queued_page(rs, &pss); + found = get_queued_page(rs, pss); if (!found) { - /* - * Recover previous precopy ramblock/offset if postcopy has - * preempted precopy. Otherwise find the next dirty bit. - */ - if (postcopy_preempt_triggered(rs)) { - postcopy_preempt_restore(rs, &pss, false); - found = true; - } else { - /* priority queue empty, so just search for something dirty */ - found = find_dirty_block(rs, &pss, &again); - } + /* priority queue empty, so just search for something dirty */ + found = find_dirty_block(rs, pss, &again); } if (found) { - pages = ram_save_host_page(rs, &pss); + pages = ram_save_host_page(rs, pss); } } while (!pages && again); - rs->last_seen_block = pss.block; - rs->last_page = pss.page; + rs->last_seen_block = pss->block; + rs->last_page = pss->page; return pages; } @@ -2595,9 +2545,9 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) uint64_t pages = size / TARGET_PAGE_SIZE; if (zero) { - ram_counters.duplicate += pages; + stat64_add(&ram_atomic_counters.duplicate, pages); } else { - ram_counters.normal += pages; + stat64_add(&ram_atomic_counters.normal, pages); ram_transferred_add(size); qemu_file_credit_transfer(f, size); } @@ -2699,13 +2649,16 @@ static void ram_save_cleanup(void *opaque) static void ram_state_reset(RAMState *rs) { + int i; + + for (i = 0; i < RAM_CHANNEL_MAX; i++) { + rs->pss[i].last_sent_block = NULL; + } + rs->last_seen_block = NULL; - rs->last_sent_block = NULL; rs->last_page = 0; rs->last_version = ram_list.version; rs->xbzrle_enabled = false; - postcopy_preempt_reset(rs); - rs->postcopy_channel = RAM_CHANNEL_PRECOPY; } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -2894,8 +2847,8 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms) migration_bitmap_sync(rs); /* Easiest way to make sure we don't resume in the middle of a host-page */ + rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; rs->last_seen_block = NULL; - rs->last_sent_block = NULL; rs->last_page = 0; postcopy_each_ram_send_discard(ms); @@ -3132,7 +3085,7 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) ram_state_reset(rs); /* Update RAMState cache of output QEMUFile */ - rs->f = out; + rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; trace_ram_state_resume_prepare(pages); } @@ -3223,7 +3176,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return -1; } } - (*rsp)->f = f; + (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; WITH_RCU_READ_LOCK_GUARD() { qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE); @@ -3349,8 +3302,6 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } qemu_mutex_unlock(&rs->bitmap_mutex); - postcopy_preempt_reset_channel(rs); - /* * Must occur before EOS (or any QEMUFile operation) * because of RDMA protocol. @@ -3360,7 +3311,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - ret = multifd_send_sync_main(rs->f); + ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); if (ret < 0) { return ret; } @@ -3406,6 +3357,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) /* try transferring iterative blocks of memory */ /* flush all remaining blocks regardless of rate limiting */ + qemu_mutex_lock(&rs->bitmap_mutex); while (true) { int pages; @@ -3419,6 +3371,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) break; } } + qemu_mutex_unlock(&rs->bitmap_mutex); flush_compressed_data(rs); ram_control_after_iterate(f, RAM_CONTROL_FINISH); @@ -3428,9 +3381,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } - postcopy_preempt_reset_channel(rs); - - ret = multifd_send_sync_main(rs->f); + ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); if (ret < 0) { return ret; } diff --git a/migration/ram.h b/migration/ram.h index c7af65ac74..81cbb0947c 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -32,7 +32,27 @@ #include "qapi/qapi-types-migration.h" #include "exec/cpu-common.h" #include "io/channel.h" +#include "qemu/stats64.h" +/* + * These are the migration statistic counters that need to be updated using + * atomic ops (can be accessed by more than one thread). Here since we + * cannot modify MigrationStats directly to use Stat64 as it was defined in + * the QAPI scheme, we define an internal structure to hold them, and we + * propagate the real values when QMP queries happen. + * + * IOW, the corresponding fields within ram_counters on these specific + * fields will be always zero and not being used at all; they're just + * placeholders to make it QAPI-compatible. + */ +typedef struct { + Stat64 transferred; + Stat64 duplicate; + Stat64 normal; + Stat64 postcopy_bytes; +} MigrationAtomicStats; + +extern MigrationAtomicStats ram_atomic_counters; extern MigrationStats ram_counters; extern XBZRLECacheStats xbzrle_counters; extern CompressionStats compression_counters; @@ -65,6 +85,9 @@ int ram_load_postcopy(QEMUFile *f, int channel); void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); +void ram_transferred_add(uint64_t bytes); +void ram_release_page(const char *rbname, uint64_t offset); + int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); |