diff options
Diffstat (limited to 'migration/postcopy-ram.c')
-rw-r--r-- | migration/postcopy-ram.c | 360 |
1 files changed, 294 insertions, 66 deletions
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 032abfbf1a..efd77939af 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -23,6 +23,8 @@ #include "savevm.h" #include "postcopy-ram.h" #include "ram.h" +#include "qapi/error.h" +#include "qemu/notify.h" #include "sysemu/sysemu.h" #include "sysemu/balloon.h" #include "qemu/error-report.h" @@ -45,6 +47,33 @@ struct PostcopyDiscardState { unsigned int nsentcmds; }; +static NotifierWithReturnList postcopy_notifier_list; + +void postcopy_infrastructure_init(void) +{ + notifier_with_return_list_init(&postcopy_notifier_list); +} + +void postcopy_add_notifier(NotifierWithReturn *nn) +{ + notifier_with_return_list_add(&postcopy_notifier_list, nn); +} + +void postcopy_remove_notifier(NotifierWithReturn *n) +{ + notifier_with_return_remove(n); +} + +int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) +{ + struct PostcopyNotifyData pnd; + pnd.reason = reason; + pnd.errp = errp; + + return notifier_with_return_list_notify(&postcopy_notifier_list, + &pnd); +} + /* Postcopy needs to detect accesses to pages that haven't yet been copied * across, and efficiently map new pages in, the techniques for doing this * are target OS specific. @@ -186,12 +215,6 @@ static int test_ramblock_postcopiable(const char *block_name, void *host_addr, RAMBlock *rb = qemu_ram_block_by_name(block_name); size_t pagesize = qemu_ram_pagesize(rb); - if (qemu_ram_is_shared(rb)) { - error_report("Postcopy on shared RAM (%s) is not yet supported", - block_name); - return 1; - } - if (length % pagesize) { error_report("Postcopy requires RAM blocks to be a page size multiple," " block %s is 0x" RAM_ADDR_FMT " bytes with a " @@ -215,6 +238,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) struct uffdio_register reg_struct; struct uffdio_range range_struct; uint64_t feature_mask; + Error *local_err = NULL; if (qemu_target_page_size() > pagesize) { error_report("Target page size bigger than host page size"); @@ -228,6 +252,12 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) goto out; } + /* Give devices a chance to object */ + if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) { + error_report_err(local_err); + goto out; + } + /* Version and features check */ if (!ufd_check_and_apply(ufd, mis)) { goto out; @@ -377,6 +407,13 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) trace_postcopy_ram_incoming_cleanup_entry(); if (mis->have_fault_thread) { + Error *local_err = NULL; + + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) { + error_report_err(local_err); + return -1; + } + if (qemu_ram_foreach_block(cleanup_range, mis)) { return -1; } @@ -481,10 +518,63 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, error_report("%s userfault: Region doesn't support COPY", __func__); return -1; } + if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) { + RAMBlock *rb = qemu_ram_block_by_name(block_name); + qemu_ram_set_uf_zeroable(rb); + } return 0; } +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) +{ + size_t pagesize = qemu_ram_pagesize(rb); + struct uffdio_range range; + int ret; + trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); + range.start = client_addr & ~(pagesize - 1); + range.len = pagesize; + ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); + if (ret) { + error_report("%s: Failed to wake: %zx in %s (%s)", + __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), + strerror(errno)); + } + return ret; +} + +/* + * Callback from shared fault handlers to ask for a page, + * the page must be specified by a RAMBlock and an offset in that rb + * Note: Only for use by shared fault handlers (in fault thread) + */ +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + size_t pagesize = qemu_ram_pagesize(rb); + uint64_t aligned_rbo = rb_offset & ~(pagesize - 1); + MigrationIncomingState *mis = migration_incoming_get_current(); + + trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), + rb_offset); + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { + trace_postcopy_request_shared_page_present(pcfd->idstr, + qemu_ram_get_idstr(rb), rb_offset); + return postcopy_wake_shared(pcfd, client_addr, rb); + } + if (rb != mis->last_rb) { + mis->last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + aligned_rbo, pagesize); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize); + } + return 0; +} + /* * Handle faults detected by the USERFAULT markings */ @@ -493,29 +583,44 @@ static void *postcopy_ram_fault_thread(void *opaque) MigrationIncomingState *mis = opaque; struct uffd_msg msg; int ret; + size_t index; RAMBlock *rb = NULL; - RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ trace_postcopy_ram_fault_thread_entry(); + mis->last_rb = NULL; /* last RAMBlock we sent part of */ qemu_sem_post(&mis->fault_thread_sem); + struct pollfd *pfd; + size_t pfd_len = 2 + mis->postcopy_remote_fds->len; + + pfd = g_new0(struct pollfd, pfd_len); + + pfd[0].fd = mis->userfault_fd; + pfd[0].events = POLLIN; + pfd[1].fd = mis->userfault_event_fd; + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ + trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); + for (index = 0; index < mis->postcopy_remote_fds->len; index++) { + struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index); + pfd[2 + index].fd = pcfd->fd; + pfd[2 + index].events = POLLIN; + trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, + pcfd->fd); + } + while (true) { ram_addr_t rb_offset; - struct pollfd pfd[2]; + int poll_result; /* * We're mainly waiting for the kernel to give us a faulting HVA, * however we can be told to quit via userfault_quit_fd which is * an eventfd */ - pfd[0].fd = mis->userfault_fd; - pfd[0].events = POLLIN; - pfd[0].revents = 0; - pfd[1].fd = mis->userfault_event_fd; - pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ - pfd[1].revents = 0; - - if (poll(pfd, 2, -1 /* Wait forever */) == -1) { + + poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); + if (poll_result == -1) { error_report("%s: userfault poll: %s", __func__, strerror(errno)); break; } @@ -535,57 +640,117 @@ static void *postcopy_ram_fault_thread(void *opaque) } } - ret = read(mis->userfault_fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (errno == EAGAIN) { - /* - * if a wake up happens on the other thread just after - * the poll, there is nothing to read. - */ - continue; + if (pfd[0].revents) { + poll_result--; + ret = read(mis->userfault_fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s", + __func__, strerror(errno)); + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd", + __func__, ret, sizeof(msg)); + break; /* Lost alignment, don't know what we'd read next */ + } } - if (ret < 0) { - error_report("%s: Failed to read full userfault message: %s", - __func__, strerror(errno)); - break; - } else { - error_report("%s: Read %d bytes from userfaultfd expected %zd", - __func__, ret, sizeof(msg)); - break; /* Lost alignment, don't know what we'd read next */ + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud from userfaultfd", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ } - } - if (msg.event != UFFD_EVENT_PAGEFAULT) { - error_report("%s: Read unexpected event %ud from userfaultfd", - __func__, msg.event); - continue; /* It's not a page fault, shouldn't happen */ - } - rb = qemu_ram_block_from_host( - (void *)(uintptr_t)msg.arg.pagefault.address, - true, &rb_offset); - if (!rb) { - error_report("postcopy_ram_fault_thread: Fault outside guest: %" - PRIx64, (uint64_t)msg.arg.pagefault.address); - break; - } + rb = qemu_ram_block_from_host( + (void *)(uintptr_t)msg.arg.pagefault.address, + true, &rb_offset); + if (!rb) { + error_report("postcopy_ram_fault_thread: Fault outside guest: %" + PRIx64, (uint64_t)msg.arg.pagefault.address); + break; + } - rb_offset &= ~(qemu_ram_pagesize(rb) - 1); - trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, + rb_offset &= ~(qemu_ram_pagesize(rb) - 1); + trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), rb_offset); + /* + * Send the request to the source - we want to request one + * of our host page sizes (which is >= TPS) + */ + if (rb != mis->last_rb) { + mis->last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + rb_offset, qemu_ram_pagesize(rb)); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, + rb_offset, qemu_ram_pagesize(rb)); + } + } - /* - * Send the request to the source - we want to request one - * of our host page sizes (which is >= TPS) - */ - if (rb != last_rb) { - last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - rb_offset, qemu_ram_pagesize(rb)); - } else { - /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, - rb_offset, qemu_ram_pagesize(rb)); + /* Now handle any requests from external processes on shared memory */ + /* TODO: May need to handle devices deregistering during postcopy */ + for (index = 2; index < pfd_len && poll_result; index++) { + if (pfd[index].revents) { + struct PostCopyFD *pcfd = + &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index - 2); + + poll_result--; + if (pfd[index].revents & POLLERR) { + error_report("%s: POLLERR on poll %zd fd=%d", + __func__, index, pcfd->fd); + pfd[index].events = 0; + continue; + } + + ret = read(pcfd->fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s (shared) revents=%d", + __func__, strerror(errno), + pfd[index].revents); + /*TODO: Could just disable this sharer */ + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd (shared)", + __func__, ret, sizeof(msg)); + /*TODO: Could just disable this sharer */ + break; /*Lost alignment,don't know what we'd read next*/ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud " + "from userfaultfd (shared)", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } + /* Call the device handler registered with us */ + ret = pcfd->handler(pcfd, &msg); + if (ret) { + error_report("%s: Failed to resolve shared fault on %zd/%s", + __func__, index, pcfd->idstr); + /* TODO: Fail? Disable this sharer? */ + } + } } } trace_postcopy_ram_fault_thread_exit(); @@ -667,6 +832,22 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, return ret; } +int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset) +{ + int i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + int ret = cur->waker(cur, rb, offset); + if (ret) { + return ret; + } + } + return 0; +} + /* * Place a host page (from) at (host) atomically * returns 0 on success @@ -690,7 +871,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, } trace_postcopy_place_page(host); - return 0; + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, host)); } /* @@ -700,17 +882,23 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, RAMBlock *rb) { + size_t pagesize = qemu_ram_pagesize(rb); trace_postcopy_place_page_zero(host); - if (qemu_ram_pagesize(rb) == getpagesize()) { - if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(), - rb)) { + /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE + * but it's not available for everything (e.g. hugetlbpages) + */ + if (qemu_ram_is_uf_zeroable(rb)) { + if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) { int e = errno; error_report("%s: %s zero host: %p", __func__, strerror(e), host); return -e; } + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, + host)); } else { /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ if (!mis->postcopy_tmp_zero_page) { @@ -730,8 +918,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); } - - return 0; } /* @@ -784,6 +970,13 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) return -1; } +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + assert(0); + return -1; +} + int postcopy_ram_enable_notify(MigrationIncomingState *mis) { assert(0); @@ -810,6 +1003,13 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis) return NULL; } +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) +{ + assert(0); + return -1; +} #endif /* ------------------------------------------------------------------------- */ @@ -927,3 +1127,31 @@ PostcopyState postcopy_state_set(PostcopyState new_state) { return atomic_xchg(&incoming_postcopy_state, new_state); } + +/* Register a handler for external shared memory postcopy + * called on the destination. + */ +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds, + *pcfd); +} + +/* Unregister a handler for external shared memory postcopy + */ +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) +{ + guint i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + if (cur->fd == pcfd->fd) { + mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i); + return; + } + } +} |