diff options
Diffstat (limited to 'migration/rdma.c')
-rw-r--r-- | migration/rdma.c | 185 |
1 files changed, 33 insertions, 152 deletions
diff --git a/migration/rdma.c b/migration/rdma.c index 672d1958a9..94a55dd95b 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -21,7 +21,6 @@ #include "migration.h" #include "qemu-file.h" #include "ram.h" -#include "qemu-file-channel.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" @@ -1371,30 +1370,6 @@ const char *print_wrid(int wrid) } /* - * RDMA requires memory registration (mlock/pinning), but this is not good for - * overcommitment. - * - * In preparation for the future where LRU information or workload-specific - * writable writable working set memory access behavior is available to QEMU - * it would be nice to have in place the ability to UN-register/UN-pin - * particular memory regions from the RDMA hardware when it is determine that - * those regions of memory will likely not be accessed again in the near future. - * - * While we do not yet have such information right now, the following - * compile-time option allows us to perform a non-optimized version of this - * behavior. - * - * By uncommenting this option, you will cause *all* RDMA transfers to be - * unregistered immediately after the transfer completes on both sides of the - * connection. This has no effect in 'rdma-pin-all' mode, only regular mode. - * - * This will have a terrible impact on migration performance, so until future - * workload information or LRU information is available, do not attempt to use - * this feature except for basic testing. - */ -/* #define RDMA_UNREGISTRATION_EXAMPLE */ - -/* * Perform a non-optimized memory unregistration after every transfer * for demonstration purposes, only if pin-all is not requested. * @@ -1487,34 +1462,6 @@ static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index, } /* - * Set bit for unregistration in the next iteration. - * We cannot transmit right here, but will unpin later. - */ -static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, - uint64_t chunk, uint64_t wr_id) -{ - if (rdma->unregistrations[rdma->unregister_next] != 0) { - error_report("rdma migration: queue is full"); - } else { - RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); - - if (!test_and_set_bit(chunk, block->unregister_bitmap)) { - trace_qemu_rdma_signal_unregister_append(chunk, - rdma->unregister_next); - - rdma->unregistrations[rdma->unregister_next++] = - qemu_rdma_make_wrid(wr_id, index, chunk); - - if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) { - rdma->unregister_next = 0; - } - } else { - trace_qemu_rdma_signal_unregister_already(chunk); - } - } -} - -/* * Consult the connection manager to see a work request * (of any kind) has completed. * Return the work request ID that completed. @@ -1571,18 +1518,6 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq, if (rdma->nb_sent > 0) { rdma->nb_sent--; } - - if (!rdma->pin_all) { - /* - * FYI: If one wanted to signal a specific chunk to be unregistered - * using LRU or workload-specific information, this is the function - * you would call to do so. That chunk would then get asynchronously - * unregistered later. - */ -#ifdef RDMA_UNREGISTRATION_EXAMPLE - qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id); -#endif - } } else { trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent); } @@ -2137,11 +2072,6 @@ retry: chunk_end = ram_chunk_end(block, chunk + chunks); - if (!rdma->pin_all) { -#ifdef RDMA_UNREGISTRATION_EXAMPLE - qemu_rdma_unregister_waiting(rdma); -#endif - } while (test_bit(chunk, block->transit_bitmap)) { (void)count; @@ -3278,33 +3208,17 @@ qio_channel_rdma_shutdown(QIOChannel *ioc, * Offset is an offset to be added to block_offset and used * to also lookup the corresponding RAMBlock. * - * @size > 0 : - * Initiate an transfer this size. - * - * @size == 0 : - * A 'hint' or 'advice' that means that we wish to speculatively - * and asynchronously unregister this memory. In this case, there is no - * guarantee that the unregister will actually happen, for example, - * if the memory is being actively transmitted. Additionally, the memory - * may be re-registered at any future time if a write within the same - * chunk was requested again, even if you attempted to unregister it - * here. - * - * @size < 0 : TODO, not yet supported - * Unregister the memory NOW. This means that the caller does not - * expect there to be any future RDMA transfers and we just want to clean - * things up. This is used in case the upper layer owns the memory and - * cannot wait for qemu_fclose() to occur. + * @size : Number of bytes to transfer * * @bytes_sent : User-specificed pointer to indicate how many bytes were * sent. Usually, this will not be more than a few bytes of * the protocol because most transfers are sent asynchronously. */ -static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, +static size_t qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size, uint64_t *bytes_sent) { - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); RDMAContext *rdma; int ret; @@ -3323,61 +3237,27 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, qemu_fflush(f); - if (size > 0) { - /* - * Add this page to the current 'chunk'. If the chunk - * is full, or the page doesn't belong to the current chunk, - * an actual RDMA write will occur and a new chunk will be formed. - */ - ret = qemu_rdma_write(f, rdma, block_offset, offset, size); - if (ret < 0) { - error_report("rdma migration: write error! %d", ret); - goto err; - } - - /* - * We always return 1 bytes because the RDMA - * protocol is completely asynchronous. We do not yet know - * whether an identified chunk is zero or not because we're - * waiting for other pages to potentially be merged with - * the current chunk. So, we have to call qemu_update_position() - * later on when the actual write occurs. - */ - if (bytes_sent) { - *bytes_sent = 1; - } - } else { - uint64_t index, chunk; - - /* TODO: Change QEMUFileOps prototype to be signed: size_t => long - if (size < 0) { - ret = qemu_rdma_drain_cq(f, rdma); - if (ret < 0) { - fprintf(stderr, "rdma: failed to synchronously drain" - " completion queue before unregistration.\n"); - goto err; - } - } - */ - - ret = qemu_rdma_search_ram_block(rdma, block_offset, - offset, size, &index, &chunk); - - if (ret) { - error_report("ram block search failed"); - goto err; - } - - qemu_rdma_signal_unregister(rdma, index, chunk, 0); + /* + * Add this page to the current 'chunk'. If the chunk + * is full, or the page doesn't belong to the current chunk, + * an actual RDMA write will occur and a new chunk will be formed. + */ + ret = qemu_rdma_write(f, rdma, block_offset, offset, size); + if (ret < 0) { + error_report("rdma migration: write error! %d", ret); + goto err; + } - /* - * TODO: Synchronous, guaranteed unregistration (should not occur during - * fast-path). Otherwise, unregisters will process on the next call to - * qemu_rdma_drain_cq() - if (size < 0) { - qemu_rdma_unregister_waiting(rdma); - } - */ + /* + * We always return 1 bytes because the RDMA + * protocol is completely asynchronous. We do not yet know + * whether an identified chunk is zero or not because we're + * waiting for other pages to potentially be merged with + * the current chunk. So, we have to call qemu_update_position() + * later on when the actual write occurs. + */ + if (bytes_sent) { + *bytes_sent = 1; } /* @@ -3950,14 +3830,15 @@ rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name) return 0; } -static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) +static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data) { + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); switch (flags) { case RAM_CONTROL_BLOCK_REG: - return rdma_block_notification_handle(opaque, data); + return rdma_block_notification_handle(rioc, data); case RAM_CONTROL_HOOK: - return qemu_rdma_registration_handle(f, opaque); + return qemu_rdma_registration_handle(f, rioc); default: /* Shouldn't be called with any other values */ @@ -3965,10 +3846,10 @@ static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) } } -static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, +static int qemu_rdma_registration_start(QEMUFile *f, uint64_t flags, void *data) { - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); RDMAContext *rdma; RCU_READ_LOCK_GUARD(); @@ -3994,10 +3875,10 @@ static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, * Inform dest that dynamic registrations are done for now. * First, flush writes, if any. */ -static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, +static int qemu_rdma_registration_stop(QEMUFile *f, uint64_t flags, void *data) { - QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f)); RDMAContext *rdma; RDMAControlHeader head = { .len = 0, .repeat = 1 }; int ret = 0; @@ -4170,12 +4051,12 @@ static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA)); if (mode[0] == 'w') { - rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc)); + rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc)); rioc->rdmaout = rdma; rioc->rdmain = rdma->return_path; qemu_file_set_hooks(rioc->file, &rdma_write_hooks); } else { - rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc)); + rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc)); rioc->rdmain = rdma; rioc->rdmaout = rdma->return_path; qemu_file_set_hooks(rioc->file, &rdma_read_hooks); |