From efb208dc9c3f1e881aecff21fb1c7a7b6b869480 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 19 May 2021 14:47:40 +0800 Subject: migration/rdma: Fix cm_event used before being initialized A segmentation fault was triggered when i try to abort a postcopy + rdma migration. since rdma_ack_cm_event releases a uninitialized cm_event in these case. like below: 2496 ret = rdma_get_cm_event(rdma->channel, &cm_event); 2497 if (ret) { 2498 perror("rdma_get_cm_event after rdma_connect"); 2499 ERROR(errp, "connecting to destination!"); 2500 rdma_ack_cm_event(cm_event); <<<< cause segmentation fault 2501 goto err_rdma_source_connect; 2502 } Refer to the rdma_get_cm_event() code, cm_event will be updated/changed only if rdma_get_cm_event() returns 0. So it's okey to remove the ack in error patch. Signed-off-by: Li Zhijian Message-Id: <20210519064740.10828-1-lizhijian@cn.fujitsu.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/rdma.c | 1 - 1 file changed, 1 deletion(-) (limited to 'migration') diff --git a/migration/rdma.c b/migration/rdma.c index 00eac34232..41726cc74a 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -2497,7 +2497,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) if (ret) { perror("rdma_get_cm_event after rdma_connect"); ERROR(errp, "connecting to destination!"); - rdma_ack_cm_event(cm_event); goto err_rdma_source_connect; } -- cgit v1.2.3 From 4e812d2338acb354b969b59f792f413f567c0ace Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 20 May 2021 16:11:45 +0800 Subject: migration/rdma: cleanup rdma in rdma_start_incoming_migration error path the error path after calling qemu_rdma_dest_init() should do rdma cleanup Signed-off-by: Li Zhijian Message-Id: <20210520081148.17001-1-lizhijian@cn.fujitsu.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/rdma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'migration') diff --git a/migration/rdma.c b/migration/rdma.c index 41726cc74a..7e7595faab 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -4040,7 +4040,7 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp) if (ret) { ERROR(errp, "listening on socket!"); - goto err; + goto cleanup_rdma; } trace_rdma_start_incoming_migration_after_rdma_listen(); @@ -4050,7 +4050,7 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp) rdma_return_path = qemu_rdma_data_init(host_port, &local_err); if (rdma_return_path == NULL) { - goto err; + goto cleanup_rdma; } qemu_rdma_return_path_dest_init(rdma_return_path, rdma); @@ -4059,6 +4059,9 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp) qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, NULL, (void *)(intptr_t)rdma); return; + +cleanup_rdma: + qemu_rdma_cleanup(rdma); err: error_propagate(errp, local_err); if (rdma) { -- cgit v1.2.3 From f53b450ada3bf7fa1c88fbf4f13b864af7795bd3 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 May 2021 16:05:50 +0800 Subject: migration/rdma: Fix rdma_addrinfo res leaks rdma_freeaddrinfo() is the reverse operation of rdma_getaddrinfo() Signed-off-by: Li Zhijian Reviewed-by: Dr. David Alan Gilbert Message-Id: <20210525080552.28259-2-lizhijian@cn.fujitsu.com> Signed-off-by: Dr. David Alan Gilbert --- migration/rdma.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'migration') diff --git a/migration/rdma.c b/migration/rdma.c index 7e7595faab..651534e825 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -987,10 +987,12 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) } } + rdma_freeaddrinfo(res); ERROR(errp, "could not resolve address %s", rdma->host); goto err_resolve_get_addr; route: + rdma_freeaddrinfo(res); qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); ret = rdma_get_cm_event(rdma->channel, &cm_event); @@ -2593,6 +2595,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) break; } + rdma_freeaddrinfo(res); if (!e) { ERROR(errp, "Error: could not rdma_bind_addr!"); goto err_dest_init_bind_addr; -- cgit v1.2.3 From 44bcfd45e9806c78d9d526d69b0590227d215a78 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 May 2021 16:05:51 +0800 Subject: migration/rdma: destination: create the return patch after the first accept destination side: $ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888 (qemu) migrate_set_capability postcopy-ram on (qemu) dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet Segmentation fault (core dumped) (gdb) bt #0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272 #1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986 #2 0x0000563c9e51f02a in aio_dispatch_handler (ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329 #3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372 #4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382 #5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=, callback=, user_data=) at ../util/async.c:306 #6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0 #7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231 #8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254 #9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530 #10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725 #11 0x0000563c9dfd46fe in main (argc=, argv=, envp=) at ../softmmu/main.c:50 The rdma return path will not be created when qemu incoming is starting since migrate_copy() is false at that moment, then a NULL return path rdma was referenced if the user enabled postcopy later. Signed-off-by: Li Zhijian Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/rdma.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'migration') diff --git a/migration/rdma.c b/migration/rdma.c index 651534e825..d829d08d07 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -316,6 +316,7 @@ typedef struct RDMALocalBlocks { typedef struct RDMAContext { char *host; int port; + char *host_port; RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; @@ -2392,7 +2393,9 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) rdma->channel = NULL; } g_free(rdma->host); + g_free(rdma->host_port); rdma->host = NULL; + rdma->host_port = NULL; } @@ -2648,6 +2651,7 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp) if (!inet_parse(addr, host_port, NULL)) { rdma->port = atoi(addr->port); rdma->host = g_strdup(addr->host); + rdma->host_port = g_strdup(host_port); } else { ERROR(errp, "bad RDMA migration address '%s'", host_port); g_free(rdma); @@ -3276,6 +3280,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) .private_data = &cap, .private_data_len = sizeof(cap), }; + RDMAContext *rdma_return_path = NULL; struct rdma_cm_event *cm_event; struct ibv_context *verbs; int ret = -EINVAL; @@ -3291,6 +3296,20 @@ static int qemu_rdma_accept(RDMAContext *rdma) goto err_rdma_dest_wait; } + /* + * initialize the RDMAContext for return path for postcopy after first + * connection request reached. + */ + if (migrate_postcopy() && !rdma->is_return_path) { + rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL); + if (rdma_return_path == NULL) { + rdma_ack_cm_event(cm_event); + goto err_rdma_dest_wait; + } + + qemu_rdma_return_path_dest_init(rdma_return_path, rdma); + } + memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); network_to_caps(&cap); @@ -3406,6 +3425,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) err_rdma_dest_wait: rdma->error_state = ret; qemu_rdma_cleanup(rdma); + g_free(rdma_return_path); return ret; } @@ -4048,17 +4068,6 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp) trace_rdma_start_incoming_migration_after_rdma_listen(); - /* initialize the RDMAContext for return path */ - if (migrate_postcopy()) { - rdma_return_path = qemu_rdma_data_init(host_port, &local_err); - - if (rdma_return_path == NULL) { - goto cleanup_rdma; - } - - qemu_rdma_return_path_dest_init(rdma_return_path, rdma); - } - qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, NULL, (void *)(intptr_t)rdma); return; @@ -4069,6 +4078,7 @@ err: error_propagate(errp, local_err); if (rdma) { g_free(rdma->host); + g_free(rdma->host_port); } g_free(rdma); g_free(rdma_return_path); -- cgit v1.2.3 From e49e49dd73b8b17f5f341b3e11c8b6878c43d3e1 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 May 2021 16:05:52 +0800 Subject: migration/rdma: source: poll cm_event from return path source side always blocks if postcopy is only enabled at source side. users are not able to cancel this migration in this case. Let source side have chance to cancel this migration Signed-off-by: Li Zhijian Message-Id: <20210525080552.28259-4-lizhijian@cn.fujitsu.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert Typo fix --- migration/rdma.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'migration') diff --git a/migration/rdma.c b/migration/rdma.c index d829d08d07..1cdb4561f3 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -36,6 +36,7 @@ #include #include "trace.h" #include "qom/object.h" +#include /* * Print and error on both the Monitor and the Log file. @@ -2460,7 +2461,36 @@ err_rdma_source_init: return -1; } -static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) +static int qemu_get_cm_event_timeout(RDMAContext *rdma, + struct rdma_cm_event **cm_event, + long msec, Error **errp) +{ + int ret; + struct pollfd poll_fd = { + .fd = rdma->channel->fd, + .events = POLLIN, + .revents = 0 + }; + + do { + ret = poll(&poll_fd, 1, msec); + } while (ret < 0 && errno == EINTR); + + if (ret == 0) { + ERROR(errp, "poll cm event timeout"); + return -1; + } else if (ret < 0) { + ERROR(errp, "failed to poll cm event, errno=%i", errno); + return -1; + } else if (poll_fd.revents & POLLIN) { + return rdma_get_cm_event(rdma->channel, cm_event); + } else { + ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents); + return -1; + } +} + +static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path) { RDMACapabilities cap = { .version = RDMA_CONTROL_VERSION_CURRENT, @@ -2498,7 +2528,11 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) goto err_rdma_source_connect; } - ret = rdma_get_cm_event(rdma->channel, &cm_event); + if (return_path) { + ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp); + } else { + ret = rdma_get_cm_event(rdma->channel, &cm_event); + } if (ret) { perror("rdma_get_cm_event after rdma_connect"); ERROR(errp, "connecting to destination!"); @@ -4111,7 +4145,7 @@ void rdma_start_outgoing_migration(void *opaque, } trace_rdma_start_outgoing_migration_after_rdma_source_init(); - ret = qemu_rdma_connect(rdma, errp); + ret = qemu_rdma_connect(rdma, errp, false); if (ret) { goto err; @@ -4132,7 +4166,7 @@ void rdma_start_outgoing_migration(void *opaque, goto return_path_err; } - ret = qemu_rdma_connect(rdma_return_path, errp); + ret = qemu_rdma_connect(rdma_return_path, errp, true); if (ret) { goto return_path_err; -- cgit v1.2.3